Commit
·
6806363
1
Parent(s):
8da3518
Custom env variables should now overwrite defaults for lambda function. Usage logs should now be correctly created with lambda function
Browse files- lambda_entrypoint.py +36 -6
- tools/cli_usage_logger.py +5 -5
- tools/file_redaction.py +1 -3
lambda_entrypoint.py
CHANGED
|
@@ -199,8 +199,26 @@ def lambda_handler(event, context):
|
|
| 199 |
print("Detected .env file, loading environment variables...")
|
| 200 |
|
| 201 |
# Load environment variables from the .env file
|
| 202 |
-
|
| 203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
# Extract the actual input file path from environment variables
|
| 206 |
# Look for common environment variable names that might contain the input file path
|
|
@@ -252,6 +270,14 @@ def lambda_handler(event, context):
|
|
| 252 |
# 4. Prepare arguments for the CLI function
|
| 253 |
# This dictionary should mirror the one in your app.py's "direct mode"
|
| 254 |
# If we loaded a .env file, use environment variables as defaults
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
cli_args = {
|
| 256 |
# Task Selection
|
| 257 |
"task": arguments.get("task", os.getenv("DIRECT_MODE_TASK", "redact")),
|
|
@@ -289,7 +315,7 @@ def lambda_handler(event, context):
|
|
| 289 |
"do_initial_clean", os.getenv("DO_INITIAL_TABULAR_DATA_CLEAN", "False")
|
| 290 |
),
|
| 291 |
"save_logs_to_csv": arguments.get(
|
| 292 |
-
"save_logs_to_csv", os.getenv("SAVE_LOGS_TO_CSV", "
|
| 293 |
),
|
| 294 |
"save_logs_to_dynamodb": arguments.get(
|
| 295 |
"save_logs_to_dynamodb", os.getenv("SAVE_LOGS_TO_DYNAMODB", "False")
|
|
@@ -325,9 +351,7 @@ def lambda_handler(event, context):
|
|
| 325 |
os.getenv("SPACY_MODEL_PATH", os.environ["SPACY_MODEL_PATH"]),
|
| 326 |
),
|
| 327 |
# PDF/Image Redaction Arguments
|
| 328 |
-
"ocr_method": arguments.get(
|
| 329 |
-
"ocr_method", os.getenv("TESSERACT_TEXT_EXTRACT_OPTION", "Local OCR")
|
| 330 |
-
),
|
| 331 |
"page_min": int(
|
| 332 |
arguments.get("page_min", os.getenv("DEFAULT_PAGE_MIN", DEFAULT_PAGE_MIN))
|
| 333 |
),
|
|
@@ -471,6 +495,12 @@ def lambda_handler(event, context):
|
|
| 471 |
"prepare_images": arguments.get("prepare_images", True),
|
| 472 |
}
|
| 473 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 474 |
# Combine extraction options
|
| 475 |
extraction_options = (
|
| 476 |
_get_env_list(cli_args["handwrite_signature_extraction"])
|
|
|
|
| 199 |
print("Detected .env file, loading environment variables...")
|
| 200 |
|
| 201 |
# Load environment variables from the .env file
|
| 202 |
+
print(f"Loading .env file from: {input_file_path}")
|
| 203 |
+
|
| 204 |
+
# Check if file exists and is readable
|
| 205 |
+
if os.path.exists(input_file_path):
|
| 206 |
+
print(".env file exists and is readable")
|
| 207 |
+
with open(input_file_path, "r") as f:
|
| 208 |
+
content = f.read()
|
| 209 |
+
print(f".env file content preview: {content[:200]}...")
|
| 210 |
+
else:
|
| 211 |
+
print(f"ERROR: .env file does not exist at {input_file_path}")
|
| 212 |
+
|
| 213 |
+
load_dotenv(input_file_path, override=True)
|
| 214 |
+
print("Environment variables loaded from .env file (with override=True)")
|
| 215 |
+
|
| 216 |
+
# Debug: Print the loaded environment variables
|
| 217 |
+
print(f"DEFAULT_PAGE_MIN from env: {os.getenv('DEFAULT_PAGE_MIN')}")
|
| 218 |
+
print(f"DEFAULT_PAGE_MAX from env: {os.getenv('DEFAULT_PAGE_MAX')}")
|
| 219 |
+
print(
|
| 220 |
+
f"All DEFAULT_PAGE_* env vars: {[k for k in os.environ.keys() if 'DEFAULT_PAGE' in k]}"
|
| 221 |
+
)
|
| 222 |
|
| 223 |
# Extract the actual input file path from environment variables
|
| 224 |
# Look for common environment variable names that might contain the input file path
|
|
|
|
| 270 |
# 4. Prepare arguments for the CLI function
|
| 271 |
# This dictionary should mirror the one in your app.py's "direct mode"
|
| 272 |
# If we loaded a .env file, use environment variables as defaults
|
| 273 |
+
|
| 274 |
+
# Debug: Print environment variables before constructing cli_args
|
| 275 |
+
print("Before cli_args construction:")
|
| 276 |
+
print(f" DEFAULT_PAGE_MIN from env: {os.getenv('DEFAULT_PAGE_MIN')}")
|
| 277 |
+
print(f" DEFAULT_PAGE_MAX from env: {os.getenv('DEFAULT_PAGE_MAX')}")
|
| 278 |
+
print(f" DEFAULT_PAGE_MIN from config: {DEFAULT_PAGE_MIN}")
|
| 279 |
+
print(f" DEFAULT_PAGE_MAX from config: {DEFAULT_PAGE_MAX}")
|
| 280 |
+
|
| 281 |
cli_args = {
|
| 282 |
# Task Selection
|
| 283 |
"task": arguments.get("task", os.getenv("DIRECT_MODE_TASK", "redact")),
|
|
|
|
| 315 |
"do_initial_clean", os.getenv("DO_INITIAL_TABULAR_DATA_CLEAN", "False")
|
| 316 |
),
|
| 317 |
"save_logs_to_csv": arguments.get(
|
| 318 |
+
"save_logs_to_csv", os.getenv("SAVE_LOGS_TO_CSV", "True")
|
| 319 |
),
|
| 320 |
"save_logs_to_dynamodb": arguments.get(
|
| 321 |
"save_logs_to_dynamodb", os.getenv("SAVE_LOGS_TO_DYNAMODB", "False")
|
|
|
|
| 351 |
os.getenv("SPACY_MODEL_PATH", os.environ["SPACY_MODEL_PATH"]),
|
| 352 |
),
|
| 353 |
# PDF/Image Redaction Arguments
|
| 354 |
+
"ocr_method": arguments.get("ocr_method", os.getenv("OCR_METHOD", "Local OCR")),
|
|
|
|
|
|
|
| 355 |
"page_min": int(
|
| 356 |
arguments.get("page_min", os.getenv("DEFAULT_PAGE_MIN", DEFAULT_PAGE_MIN))
|
| 357 |
),
|
|
|
|
| 495 |
"prepare_images": arguments.get("prepare_images", True),
|
| 496 |
}
|
| 497 |
|
| 498 |
+
# Debug: Print the final page_min and page_max values
|
| 499 |
+
print(f"Final cli_args page_min: {cli_args['page_min']}")
|
| 500 |
+
print(f"Final cli_args page_max: {cli_args['page_max']}")
|
| 501 |
+
print(f"Final cli_args save_logs_to_csv: {cli_args['save_logs_to_csv']}")
|
| 502 |
+
print(f"Final cli_args usage_logs_folder: {cli_args['usage_logs_folder']}")
|
| 503 |
+
|
| 504 |
# Combine extraction options
|
| 505 |
extraction_options = (
|
| 506 |
_get_env_list(cli_args["handwrite_signature_extraction"])
|
tools/cli_usage_logger.py
CHANGED
|
@@ -217,13 +217,13 @@ def create_cli_usage_logger(logs_folder: str = None) -> CLIUsageLogger:
|
|
| 217 |
Returns:
|
| 218 |
Configured CLIUsageLogger instance
|
| 219 |
"""
|
| 220 |
-
#
|
| 221 |
-
import json
|
| 222 |
-
|
| 223 |
try:
|
| 224 |
-
headers =
|
|
|
|
|
|
|
| 225 |
except Exception as e:
|
| 226 |
-
print(f"Error
|
| 227 |
# Fallback headers if parsing fails
|
| 228 |
headers = [
|
| 229 |
"session_hash_textbox",
|
|
|
|
| 217 |
Returns:
|
| 218 |
Configured CLIUsageLogger instance
|
| 219 |
"""
|
| 220 |
+
# Use CSV headers from config (already parsed as list)
|
|
|
|
|
|
|
| 221 |
try:
|
| 222 |
+
headers = CSV_USAGE_LOG_HEADERS
|
| 223 |
+
if not headers or len(headers) == 0:
|
| 224 |
+
raise ValueError("Empty headers list")
|
| 225 |
except Exception as e:
|
| 226 |
+
print(f"Error using CSV usage log headers: {e}")
|
| 227 |
# Fallback headers if parsing fails
|
| 228 |
headers = [
|
| 229 |
"session_hash_textbox",
|
tools/file_redaction.py
CHANGED
|
@@ -1298,9 +1298,7 @@ def choose_and_run_redactor(
|
|
| 1298 |
output_folder + pdf_file_name_without_ext + "_redacted.pdf"
|
| 1299 |
)
|
| 1300 |
# Add page range suffix if partial processing
|
| 1301 |
-
|
| 1302 |
-
f"page_min: {page_min}, current_loop_page: {current_loop_page}, number_of_pages: {number_of_pages}"
|
| 1303 |
-
)
|
| 1304 |
out_redacted_pdf_file_path = add_page_range_suffix_to_file_path(
|
| 1305 |
out_redacted_pdf_file_path,
|
| 1306 |
page_min,
|
|
|
|
| 1298 |
output_folder + pdf_file_name_without_ext + "_redacted.pdf"
|
| 1299 |
)
|
| 1300 |
# Add page range suffix if partial processing
|
| 1301 |
+
|
|
|
|
|
|
|
| 1302 |
out_redacted_pdf_file_path = add_page_range_suffix_to_file_path(
|
| 1303 |
out_redacted_pdf_file_path,
|
| 1304 |
page_min,
|