Commit
·
5f824f4
1
Parent(s):
6806363
Revised environment variables for consistency.
Browse files- README.md +2 -2
- app.py +7 -24
- cdk/cdk_config.py +12 -0
- cdk/cdk_functions.py +2 -2
- cli_redact.py +3 -1
- example_config.env +1 -1
- lambda_entrypoint.py +80 -38
- src/app_settings.qmd +6 -6
- src/installation_guide.qmd +2 -2
- tools/aws_functions.py +14 -14
- tools/aws_textract.py +7 -10
- tools/cli_usage_logger.py +3 -3
- tools/config.py +125 -216
- tools/data_anonymise.py +2 -2
- tools/file_conversion.py +1 -1
- tools/file_redaction.py +6 -6
README.md
CHANGED
|
@@ -176,8 +176,8 @@ These settings are useful for all users, regardless of whether you are using AWS
|
|
| 176 |
|
| 177 |
These settings are only relevant if you intend to use AWS services like Textract for OCR and Comprehend for PII detection.
|
| 178 |
|
| 179 |
-
* `RUN_AWS_FUNCTIONS=
|
| 180 |
-
* **This is the master switch.** You must set this to `
|
| 181 |
|
| 182 |
* **UI Options:**
|
| 183 |
* `SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True`: Adds "AWS Textract" as an option in the text extraction dropdown.
|
|
|
|
| 176 |
|
| 177 |
These settings are only relevant if you intend to use AWS services like Textract for OCR and Comprehend for PII detection.
|
| 178 |
|
| 179 |
+
* `RUN_AWS_FUNCTIONS=True`
|
| 180 |
+
* **This is the master switch.** You must set this to `True` to enable any AWS functionality. If it is `False`, all other AWS settings will be ignored.
|
| 181 |
|
| 182 |
* **UI Options:**
|
| 183 |
* `SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True`: Adds "AWS Textract" as an option in the text extraction dropdown.
|
app.py
CHANGED
|
@@ -5806,7 +5806,7 @@ with blocks:
|
|
| 5806 |
if (
|
| 5807 |
not os.path.exists(ALLOW_LIST_PATH)
|
| 5808 |
and S3_ALLOW_LIST_PATH
|
| 5809 |
-
and RUN_AWS_FUNCTIONS
|
| 5810 |
):
|
| 5811 |
print("Downloading allow list from S3")
|
| 5812 |
blocks.load(
|
|
@@ -5840,7 +5840,7 @@ with blocks:
|
|
| 5840 |
if (
|
| 5841 |
not os.path.exists(COST_CODES_PATH)
|
| 5842 |
and S3_COST_CODES_PATH
|
| 5843 |
-
and RUN_AWS_FUNCTIONS
|
| 5844 |
):
|
| 5845 |
print("Downloading cost codes from S3")
|
| 5846 |
blocks.load(
|
|
@@ -6423,9 +6423,9 @@ with blocks:
|
|
| 6423 |
default_concurrency_limit=int(DEFAULT_CONCURRENCY_LIMIT),
|
| 6424 |
)
|
| 6425 |
|
| 6426 |
-
if RUN_DIRECT_MODE
|
| 6427 |
# If running through command line with uvicorn
|
| 6428 |
-
if RUN_FASTAPI
|
| 6429 |
if ALLOWED_ORIGINS:
|
| 6430 |
print(f"CORS enabled. Allowing origins: {ALLOWED_ORIGINS}")
|
| 6431 |
app.add_middleware(
|
|
@@ -6448,7 +6448,7 @@ with blocks:
|
|
| 6448 |
app,
|
| 6449 |
blocks,
|
| 6450 |
show_error=True,
|
| 6451 |
-
auth=authenticate_user if COGNITO_AUTH
|
| 6452 |
max_file_size=MAX_FILE_SIZE,
|
| 6453 |
path=FASTAPI_ROOT_PATH,
|
| 6454 |
favicon_path=Path(FAVICON_PATH),
|
|
@@ -6459,7 +6459,7 @@ with blocks:
|
|
| 6459 |
|
| 6460 |
else:
|
| 6461 |
if __name__ == "__main__":
|
| 6462 |
-
if COGNITO_AUTH
|
| 6463 |
blocks.launch(
|
| 6464 |
show_error=True,
|
| 6465 |
inbrowser=True,
|
|
@@ -6519,7 +6519,7 @@ with blocks:
|
|
| 6519 |
"save_logs_to_csv": SAVE_LOGS_TO_CSV,
|
| 6520 |
"save_logs_to_dynamodb": SAVE_LOGS_TO_DYNAMODB,
|
| 6521 |
"display_file_names_in_logs": DISPLAY_FILE_NAMES_IN_LOGS,
|
| 6522 |
-
"upload_logs_to_s3": RUN_AWS_FUNCTIONS
|
| 6523 |
"s3_logs_prefix": S3_USAGE_LOGS_FOLDER,
|
| 6524 |
"feedback_logs_folder": FEEDBACK_LOGS_FOLDER,
|
| 6525 |
"access_logs_folder": ACCESS_LOGS_FOLDER,
|
|
@@ -6601,20 +6601,3 @@ with blocks:
|
|
| 6601 |
|
| 6602 |
# Run the CLI main function with direct mode arguments
|
| 6603 |
main(direct_mode_args=direct_mode_args)
|
| 6604 |
-
|
| 6605 |
-
# Combine extraction options
|
| 6606 |
-
extraction_options = (
|
| 6607 |
-
list(direct_mode_args["handwrite_signature_extraction"])
|
| 6608 |
-
if direct_mode_args["handwrite_signature_extraction"]
|
| 6609 |
-
else []
|
| 6610 |
-
)
|
| 6611 |
-
if direct_mode_args["extract_forms"]:
|
| 6612 |
-
extraction_options.append("Extract forms")
|
| 6613 |
-
if direct_mode_args["extract_tables"]:
|
| 6614 |
-
extraction_options.append("Extract tables")
|
| 6615 |
-
if direct_mode_args["extract_layout"]:
|
| 6616 |
-
extraction_options.append("Extract layout")
|
| 6617 |
-
direct_mode_args["handwrite_signature_extraction"] = extraction_options
|
| 6618 |
-
|
| 6619 |
-
# Run the CLI main function with direct mode arguments
|
| 6620 |
-
main(direct_mode_args=direct_mode_args)
|
|
|
|
| 5806 |
if (
|
| 5807 |
not os.path.exists(ALLOW_LIST_PATH)
|
| 5808 |
and S3_ALLOW_LIST_PATH
|
| 5809 |
+
and RUN_AWS_FUNCTIONS
|
| 5810 |
):
|
| 5811 |
print("Downloading allow list from S3")
|
| 5812 |
blocks.load(
|
|
|
|
| 5840 |
if (
|
| 5841 |
not os.path.exists(COST_CODES_PATH)
|
| 5842 |
and S3_COST_CODES_PATH
|
| 5843 |
+
and RUN_AWS_FUNCTIONS
|
| 5844 |
):
|
| 5845 |
print("Downloading cost codes from S3")
|
| 5846 |
blocks.load(
|
|
|
|
| 6423 |
default_concurrency_limit=int(DEFAULT_CONCURRENCY_LIMIT),
|
| 6424 |
)
|
| 6425 |
|
| 6426 |
+
if not RUN_DIRECT_MODE:
|
| 6427 |
# If running through command line with uvicorn
|
| 6428 |
+
if RUN_FASTAPI:
|
| 6429 |
if ALLOWED_ORIGINS:
|
| 6430 |
print(f"CORS enabled. Allowing origins: {ALLOWED_ORIGINS}")
|
| 6431 |
app.add_middleware(
|
|
|
|
| 6448 |
app,
|
| 6449 |
blocks,
|
| 6450 |
show_error=True,
|
| 6451 |
+
auth=authenticate_user if COGNITO_AUTH else None,
|
| 6452 |
max_file_size=MAX_FILE_SIZE,
|
| 6453 |
path=FASTAPI_ROOT_PATH,
|
| 6454 |
favicon_path=Path(FAVICON_PATH),
|
|
|
|
| 6459 |
|
| 6460 |
else:
|
| 6461 |
if __name__ == "__main__":
|
| 6462 |
+
if COGNITO_AUTH:
|
| 6463 |
blocks.launch(
|
| 6464 |
show_error=True,
|
| 6465 |
inbrowser=True,
|
|
|
|
| 6519 |
"save_logs_to_csv": SAVE_LOGS_TO_CSV,
|
| 6520 |
"save_logs_to_dynamodb": SAVE_LOGS_TO_DYNAMODB,
|
| 6521 |
"display_file_names_in_logs": DISPLAY_FILE_NAMES_IN_LOGS,
|
| 6522 |
+
"upload_logs_to_s3": RUN_AWS_FUNCTIONS,
|
| 6523 |
"s3_logs_prefix": S3_USAGE_LOGS_FOLDER,
|
| 6524 |
"feedback_logs_folder": FEEDBACK_LOGS_FOLDER,
|
| 6525 |
"access_logs_folder": ACCESS_LOGS_FOLDER,
|
|
|
|
| 6601 |
|
| 6602 |
# Run the CLI main function with direct mode arguments
|
| 6603 |
main(direct_mode_args=direct_mode_args)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cdk/cdk_config.py
CHANGED
|
@@ -6,6 +6,18 @@ from dotenv import load_dotenv
|
|
| 6 |
# Set or retrieve configuration variables for CDK redaction deployment
|
| 7 |
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
def get_or_create_env_var(var_name: str, default_value: str, print_val: bool = False):
|
| 10 |
"""
|
| 11 |
Get an environmental variable, and set it to a default value if it doesn't exist
|
|
|
|
| 6 |
# Set or retrieve configuration variables for CDK redaction deployment
|
| 7 |
|
| 8 |
|
| 9 |
+
def convert_string_to_boolean(value: str) -> bool:
|
| 10 |
+
"""Convert string to boolean, handling various formats."""
|
| 11 |
+
if isinstance(value, bool):
|
| 12 |
+
return value
|
| 13 |
+
elif value in ["True", "1", "true", "TRUE"]:
|
| 14 |
+
return True
|
| 15 |
+
elif value in ["False", "0", "false", "FALSE"]:
|
| 16 |
+
return False
|
| 17 |
+
else:
|
| 18 |
+
raise ValueError(f"Invalid boolean value: {value}")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
def get_or_create_env_var(var_name: str, default_value: str, print_val: bool = False):
|
| 22 |
"""
|
| 23 |
Get an environmental variable, and set it to a default value if it doesn't exist
|
cdk/cdk_functions.py
CHANGED
|
@@ -1335,8 +1335,8 @@ def create_basic_config_env(
|
|
| 1335 |
Create a basic config.env file for the user to use with their newly deployed redaction app.
|
| 1336 |
"""
|
| 1337 |
variables = {
|
| 1338 |
-
"COGNITO_AUTH": "
|
| 1339 |
-
"RUN_AWS_FUNCTIONS": "
|
| 1340 |
"DISPLAY_FILE_NAMES_IN_LOGS": "False",
|
| 1341 |
"SESSION_OUTPUT_FOLDER": "True",
|
| 1342 |
"SAVE_LOGS_TO_DYNAMODB": "True",
|
|
|
|
| 1335 |
Create a basic config.env file for the user to use with their newly deployed redaction app.
|
| 1336 |
"""
|
| 1337 |
variables = {
|
| 1338 |
+
"COGNITO_AUTH": "True",
|
| 1339 |
+
"RUN_AWS_FUNCTIONS": "True",
|
| 1340 |
"DISPLAY_FILE_NAMES_IN_LOGS": "False",
|
| 1341 |
"SESSION_OUTPUT_FOLDER": "True",
|
| 1342 |
"SAVE_LOGS_TO_DYNAMODB": "True",
|
cli_redact.py
CHANGED
|
@@ -341,7 +341,7 @@ python cli_redact.py --task textract --textract_action list
|
|
| 341 |
)
|
| 342 |
general_group.add_argument(
|
| 343 |
"--upload_logs_to_s3",
|
| 344 |
-
default=RUN_AWS_FUNCTIONS
|
| 345 |
help="Upload log files to S3 after processing.",
|
| 346 |
)
|
| 347 |
general_group.add_argument(
|
|
@@ -762,6 +762,8 @@ python cli_redact.py --task textract --textract_action list
|
|
| 762 |
output_folder=args.output_dir,
|
| 763 |
input_folder=args.input_dir,
|
| 764 |
prepare_images=args.prepare_images,
|
|
|
|
|
|
|
| 765 |
)
|
| 766 |
print(f"Preparation complete. {prep_summary}")
|
| 767 |
|
|
|
|
| 341 |
)
|
| 342 |
general_group.add_argument(
|
| 343 |
"--upload_logs_to_s3",
|
| 344 |
+
default=RUN_AWS_FUNCTIONS,
|
| 345 |
help="Upload log files to S3 after processing.",
|
| 346 |
)
|
| 347 |
general_group.add_argument(
|
|
|
|
| 762 |
output_folder=args.output_dir,
|
| 763 |
input_folder=args.input_dir,
|
| 764 |
prepare_images=args.prepare_images,
|
| 765 |
+
page_min=args.page_min,
|
| 766 |
+
page_max=args.page_max,
|
| 767 |
)
|
| 768 |
print(f"Preparation complete. {prep_summary}")
|
| 769 |
|
example_config.env
CHANGED
|
@@ -6,7 +6,7 @@ CHOSEN_LOCAL_OCR_MODEL=tesseract
|
|
| 6 |
SESSION_OUTPUT_FOLDER=False
|
| 7 |
DISPLAY_FILE_NAMES_IN_LOGS=False
|
| 8 |
|
| 9 |
-
RUN_AWS_FUNCTIONS=
|
| 10 |
SAVE_LOGS_TO_DYNAMODB=True
|
| 11 |
S3_COST_CODES_PATH=cost_codes.csv
|
| 12 |
SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True
|
|
|
|
| 6 |
SESSION_OUTPUT_FOLDER=False
|
| 7 |
DISPLAY_FILE_NAMES_IN_LOGS=False
|
| 8 |
|
| 9 |
+
RUN_AWS_FUNCTIONS=True # Set to False if you don't want to run AWS functions
|
| 10 |
SAVE_LOGS_TO_DYNAMODB=True
|
| 11 |
S3_COST_CODES_PATH=cost_codes.csv
|
| 12 |
SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True
|
lambda_entrypoint.py
CHANGED
|
@@ -43,6 +43,18 @@ def _get_env_list(env_var_name: str | list[str] | None) -> list[str]:
|
|
| 43 |
return [s.strip() for s in value.split(",") if s.strip()]
|
| 44 |
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
print("Lambda entrypoint loading...")
|
| 47 |
|
| 48 |
# Initialize S3 client outside the handler for connection reuse
|
|
@@ -293,8 +305,10 @@ def lambda_handler(event, context):
|
|
| 293 |
"username": arguments.get(
|
| 294 |
"username", os.getenv("DIRECT_MODE_DEFAULT_USER", "lambda_user")
|
| 295 |
),
|
| 296 |
-
"save_to_user_folders":
|
| 297 |
-
|
|
|
|
|
|
|
| 298 |
),
|
| 299 |
"local_redact_entities": _get_env_list(
|
| 300 |
arguments.get(
|
|
@@ -312,20 +326,26 @@ def lambda_handler(event, context):
|
|
| 312 |
"aws_region": os.getenv("AWS_REGION", ""),
|
| 313 |
"s3_bucket": bucket_name,
|
| 314 |
"do_initial_clean": arguments.get(
|
| 315 |
-
"do_initial_clean",
|
|
|
|
|
|
|
|
|
|
| 316 |
),
|
| 317 |
-
"save_logs_to_csv":
|
| 318 |
-
"save_logs_to_csv", os.getenv("SAVE_LOGS_TO_CSV", "True")
|
| 319 |
),
|
| 320 |
"save_logs_to_dynamodb": arguments.get(
|
| 321 |
-
"save_logs_to_dynamodb",
|
|
|
|
| 322 |
),
|
| 323 |
-
"display_file_names_in_logs":
|
| 324 |
-
|
| 325 |
-
|
|
|
|
|
|
|
| 326 |
),
|
| 327 |
-
"upload_logs_to_s3":
|
| 328 |
-
"upload_logs_to_s3", os.getenv("RUN_AWS_FUNCTIONS", "False")
|
| 329 |
),
|
| 330 |
"s3_logs_prefix": arguments.get(
|
| 331 |
"s3_logs_prefix", os.getenv("S3_USAGE_LOGS_FOLDER", "")
|
|
@@ -364,15 +384,21 @@ def lambda_handler(event, context):
|
|
| 364 |
"chosen_local_ocr_model": arguments.get(
|
| 365 |
"chosen_local_ocr_model", os.getenv("CHOSEN_LOCAL_OCR_MODEL", "tesseract")
|
| 366 |
),
|
| 367 |
-
"preprocess_local_ocr_images":
|
| 368 |
-
|
| 369 |
-
|
|
|
|
|
|
|
| 370 |
),
|
| 371 |
-
"compress_redacted_pdf":
|
| 372 |
-
|
|
|
|
|
|
|
| 373 |
),
|
| 374 |
-
"return_pdf_end_of_redaction":
|
| 375 |
-
|
|
|
|
|
|
|
| 376 |
),
|
| 377 |
"deny_list_file": arguments.get(
|
| 378 |
"deny_list_file", os.getenv("DENY_LIST_PATH", "")
|
|
@@ -392,17 +418,23 @@ def lambda_handler(event, context):
|
|
| 392 |
),
|
| 393 |
)
|
| 394 |
),
|
| 395 |
-
"extract_forms":
|
| 396 |
-
|
| 397 |
-
|
|
|
|
|
|
|
| 398 |
),
|
| 399 |
-
"extract_tables":
|
| 400 |
-
|
| 401 |
-
|
|
|
|
|
|
|
| 402 |
),
|
| 403 |
-
"extract_layout":
|
| 404 |
-
|
| 405 |
-
|
|
|
|
|
|
|
| 406 |
),
|
| 407 |
# Word/Tabular Anonymisation Arguments
|
| 408 |
"anon_strategy": arguments.get(
|
|
@@ -424,9 +456,11 @@ def lambda_handler(event, context):
|
|
| 424 |
),
|
| 425 |
)
|
| 426 |
),
|
| 427 |
-
"match_fuzzy_whole_phrase_bool":
|
| 428 |
-
|
| 429 |
-
|
|
|
|
|
|
|
| 430 |
),
|
| 431 |
# Duplicate Detection Arguments
|
| 432 |
"duplicate_type": arguments.get(
|
|
@@ -455,19 +489,25 @@ def lambda_handler(event, context):
|
|
| 455 |
),
|
| 456 |
)
|
| 457 |
),
|
| 458 |
-
"greedy_match":
|
| 459 |
-
|
|
|
|
|
|
|
| 460 |
),
|
| 461 |
-
"combine_pages":
|
| 462 |
-
"combine_pages", os.getenv("DEFAULT_COMBINE_PAGES", "True")
|
| 463 |
),
|
| 464 |
-
"remove_duplicate_rows":
|
| 465 |
-
|
|
|
|
|
|
|
| 466 |
),
|
| 467 |
# Textract Batch Operations Arguments
|
| 468 |
"textract_action": arguments.get("textract_action", ""),
|
| 469 |
"job_id": arguments.get("job_id", ""),
|
| 470 |
-
"extract_signatures":
|
|
|
|
|
|
|
| 471 |
"textract_bucket": arguments.get(
|
| 472 |
"textract_bucket", os.getenv("TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET", "")
|
| 473 |
),
|
|
@@ -492,7 +532,9 @@ def lambda_handler(event, context):
|
|
| 492 |
"search_query": arguments.get(
|
| 493 |
"search_query", os.getenv("DEFAULT_SEARCH_QUERY", "")
|
| 494 |
),
|
| 495 |
-
"prepare_images":
|
|
|
|
|
|
|
| 496 |
}
|
| 497 |
|
| 498 |
# Debug: Print the final page_min and page_max values
|
|
|
|
| 43 |
return [s.strip() for s in value.split(",") if s.strip()]
|
| 44 |
|
| 45 |
|
| 46 |
+
def convert_string_to_boolean(value: str) -> bool:
|
| 47 |
+
"""Convert string to boolean, handling various formats."""
|
| 48 |
+
if isinstance(value, bool):
|
| 49 |
+
return value
|
| 50 |
+
elif value in ["True", "1", "true", "TRUE"]:
|
| 51 |
+
return True
|
| 52 |
+
elif value in ["False", "0", "false", "FALSE"]:
|
| 53 |
+
return False
|
| 54 |
+
else:
|
| 55 |
+
raise ValueError(f"Invalid boolean value: {value}")
|
| 56 |
+
|
| 57 |
+
|
| 58 |
print("Lambda entrypoint loading...")
|
| 59 |
|
| 60 |
# Initialize S3 client outside the handler for connection reuse
|
|
|
|
| 305 |
"username": arguments.get(
|
| 306 |
"username", os.getenv("DIRECT_MODE_DEFAULT_USER", "lambda_user")
|
| 307 |
),
|
| 308 |
+
"save_to_user_folders": convert_string_to_boolean(
|
| 309 |
+
arguments.get(
|
| 310 |
+
"save_to_user_folders", os.getenv("SESSION_OUTPUT_FOLDER", "False")
|
| 311 |
+
)
|
| 312 |
),
|
| 313 |
"local_redact_entities": _get_env_list(
|
| 314 |
arguments.get(
|
|
|
|
| 326 |
"aws_region": os.getenv("AWS_REGION", ""),
|
| 327 |
"s3_bucket": bucket_name,
|
| 328 |
"do_initial_clean": arguments.get(
|
| 329 |
+
"do_initial_clean",
|
| 330 |
+
convert_string_to_boolean(
|
| 331 |
+
os.getenv("DO_INITIAL_TABULAR_DATA_CLEAN", "False")
|
| 332 |
+
),
|
| 333 |
),
|
| 334 |
+
"save_logs_to_csv": convert_string_to_boolean(
|
| 335 |
+
arguments.get("save_logs_to_csv", os.getenv("SAVE_LOGS_TO_CSV", "True"))
|
| 336 |
),
|
| 337 |
"save_logs_to_dynamodb": arguments.get(
|
| 338 |
+
"save_logs_to_dynamodb",
|
| 339 |
+
convert_string_to_boolean(os.getenv("SAVE_LOGS_TO_DYNAMODB", "False")),
|
| 340 |
),
|
| 341 |
+
"display_file_names_in_logs": convert_string_to_boolean(
|
| 342 |
+
arguments.get(
|
| 343 |
+
"display_file_names_in_logs",
|
| 344 |
+
os.getenv("DISPLAY_FILE_NAMES_IN_LOGS", "True"),
|
| 345 |
+
)
|
| 346 |
),
|
| 347 |
+
"upload_logs_to_s3": convert_string_to_boolean(
|
| 348 |
+
arguments.get("upload_logs_to_s3", os.getenv("RUN_AWS_FUNCTIONS", "False"))
|
| 349 |
),
|
| 350 |
"s3_logs_prefix": arguments.get(
|
| 351 |
"s3_logs_prefix", os.getenv("S3_USAGE_LOGS_FOLDER", "")
|
|
|
|
| 384 |
"chosen_local_ocr_model": arguments.get(
|
| 385 |
"chosen_local_ocr_model", os.getenv("CHOSEN_LOCAL_OCR_MODEL", "tesseract")
|
| 386 |
),
|
| 387 |
+
"preprocess_local_ocr_images": convert_string_to_boolean(
|
| 388 |
+
arguments.get(
|
| 389 |
+
"preprocess_local_ocr_images",
|
| 390 |
+
os.getenv("PREPROCESS_LOCAL_OCR_IMAGES", "True"),
|
| 391 |
+
)
|
| 392 |
),
|
| 393 |
+
"compress_redacted_pdf": convert_string_to_boolean(
|
| 394 |
+
arguments.get(
|
| 395 |
+
"compress_redacted_pdf", os.getenv("COMPRESS_REDACTED_PDF", "True")
|
| 396 |
+
)
|
| 397 |
),
|
| 398 |
+
"return_pdf_end_of_redaction": convert_string_to_boolean(
|
| 399 |
+
arguments.get(
|
| 400 |
+
"return_pdf_end_of_redaction", os.getenv("RETURN_REDACTED_PDF", "True")
|
| 401 |
+
)
|
| 402 |
),
|
| 403 |
"deny_list_file": arguments.get(
|
| 404 |
"deny_list_file", os.getenv("DENY_LIST_PATH", "")
|
|
|
|
| 418 |
),
|
| 419 |
)
|
| 420 |
),
|
| 421 |
+
"extract_forms": convert_string_to_boolean(
|
| 422 |
+
arguments.get(
|
| 423 |
+
"extract_forms",
|
| 424 |
+
os.getenv("INCLUDE_FORM_EXTRACTION_TEXTRACT_OPTION", "False"),
|
| 425 |
+
)
|
| 426 |
),
|
| 427 |
+
"extract_tables": convert_string_to_boolean(
|
| 428 |
+
arguments.get(
|
| 429 |
+
"extract_tables",
|
| 430 |
+
os.getenv("INCLUDE_TABLE_EXTRACTION_TEXTRACT_OPTION", "False"),
|
| 431 |
+
)
|
| 432 |
),
|
| 433 |
+
"extract_layout": convert_string_to_boolean(
|
| 434 |
+
arguments.get(
|
| 435 |
+
"extract_layout",
|
| 436 |
+
os.getenv("INCLUDE_LAYOUT_EXTRACTION_TEXTRACT_OPTION", "False"),
|
| 437 |
+
)
|
| 438 |
),
|
| 439 |
# Word/Tabular Anonymisation Arguments
|
| 440 |
"anon_strategy": arguments.get(
|
|
|
|
| 456 |
),
|
| 457 |
)
|
| 458 |
),
|
| 459 |
+
"match_fuzzy_whole_phrase_bool": convert_string_to_boolean(
|
| 460 |
+
arguments.get(
|
| 461 |
+
"match_fuzzy_whole_phrase_bool",
|
| 462 |
+
os.getenv("MATCH_FUZZY_WHOLE_PHRASE_BOOL", "True"),
|
| 463 |
+
)
|
| 464 |
),
|
| 465 |
# Duplicate Detection Arguments
|
| 466 |
"duplicate_type": arguments.get(
|
|
|
|
| 489 |
),
|
| 490 |
)
|
| 491 |
),
|
| 492 |
+
"greedy_match": convert_string_to_boolean(
|
| 493 |
+
arguments.get(
|
| 494 |
+
"greedy_match", os.getenv("USE_GREEDY_DUPLICATE_DETECTION", "False")
|
| 495 |
+
)
|
| 496 |
),
|
| 497 |
+
"combine_pages": convert_string_to_boolean(
|
| 498 |
+
arguments.get("combine_pages", os.getenv("DEFAULT_COMBINE_PAGES", "True"))
|
| 499 |
),
|
| 500 |
+
"remove_duplicate_rows": convert_string_to_boolean(
|
| 501 |
+
arguments.get(
|
| 502 |
+
"remove_duplicate_rows", os.getenv("REMOVE_DUPLICATE_ROWS", "False")
|
| 503 |
+
)
|
| 504 |
),
|
| 505 |
# Textract Batch Operations Arguments
|
| 506 |
"textract_action": arguments.get("textract_action", ""),
|
| 507 |
"job_id": arguments.get("job_id", ""),
|
| 508 |
+
"extract_signatures": convert_string_to_boolean(
|
| 509 |
+
arguments.get("extract_signatures", "False")
|
| 510 |
+
),
|
| 511 |
"textract_bucket": arguments.get(
|
| 512 |
"textract_bucket", os.getenv("TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET", "")
|
| 513 |
),
|
|
|
|
| 532 |
"search_query": arguments.get(
|
| 533 |
"search_query", os.getenv("DEFAULT_SEARCH_QUERY", "")
|
| 534 |
),
|
| 535 |
+
"prepare_images": convert_string_to_boolean(
|
| 536 |
+
arguments.get("prepare_images", "True")
|
| 537 |
+
),
|
| 538 |
}
|
| 539 |
|
| 540 |
# Debug: Print the final page_min and page_max values
|
src/app_settings.qmd
CHANGED
|
@@ -28,8 +28,8 @@ This section covers configurations related to AWS services used by the applicati
|
|
| 28 |
* **Configuration:** Set as an environment variable directly. This variable defines an additional source for AWS-specific configurations.
|
| 29 |
|
| 30 |
* **`RUN_AWS_FUNCTIONS`**
|
| 31 |
-
* **Description:** Enables or disables AWS-specific functionalities within the application. Set to `"
|
| 32 |
-
* **Default Value:** `"
|
| 33 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
|
| 34 |
|
| 35 |
* **`AWS_REGION`**
|
|
@@ -392,13 +392,13 @@ General runtime configurations for the application.
|
|
| 392 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
| 393 |
|
| 394 |
* **`COGNITO_AUTH`**
|
| 395 |
-
* **Description:** Enables or disables AWS Cognito authentication for the application. Set to `'
|
| 396 |
-
* **Default Value:** `'
|
| 397 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
|
| 398 |
|
| 399 |
* **`RUN_DIRECT_MODE`**
|
| 400 |
-
* **Description:** If set to `'
|
| 401 |
-
* **Default Value:** `'
|
| 402 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
| 403 |
|
| 404 |
* **`MAX_QUEUE_SIZE`**
|
|
|
|
| 28 |
* **Configuration:** Set as an environment variable directly. This variable defines an additional source for AWS-specific configurations.
|
| 29 |
|
| 30 |
* **`RUN_AWS_FUNCTIONS`**
|
| 31 |
+
* **Description:** Enables or disables AWS-specific functionalities within the application. Set to `"True"` to enable and `"False"` to disable.
|
| 32 |
+
* **Default Value:** `"False"`
|
| 33 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
|
| 34 |
|
| 35 |
* **`AWS_REGION`**
|
|
|
|
| 392 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
| 393 |
|
| 394 |
* **`COGNITO_AUTH`**
|
| 395 |
+
* **Description:** Enables or disables AWS Cognito authentication for the application. Set to `'True'` to enable.
|
| 396 |
+
* **Default Value:** `'False'`
|
| 397 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
|
| 398 |
|
| 399 |
* **`RUN_DIRECT_MODE`**
|
| 400 |
+
* **Description:** If set to `'True'`, runs the application in a "direct mode", which might alter certain behaviors (e.g., UI elements, processing flow).
|
| 401 |
+
* **Default Value:** `'False'`
|
| 402 |
* **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
|
| 403 |
|
| 404 |
* **`MAX_QUEUE_SIZE`**
|
src/installation_guide.qmd
CHANGED
|
@@ -83,7 +83,7 @@ AWS_ACCOUNT_ID=1234567890 # AWS account ID that has administrator access that yo
|
|
| 83 |
CDK_FOLDER=C:/path_to_cdk_folder/ # The place where the cdk folder code is located
|
| 84 |
CONTEXT_FILE=C:/path_to_cdk_folder/cdk.context.json
|
| 85 |
COGNITO_USER_POOL_DOMAIN_PREFIX=redaction-12345 # The prefix of the login / user sign up domain that you want to use with Cognito login. Should not contain the terms amazon, aws, or cognito.
|
| 86 |
-
COGNITO_AUTH=
|
| 87 |
USE_CLOUDFRONT=True # Recommended. If you intend to use CloudFront as the front URL to your application load balancer (ALB). This has some extra security features that you won't get with just an ALB, e.g. limiting app access by country.
|
| 88 |
RUN_USEAST_STACK=False # Set this to True only if you have permissions to create a Cloudfront distribution and web ACL on top of it in the us-east-1 region. If you don't, the section below shows how you can create the CloudFront resource manually and map it to your application load balancer (as you should have permissions for that if you are admin in your region).
|
| 89 |
CLOUDFRONT_DOMAIN=<example>.cloudfront.net # If you already know the domain of the CloudFront distribution that you want to use, you can add this here.
|
|
@@ -155,7 +155,7 @@ if you want to do this manually:
|
|
| 155 |
Create a `config.env` file to upload to the S3 bucket that has at least the following variables:
|
| 156 |
|
| 157 |
```ini
|
| 158 |
-
COGNITO_AUTH=
|
| 159 |
RUN_AWS_FUNCTIONS=1 # This will enable the app to communicate with AWS services.
|
| 160 |
SESSION_OUTPUT_FOLDER=True # This will put outputs for each user in separate output folders.
|
| 161 |
```
|
|
|
|
| 83 |
CDK_FOLDER=C:/path_to_cdk_folder/ # The place where the cdk folder code is located
|
| 84 |
CONTEXT_FILE=C:/path_to_cdk_folder/cdk.context.json
|
| 85 |
COGNITO_USER_POOL_DOMAIN_PREFIX=redaction-12345 # The prefix of the login / user sign up domain that you want to use with Cognito login. Should not contain the terms amazon, aws, or cognito.
|
| 86 |
+
COGNITO_AUTH=0 # Do you want to do in-app authentication (username and password only, not necessary if you are using an SSL certificate as recommended below)
|
| 87 |
USE_CLOUDFRONT=True # Recommended. If you intend to use CloudFront as the front URL to your application load balancer (ALB). This has some extra security features that you won't get with just an ALB, e.g. limiting app access by country.
|
| 88 |
RUN_USEAST_STACK=False # Set this to True only if you have permissions to create a Cloudfront distribution and web ACL on top of it in the us-east-1 region. If you don't, the section below shows how you can create the CloudFront resource manually and map it to your application load balancer (as you should have permissions for that if you are admin in your region).
|
| 89 |
CLOUDFRONT_DOMAIN=<example>.cloudfront.net # If you already know the domain of the CloudFront distribution that you want to use, you can add this here.
|
|
|
|
| 155 |
Create a `config.env` file to upload to the S3 bucket that has at least the following variables:
|
| 156 |
|
| 157 |
```ini
|
| 158 |
+
COGNITO_AUTH=0 # If you are using an SSL certificate with your application load balancer, you will be logging in there. Set this to 0 to turn off the default login screen.
|
| 159 |
RUN_AWS_FUNCTIONS=1 # This will enable the app to communicate with AWS services.
|
| 160 |
SESSION_OUTPUT_FOLDER=True # This will put outputs for each user in separate output folders.
|
| 161 |
```
|
tools/aws_functions.py
CHANGED
|
@@ -29,7 +29,7 @@ def get_assumed_role_info():
|
|
| 29 |
return assumed_role_arn, assumed_role_name
|
| 30 |
|
| 31 |
|
| 32 |
-
if RUN_AWS_FUNCTIONS
|
| 33 |
try:
|
| 34 |
session = boto3.Session(region_name=AWS_REGION)
|
| 35 |
|
|
@@ -52,10 +52,10 @@ def download_file_from_s3(
|
|
| 52 |
bucket_name: str,
|
| 53 |
key: str,
|
| 54 |
local_file_path_and_name: str,
|
| 55 |
-
RUN_AWS_FUNCTIONS:
|
| 56 |
):
|
| 57 |
|
| 58 |
-
if RUN_AWS_FUNCTIONS
|
| 59 |
|
| 60 |
try:
|
| 61 |
# Ensure the local directory exists
|
|
@@ -74,12 +74,12 @@ def download_folder_from_s3(
|
|
| 74 |
bucket_name: str,
|
| 75 |
s3_folder: str,
|
| 76 |
local_folder: str,
|
| 77 |
-
RUN_AWS_FUNCTIONS:
|
| 78 |
):
|
| 79 |
"""
|
| 80 |
Download all files from an S3 folder to a local folder.
|
| 81 |
"""
|
| 82 |
-
if RUN_AWS_FUNCTIONS
|
| 83 |
if bucket_name and s3_folder and local_folder:
|
| 84 |
|
| 85 |
s3 = boto3.client("s3", region_name=AWS_REGION)
|
|
@@ -117,13 +117,13 @@ def download_files_from_s3(
|
|
| 117 |
s3_folder: str,
|
| 118 |
local_folder: str,
|
| 119 |
filenames: List[str],
|
| 120 |
-
RUN_AWS_FUNCTIONS:
|
| 121 |
):
|
| 122 |
"""
|
| 123 |
Download specific files from an S3 folder to a local folder.
|
| 124 |
"""
|
| 125 |
|
| 126 |
-
if RUN_AWS_FUNCTIONS
|
| 127 |
if bucket_name and s3_folder and local_folder and filenames:
|
| 128 |
|
| 129 |
s3 = boto3.client("s3", region_name=AWS_REGION)
|
|
@@ -169,7 +169,7 @@ def upload_file_to_s3(
|
|
| 169 |
local_file_paths: List[str],
|
| 170 |
s3_key: str,
|
| 171 |
s3_bucket: str = DOCUMENT_REDACTION_BUCKET,
|
| 172 |
-
RUN_AWS_FUNCTIONS:
|
| 173 |
):
|
| 174 |
"""
|
| 175 |
Uploads a file from local machine to Amazon S3.
|
|
@@ -182,10 +182,10 @@ def upload_file_to_s3(
|
|
| 182 |
Returns:
|
| 183 |
- Message as variable/printed to console
|
| 184 |
"""
|
| 185 |
-
final_out_message =
|
| 186 |
final_out_message_str = ""
|
| 187 |
|
| 188 |
-
if RUN_AWS_FUNCTIONS
|
| 189 |
try:
|
| 190 |
if s3_bucket and s3_key and local_file_paths:
|
| 191 |
|
|
@@ -236,8 +236,8 @@ def upload_log_file_to_s3(
|
|
| 236 |
local_file_paths: List[str],
|
| 237 |
s3_key: str,
|
| 238 |
s3_bucket: str = DOCUMENT_REDACTION_BUCKET,
|
| 239 |
-
RUN_AWS_FUNCTIONS:
|
| 240 |
-
SAVE_LOGS_TO_CSV:
|
| 241 |
):
|
| 242 |
"""
|
| 243 |
Uploads a log file from local machine to Amazon S3.
|
|
@@ -250,10 +250,10 @@ def upload_log_file_to_s3(
|
|
| 250 |
Returns:
|
| 251 |
- Message as variable/printed to console
|
| 252 |
"""
|
| 253 |
-
final_out_message =
|
| 254 |
final_out_message_str = ""
|
| 255 |
|
| 256 |
-
if RUN_AWS_FUNCTIONS
|
| 257 |
try:
|
| 258 |
if s3_bucket and s3_key and local_file_paths:
|
| 259 |
|
|
|
|
| 29 |
return assumed_role_arn, assumed_role_name
|
| 30 |
|
| 31 |
|
| 32 |
+
if RUN_AWS_FUNCTIONS:
|
| 33 |
try:
|
| 34 |
session = boto3.Session(region_name=AWS_REGION)
|
| 35 |
|
|
|
|
| 52 |
bucket_name: str,
|
| 53 |
key: str,
|
| 54 |
local_file_path_and_name: str,
|
| 55 |
+
RUN_AWS_FUNCTIONS: bool = RUN_AWS_FUNCTIONS,
|
| 56 |
):
|
| 57 |
|
| 58 |
+
if RUN_AWS_FUNCTIONS:
|
| 59 |
|
| 60 |
try:
|
| 61 |
# Ensure the local directory exists
|
|
|
|
| 74 |
bucket_name: str,
|
| 75 |
s3_folder: str,
|
| 76 |
local_folder: str,
|
| 77 |
+
RUN_AWS_FUNCTIONS: bool = RUN_AWS_FUNCTIONS,
|
| 78 |
):
|
| 79 |
"""
|
| 80 |
Download all files from an S3 folder to a local folder.
|
| 81 |
"""
|
| 82 |
+
if RUN_AWS_FUNCTIONS:
|
| 83 |
if bucket_name and s3_folder and local_folder:
|
| 84 |
|
| 85 |
s3 = boto3.client("s3", region_name=AWS_REGION)
|
|
|
|
| 117 |
s3_folder: str,
|
| 118 |
local_folder: str,
|
| 119 |
filenames: List[str],
|
| 120 |
+
RUN_AWS_FUNCTIONS: bool = RUN_AWS_FUNCTIONS,
|
| 121 |
):
|
| 122 |
"""
|
| 123 |
Download specific files from an S3 folder to a local folder.
|
| 124 |
"""
|
| 125 |
|
| 126 |
+
if RUN_AWS_FUNCTIONS:
|
| 127 |
if bucket_name and s3_folder and local_folder and filenames:
|
| 128 |
|
| 129 |
s3 = boto3.client("s3", region_name=AWS_REGION)
|
|
|
|
| 169 |
local_file_paths: List[str],
|
| 170 |
s3_key: str,
|
| 171 |
s3_bucket: str = DOCUMENT_REDACTION_BUCKET,
|
| 172 |
+
RUN_AWS_FUNCTIONS: bool = RUN_AWS_FUNCTIONS,
|
| 173 |
):
|
| 174 |
"""
|
| 175 |
Uploads a file from local machine to Amazon S3.
|
|
|
|
| 182 |
Returns:
|
| 183 |
- Message as variable/printed to console
|
| 184 |
"""
|
| 185 |
+
final_out_message = list()
|
| 186 |
final_out_message_str = ""
|
| 187 |
|
| 188 |
+
if RUN_AWS_FUNCTIONS:
|
| 189 |
try:
|
| 190 |
if s3_bucket and s3_key and local_file_paths:
|
| 191 |
|
|
|
|
| 236 |
local_file_paths: List[str],
|
| 237 |
s3_key: str,
|
| 238 |
s3_bucket: str = DOCUMENT_REDACTION_BUCKET,
|
| 239 |
+
RUN_AWS_FUNCTIONS: bool = RUN_AWS_FUNCTIONS,
|
| 240 |
+
SAVE_LOGS_TO_CSV: bool = SAVE_LOGS_TO_CSV,
|
| 241 |
):
|
| 242 |
"""
|
| 243 |
Uploads a log file from local machine to Amazon S3.
|
|
|
|
| 250 |
Returns:
|
| 251 |
- Message as variable/printed to console
|
| 252 |
"""
|
| 253 |
+
final_out_message = list()
|
| 254 |
final_out_message_str = ""
|
| 255 |
|
| 256 |
+
if RUN_AWS_FUNCTIONS and SAVE_LOGS_TO_CSV:
|
| 257 |
try:
|
| 258 |
if s3_bucket and s3_key and local_file_paths:
|
| 259 |
|
tools/aws_textract.py
CHANGED
|
@@ -38,8 +38,8 @@ def analyse_page_with_textract(
|
|
| 38 |
textract_output_found: bool = False,
|
| 39 |
aws_access_question_textbox: str = AWS_ACCESS_KEY,
|
| 40 |
aws_secret_question_textbox: str = AWS_SECRET_KEY,
|
| 41 |
-
RUN_AWS_FUNCTIONS:
|
| 42 |
-
PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:
|
| 43 |
):
|
| 44 |
"""
|
| 45 |
Analyzes a single page of a document using AWS Textract to extract text and other features.
|
|
@@ -62,12 +62,12 @@ def analyse_page_with_textract(
|
|
| 62 |
SSO or environment variables. Defaults to AWS_ACCESS_KEY.
|
| 63 |
aws_secret_question_textbox (str, optional): AWS secret question provided by the user, if not using
|
| 64 |
SSO or environment variables. Defaults to AWS_SECRET_KEY.
|
| 65 |
-
RUN_AWS_FUNCTIONS (
|
| 66 |
disable AWS functions. Defaults to RUN_AWS_FUNCTIONS.
|
| 67 |
-
PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS (
|
| 68 |
to prioritize AWS SSO credentials
|
| 69 |
over environment variables.
|
| 70 |
-
Defaults to
|
| 71 |
|
| 72 |
Returns:
|
| 73 |
Tuple[List[Dict], str]: A tuple containing:
|
|
@@ -79,10 +79,7 @@ def analyse_page_with_textract(
|
|
| 79 |
if client == "":
|
| 80 |
try:
|
| 81 |
# Try to connect to AWS Textract Client if using that text extraction method
|
| 82 |
-
if
|
| 83 |
-
RUN_AWS_FUNCTIONS == "1"
|
| 84 |
-
and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS == "1"
|
| 85 |
-
):
|
| 86 |
print("Connecting to Textract via existing SSO connection")
|
| 87 |
client = boto3.client("textract", region_name=AWS_REGION)
|
| 88 |
elif aws_access_question_textbox and aws_secret_question_textbox:
|
|
@@ -95,7 +92,7 @@ def analyse_page_with_textract(
|
|
| 95 |
aws_secret_access_question=aws_secret_question_textbox,
|
| 96 |
region_name=AWS_REGION,
|
| 97 |
)
|
| 98 |
-
elif RUN_AWS_FUNCTIONS
|
| 99 |
print("Connecting to Textract via existing SSO connection")
|
| 100 |
client = boto3.client("textract", region_name=AWS_REGION)
|
| 101 |
elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
|
|
|
|
| 38 |
textract_output_found: bool = False,
|
| 39 |
aws_access_question_textbox: str = AWS_ACCESS_KEY,
|
| 40 |
aws_secret_question_textbox: str = AWS_SECRET_KEY,
|
| 41 |
+
RUN_AWS_FUNCTIONS: bool = RUN_AWS_FUNCTIONS,
|
| 42 |
+
PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS: bool = PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS,
|
| 43 |
):
|
| 44 |
"""
|
| 45 |
Analyzes a single page of a document using AWS Textract to extract text and other features.
|
|
|
|
| 62 |
SSO or environment variables. Defaults to AWS_ACCESS_KEY.
|
| 63 |
aws_secret_question_textbox (str, optional): AWS secret question provided by the user, if not using
|
| 64 |
SSO or environment variables. Defaults to AWS_SECRET_KEY.
|
| 65 |
+
RUN_AWS_FUNCTIONS (bool, optional): Configuration flag to enable or
|
| 66 |
disable AWS functions. Defaults to RUN_AWS_FUNCTIONS.
|
| 67 |
+
PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS (bool, optional): Configuration flag (e.g., True or False)
|
| 68 |
to prioritize AWS SSO credentials
|
| 69 |
over environment variables.
|
| 70 |
+
Defaults to True.
|
| 71 |
|
| 72 |
Returns:
|
| 73 |
Tuple[List[Dict], str]: A tuple containing:
|
|
|
|
| 79 |
if client == "":
|
| 80 |
try:
|
| 81 |
# Try to connect to AWS Textract Client if using that text extraction method
|
| 82 |
+
if RUN_AWS_FUNCTIONS and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:
|
|
|
|
|
|
|
|
|
|
| 83 |
print("Connecting to Textract via existing SSO connection")
|
| 84 |
client = boto3.client("textract", region_name=AWS_REGION)
|
| 85 |
elif aws_access_question_textbox and aws_secret_question_textbox:
|
|
|
|
| 92 |
aws_secret_access_question=aws_secret_question_textbox,
|
| 93 |
region_name=AWS_REGION,
|
| 94 |
)
|
| 95 |
+
elif RUN_AWS_FUNCTIONS is True:
|
| 96 |
print("Connecting to Textract via existing SSO connection")
|
| 97 |
client = boto3.client("textract", region_name=AWS_REGION)
|
| 98 |
elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
|
tools/cli_usage_logger.py
CHANGED
|
@@ -112,11 +112,11 @@ class CLIUsageLogger:
|
|
| 112 |
"""
|
| 113 |
# Use config defaults if not specified
|
| 114 |
if save_to_csv is None:
|
| 115 |
-
save_to_csv = SAVE_LOGS_TO_CSV
|
| 116 |
if save_to_dynamodb is None:
|
| 117 |
-
save_to_dynamodb = SAVE_LOGS_TO_DYNAMODB
|
| 118 |
if save_to_s3 is None:
|
| 119 |
-
save_to_s3 = RUN_AWS_FUNCTIONS
|
| 120 |
if s3_bucket is None:
|
| 121 |
s3_bucket = DOCUMENT_REDACTION_BUCKET
|
| 122 |
if s3_key_prefix is None:
|
|
|
|
| 112 |
"""
|
| 113 |
# Use config defaults if not specified
|
| 114 |
if save_to_csv is None:
|
| 115 |
+
save_to_csv = SAVE_LOGS_TO_CSV
|
| 116 |
if save_to_dynamodb is None:
|
| 117 |
+
save_to_dynamodb = SAVE_LOGS_TO_DYNAMODB
|
| 118 |
if save_to_s3 is None:
|
| 119 |
+
save_to_s3 = RUN_AWS_FUNCTIONS and SAVE_LOGS_TO_CSV
|
| 120 |
if s3_bucket is None:
|
| 121 |
s3_bucket = DOCUMENT_REDACTION_BUCKET
|
| 122 |
if s3_key_prefix is None:
|
tools/config.py
CHANGED
|
@@ -25,6 +25,18 @@ def _get_env_list(env_var_name: str) -> List[str]:
|
|
| 25 |
# Set or retrieve configuration variables for the redaction app
|
| 26 |
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
def get_or_create_env_var(var_name: str, default_value: str, print_val: bool = False):
|
| 29 |
"""
|
| 30 |
Get an environmental variable, and set it to a default value if it doesn't exist
|
|
@@ -100,7 +112,9 @@ if AWS_CONFIG_PATH:
|
|
| 100 |
else:
|
| 101 |
print("AWS config file not found at location:", AWS_CONFIG_PATH)
|
| 102 |
|
| 103 |
-
RUN_AWS_FUNCTIONS =
|
|
|
|
|
|
|
| 104 |
|
| 105 |
AWS_REGION = get_or_create_env_var("AWS_REGION", "")
|
| 106 |
|
|
@@ -119,8 +133,8 @@ AWS_SECRET_KEY = get_or_create_env_var("AWS_SECRET_KEY", "")
|
|
| 119 |
DOCUMENT_REDACTION_BUCKET = get_or_create_env_var("DOCUMENT_REDACTION_BUCKET", "")
|
| 120 |
|
| 121 |
# Should the app prioritise using AWS SSO over using API keys stored in environment variables/secrets (defaults to yes)
|
| 122 |
-
PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS =
|
| 123 |
-
"PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS", "
|
| 124 |
)
|
| 125 |
|
| 126 |
# Custom headers e.g. if routing traffic through Cloudfront
|
|
@@ -134,7 +148,9 @@ CUSTOM_HEADER_VALUE = get_or_create_env_var("CUSTOM_HEADER_VALUE", "")
|
|
| 134 |
# Image options
|
| 135 |
###
|
| 136 |
IMAGES_DPI = float(get_or_create_env_var("IMAGES_DPI", "300.0"))
|
| 137 |
-
LOAD_TRUNCATED_IMAGES =
|
|
|
|
|
|
|
| 138 |
MAX_IMAGE_PIXELS = get_or_create_env_var(
|
| 139 |
"MAX_IMAGE_PIXELS", ""
|
| 140 |
) # Changed to None if blank in file_conversion.py
|
|
@@ -173,15 +189,19 @@ MPLCONFIGDIR = get_or_create_env_var("MPLCONFIGDIR", "") # Matplotlib cache fol
|
|
| 173 |
# By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
|
| 174 |
# Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
|
| 175 |
|
| 176 |
-
SAVE_LOGS_TO_CSV =
|
|
|
|
|
|
|
| 177 |
|
| 178 |
-
USE_LOG_SUBFOLDERS =
|
|
|
|
|
|
|
| 179 |
|
| 180 |
FEEDBACK_LOGS_FOLDER = get_or_create_env_var("FEEDBACK_LOGS_FOLDER", "feedback/")
|
| 181 |
ACCESS_LOGS_FOLDER = get_or_create_env_var("ACCESS_LOGS_FOLDER", "logs/")
|
| 182 |
USAGE_LOGS_FOLDER = get_or_create_env_var("USAGE_LOGS_FOLDER", "usage/")
|
| 183 |
|
| 184 |
-
if USE_LOG_SUBFOLDERS
|
| 185 |
day_log_subfolder = today_rev + "/"
|
| 186 |
host_name_subfolder = HOST_NAME + "/"
|
| 187 |
full_log_subfolder = day_log_subfolder + host_name_subfolder
|
|
@@ -201,8 +221,8 @@ S3_USAGE_LOGS_FOLDER = get_or_create_env_var(
|
|
| 201 |
)
|
| 202 |
|
| 203 |
# Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
|
| 204 |
-
DISPLAY_FILE_NAMES_IN_LOGS =
|
| 205 |
-
"DISPLAY_FILE_NAMES_IN_LOGS", "False"
|
| 206 |
)
|
| 207 |
|
| 208 |
# Further customisation options for CSV logs
|
|
@@ -218,7 +238,9 @@ CSV_USAGE_LOG_HEADERS = get_or_create_env_var(
|
|
| 218 |
) # If blank, uses component labels
|
| 219 |
|
| 220 |
### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
|
| 221 |
-
SAVE_LOGS_TO_DYNAMODB =
|
|
|
|
|
|
|
| 222 |
|
| 223 |
ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
|
| 224 |
"ACCESS_LOG_DYNAMODB_TABLE_NAME", "redaction_access_log"
|
|
@@ -238,9 +260,9 @@ USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
|
|
| 238 |
DYNAMODB_USAGE_LOG_HEADERS = get_or_create_env_var("DYNAMODB_USAGE_LOG_HEADERS", "")
|
| 239 |
|
| 240 |
# Report logging to console?
|
| 241 |
-
LOGGING = get_or_create_env_var("LOGGING", "False")
|
| 242 |
|
| 243 |
-
if LOGGING
|
| 244 |
# Configure logging
|
| 245 |
logging.basicConfig(
|
| 246 |
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
|
@@ -257,7 +279,7 @@ FEEDBACK_LOG_FILE_NAME = get_or_create_env_var("FEEDBACK_LOG_FILE_NAME", LOG_FIL
|
|
| 257 |
|
| 258 |
FAVICON_PATH = get_or_create_env_var("FAVICON_PATH", "favicon.png")
|
| 259 |
|
| 260 |
-
RUN_FASTAPI = get_or_create_env_var("RUN_FASTAPI", "
|
| 261 |
|
| 262 |
MAX_QUEUE_SIZE = int(get_or_create_env_var("MAX_QUEUE_SIZE", "5"))
|
| 263 |
|
|
@@ -291,8 +313,8 @@ MAX_OPEN_TEXT_CHARACTERS = int(
|
|
| 291 |
)
|
| 292 |
|
| 293 |
# When loading for review, should PDFs have existing redaction annotations loaded in?
|
| 294 |
-
LOAD_REDACTION_ANNOTATIONS_FROM_PDF =
|
| 295 |
-
"LOAD_REDACTION_ANNOTATIONS_FROM_PDF", "True"
|
| 296 |
)
|
| 297 |
|
| 298 |
|
|
@@ -313,8 +335,8 @@ if POPPLER_FOLDER:
|
|
| 313 |
add_folder_to_path(POPPLER_FOLDER)
|
| 314 |
|
| 315 |
# Extraction and PII options open by default:
|
| 316 |
-
EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT =
|
| 317 |
-
"EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT", "True"
|
| 318 |
)
|
| 319 |
|
| 320 |
# List of models to use for text extraction and PII detection
|
|
@@ -336,62 +358,56 @@ NO_REDACTION_PII_OPTION = get_or_create_env_var(
|
|
| 336 |
LOCAL_PII_OPTION = get_or_create_env_var("LOCAL_PII_OPTION", "Local")
|
| 337 |
AWS_PII_OPTION = get_or_create_env_var("AWS_PII_OPTION", "AWS Comprehend")
|
| 338 |
|
| 339 |
-
SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS =
|
| 340 |
-
"SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS", "True"
|
| 341 |
)
|
| 342 |
-
SHOW_AWS_TEXT_EXTRACTION_OPTIONS =
|
| 343 |
-
"SHOW_AWS_TEXT_EXTRACTION_OPTIONS", "True"
|
| 344 |
)
|
| 345 |
|
| 346 |
# Show at least local options if everything mistakenly removed
|
| 347 |
-
if
|
| 348 |
-
SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS
|
| 349 |
-
and SHOW_AWS_TEXT_EXTRACTION_OPTIONS != "True"
|
| 350 |
-
):
|
| 351 |
-
SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS = "True"
|
| 352 |
|
| 353 |
local_model_options = list()
|
| 354 |
aws_model_options = list()
|
| 355 |
text_extraction_models = list()
|
| 356 |
|
| 357 |
-
if SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS
|
| 358 |
local_model_options.append(SELECTABLE_TEXT_EXTRACT_OPTION)
|
| 359 |
local_model_options.append(TESSERACT_TEXT_EXTRACT_OPTION)
|
| 360 |
|
| 361 |
-
if SHOW_AWS_TEXT_EXTRACTION_OPTIONS
|
| 362 |
aws_model_options.append(TEXTRACT_TEXT_EXTRACT_OPTION)
|
| 363 |
|
| 364 |
TEXT_EXTRACTION_MODELS = local_model_options + aws_model_options
|
| 365 |
-
DO_INITIAL_TABULAR_DATA_CLEAN =
|
| 366 |
-
"DO_INITIAL_TABULAR_DATA_CLEAN", "True"
|
| 367 |
)
|
| 368 |
|
| 369 |
-
SHOW_LOCAL_PII_DETECTION_OPTIONS =
|
| 370 |
-
"SHOW_LOCAL_PII_DETECTION_OPTIONS", "True"
|
| 371 |
)
|
| 372 |
-
SHOW_AWS_PII_DETECTION_OPTIONS =
|
| 373 |
-
"SHOW_AWS_PII_DETECTION_OPTIONS", "True"
|
| 374 |
)
|
| 375 |
|
| 376 |
-
if
|
| 377 |
-
SHOW_LOCAL_PII_DETECTION_OPTIONS
|
| 378 |
-
and SHOW_AWS_PII_DETECTION_OPTIONS != "True"
|
| 379 |
-
):
|
| 380 |
-
SHOW_LOCAL_PII_DETECTION_OPTIONS = "True"
|
| 381 |
|
| 382 |
local_model_options = [NO_REDACTION_PII_OPTION]
|
| 383 |
aws_model_options = list()
|
| 384 |
pii_detection_models = list()
|
| 385 |
|
| 386 |
-
if SHOW_LOCAL_PII_DETECTION_OPTIONS
|
| 387 |
local_model_options.append(LOCAL_PII_OPTION)
|
| 388 |
|
| 389 |
-
if SHOW_AWS_PII_DETECTION_OPTIONS
|
| 390 |
aws_model_options.append(AWS_PII_OPTION)
|
| 391 |
|
| 392 |
PII_DETECTION_MODELS = local_model_options + aws_model_options
|
| 393 |
|
| 394 |
-
if SHOW_AWS_TEXT_EXTRACTION_OPTIONS
|
| 395 |
DEFAULT_TEXT_EXTRACTION_MODEL = get_or_create_env_var(
|
| 396 |
"DEFAULT_TEXT_EXTRACTION_MODEL", TEXTRACT_TEXT_EXTRACT_OPTION
|
| 397 |
)
|
|
@@ -400,7 +416,7 @@ else:
|
|
| 400 |
"DEFAULT_TEXT_EXTRACTION_MODEL", SELECTABLE_TEXT_EXTRACT_OPTION
|
| 401 |
)
|
| 402 |
|
| 403 |
-
if SHOW_AWS_PII_DETECTION_OPTIONS
|
| 404 |
DEFAULT_PII_DETECTION_MODEL = get_or_create_env_var(
|
| 405 |
"DEFAULT_PII_DETECTION_MODEL", AWS_PII_OPTION
|
| 406 |
)
|
|
@@ -426,10 +442,10 @@ CHOSEN_LOCAL_OCR_MODEL = get_or_create_env_var(
|
|
| 426 |
"CHOSEN_LOCAL_OCR_MODEL", "tesseract"
|
| 427 |
) # Choose between "tesseract", "hybrid", and "paddle". "paddle" is accurate for whole line text extraction, but word-level extract is not natively supported, and so word bounding boxes will be inaccurate. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with the chosen hybrid model (default PaddleOCR) on words with low confidence.
|
| 428 |
|
| 429 |
-
SHOW_LOCAL_OCR_MODEL_OPTIONS =
|
| 430 |
-
"SHOW_LOCAL_OCR_MODEL_OPTIONS", "False"
|
| 431 |
)
|
| 432 |
-
if SHOW_LOCAL_OCR_MODEL_OPTIONS
|
| 433 |
LOCAL_OCR_MODEL_OPTIONS = [
|
| 434 |
"tesseract",
|
| 435 |
"hybrid",
|
|
@@ -445,18 +461,18 @@ HYBRID_OCR_PADDING = int(
|
|
| 445 |
get_or_create_env_var("HYBRID_OCR_PADDING", "1")
|
| 446 |
) # The padding to add to the text when passing it to PaddleOCR for re-extraction using the hybrid OCR method.
|
| 447 |
|
| 448 |
-
PADDLE_USE_TEXTLINE_ORIENTATION =
|
| 449 |
-
"PADDLE_USE_TEXTLINE_ORIENTATION", "False"
|
| 450 |
)
|
| 451 |
|
| 452 |
PADDLE_DET_DB_UNCLIP_RATIO = get_or_create_env_var("PADDLE_DET_DB_UNCLIP_RATIO", "1.2")
|
| 453 |
|
| 454 |
-
SAVE_EXAMPLE_TESSERACT_VS_PADDLE_IMAGES =
|
| 455 |
-
"SAVE_EXAMPLE_TESSERACT_VS_PADDLE_IMAGES", "False"
|
| 456 |
) # Whether to save example images of Tesseract vs PaddleOCR re-extraction in hybrid OCR mode.
|
| 457 |
|
| 458 |
-
SAVE_PADDLE_VISUALISATIONS =
|
| 459 |
-
"SAVE_PADDLE_VISUALISATIONS", "False"
|
| 460 |
) # Whether to save visualisations of PaddleOCR bounding boxes.
|
| 461 |
|
| 462 |
# Model storage paths for Lambda compatibility
|
|
@@ -543,7 +559,9 @@ DEFAULT_PAGE_MAX = int(get_or_create_env_var("DEFAULT_PAGE_MAX", "0"))
|
|
| 543 |
|
| 544 |
### Language selection options
|
| 545 |
|
| 546 |
-
SHOW_LANGUAGE_SELECTION =
|
|
|
|
|
|
|
| 547 |
|
| 548 |
DEFAULT_LANGUAGE_FULL_NAME = get_or_create_env_var(
|
| 549 |
"DEFAULT_LANGUAGE_FULL_NAME", "english"
|
|
@@ -581,22 +599,24 @@ DEFAULT_DUPLICATE_DETECTION_THRESHOLD = float(
|
|
| 581 |
DEFAULT_MIN_CONSECUTIVE_PAGES = int(
|
| 582 |
get_or_create_env_var("DEFAULT_MIN_CONSECUTIVE_PAGES", "1")
|
| 583 |
)
|
| 584 |
-
USE_GREEDY_DUPLICATE_DETECTION =
|
| 585 |
-
"USE_GREEDY_DUPLICATE_DETECTION", "True"
|
| 586 |
)
|
| 587 |
-
DEFAULT_COMBINE_PAGES =
|
| 588 |
-
"DEFAULT_COMBINE_PAGES", "True"
|
| 589 |
) # Combine text from the same page number within a file. Alternative will enable line-level duplicate detection.
|
| 590 |
DEFAULT_MIN_WORD_COUNT = int(get_or_create_env_var("DEFAULT_MIN_WORD_COUNT", "10"))
|
| 591 |
-
REMOVE_DUPLICATE_ROWS =
|
|
|
|
|
|
|
| 592 |
|
| 593 |
|
| 594 |
###
|
| 595 |
# File output options
|
| 596 |
###
|
| 597 |
# Should the output pdf redaction boxes be drawn using the custom box colour?
|
| 598 |
-
USE_GUI_BOX_COLOURS_FOR_OUTPUTS =
|
| 599 |
-
"USE_GUI_BOX_COLOURS_FOR_OUTPUTS", "False"
|
| 600 |
)
|
| 601 |
|
| 602 |
# This is the colour of the output pdf redaction boxes. Should be a tuple of three integers between 0 and 255
|
|
@@ -632,14 +652,16 @@ APPLY_REDACTIONS_TEXT = int(
|
|
| 632 |
) # The default PDF_REDACT_TEXT_REMOVE | 0 removes all characters whose boundary box overlaps any redaction rectangle. This complies with the original legal / data protection intentions of redaction annotations. Other use cases however may require to keep text while redacting vector graphics or images. This can be achieved by setting text=True|PDF_REDACT_TEXT_NONE | 1. This does not comply with the data protection intentions of redaction annotations. Do so at your own risk.
|
| 633 |
|
| 634 |
# If you don't want to redact the text, but instead just draw a box over it, set this to True
|
| 635 |
-
RETURN_PDF_FOR_REVIEW =
|
|
|
|
|
|
|
| 636 |
|
| 637 |
-
RETURN_REDACTED_PDF =
|
| 638 |
-
"RETURN_REDACTED_PDF", "True"
|
| 639 |
) # Return a redacted PDF at the end of the redaction task. Could be useful to set this to "False" if you want to ensure that the user always goes to the 'Review Redactions' tab before getting the final redacted PDF product.
|
| 640 |
|
| 641 |
-
COMPRESS_REDACTED_PDF =
|
| 642 |
-
"COMPRESS_REDACTED_PDF", "False"
|
| 643 |
) # On low memory systems, the compression options in pymupdf can cause the app to crash if the PDF is longer than 500 pages or so. Setting this to False will save the PDF only with a basic cleaning option enabled
|
| 644 |
|
| 645 |
###
|
|
@@ -654,7 +676,7 @@ except Exception as e:
|
|
| 654 |
extract = TLDExtract(cache_dir=None)
|
| 655 |
|
| 656 |
# Get some environment variables and Launch the Gradio app
|
| 657 |
-
COGNITO_AUTH = get_or_create_env_var("COGNITO_AUTH", "
|
| 658 |
|
| 659 |
|
| 660 |
# Link to user guide - ensure it is a valid URL
|
|
@@ -709,12 +731,18 @@ USER_GUIDE_URL = validate_safe_url(
|
|
| 709 |
)
|
| 710 |
)
|
| 711 |
|
| 712 |
-
SHOW_EXAMPLES =
|
| 713 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 714 |
|
| 715 |
FILE_INPUT_HEIGHT = int(get_or_create_env_var("FILE_INPUT_HEIGHT", "200"))
|
| 716 |
|
| 717 |
-
RUN_DIRECT_MODE =
|
|
|
|
|
|
|
| 718 |
|
| 719 |
# Direct mode configuration options
|
| 720 |
DIRECT_MODE_DEFAULT_USER = get_or_create_env_var(
|
|
@@ -736,7 +764,9 @@ DIRECT_MODE_DUPLICATE_TYPE = get_or_create_env_var(
|
|
| 736 |
|
| 737 |
### ALLOW LIST
|
| 738 |
|
| 739 |
-
GET_DEFAULT_ALLOW_LIST =
|
|
|
|
|
|
|
| 740 |
|
| 741 |
ALLOW_LIST_PATH = get_or_create_env_var(
|
| 742 |
"ALLOW_LIST_PATH", ""
|
|
@@ -753,7 +783,9 @@ else:
|
|
| 753 |
|
| 754 |
### DENY LIST
|
| 755 |
|
| 756 |
-
GET_DEFAULT_DENY_LIST =
|
|
|
|
|
|
|
| 757 |
|
| 758 |
S3_DENY_LIST_PATH = get_or_create_env_var(
|
| 759 |
"S3_DENY_LIST_PATH", ""
|
|
@@ -793,9 +825,11 @@ else:
|
|
| 793 |
# COST CODE OPTIONS
|
| 794 |
###
|
| 795 |
|
| 796 |
-
SHOW_COSTS = get_or_create_env_var("SHOW_COSTS", "False")
|
| 797 |
|
| 798 |
-
GET_COST_CODES =
|
|
|
|
|
|
|
| 799 |
|
| 800 |
DEFAULT_COST_CODE = get_or_create_env_var("DEFAULT_COST_CODE", "")
|
| 801 |
|
|
@@ -813,20 +847,21 @@ if COST_CODES_PATH:
|
|
| 813 |
else:
|
| 814 |
OUTPUT_COST_CODES_PATH = "config/cost_codes.csv"
|
| 815 |
|
| 816 |
-
ENFORCE_COST_CODES =
|
| 817 |
-
"ENFORCE_COST_CODES", "False"
|
| 818 |
-
)
|
|
|
|
| 819 |
|
| 820 |
-
if ENFORCE_COST_CODES
|
| 821 |
-
GET_COST_CODES =
|
| 822 |
|
| 823 |
|
| 824 |
###
|
| 825 |
# WHOLE DOCUMENT API OPTIONS
|
| 826 |
###
|
| 827 |
|
| 828 |
-
SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS =
|
| 829 |
-
"SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS", "False"
|
| 830 |
) # This feature not currently implemented
|
| 831 |
|
| 832 |
TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET = get_or_create_env_var(
|
|
@@ -841,9 +876,10 @@ TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER = get_or_create_env_var(
|
|
| 841 |
"TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER", "output"
|
| 842 |
)
|
| 843 |
|
| 844 |
-
LOAD_PREVIOUS_TEXTRACT_JOBS_S3 =
|
| 845 |
-
"LOAD_PREVIOUS_TEXTRACT_JOBS_S3", "False"
|
| 846 |
-
)
|
|
|
|
| 847 |
|
| 848 |
TEXTRACT_JOBS_S3_LOC = get_or_create_env_var(
|
| 849 |
"TEXTRACT_JOBS_S3_LOC", "output"
|
|
@@ -865,114 +901,15 @@ DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = int(
|
|
| 865 |
###
|
| 866 |
# Config vars output format
|
| 867 |
###
|
| 868 |
-
# Ensure that config variables are in the correct format for subsequent use elsewhere
|
| 869 |
-
|
| 870 |
-
if LOAD_REDACTION_ANNOTATIONS_FROM_PDF == "True":
|
| 871 |
-
LOAD_REDACTION_ANNOTATIONS_FROM_PDF = True
|
| 872 |
-
else:
|
| 873 |
-
LOAD_REDACTION_ANNOTATIONS_FROM_PDF = False
|
| 874 |
|
| 875 |
# Convert string environment variables to string or list
|
| 876 |
-
|
| 877 |
-
|
| 878 |
-
|
| 879 |
-
SAVE_LOGS_TO_CSV = False
|
| 880 |
-
if SAVE_LOGS_TO_DYNAMODB == "True":
|
| 881 |
-
SAVE_LOGS_TO_DYNAMODB = True
|
| 882 |
-
else:
|
| 883 |
-
SAVE_LOGS_TO_DYNAMODB = False
|
| 884 |
-
if SHOW_LANGUAGE_SELECTION == "True":
|
| 885 |
-
SHOW_LANGUAGE_SELECTION = True
|
| 886 |
-
else:
|
| 887 |
-
SHOW_LANGUAGE_SELECTION = False
|
| 888 |
-
if DISPLAY_FILE_NAMES_IN_LOGS == "True":
|
| 889 |
-
DISPLAY_FILE_NAMES_IN_LOGS = True
|
| 890 |
-
else:
|
| 891 |
-
DISPLAY_FILE_NAMES_IN_LOGS = False
|
| 892 |
-
if DO_INITIAL_TABULAR_DATA_CLEAN == "True":
|
| 893 |
-
DO_INITIAL_TABULAR_DATA_CLEAN = True
|
| 894 |
-
else:
|
| 895 |
-
DO_INITIAL_TABULAR_DATA_CLEAN = False
|
| 896 |
-
if COMPRESS_REDACTED_PDF == "True":
|
| 897 |
-
COMPRESS_REDACTED_PDF = True
|
| 898 |
-
else:
|
| 899 |
-
COMPRESS_REDACTED_PDF = False
|
| 900 |
-
if RETURN_REDACTED_PDF == "True":
|
| 901 |
-
RETURN_REDACTED_PDF = True
|
| 902 |
-
else:
|
| 903 |
-
RETURN_REDACTED_PDF = False
|
| 904 |
-
if USE_GREEDY_DUPLICATE_DETECTION == "True":
|
| 905 |
-
USE_GREEDY_DUPLICATE_DETECTION = True
|
| 906 |
-
else:
|
| 907 |
-
USE_GREEDY_DUPLICATE_DETECTION = False
|
| 908 |
-
if DEFAULT_COMBINE_PAGES == "True":
|
| 909 |
-
DEFAULT_COMBINE_PAGES = True
|
| 910 |
-
else:
|
| 911 |
-
DEFAULT_COMBINE_PAGES = False
|
| 912 |
-
if REMOVE_DUPLICATE_ROWS == "True":
|
| 913 |
-
REMOVE_DUPLICATE_ROWS = True
|
| 914 |
-
else:
|
| 915 |
-
REMOVE_DUPLICATE_ROWS = False
|
| 916 |
-
|
| 917 |
-
if GET_COST_CODES == "True":
|
| 918 |
-
GET_COST_CODES = True
|
| 919 |
-
else:
|
| 920 |
-
GET_COST_CODES = False
|
| 921 |
-
|
| 922 |
-
if ENFORCE_COST_CODES == "True":
|
| 923 |
-
ENFORCE_COST_CODES = True
|
| 924 |
-
else:
|
| 925 |
-
ENFORCE_COST_CODES = False
|
| 926 |
-
|
| 927 |
-
if SHOW_COSTS == "True":
|
| 928 |
-
SHOW_COSTS = True
|
| 929 |
-
else:
|
| 930 |
-
SHOW_COSTS = False
|
| 931 |
-
|
| 932 |
-
if GET_DEFAULT_ALLOW_LIST == "True":
|
| 933 |
-
GET_DEFAULT_ALLOW_LIST = True
|
| 934 |
-
else:
|
| 935 |
-
GET_DEFAULT_ALLOW_LIST = False
|
| 936 |
-
|
| 937 |
-
if SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS == "True":
|
| 938 |
-
SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS = True
|
| 939 |
-
else:
|
| 940 |
-
SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS = False
|
| 941 |
-
|
| 942 |
-
if SHOW_LOCAL_OCR_MODEL_OPTIONS == "True":
|
| 943 |
-
SHOW_LOCAL_OCR_MODEL_OPTIONS = True
|
| 944 |
-
else:
|
| 945 |
-
SHOW_LOCAL_OCR_MODEL_OPTIONS = False
|
| 946 |
-
|
| 947 |
-
if SAVE_EXAMPLE_TESSERACT_VS_PADDLE_IMAGES == "True":
|
| 948 |
-
SAVE_EXAMPLE_TESSERACT_VS_PADDLE_IMAGES = True
|
| 949 |
-
else:
|
| 950 |
-
SAVE_EXAMPLE_TESSERACT_VS_PADDLE_IMAGES = False
|
| 951 |
-
|
| 952 |
-
if SAVE_PADDLE_VISUALISATIONS == "True":
|
| 953 |
-
SAVE_PADDLE_VISUALISATIONS = True
|
| 954 |
-
else:
|
| 955 |
-
SAVE_PADDLE_VISUALISATIONS = False
|
| 956 |
-
|
| 957 |
-
if SHOW_AWS_TEXT_EXTRACTION_OPTIONS == "True":
|
| 958 |
-
SHOW_AWS_TEXT_EXTRACTION_OPTIONS = True
|
| 959 |
-
else:
|
| 960 |
-
SHOW_AWS_TEXT_EXTRACTION_OPTIONS = False
|
| 961 |
-
|
| 962 |
-
if CSV_ACCESS_LOG_HEADERS:
|
| 963 |
-
CSV_ACCESS_LOG_HEADERS = _get_env_list(CSV_ACCESS_LOG_HEADERS)
|
| 964 |
-
if CSV_FEEDBACK_LOG_HEADERS:
|
| 965 |
-
CSV_FEEDBACK_LOG_HEADERS = _get_env_list(CSV_FEEDBACK_LOG_HEADERS)
|
| 966 |
-
if CSV_USAGE_LOG_HEADERS:
|
| 967 |
-
CSV_USAGE_LOG_HEADERS = _get_env_list(CSV_USAGE_LOG_HEADERS)
|
| 968 |
-
|
| 969 |
-
if DYNAMODB_ACCESS_LOG_HEADERS:
|
| 970 |
-
DYNAMODB_ACCESS_LOG_HEADERS = _get_env_list(DYNAMODB_ACCESS_LOG_HEADERS)
|
| 971 |
-
if DYNAMODB_FEEDBACK_LOG_HEADERS:
|
| 972 |
-
DYNAMODB_FEEDBACK_LOG_HEADERS = _get_env_list(DYNAMODB_FEEDBACK_LOG_HEADERS)
|
| 973 |
-
if DYNAMODB_USAGE_LOG_HEADERS:
|
| 974 |
-
DYNAMODB_USAGE_LOG_HEADERS = _get_env_list(DYNAMODB_USAGE_LOG_HEADERS)
|
| 975 |
|
|
|
|
|
|
|
|
|
|
| 976 |
if CHOSEN_COMPREHEND_ENTITIES:
|
| 977 |
CHOSEN_COMPREHEND_ENTITIES = _get_env_list(CHOSEN_COMPREHEND_ENTITIES)
|
| 978 |
if FULL_COMPREHEND_ENTITY_LIST:
|
|
@@ -1000,31 +937,3 @@ if ALLOWED_ORIGINS:
|
|
| 1000 |
|
| 1001 |
if ALLOWED_HOSTS:
|
| 1002 |
ALLOWED_HOSTS = _get_env_list(ALLOWED_HOSTS)
|
| 1003 |
-
|
| 1004 |
-
USE_GUI_BOX_COLOURS_FOR_OUTPUTS = USE_GUI_BOX_COLOURS_FOR_OUTPUTS.lower() == "true"
|
| 1005 |
-
RETURN_PDF_FOR_REVIEW = RETURN_PDF_FOR_REVIEW.lower() == "true"
|
| 1006 |
-
|
| 1007 |
-
if DO_INITIAL_TABULAR_DATA_CLEAN == "True":
|
| 1008 |
-
DO_INITIAL_TABULAR_DATA_CLEAN = True
|
| 1009 |
-
else:
|
| 1010 |
-
DO_INITIAL_TABULAR_DATA_CLEAN = False
|
| 1011 |
-
|
| 1012 |
-
if REMOVE_DUPLICATE_ROWS == "True":
|
| 1013 |
-
REMOVE_DUPLICATE_ROWS = True
|
| 1014 |
-
else:
|
| 1015 |
-
REMOVE_DUPLICATE_ROWS = False
|
| 1016 |
-
|
| 1017 |
-
if EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT == "True":
|
| 1018 |
-
EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT = True
|
| 1019 |
-
else:
|
| 1020 |
-
EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT = False
|
| 1021 |
-
|
| 1022 |
-
if PADDLE_USE_TEXTLINE_ORIENTATION == "True":
|
| 1023 |
-
PADDLE_USE_TEXTLINE_ORIENTATION = True
|
| 1024 |
-
else:
|
| 1025 |
-
PADDLE_USE_TEXTLINE_ORIENTATION = False
|
| 1026 |
-
|
| 1027 |
-
if PADDLE_DET_DB_UNCLIP_RATIO == "True":
|
| 1028 |
-
PADDLE_DET_DB_UNCLIP_RATIO = True
|
| 1029 |
-
else:
|
| 1030 |
-
PADDLE_DET_DB_UNCLIP_RATIO = False
|
|
|
|
| 25 |
# Set or retrieve configuration variables for the redaction app
|
| 26 |
|
| 27 |
|
| 28 |
+
def convert_string_to_boolean(value: str) -> bool:
|
| 29 |
+
"""Convert string to boolean, handling various formats."""
|
| 30 |
+
if isinstance(value, bool):
|
| 31 |
+
return value
|
| 32 |
+
elif value in ["True", "1", "true", "TRUE"]:
|
| 33 |
+
return True
|
| 34 |
+
elif value in ["False", "0", "false", "FALSE"]:
|
| 35 |
+
return False
|
| 36 |
+
else:
|
| 37 |
+
raise ValueError(f"Invalid boolean value: {value}")
|
| 38 |
+
|
| 39 |
+
|
| 40 |
def get_or_create_env_var(var_name: str, default_value: str, print_val: bool = False):
|
| 41 |
"""
|
| 42 |
Get an environmental variable, and set it to a default value if it doesn't exist
|
|
|
|
| 112 |
else:
|
| 113 |
print("AWS config file not found at location:", AWS_CONFIG_PATH)
|
| 114 |
|
| 115 |
+
RUN_AWS_FUNCTIONS = convert_string_to_boolean(
|
| 116 |
+
get_or_create_env_var("RUN_AWS_FUNCTIONS", "False")
|
| 117 |
+
)
|
| 118 |
|
| 119 |
AWS_REGION = get_or_create_env_var("AWS_REGION", "")
|
| 120 |
|
|
|
|
| 133 |
DOCUMENT_REDACTION_BUCKET = get_or_create_env_var("DOCUMENT_REDACTION_BUCKET", "")
|
| 134 |
|
| 135 |
# Should the app prioritise using AWS SSO over using API keys stored in environment variables/secrets (defaults to yes)
|
| 136 |
+
PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS = convert_string_to_boolean(
|
| 137 |
+
get_or_create_env_var("PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS", "True")
|
| 138 |
)
|
| 139 |
|
| 140 |
# Custom headers e.g. if routing traffic through Cloudfront
|
|
|
|
| 148 |
# Image options
|
| 149 |
###
|
| 150 |
IMAGES_DPI = float(get_or_create_env_var("IMAGES_DPI", "300.0"))
|
| 151 |
+
LOAD_TRUNCATED_IMAGES = convert_string_to_boolean(
|
| 152 |
+
get_or_create_env_var("LOAD_TRUNCATED_IMAGES", "True")
|
| 153 |
+
)
|
| 154 |
MAX_IMAGE_PIXELS = get_or_create_env_var(
|
| 155 |
"MAX_IMAGE_PIXELS", ""
|
| 156 |
) # Changed to None if blank in file_conversion.py
|
|
|
|
| 189 |
# By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
|
| 190 |
# Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
|
| 191 |
|
| 192 |
+
SAVE_LOGS_TO_CSV = convert_string_to_boolean(
|
| 193 |
+
get_or_create_env_var("SAVE_LOGS_TO_CSV", "True")
|
| 194 |
+
)
|
| 195 |
|
| 196 |
+
USE_LOG_SUBFOLDERS = convert_string_to_boolean(
|
| 197 |
+
get_or_create_env_var("USE_LOG_SUBFOLDERS", "True")
|
| 198 |
+
)
|
| 199 |
|
| 200 |
FEEDBACK_LOGS_FOLDER = get_or_create_env_var("FEEDBACK_LOGS_FOLDER", "feedback/")
|
| 201 |
ACCESS_LOGS_FOLDER = get_or_create_env_var("ACCESS_LOGS_FOLDER", "logs/")
|
| 202 |
USAGE_LOGS_FOLDER = get_or_create_env_var("USAGE_LOGS_FOLDER", "usage/")
|
| 203 |
|
| 204 |
+
if USE_LOG_SUBFOLDERS:
|
| 205 |
day_log_subfolder = today_rev + "/"
|
| 206 |
host_name_subfolder = HOST_NAME + "/"
|
| 207 |
full_log_subfolder = day_log_subfolder + host_name_subfolder
|
|
|
|
| 221 |
)
|
| 222 |
|
| 223 |
# Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
|
| 224 |
+
DISPLAY_FILE_NAMES_IN_LOGS = convert_string_to_boolean(
|
| 225 |
+
get_or_create_env_var("DISPLAY_FILE_NAMES_IN_LOGS", "False")
|
| 226 |
)
|
| 227 |
|
| 228 |
# Further customisation options for CSV logs
|
|
|
|
| 238 |
) # If blank, uses component labels
|
| 239 |
|
| 240 |
### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
|
| 241 |
+
SAVE_LOGS_TO_DYNAMODB = convert_string_to_boolean(
|
| 242 |
+
get_or_create_env_var("SAVE_LOGS_TO_DYNAMODB", "False")
|
| 243 |
+
)
|
| 244 |
|
| 245 |
ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
|
| 246 |
"ACCESS_LOG_DYNAMODB_TABLE_NAME", "redaction_access_log"
|
|
|
|
| 260 |
DYNAMODB_USAGE_LOG_HEADERS = get_or_create_env_var("DYNAMODB_USAGE_LOG_HEADERS", "")
|
| 261 |
|
| 262 |
# Report logging to console?
|
| 263 |
+
LOGGING = convert_string_to_boolean(get_or_create_env_var("LOGGING", "False"))
|
| 264 |
|
| 265 |
+
if LOGGING:
|
| 266 |
# Configure logging
|
| 267 |
logging.basicConfig(
|
| 268 |
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
|
|
|
| 279 |
|
| 280 |
FAVICON_PATH = get_or_create_env_var("FAVICON_PATH", "favicon.png")
|
| 281 |
|
| 282 |
+
RUN_FASTAPI = convert_string_to_boolean(get_or_create_env_var("RUN_FASTAPI", "False"))
|
| 283 |
|
| 284 |
MAX_QUEUE_SIZE = int(get_or_create_env_var("MAX_QUEUE_SIZE", "5"))
|
| 285 |
|
|
|
|
| 313 |
)
|
| 314 |
|
| 315 |
# When loading for review, should PDFs have existing redaction annotations loaded in?
|
| 316 |
+
LOAD_REDACTION_ANNOTATIONS_FROM_PDF = convert_string_to_boolean(
|
| 317 |
+
get_or_create_env_var("LOAD_REDACTION_ANNOTATIONS_FROM_PDF", "True")
|
| 318 |
)
|
| 319 |
|
| 320 |
|
|
|
|
| 335 |
add_folder_to_path(POPPLER_FOLDER)
|
| 336 |
|
| 337 |
# Extraction and PII options open by default:
|
| 338 |
+
EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT = convert_string_to_boolean(
|
| 339 |
+
get_or_create_env_var("EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT", "True")
|
| 340 |
)
|
| 341 |
|
| 342 |
# List of models to use for text extraction and PII detection
|
|
|
|
| 358 |
LOCAL_PII_OPTION = get_or_create_env_var("LOCAL_PII_OPTION", "Local")
|
| 359 |
AWS_PII_OPTION = get_or_create_env_var("AWS_PII_OPTION", "AWS Comprehend")
|
| 360 |
|
| 361 |
+
SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS = convert_string_to_boolean(
|
| 362 |
+
get_or_create_env_var("SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS", "True")
|
| 363 |
)
|
| 364 |
+
SHOW_AWS_TEXT_EXTRACTION_OPTIONS = convert_string_to_boolean(
|
| 365 |
+
get_or_create_env_var("SHOW_AWS_TEXT_EXTRACTION_OPTIONS", "True")
|
| 366 |
)
|
| 367 |
|
| 368 |
# Show at least local options if everything mistakenly removed
|
| 369 |
+
if not SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS and not SHOW_AWS_TEXT_EXTRACTION_OPTIONS:
|
| 370 |
+
SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS = True
|
|
|
|
|
|
|
|
|
|
| 371 |
|
| 372 |
local_model_options = list()
|
| 373 |
aws_model_options = list()
|
| 374 |
text_extraction_models = list()
|
| 375 |
|
| 376 |
+
if SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS:
|
| 377 |
local_model_options.append(SELECTABLE_TEXT_EXTRACT_OPTION)
|
| 378 |
local_model_options.append(TESSERACT_TEXT_EXTRACT_OPTION)
|
| 379 |
|
| 380 |
+
if SHOW_AWS_TEXT_EXTRACTION_OPTIONS:
|
| 381 |
aws_model_options.append(TEXTRACT_TEXT_EXTRACT_OPTION)
|
| 382 |
|
| 383 |
TEXT_EXTRACTION_MODELS = local_model_options + aws_model_options
|
| 384 |
+
DO_INITIAL_TABULAR_DATA_CLEAN = convert_string_to_boolean(
|
| 385 |
+
get_or_create_env_var("DO_INITIAL_TABULAR_DATA_CLEAN", "True")
|
| 386 |
)
|
| 387 |
|
| 388 |
+
SHOW_LOCAL_PII_DETECTION_OPTIONS = convert_string_to_boolean(
|
| 389 |
+
get_or_create_env_var("SHOW_LOCAL_PII_DETECTION_OPTIONS", "True")
|
| 390 |
)
|
| 391 |
+
SHOW_AWS_PII_DETECTION_OPTIONS = convert_string_to_boolean(
|
| 392 |
+
get_or_create_env_var("SHOW_AWS_PII_DETECTION_OPTIONS", "True")
|
| 393 |
)
|
| 394 |
|
| 395 |
+
if not SHOW_LOCAL_PII_DETECTION_OPTIONS and not SHOW_AWS_PII_DETECTION_OPTIONS:
|
| 396 |
+
SHOW_LOCAL_PII_DETECTION_OPTIONS = True
|
|
|
|
|
|
|
|
|
|
| 397 |
|
| 398 |
local_model_options = [NO_REDACTION_PII_OPTION]
|
| 399 |
aws_model_options = list()
|
| 400 |
pii_detection_models = list()
|
| 401 |
|
| 402 |
+
if SHOW_LOCAL_PII_DETECTION_OPTIONS:
|
| 403 |
local_model_options.append(LOCAL_PII_OPTION)
|
| 404 |
|
| 405 |
+
if SHOW_AWS_PII_DETECTION_OPTIONS:
|
| 406 |
aws_model_options.append(AWS_PII_OPTION)
|
| 407 |
|
| 408 |
PII_DETECTION_MODELS = local_model_options + aws_model_options
|
| 409 |
|
| 410 |
+
if SHOW_AWS_TEXT_EXTRACTION_OPTIONS:
|
| 411 |
DEFAULT_TEXT_EXTRACTION_MODEL = get_or_create_env_var(
|
| 412 |
"DEFAULT_TEXT_EXTRACTION_MODEL", TEXTRACT_TEXT_EXTRACT_OPTION
|
| 413 |
)
|
|
|
|
| 416 |
"DEFAULT_TEXT_EXTRACTION_MODEL", SELECTABLE_TEXT_EXTRACT_OPTION
|
| 417 |
)
|
| 418 |
|
| 419 |
+
if SHOW_AWS_PII_DETECTION_OPTIONS:
|
| 420 |
DEFAULT_PII_DETECTION_MODEL = get_or_create_env_var(
|
| 421 |
"DEFAULT_PII_DETECTION_MODEL", AWS_PII_OPTION
|
| 422 |
)
|
|
|
|
| 442 |
"CHOSEN_LOCAL_OCR_MODEL", "tesseract"
|
| 443 |
) # Choose between "tesseract", "hybrid", and "paddle". "paddle" is accurate for whole line text extraction, but word-level extract is not natively supported, and so word bounding boxes will be inaccurate. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with the chosen hybrid model (default PaddleOCR) on words with low confidence.
|
| 444 |
|
| 445 |
+
SHOW_LOCAL_OCR_MODEL_OPTIONS = convert_string_to_boolean(
|
| 446 |
+
get_or_create_env_var("SHOW_LOCAL_OCR_MODEL_OPTIONS", "False")
|
| 447 |
)
|
| 448 |
+
if SHOW_LOCAL_OCR_MODEL_OPTIONS:
|
| 449 |
LOCAL_OCR_MODEL_OPTIONS = [
|
| 450 |
"tesseract",
|
| 451 |
"hybrid",
|
|
|
|
| 461 |
get_or_create_env_var("HYBRID_OCR_PADDING", "1")
|
| 462 |
) # The padding to add to the text when passing it to PaddleOCR for re-extraction using the hybrid OCR method.
|
| 463 |
|
| 464 |
+
PADDLE_USE_TEXTLINE_ORIENTATION = convert_string_to_boolean(
|
| 465 |
+
get_or_create_env_var("PADDLE_USE_TEXTLINE_ORIENTATION", "False")
|
| 466 |
)
|
| 467 |
|
| 468 |
PADDLE_DET_DB_UNCLIP_RATIO = get_or_create_env_var("PADDLE_DET_DB_UNCLIP_RATIO", "1.2")
|
| 469 |
|
| 470 |
+
SAVE_EXAMPLE_TESSERACT_VS_PADDLE_IMAGES = convert_string_to_boolean(
|
| 471 |
+
get_or_create_env_var("SAVE_EXAMPLE_TESSERACT_VS_PADDLE_IMAGES", "False")
|
| 472 |
) # Whether to save example images of Tesseract vs PaddleOCR re-extraction in hybrid OCR mode.
|
| 473 |
|
| 474 |
+
SAVE_PADDLE_VISUALISATIONS = convert_string_to_boolean(
|
| 475 |
+
get_or_create_env_var("SAVE_PADDLE_VISUALISATIONS", "False")
|
| 476 |
) # Whether to save visualisations of PaddleOCR bounding boxes.
|
| 477 |
|
| 478 |
# Model storage paths for Lambda compatibility
|
|
|
|
| 559 |
|
| 560 |
### Language selection options
|
| 561 |
|
| 562 |
+
SHOW_LANGUAGE_SELECTION = convert_string_to_boolean(
|
| 563 |
+
get_or_create_env_var("SHOW_LANGUAGE_SELECTION", "False")
|
| 564 |
+
)
|
| 565 |
|
| 566 |
DEFAULT_LANGUAGE_FULL_NAME = get_or_create_env_var(
|
| 567 |
"DEFAULT_LANGUAGE_FULL_NAME", "english"
|
|
|
|
| 599 |
DEFAULT_MIN_CONSECUTIVE_PAGES = int(
|
| 600 |
get_or_create_env_var("DEFAULT_MIN_CONSECUTIVE_PAGES", "1")
|
| 601 |
)
|
| 602 |
+
USE_GREEDY_DUPLICATE_DETECTION = convert_string_to_boolean(
|
| 603 |
+
get_or_create_env_var("USE_GREEDY_DUPLICATE_DETECTION", "True")
|
| 604 |
)
|
| 605 |
+
DEFAULT_COMBINE_PAGES = convert_string_to_boolean(
|
| 606 |
+
get_or_create_env_var("DEFAULT_COMBINE_PAGES", "True")
|
| 607 |
) # Combine text from the same page number within a file. Alternative will enable line-level duplicate detection.
|
| 608 |
DEFAULT_MIN_WORD_COUNT = int(get_or_create_env_var("DEFAULT_MIN_WORD_COUNT", "10"))
|
| 609 |
+
REMOVE_DUPLICATE_ROWS = convert_string_to_boolean(
|
| 610 |
+
get_or_create_env_var("REMOVE_DUPLICATE_ROWS", "False")
|
| 611 |
+
)
|
| 612 |
|
| 613 |
|
| 614 |
###
|
| 615 |
# File output options
|
| 616 |
###
|
| 617 |
# Should the output pdf redaction boxes be drawn using the custom box colour?
|
| 618 |
+
USE_GUI_BOX_COLOURS_FOR_OUTPUTS = convert_string_to_boolean(
|
| 619 |
+
get_or_create_env_var("USE_GUI_BOX_COLOURS_FOR_OUTPUTS", "False")
|
| 620 |
)
|
| 621 |
|
| 622 |
# This is the colour of the output pdf redaction boxes. Should be a tuple of three integers between 0 and 255
|
|
|
|
| 652 |
) # The default PDF_REDACT_TEXT_REMOVE | 0 removes all characters whose boundary box overlaps any redaction rectangle. This complies with the original legal / data protection intentions of redaction annotations. Other use cases however may require to keep text while redacting vector graphics or images. This can be achieved by setting text=True|PDF_REDACT_TEXT_NONE | 1. This does not comply with the data protection intentions of redaction annotations. Do so at your own risk.
|
| 653 |
|
| 654 |
# If you don't want to redact the text, but instead just draw a box over it, set this to True
|
| 655 |
+
RETURN_PDF_FOR_REVIEW = convert_string_to_boolean(
|
| 656 |
+
get_or_create_env_var("RETURN_PDF_FOR_REVIEW", "True")
|
| 657 |
+
)
|
| 658 |
|
| 659 |
+
RETURN_REDACTED_PDF = convert_string_to_boolean(
|
| 660 |
+
get_or_create_env_var("RETURN_REDACTED_PDF", "True")
|
| 661 |
) # Return a redacted PDF at the end of the redaction task. Could be useful to set this to "False" if you want to ensure that the user always goes to the 'Review Redactions' tab before getting the final redacted PDF product.
|
| 662 |
|
| 663 |
+
COMPRESS_REDACTED_PDF = convert_string_to_boolean(
|
| 664 |
+
get_or_create_env_var("COMPRESS_REDACTED_PDF", "False")
|
| 665 |
) # On low memory systems, the compression options in pymupdf can cause the app to crash if the PDF is longer than 500 pages or so. Setting this to False will save the PDF only with a basic cleaning option enabled
|
| 666 |
|
| 667 |
###
|
|
|
|
| 676 |
extract = TLDExtract(cache_dir=None)
|
| 677 |
|
| 678 |
# Get some environment variables and Launch the Gradio app
|
| 679 |
+
COGNITO_AUTH = convert_string_to_boolean(get_or_create_env_var("COGNITO_AUTH", "False"))
|
| 680 |
|
| 681 |
|
| 682 |
# Link to user guide - ensure it is a valid URL
|
|
|
|
| 731 |
)
|
| 732 |
)
|
| 733 |
|
| 734 |
+
SHOW_EXAMPLES = convert_string_to_boolean(
|
| 735 |
+
get_or_create_env_var("SHOW_EXAMPLES", "True")
|
| 736 |
+
)
|
| 737 |
+
SHOW_AWS_EXAMPLES = convert_string_to_boolean(
|
| 738 |
+
get_or_create_env_var("SHOW_AWS_EXAMPLES", "False")
|
| 739 |
+
)
|
| 740 |
|
| 741 |
FILE_INPUT_HEIGHT = int(get_or_create_env_var("FILE_INPUT_HEIGHT", "200"))
|
| 742 |
|
| 743 |
+
RUN_DIRECT_MODE = convert_string_to_boolean(
|
| 744 |
+
get_or_create_env_var("RUN_DIRECT_MODE", "False")
|
| 745 |
+
)
|
| 746 |
|
| 747 |
# Direct mode configuration options
|
| 748 |
DIRECT_MODE_DEFAULT_USER = get_or_create_env_var(
|
|
|
|
| 764 |
|
| 765 |
### ALLOW LIST
|
| 766 |
|
| 767 |
+
GET_DEFAULT_ALLOW_LIST = convert_string_to_boolean(
|
| 768 |
+
get_or_create_env_var("GET_DEFAULT_ALLOW_LIST", "False")
|
| 769 |
+
)
|
| 770 |
|
| 771 |
ALLOW_LIST_PATH = get_or_create_env_var(
|
| 772 |
"ALLOW_LIST_PATH", ""
|
|
|
|
| 783 |
|
| 784 |
### DENY LIST
|
| 785 |
|
| 786 |
+
GET_DEFAULT_DENY_LIST = convert_string_to_boolean(
|
| 787 |
+
get_or_create_env_var("GET_DEFAULT_DENY_LIST", "False")
|
| 788 |
+
)
|
| 789 |
|
| 790 |
S3_DENY_LIST_PATH = get_or_create_env_var(
|
| 791 |
"S3_DENY_LIST_PATH", ""
|
|
|
|
| 825 |
# COST CODE OPTIONS
|
| 826 |
###
|
| 827 |
|
| 828 |
+
SHOW_COSTS = convert_string_to_boolean(get_or_create_env_var("SHOW_COSTS", "False"))
|
| 829 |
|
| 830 |
+
GET_COST_CODES = convert_string_to_boolean(
|
| 831 |
+
get_or_create_env_var("GET_COST_CODES", "False")
|
| 832 |
+
)
|
| 833 |
|
| 834 |
DEFAULT_COST_CODE = get_or_create_env_var("DEFAULT_COST_CODE", "")
|
| 835 |
|
|
|
|
| 847 |
else:
|
| 848 |
OUTPUT_COST_CODES_PATH = "config/cost_codes.csv"
|
| 849 |
|
| 850 |
+
ENFORCE_COST_CODES = convert_string_to_boolean(
|
| 851 |
+
get_or_create_env_var("ENFORCE_COST_CODES", "False")
|
| 852 |
+
)
|
| 853 |
+
# If you have cost codes listed, is it compulsory to choose one before redacting?
|
| 854 |
|
| 855 |
+
if ENFORCE_COST_CODES:
|
| 856 |
+
GET_COST_CODES = True
|
| 857 |
|
| 858 |
|
| 859 |
###
|
| 860 |
# WHOLE DOCUMENT API OPTIONS
|
| 861 |
###
|
| 862 |
|
| 863 |
+
SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS = convert_string_to_boolean(
|
| 864 |
+
get_or_create_env_var("SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS", "False")
|
| 865 |
) # This feature not currently implemented
|
| 866 |
|
| 867 |
TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET = get_or_create_env_var(
|
|
|
|
| 876 |
"TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER", "output"
|
| 877 |
)
|
| 878 |
|
| 879 |
+
LOAD_PREVIOUS_TEXTRACT_JOBS_S3 = convert_string_to_boolean(
|
| 880 |
+
get_or_create_env_var("LOAD_PREVIOUS_TEXTRACT_JOBS_S3", "False")
|
| 881 |
+
)
|
| 882 |
+
# Whether or not to load previous Textract jobs from S3
|
| 883 |
|
| 884 |
TEXTRACT_JOBS_S3_LOC = get_or_create_env_var(
|
| 885 |
"TEXTRACT_JOBS_S3_LOC", "output"
|
|
|
|
| 901 |
###
|
| 902 |
# Config vars output format
|
| 903 |
###
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 904 |
|
| 905 |
# Convert string environment variables to string or list
|
| 906 |
+
CSV_ACCESS_LOG_HEADERS = _get_env_list(CSV_ACCESS_LOG_HEADERS)
|
| 907 |
+
CSV_FEEDBACK_LOG_HEADERS = _get_env_list(CSV_FEEDBACK_LOG_HEADERS)
|
| 908 |
+
CSV_USAGE_LOG_HEADERS = _get_env_list(CSV_USAGE_LOG_HEADERS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 909 |
|
| 910 |
+
DYNAMODB_ACCESS_LOG_HEADERS = _get_env_list(DYNAMODB_ACCESS_LOG_HEADERS)
|
| 911 |
+
DYNAMODB_FEEDBACK_LOG_HEADERS = _get_env_list(DYNAMODB_FEEDBACK_LOG_HEADERS)
|
| 912 |
+
DYNAMODB_USAGE_LOG_HEADERS = _get_env_list(DYNAMODB_USAGE_LOG_HEADERS)
|
| 913 |
if CHOSEN_COMPREHEND_ENTITIES:
|
| 914 |
CHOSEN_COMPREHEND_ENTITIES = _get_env_list(CHOSEN_COMPREHEND_ENTITIES)
|
| 915 |
if FULL_COMPREHEND_ENTITY_LIST:
|
|
|
|
| 937 |
|
| 938 |
if ALLOWED_HOSTS:
|
| 939 |
ALLOWED_HOSTS = _get_env_list(ALLOWED_HOSTS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tools/data_anonymise.py
CHANGED
|
@@ -545,7 +545,7 @@ def anonymise_files_with_open_text(
|
|
| 545 |
# Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
|
| 546 |
if pii_identification_method == "AWS Comprehend":
|
| 547 |
print("Trying to connect to AWS Comprehend service")
|
| 548 |
-
if RUN_AWS_FUNCTIONS
|
| 549 |
print("Connecting to Comprehend via existing SSO connection")
|
| 550 |
comprehend_client = boto3.client("comprehend", region_name=AWS_REGION)
|
| 551 |
elif aws_access_key_textbox and aws_secret_key_textbox:
|
|
@@ -557,7 +557,7 @@ def anonymise_files_with_open_text(
|
|
| 557 |
aws_access_key_id=aws_access_key_textbox,
|
| 558 |
aws_secret_access_key=aws_secret_key_textbox,
|
| 559 |
)
|
| 560 |
-
elif RUN_AWS_FUNCTIONS
|
| 561 |
print("Connecting to Comprehend via existing SSO connection")
|
| 562 |
comprehend_client = boto3.client("comprehend")
|
| 563 |
elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
|
|
|
|
| 545 |
# Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
|
| 546 |
if pii_identification_method == "AWS Comprehend":
|
| 547 |
print("Trying to connect to AWS Comprehend service")
|
| 548 |
+
if RUN_AWS_FUNCTIONS and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:
|
| 549 |
print("Connecting to Comprehend via existing SSO connection")
|
| 550 |
comprehend_client = boto3.client("comprehend", region_name=AWS_REGION)
|
| 551 |
elif aws_access_key_textbox and aws_secret_key_textbox:
|
|
|
|
| 557 |
aws_access_key_id=aws_access_key_textbox,
|
| 558 |
aws_secret_access_key=aws_secret_key_textbox,
|
| 559 |
)
|
| 560 |
+
elif RUN_AWS_FUNCTIONS:
|
| 561 |
print("Connecting to Comprehend via existing SSO connection")
|
| 562 |
comprehend_client = boto3.client("comprehend")
|
| 563 |
elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
|
tools/file_conversion.py
CHANGED
|
@@ -48,7 +48,7 @@ if not MAX_IMAGE_PIXELS:
|
|
| 48 |
else:
|
| 49 |
Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
|
| 50 |
|
| 51 |
-
ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES
|
| 52 |
|
| 53 |
|
| 54 |
def is_pdf_or_image(filename):
|
|
|
|
| 48 |
else:
|
| 49 |
Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
|
| 50 |
|
| 51 |
+
ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES
|
| 52 |
|
| 53 |
|
| 54 |
def is_pdf_or_image(filename):
|
tools/file_redaction.py
CHANGED
|
@@ -107,7 +107,7 @@ from tools.secure_path_utils import (
|
|
| 107 |
validate_path_containment,
|
| 108 |
)
|
| 109 |
|
| 110 |
-
ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES
|
| 111 |
if not MAX_IMAGE_PIXELS:
|
| 112 |
Image.MAX_IMAGE_PIXELS = None
|
| 113 |
else:
|
|
@@ -803,9 +803,9 @@ def choose_and_run_redactor(
|
|
| 803 |
|
| 804 |
### Load/create PII identification method
|
| 805 |
|
| 806 |
-
# Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is
|
| 807 |
if pii_identification_method == AWS_PII_OPTION:
|
| 808 |
-
if RUN_AWS_FUNCTIONS
|
| 809 |
print("Connecting to Comprehend via existing SSO connection")
|
| 810 |
comprehend_client = boto3.client("comprehend", region_name=AWS_REGION)
|
| 811 |
elif aws_access_key_textbox and aws_secret_key_textbox:
|
|
@@ -818,7 +818,7 @@ def choose_and_run_redactor(
|
|
| 818 |
aws_secret_access_key=aws_secret_key_textbox,
|
| 819 |
region_name=AWS_REGION,
|
| 820 |
)
|
| 821 |
-
elif RUN_AWS_FUNCTIONS
|
| 822 |
print("Connecting to Comprehend via existing SSO connection")
|
| 823 |
comprehend_client = boto3.client("comprehend", region_name=AWS_REGION)
|
| 824 |
elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
|
|
@@ -839,7 +839,7 @@ def choose_and_run_redactor(
|
|
| 839 |
|
| 840 |
# Try to connect to AWS Textract Client if using that text extraction method
|
| 841 |
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
| 842 |
-
if RUN_AWS_FUNCTIONS
|
| 843 |
print("Connecting to Textract via existing SSO connection")
|
| 844 |
textract_client = boto3.client("textract", region_name=AWS_REGION)
|
| 845 |
elif aws_access_key_textbox and aws_secret_key_textbox:
|
|
@@ -852,7 +852,7 @@ def choose_and_run_redactor(
|
|
| 852 |
aws_secret_access_key=aws_secret_key_textbox,
|
| 853 |
region_name=AWS_REGION,
|
| 854 |
)
|
| 855 |
-
elif RUN_AWS_FUNCTIONS
|
| 856 |
print("Connecting to Textract via existing SSO connection")
|
| 857 |
textract_client = boto3.client("textract", region_name=AWS_REGION)
|
| 858 |
elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
|
|
|
|
| 107 |
validate_path_containment,
|
| 108 |
)
|
| 109 |
|
| 110 |
+
ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES
|
| 111 |
if not MAX_IMAGE_PIXELS:
|
| 112 |
Image.MAX_IMAGE_PIXELS = None
|
| 113 |
else:
|
|
|
|
| 803 |
|
| 804 |
### Load/create PII identification method
|
| 805 |
|
| 806 |
+
# Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is True, otherwise an environment variable or direct textbox input is needed.
|
| 807 |
if pii_identification_method == AWS_PII_OPTION:
|
| 808 |
+
if RUN_AWS_FUNCTIONS and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:
|
| 809 |
print("Connecting to Comprehend via existing SSO connection")
|
| 810 |
comprehend_client = boto3.client("comprehend", region_name=AWS_REGION)
|
| 811 |
elif aws_access_key_textbox and aws_secret_key_textbox:
|
|
|
|
| 818 |
aws_secret_access_key=aws_secret_key_textbox,
|
| 819 |
region_name=AWS_REGION,
|
| 820 |
)
|
| 821 |
+
elif RUN_AWS_FUNCTIONS:
|
| 822 |
print("Connecting to Comprehend via existing SSO connection")
|
| 823 |
comprehend_client = boto3.client("comprehend", region_name=AWS_REGION)
|
| 824 |
elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
|
|
|
|
| 839 |
|
| 840 |
# Try to connect to AWS Textract Client if using that text extraction method
|
| 841 |
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
| 842 |
+
if RUN_AWS_FUNCTIONS and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:
|
| 843 |
print("Connecting to Textract via existing SSO connection")
|
| 844 |
textract_client = boto3.client("textract", region_name=AWS_REGION)
|
| 845 |
elif aws_access_key_textbox and aws_secret_key_textbox:
|
|
|
|
| 852 |
aws_secret_access_key=aws_secret_key_textbox,
|
| 853 |
region_name=AWS_REGION,
|
| 854 |
)
|
| 855 |
+
elif RUN_AWS_FUNCTIONS:
|
| 856 |
print("Connecting to Textract via existing SSO connection")
|
| 857 |
textract_client = boto3.client("textract", region_name=AWS_REGION)
|
| 858 |
elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
|