seanpedrickcase commited on
Commit
5f824f4
·
1 Parent(s): 6806363

Revised environment variables for consistency.

Browse files
README.md CHANGED
@@ -176,8 +176,8 @@ These settings are useful for all users, regardless of whether you are using AWS
176
 
177
  These settings are only relevant if you intend to use AWS services like Textract for OCR and Comprehend for PII detection.
178
 
179
- * `RUN_AWS_FUNCTIONS=1`
180
- * **This is the master switch.** You must set this to `1` to enable any AWS functionality. If it is `0`, all other AWS settings will be ignored.
181
 
182
  * **UI Options:**
183
  * `SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True`: Adds "AWS Textract" as an option in the text extraction dropdown.
 
176
 
177
  These settings are only relevant if you intend to use AWS services like Textract for OCR and Comprehend for PII detection.
178
 
179
+ * `RUN_AWS_FUNCTIONS=True`
180
+ * **This is the master switch.** You must set this to `True` to enable any AWS functionality. If it is `False`, all other AWS settings will be ignored.
181
 
182
  * **UI Options:**
183
  * `SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True`: Adds "AWS Textract" as an option in the text extraction dropdown.
app.py CHANGED
@@ -5806,7 +5806,7 @@ with blocks:
5806
  if (
5807
  not os.path.exists(ALLOW_LIST_PATH)
5808
  and S3_ALLOW_LIST_PATH
5809
- and RUN_AWS_FUNCTIONS == "1"
5810
  ):
5811
  print("Downloading allow list from S3")
5812
  blocks.load(
@@ -5840,7 +5840,7 @@ with blocks:
5840
  if (
5841
  not os.path.exists(COST_CODES_PATH)
5842
  and S3_COST_CODES_PATH
5843
- and RUN_AWS_FUNCTIONS == "1"
5844
  ):
5845
  print("Downloading cost codes from S3")
5846
  blocks.load(
@@ -6423,9 +6423,9 @@ with blocks:
6423
  default_concurrency_limit=int(DEFAULT_CONCURRENCY_LIMIT),
6424
  )
6425
 
6426
- if RUN_DIRECT_MODE == "0":
6427
  # If running through command line with uvicorn
6428
- if RUN_FASTAPI == "1":
6429
  if ALLOWED_ORIGINS:
6430
  print(f"CORS enabled. Allowing origins: {ALLOWED_ORIGINS}")
6431
  app.add_middleware(
@@ -6448,7 +6448,7 @@ with blocks:
6448
  app,
6449
  blocks,
6450
  show_error=True,
6451
- auth=authenticate_user if COGNITO_AUTH == "1" else None,
6452
  max_file_size=MAX_FILE_SIZE,
6453
  path=FASTAPI_ROOT_PATH,
6454
  favicon_path=Path(FAVICON_PATH),
@@ -6459,7 +6459,7 @@ with blocks:
6459
 
6460
  else:
6461
  if __name__ == "__main__":
6462
- if COGNITO_AUTH == "1":
6463
  blocks.launch(
6464
  show_error=True,
6465
  inbrowser=True,
@@ -6519,7 +6519,7 @@ with blocks:
6519
  "save_logs_to_csv": SAVE_LOGS_TO_CSV,
6520
  "save_logs_to_dynamodb": SAVE_LOGS_TO_DYNAMODB,
6521
  "display_file_names_in_logs": DISPLAY_FILE_NAMES_IN_LOGS,
6522
- "upload_logs_to_s3": RUN_AWS_FUNCTIONS == "1",
6523
  "s3_logs_prefix": S3_USAGE_LOGS_FOLDER,
6524
  "feedback_logs_folder": FEEDBACK_LOGS_FOLDER,
6525
  "access_logs_folder": ACCESS_LOGS_FOLDER,
@@ -6601,20 +6601,3 @@ with blocks:
6601
 
6602
  # Run the CLI main function with direct mode arguments
6603
  main(direct_mode_args=direct_mode_args)
6604
-
6605
- # Combine extraction options
6606
- extraction_options = (
6607
- list(direct_mode_args["handwrite_signature_extraction"])
6608
- if direct_mode_args["handwrite_signature_extraction"]
6609
- else []
6610
- )
6611
- if direct_mode_args["extract_forms"]:
6612
- extraction_options.append("Extract forms")
6613
- if direct_mode_args["extract_tables"]:
6614
- extraction_options.append("Extract tables")
6615
- if direct_mode_args["extract_layout"]:
6616
- extraction_options.append("Extract layout")
6617
- direct_mode_args["handwrite_signature_extraction"] = extraction_options
6618
-
6619
- # Run the CLI main function with direct mode arguments
6620
- main(direct_mode_args=direct_mode_args)
 
5806
  if (
5807
  not os.path.exists(ALLOW_LIST_PATH)
5808
  and S3_ALLOW_LIST_PATH
5809
+ and RUN_AWS_FUNCTIONS
5810
  ):
5811
  print("Downloading allow list from S3")
5812
  blocks.load(
 
5840
  if (
5841
  not os.path.exists(COST_CODES_PATH)
5842
  and S3_COST_CODES_PATH
5843
+ and RUN_AWS_FUNCTIONS
5844
  ):
5845
  print("Downloading cost codes from S3")
5846
  blocks.load(
 
6423
  default_concurrency_limit=int(DEFAULT_CONCURRENCY_LIMIT),
6424
  )
6425
 
6426
+ if not RUN_DIRECT_MODE:
6427
  # If running through command line with uvicorn
6428
+ if RUN_FASTAPI:
6429
  if ALLOWED_ORIGINS:
6430
  print(f"CORS enabled. Allowing origins: {ALLOWED_ORIGINS}")
6431
  app.add_middleware(
 
6448
  app,
6449
  blocks,
6450
  show_error=True,
6451
+ auth=authenticate_user if COGNITO_AUTH else None,
6452
  max_file_size=MAX_FILE_SIZE,
6453
  path=FASTAPI_ROOT_PATH,
6454
  favicon_path=Path(FAVICON_PATH),
 
6459
 
6460
  else:
6461
  if __name__ == "__main__":
6462
+ if COGNITO_AUTH:
6463
  blocks.launch(
6464
  show_error=True,
6465
  inbrowser=True,
 
6519
  "save_logs_to_csv": SAVE_LOGS_TO_CSV,
6520
  "save_logs_to_dynamodb": SAVE_LOGS_TO_DYNAMODB,
6521
  "display_file_names_in_logs": DISPLAY_FILE_NAMES_IN_LOGS,
6522
+ "upload_logs_to_s3": RUN_AWS_FUNCTIONS,
6523
  "s3_logs_prefix": S3_USAGE_LOGS_FOLDER,
6524
  "feedback_logs_folder": FEEDBACK_LOGS_FOLDER,
6525
  "access_logs_folder": ACCESS_LOGS_FOLDER,
 
6601
 
6602
  # Run the CLI main function with direct mode arguments
6603
  main(direct_mode_args=direct_mode_args)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cdk/cdk_config.py CHANGED
@@ -6,6 +6,18 @@ from dotenv import load_dotenv
6
  # Set or retrieve configuration variables for CDK redaction deployment
7
 
8
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  def get_or_create_env_var(var_name: str, default_value: str, print_val: bool = False):
10
  """
11
  Get an environmental variable, and set it to a default value if it doesn't exist
 
6
  # Set or retrieve configuration variables for CDK redaction deployment
7
 
8
 
9
+ def convert_string_to_boolean(value: str) -> bool:
10
+ """Convert string to boolean, handling various formats."""
11
+ if isinstance(value, bool):
12
+ return value
13
+ elif value in ["True", "1", "true", "TRUE"]:
14
+ return True
15
+ elif value in ["False", "0", "false", "FALSE"]:
16
+ return False
17
+ else:
18
+ raise ValueError(f"Invalid boolean value: {value}")
19
+
20
+
21
  def get_or_create_env_var(var_name: str, default_value: str, print_val: bool = False):
22
  """
23
  Get an environmental variable, and set it to a default value if it doesn't exist
cdk/cdk_functions.py CHANGED
@@ -1335,8 +1335,8 @@ def create_basic_config_env(
1335
  Create a basic config.env file for the user to use with their newly deployed redaction app.
1336
  """
1337
  variables = {
1338
- "COGNITO_AUTH": "1",
1339
- "RUN_AWS_FUNCTIONS": "1",
1340
  "DISPLAY_FILE_NAMES_IN_LOGS": "False",
1341
  "SESSION_OUTPUT_FOLDER": "True",
1342
  "SAVE_LOGS_TO_DYNAMODB": "True",
 
1335
  Create a basic config.env file for the user to use with their newly deployed redaction app.
1336
  """
1337
  variables = {
1338
+ "COGNITO_AUTH": "True",
1339
+ "RUN_AWS_FUNCTIONS": "True",
1340
  "DISPLAY_FILE_NAMES_IN_LOGS": "False",
1341
  "SESSION_OUTPUT_FOLDER": "True",
1342
  "SAVE_LOGS_TO_DYNAMODB": "True",
cli_redact.py CHANGED
@@ -341,7 +341,7 @@ python cli_redact.py --task textract --textract_action list
341
  )
342
  general_group.add_argument(
343
  "--upload_logs_to_s3",
344
- default=RUN_AWS_FUNCTIONS == "1",
345
  help="Upload log files to S3 after processing.",
346
  )
347
  general_group.add_argument(
@@ -762,6 +762,8 @@ python cli_redact.py --task textract --textract_action list
762
  output_folder=args.output_dir,
763
  input_folder=args.input_dir,
764
  prepare_images=args.prepare_images,
 
 
765
  )
766
  print(f"Preparation complete. {prep_summary}")
767
 
 
341
  )
342
  general_group.add_argument(
343
  "--upload_logs_to_s3",
344
+ default=RUN_AWS_FUNCTIONS,
345
  help="Upload log files to S3 after processing.",
346
  )
347
  general_group.add_argument(
 
762
  output_folder=args.output_dir,
763
  input_folder=args.input_dir,
764
  prepare_images=args.prepare_images,
765
+ page_min=args.page_min,
766
+ page_max=args.page_max,
767
  )
768
  print(f"Preparation complete. {prep_summary}")
769
 
example_config.env CHANGED
@@ -6,7 +6,7 @@ CHOSEN_LOCAL_OCR_MODEL=tesseract
6
  SESSION_OUTPUT_FOLDER=False
7
  DISPLAY_FILE_NAMES_IN_LOGS=False
8
 
9
- RUN_AWS_FUNCTIONS=1 # Set to 0 if you don't want to run AWS functions
10
  SAVE_LOGS_TO_DYNAMODB=True
11
  S3_COST_CODES_PATH=cost_codes.csv
12
  SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True
 
6
  SESSION_OUTPUT_FOLDER=False
7
  DISPLAY_FILE_NAMES_IN_LOGS=False
8
 
9
+ RUN_AWS_FUNCTIONS=True # Set to False if you don't want to run AWS functions
10
  SAVE_LOGS_TO_DYNAMODB=True
11
  S3_COST_CODES_PATH=cost_codes.csv
12
  SHOW_AWS_TEXT_EXTRACTION_OPTIONS=True
lambda_entrypoint.py CHANGED
@@ -43,6 +43,18 @@ def _get_env_list(env_var_name: str | list[str] | None) -> list[str]:
43
  return [s.strip() for s in value.split(",") if s.strip()]
44
 
45
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  print("Lambda entrypoint loading...")
47
 
48
  # Initialize S3 client outside the handler for connection reuse
@@ -293,8 +305,10 @@ def lambda_handler(event, context):
293
  "username": arguments.get(
294
  "username", os.getenv("DIRECT_MODE_DEFAULT_USER", "lambda_user")
295
  ),
296
- "save_to_user_folders": arguments.get(
297
- "save_to_user_folders", os.getenv("SESSION_OUTPUT_FOLDER", "False")
 
 
298
  ),
299
  "local_redact_entities": _get_env_list(
300
  arguments.get(
@@ -312,20 +326,26 @@ def lambda_handler(event, context):
312
  "aws_region": os.getenv("AWS_REGION", ""),
313
  "s3_bucket": bucket_name,
314
  "do_initial_clean": arguments.get(
315
- "do_initial_clean", os.getenv("DO_INITIAL_TABULAR_DATA_CLEAN", "False")
 
 
 
316
  ),
317
- "save_logs_to_csv": arguments.get(
318
- "save_logs_to_csv", os.getenv("SAVE_LOGS_TO_CSV", "True")
319
  ),
320
  "save_logs_to_dynamodb": arguments.get(
321
- "save_logs_to_dynamodb", os.getenv("SAVE_LOGS_TO_DYNAMODB", "False")
 
322
  ),
323
- "display_file_names_in_logs": arguments.get(
324
- "display_file_names_in_logs",
325
- os.getenv("DISPLAY_FILE_NAMES_IN_LOGS", "True"),
 
 
326
  ),
327
- "upload_logs_to_s3": arguments.get(
328
- "upload_logs_to_s3", os.getenv("RUN_AWS_FUNCTIONS", "False")
329
  ),
330
  "s3_logs_prefix": arguments.get(
331
  "s3_logs_prefix", os.getenv("S3_USAGE_LOGS_FOLDER", "")
@@ -364,15 +384,21 @@ def lambda_handler(event, context):
364
  "chosen_local_ocr_model": arguments.get(
365
  "chosen_local_ocr_model", os.getenv("CHOSEN_LOCAL_OCR_MODEL", "tesseract")
366
  ),
367
- "preprocess_local_ocr_images": arguments.get(
368
- "preprocess_local_ocr_images",
369
- os.getenv("PREPROCESS_LOCAL_OCR_IMAGES", "False"),
 
 
370
  ),
371
- "compress_redacted_pdf": arguments.get(
372
- "compress_redacted_pdf", os.getenv("COMPRESS_REDACTED_PDF", "False")
 
 
373
  ),
374
- "return_pdf_end_of_redaction": arguments.get(
375
- "return_pdf_end_of_redaction", os.getenv("RETURN_REDACTED_PDF", "True")
 
 
376
  ),
377
  "deny_list_file": arguments.get(
378
  "deny_list_file", os.getenv("DENY_LIST_PATH", "")
@@ -392,17 +418,23 @@ def lambda_handler(event, context):
392
  ),
393
  )
394
  ),
395
- "extract_forms": arguments.get(
396
- "extract_forms",
397
- os.getenv("INCLUDE_FORM_EXTRACTION_TEXTRACT_OPTION", "False") == "True",
 
 
398
  ),
399
- "extract_tables": arguments.get(
400
- "extract_tables",
401
- os.getenv("INCLUDE_TABLE_EXTRACTION_TEXTRACT_OPTION", "False") == "True",
 
 
402
  ),
403
- "extract_layout": arguments.get(
404
- "extract_layout",
405
- os.getenv("INCLUDE_LAYOUT_EXTRACTION_TEXTRACT_OPTION", "False") == "True",
 
 
406
  ),
407
  # Word/Tabular Anonymisation Arguments
408
  "anon_strategy": arguments.get(
@@ -424,9 +456,11 @@ def lambda_handler(event, context):
424
  ),
425
  )
426
  ),
427
- "match_fuzzy_whole_phrase_bool": arguments.get(
428
- "match_fuzzy_whole_phrase_bool",
429
- os.getenv("MATCH_FUZZY_WHOLE_PHRASE_BOOL", "True"),
 
 
430
  ),
431
  # Duplicate Detection Arguments
432
  "duplicate_type": arguments.get(
@@ -455,19 +489,25 @@ def lambda_handler(event, context):
455
  ),
456
  )
457
  ),
458
- "greedy_match": arguments.get(
459
- "greedy_match", os.getenv("USE_GREEDY_DUPLICATE_DETECTION", "False")
 
 
460
  ),
461
- "combine_pages": arguments.get(
462
- "combine_pages", os.getenv("DEFAULT_COMBINE_PAGES", "True")
463
  ),
464
- "remove_duplicate_rows": arguments.get(
465
- "remove_duplicate_rows", os.getenv("REMOVE_DUPLICATE_ROWS", "False")
 
 
466
  ),
467
  # Textract Batch Operations Arguments
468
  "textract_action": arguments.get("textract_action", ""),
469
  "job_id": arguments.get("job_id", ""),
470
- "extract_signatures": arguments.get("extract_signatures", False),
 
 
471
  "textract_bucket": arguments.get(
472
  "textract_bucket", os.getenv("TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET", "")
473
  ),
@@ -492,7 +532,9 @@ def lambda_handler(event, context):
492
  "search_query": arguments.get(
493
  "search_query", os.getenv("DEFAULT_SEARCH_QUERY", "")
494
  ),
495
- "prepare_images": arguments.get("prepare_images", True),
 
 
496
  }
497
 
498
  # Debug: Print the final page_min and page_max values
 
43
  return [s.strip() for s in value.split(",") if s.strip()]
44
 
45
 
46
+ def convert_string_to_boolean(value: str) -> bool:
47
+ """Convert string to boolean, handling various formats."""
48
+ if isinstance(value, bool):
49
+ return value
50
+ elif value in ["True", "1", "true", "TRUE"]:
51
+ return True
52
+ elif value in ["False", "0", "false", "FALSE"]:
53
+ return False
54
+ else:
55
+ raise ValueError(f"Invalid boolean value: {value}")
56
+
57
+
58
  print("Lambda entrypoint loading...")
59
 
60
  # Initialize S3 client outside the handler for connection reuse
 
305
  "username": arguments.get(
306
  "username", os.getenv("DIRECT_MODE_DEFAULT_USER", "lambda_user")
307
  ),
308
+ "save_to_user_folders": convert_string_to_boolean(
309
+ arguments.get(
310
+ "save_to_user_folders", os.getenv("SESSION_OUTPUT_FOLDER", "False")
311
+ )
312
  ),
313
  "local_redact_entities": _get_env_list(
314
  arguments.get(
 
326
  "aws_region": os.getenv("AWS_REGION", ""),
327
  "s3_bucket": bucket_name,
328
  "do_initial_clean": arguments.get(
329
+ "do_initial_clean",
330
+ convert_string_to_boolean(
331
+ os.getenv("DO_INITIAL_TABULAR_DATA_CLEAN", "False")
332
+ ),
333
  ),
334
+ "save_logs_to_csv": convert_string_to_boolean(
335
+ arguments.get("save_logs_to_csv", os.getenv("SAVE_LOGS_TO_CSV", "True"))
336
  ),
337
  "save_logs_to_dynamodb": arguments.get(
338
+ "save_logs_to_dynamodb",
339
+ convert_string_to_boolean(os.getenv("SAVE_LOGS_TO_DYNAMODB", "False")),
340
  ),
341
+ "display_file_names_in_logs": convert_string_to_boolean(
342
+ arguments.get(
343
+ "display_file_names_in_logs",
344
+ os.getenv("DISPLAY_FILE_NAMES_IN_LOGS", "True"),
345
+ )
346
  ),
347
+ "upload_logs_to_s3": convert_string_to_boolean(
348
+ arguments.get("upload_logs_to_s3", os.getenv("RUN_AWS_FUNCTIONS", "False"))
349
  ),
350
  "s3_logs_prefix": arguments.get(
351
  "s3_logs_prefix", os.getenv("S3_USAGE_LOGS_FOLDER", "")
 
384
  "chosen_local_ocr_model": arguments.get(
385
  "chosen_local_ocr_model", os.getenv("CHOSEN_LOCAL_OCR_MODEL", "tesseract")
386
  ),
387
+ "preprocess_local_ocr_images": convert_string_to_boolean(
388
+ arguments.get(
389
+ "preprocess_local_ocr_images",
390
+ os.getenv("PREPROCESS_LOCAL_OCR_IMAGES", "True"),
391
+ )
392
  ),
393
+ "compress_redacted_pdf": convert_string_to_boolean(
394
+ arguments.get(
395
+ "compress_redacted_pdf", os.getenv("COMPRESS_REDACTED_PDF", "True")
396
+ )
397
  ),
398
+ "return_pdf_end_of_redaction": convert_string_to_boolean(
399
+ arguments.get(
400
+ "return_pdf_end_of_redaction", os.getenv("RETURN_REDACTED_PDF", "True")
401
+ )
402
  ),
403
  "deny_list_file": arguments.get(
404
  "deny_list_file", os.getenv("DENY_LIST_PATH", "")
 
418
  ),
419
  )
420
  ),
421
+ "extract_forms": convert_string_to_boolean(
422
+ arguments.get(
423
+ "extract_forms",
424
+ os.getenv("INCLUDE_FORM_EXTRACTION_TEXTRACT_OPTION", "False"),
425
+ )
426
  ),
427
+ "extract_tables": convert_string_to_boolean(
428
+ arguments.get(
429
+ "extract_tables",
430
+ os.getenv("INCLUDE_TABLE_EXTRACTION_TEXTRACT_OPTION", "False"),
431
+ )
432
  ),
433
+ "extract_layout": convert_string_to_boolean(
434
+ arguments.get(
435
+ "extract_layout",
436
+ os.getenv("INCLUDE_LAYOUT_EXTRACTION_TEXTRACT_OPTION", "False"),
437
+ )
438
  ),
439
  # Word/Tabular Anonymisation Arguments
440
  "anon_strategy": arguments.get(
 
456
  ),
457
  )
458
  ),
459
+ "match_fuzzy_whole_phrase_bool": convert_string_to_boolean(
460
+ arguments.get(
461
+ "match_fuzzy_whole_phrase_bool",
462
+ os.getenv("MATCH_FUZZY_WHOLE_PHRASE_BOOL", "True"),
463
+ )
464
  ),
465
  # Duplicate Detection Arguments
466
  "duplicate_type": arguments.get(
 
489
  ),
490
  )
491
  ),
492
+ "greedy_match": convert_string_to_boolean(
493
+ arguments.get(
494
+ "greedy_match", os.getenv("USE_GREEDY_DUPLICATE_DETECTION", "False")
495
+ )
496
  ),
497
+ "combine_pages": convert_string_to_boolean(
498
+ arguments.get("combine_pages", os.getenv("DEFAULT_COMBINE_PAGES", "True"))
499
  ),
500
+ "remove_duplicate_rows": convert_string_to_boolean(
501
+ arguments.get(
502
+ "remove_duplicate_rows", os.getenv("REMOVE_DUPLICATE_ROWS", "False")
503
+ )
504
  ),
505
  # Textract Batch Operations Arguments
506
  "textract_action": arguments.get("textract_action", ""),
507
  "job_id": arguments.get("job_id", ""),
508
+ "extract_signatures": convert_string_to_boolean(
509
+ arguments.get("extract_signatures", "False")
510
+ ),
511
  "textract_bucket": arguments.get(
512
  "textract_bucket", os.getenv("TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET", "")
513
  ),
 
532
  "search_query": arguments.get(
533
  "search_query", os.getenv("DEFAULT_SEARCH_QUERY", "")
534
  ),
535
+ "prepare_images": convert_string_to_boolean(
536
+ arguments.get("prepare_images", "True")
537
+ ),
538
  }
539
 
540
  # Debug: Print the final page_min and page_max values
src/app_settings.qmd CHANGED
@@ -28,8 +28,8 @@ This section covers configurations related to AWS services used by the applicati
28
  * **Configuration:** Set as an environment variable directly. This variable defines an additional source for AWS-specific configurations.
29
 
30
  * **`RUN_AWS_FUNCTIONS`**
31
- * **Description:** Enables or disables AWS-specific functionalities within the application. Set to `"1"` to enable and `"0"` to disable.
32
- * **Default Value:** `"0"`
33
  * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
34
 
35
  * **`AWS_REGION`**
@@ -392,13 +392,13 @@ General runtime configurations for the application.
392
  * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
393
 
394
  * **`COGNITO_AUTH`**
395
- * **Description:** Enables or disables AWS Cognito authentication for the application. Set to `'1'` to enable.
396
- * **Default Value:** `'0'`
397
  * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
398
 
399
  * **`RUN_DIRECT_MODE`**
400
- * **Description:** If set to `'1'`, runs the application in a "direct mode", which might alter certain behaviors (e.g., UI elements, processing flow).
401
- * **Default Value:** `'0'`
402
  * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
403
 
404
  * **`MAX_QUEUE_SIZE`**
 
28
  * **Configuration:** Set as an environment variable directly. This variable defines an additional source for AWS-specific configurations.
29
 
30
  * **`RUN_AWS_FUNCTIONS`**
31
+ * **Description:** Enables or disables AWS-specific functionalities within the application. Set to `"True"` to enable and `"False"` to disable.
32
+ * **Default Value:** `"False"`
33
  * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
34
 
35
  * **`AWS_REGION`**
 
392
  * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
393
 
394
  * **`COGNITO_AUTH`**
395
+ * **Description:** Enables or disables AWS Cognito authentication for the application. Set to `'True'` to enable.
396
+ * **Default Value:** `'False'`
397
  * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env` (or `config/aws_config.env` if `AWS_CONFIG_PATH` is configured).
398
 
399
  * **`RUN_DIRECT_MODE`**
400
+ * **Description:** If set to `'True'`, runs the application in a "direct mode", which might alter certain behaviors (e.g., UI elements, processing flow).
401
+ * **Default Value:** `'False'`
402
  * **Configuration:** Set as an environment variable directly, or include in `config/app_config.env`.
403
 
404
  * **`MAX_QUEUE_SIZE`**
src/installation_guide.qmd CHANGED
@@ -83,7 +83,7 @@ AWS_ACCOUNT_ID=1234567890 # AWS account ID that has administrator access that yo
83
  CDK_FOLDER=C:/path_to_cdk_folder/ # The place where the cdk folder code is located
84
  CONTEXT_FILE=C:/path_to_cdk_folder/cdk.context.json
85
  COGNITO_USER_POOL_DOMAIN_PREFIX=redaction-12345 # The prefix of the login / user sign up domain that you want to use with Cognito login. Should not contain the terms amazon, aws, or cognito.
86
- COGNITO_AUTH=1 # Do you want to do in-app authentication (username and password only, not necessary if you are using an SSL certificate as recommended below)
87
  USE_CLOUDFRONT=True # Recommended. If you intend to use CloudFront as the front URL to your application load balancer (ALB). This has some extra security features that you won't get with just an ALB, e.g. limiting app access by country.
88
  RUN_USEAST_STACK=False # Set this to True only if you have permissions to create a Cloudfront distribution and web ACL on top of it in the us-east-1 region. If you don't, the section below shows how you can create the CloudFront resource manually and map it to your application load balancer (as you should have permissions for that if you are admin in your region).
89
  CLOUDFRONT_DOMAIN=<example>.cloudfront.net # If you already know the domain of the CloudFront distribution that you want to use, you can add this here.
@@ -155,7 +155,7 @@ if you want to do this manually:
155
  Create a `config.env` file to upload to the S3 bucket that has at least the following variables:
156
 
157
  ```ini
158
- COGNITO_AUTH=1 # If you are using an SSL certificate with your application load balancer, you will be logging in there. Set this to 0 to turn off the default login screen.
159
  RUN_AWS_FUNCTIONS=1 # This will enable the app to communicate with AWS services.
160
  SESSION_OUTPUT_FOLDER=True # This will put outputs for each user in separate output folders.
161
  ```
 
83
  CDK_FOLDER=C:/path_to_cdk_folder/ # The place where the cdk folder code is located
84
  CONTEXT_FILE=C:/path_to_cdk_folder/cdk.context.json
85
  COGNITO_USER_POOL_DOMAIN_PREFIX=redaction-12345 # The prefix of the login / user sign up domain that you want to use with Cognito login. Should not contain the terms amazon, aws, or cognito.
86
+ COGNITO_AUTH=0 # Do you want to do in-app authentication (username and password only, not necessary if you are using an SSL certificate as recommended below)
87
  USE_CLOUDFRONT=True # Recommended. If you intend to use CloudFront as the front URL to your application load balancer (ALB). This has some extra security features that you won't get with just an ALB, e.g. limiting app access by country.
88
  RUN_USEAST_STACK=False # Set this to True only if you have permissions to create a Cloudfront distribution and web ACL on top of it in the us-east-1 region. If you don't, the section below shows how you can create the CloudFront resource manually and map it to your application load balancer (as you should have permissions for that if you are admin in your region).
89
  CLOUDFRONT_DOMAIN=<example>.cloudfront.net # If you already know the domain of the CloudFront distribution that you want to use, you can add this here.
 
155
  Create a `config.env` file to upload to the S3 bucket that has at least the following variables:
156
 
157
  ```ini
158
+ COGNITO_AUTH=0 # If you are using an SSL certificate with your application load balancer, you will be logging in there. Set this to 0 to turn off the default login screen.
159
  RUN_AWS_FUNCTIONS=1 # This will enable the app to communicate with AWS services.
160
  SESSION_OUTPUT_FOLDER=True # This will put outputs for each user in separate output folders.
161
  ```
tools/aws_functions.py CHANGED
@@ -29,7 +29,7 @@ def get_assumed_role_info():
29
  return assumed_role_arn, assumed_role_name
30
 
31
 
32
- if RUN_AWS_FUNCTIONS == "1":
33
  try:
34
  session = boto3.Session(region_name=AWS_REGION)
35
 
@@ -52,10 +52,10 @@ def download_file_from_s3(
52
  bucket_name: str,
53
  key: str,
54
  local_file_path_and_name: str,
55
- RUN_AWS_FUNCTIONS: str = RUN_AWS_FUNCTIONS,
56
  ):
57
 
58
- if RUN_AWS_FUNCTIONS == "1":
59
 
60
  try:
61
  # Ensure the local directory exists
@@ -74,12 +74,12 @@ def download_folder_from_s3(
74
  bucket_name: str,
75
  s3_folder: str,
76
  local_folder: str,
77
- RUN_AWS_FUNCTIONS: str = RUN_AWS_FUNCTIONS,
78
  ):
79
  """
80
  Download all files from an S3 folder to a local folder.
81
  """
82
- if RUN_AWS_FUNCTIONS == "1":
83
  if bucket_name and s3_folder and local_folder:
84
 
85
  s3 = boto3.client("s3", region_name=AWS_REGION)
@@ -117,13 +117,13 @@ def download_files_from_s3(
117
  s3_folder: str,
118
  local_folder: str,
119
  filenames: List[str],
120
- RUN_AWS_FUNCTIONS: str = RUN_AWS_FUNCTIONS,
121
  ):
122
  """
123
  Download specific files from an S3 folder to a local folder.
124
  """
125
 
126
- if RUN_AWS_FUNCTIONS == "1":
127
  if bucket_name and s3_folder and local_folder and filenames:
128
 
129
  s3 = boto3.client("s3", region_name=AWS_REGION)
@@ -169,7 +169,7 @@ def upload_file_to_s3(
169
  local_file_paths: List[str],
170
  s3_key: str,
171
  s3_bucket: str = DOCUMENT_REDACTION_BUCKET,
172
- RUN_AWS_FUNCTIONS: str = RUN_AWS_FUNCTIONS,
173
  ):
174
  """
175
  Uploads a file from local machine to Amazon S3.
@@ -182,10 +182,10 @@ def upload_file_to_s3(
182
  Returns:
183
  - Message as variable/printed to console
184
  """
185
- final_out_message = []
186
  final_out_message_str = ""
187
 
188
- if RUN_AWS_FUNCTIONS == "1":
189
  try:
190
  if s3_bucket and s3_key and local_file_paths:
191
 
@@ -236,8 +236,8 @@ def upload_log_file_to_s3(
236
  local_file_paths: List[str],
237
  s3_key: str,
238
  s3_bucket: str = DOCUMENT_REDACTION_BUCKET,
239
- RUN_AWS_FUNCTIONS: str = RUN_AWS_FUNCTIONS,
240
- SAVE_LOGS_TO_CSV: str = SAVE_LOGS_TO_CSV,
241
  ):
242
  """
243
  Uploads a log file from local machine to Amazon S3.
@@ -250,10 +250,10 @@ def upload_log_file_to_s3(
250
  Returns:
251
  - Message as variable/printed to console
252
  """
253
- final_out_message = []
254
  final_out_message_str = ""
255
 
256
- if RUN_AWS_FUNCTIONS == "1" and SAVE_LOGS_TO_CSV is True:
257
  try:
258
  if s3_bucket and s3_key and local_file_paths:
259
 
 
29
  return assumed_role_arn, assumed_role_name
30
 
31
 
32
+ if RUN_AWS_FUNCTIONS:
33
  try:
34
  session = boto3.Session(region_name=AWS_REGION)
35
 
 
52
  bucket_name: str,
53
  key: str,
54
  local_file_path_and_name: str,
55
+ RUN_AWS_FUNCTIONS: bool = RUN_AWS_FUNCTIONS,
56
  ):
57
 
58
+ if RUN_AWS_FUNCTIONS:
59
 
60
  try:
61
  # Ensure the local directory exists
 
74
  bucket_name: str,
75
  s3_folder: str,
76
  local_folder: str,
77
+ RUN_AWS_FUNCTIONS: bool = RUN_AWS_FUNCTIONS,
78
  ):
79
  """
80
  Download all files from an S3 folder to a local folder.
81
  """
82
+ if RUN_AWS_FUNCTIONS:
83
  if bucket_name and s3_folder and local_folder:
84
 
85
  s3 = boto3.client("s3", region_name=AWS_REGION)
 
117
  s3_folder: str,
118
  local_folder: str,
119
  filenames: List[str],
120
+ RUN_AWS_FUNCTIONS: bool = RUN_AWS_FUNCTIONS,
121
  ):
122
  """
123
  Download specific files from an S3 folder to a local folder.
124
  """
125
 
126
+ if RUN_AWS_FUNCTIONS:
127
  if bucket_name and s3_folder and local_folder and filenames:
128
 
129
  s3 = boto3.client("s3", region_name=AWS_REGION)
 
169
  local_file_paths: List[str],
170
  s3_key: str,
171
  s3_bucket: str = DOCUMENT_REDACTION_BUCKET,
172
+ RUN_AWS_FUNCTIONS: bool = RUN_AWS_FUNCTIONS,
173
  ):
174
  """
175
  Uploads a file from local machine to Amazon S3.
 
182
  Returns:
183
  - Message as variable/printed to console
184
  """
185
+ final_out_message = list()
186
  final_out_message_str = ""
187
 
188
+ if RUN_AWS_FUNCTIONS:
189
  try:
190
  if s3_bucket and s3_key and local_file_paths:
191
 
 
236
  local_file_paths: List[str],
237
  s3_key: str,
238
  s3_bucket: str = DOCUMENT_REDACTION_BUCKET,
239
+ RUN_AWS_FUNCTIONS: bool = RUN_AWS_FUNCTIONS,
240
+ SAVE_LOGS_TO_CSV: bool = SAVE_LOGS_TO_CSV,
241
  ):
242
  """
243
  Uploads a log file from local machine to Amazon S3.
 
250
  Returns:
251
  - Message as variable/printed to console
252
  """
253
+ final_out_message = list()
254
  final_out_message_str = ""
255
 
256
+ if RUN_AWS_FUNCTIONS and SAVE_LOGS_TO_CSV:
257
  try:
258
  if s3_bucket and s3_key and local_file_paths:
259
 
tools/aws_textract.py CHANGED
@@ -38,8 +38,8 @@ def analyse_page_with_textract(
38
  textract_output_found: bool = False,
39
  aws_access_question_textbox: str = AWS_ACCESS_KEY,
40
  aws_secret_question_textbox: str = AWS_SECRET_KEY,
41
- RUN_AWS_FUNCTIONS: str = RUN_AWS_FUNCTIONS,
42
- PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS: str = PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS,
43
  ):
44
  """
45
  Analyzes a single page of a document using AWS Textract to extract text and other features.
@@ -62,12 +62,12 @@ def analyse_page_with_textract(
62
  SSO or environment variables. Defaults to AWS_ACCESS_KEY.
63
  aws_secret_question_textbox (str, optional): AWS secret question provided by the user, if not using
64
  SSO or environment variables. Defaults to AWS_SECRET_KEY.
65
- RUN_AWS_FUNCTIONS (str, optional): Configuration flag (e.g., "1" or "0") to enable or
66
  disable AWS functions. Defaults to RUN_AWS_FUNCTIONS.
67
- PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS (str, optional): Configuration flag (e.g., "1" or "0")
68
  to prioritize AWS SSO credentials
69
  over environment variables.
70
- Defaults to PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS.
71
 
72
  Returns:
73
  Tuple[List[Dict], str]: A tuple containing:
@@ -79,10 +79,7 @@ def analyse_page_with_textract(
79
  if client == "":
80
  try:
81
  # Try to connect to AWS Textract Client if using that text extraction method
82
- if (
83
- RUN_AWS_FUNCTIONS == "1"
84
- and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS == "1"
85
- ):
86
  print("Connecting to Textract via existing SSO connection")
87
  client = boto3.client("textract", region_name=AWS_REGION)
88
  elif aws_access_question_textbox and aws_secret_question_textbox:
@@ -95,7 +92,7 @@ def analyse_page_with_textract(
95
  aws_secret_access_question=aws_secret_question_textbox,
96
  region_name=AWS_REGION,
97
  )
98
- elif RUN_AWS_FUNCTIONS == "1":
99
  print("Connecting to Textract via existing SSO connection")
100
  client = boto3.client("textract", region_name=AWS_REGION)
101
  elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
 
38
  textract_output_found: bool = False,
39
  aws_access_question_textbox: str = AWS_ACCESS_KEY,
40
  aws_secret_question_textbox: str = AWS_SECRET_KEY,
41
+ RUN_AWS_FUNCTIONS: bool = RUN_AWS_FUNCTIONS,
42
+ PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS: bool = PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS,
43
  ):
44
  """
45
  Analyzes a single page of a document using AWS Textract to extract text and other features.
 
62
  SSO or environment variables. Defaults to AWS_ACCESS_KEY.
63
  aws_secret_question_textbox (str, optional): AWS secret question provided by the user, if not using
64
  SSO or environment variables. Defaults to AWS_SECRET_KEY.
65
+ RUN_AWS_FUNCTIONS (bool, optional): Configuration flag to enable or
66
  disable AWS functions. Defaults to RUN_AWS_FUNCTIONS.
67
+ PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS (bool, optional): Configuration flag (e.g., True or False)
68
  to prioritize AWS SSO credentials
69
  over environment variables.
70
+ Defaults to True.
71
 
72
  Returns:
73
  Tuple[List[Dict], str]: A tuple containing:
 
79
  if client == "":
80
  try:
81
  # Try to connect to AWS Textract Client if using that text extraction method
82
+ if RUN_AWS_FUNCTIONS and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:
 
 
 
83
  print("Connecting to Textract via existing SSO connection")
84
  client = boto3.client("textract", region_name=AWS_REGION)
85
  elif aws_access_question_textbox and aws_secret_question_textbox:
 
92
  aws_secret_access_question=aws_secret_question_textbox,
93
  region_name=AWS_REGION,
94
  )
95
+ elif RUN_AWS_FUNCTIONS is True:
96
  print("Connecting to Textract via existing SSO connection")
97
  client = boto3.client("textract", region_name=AWS_REGION)
98
  elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
tools/cli_usage_logger.py CHANGED
@@ -112,11 +112,11 @@ class CLIUsageLogger:
112
  """
113
  # Use config defaults if not specified
114
  if save_to_csv is None:
115
- save_to_csv = SAVE_LOGS_TO_CSV == "True"
116
  if save_to_dynamodb is None:
117
- save_to_dynamodb = SAVE_LOGS_TO_DYNAMODB == "True"
118
  if save_to_s3 is None:
119
- save_to_s3 = RUN_AWS_FUNCTIONS == "1" and SAVE_LOGS_TO_CSV == "True"
120
  if s3_bucket is None:
121
  s3_bucket = DOCUMENT_REDACTION_BUCKET
122
  if s3_key_prefix is None:
 
112
  """
113
  # Use config defaults if not specified
114
  if save_to_csv is None:
115
+ save_to_csv = SAVE_LOGS_TO_CSV
116
  if save_to_dynamodb is None:
117
+ save_to_dynamodb = SAVE_LOGS_TO_DYNAMODB
118
  if save_to_s3 is None:
119
+ save_to_s3 = RUN_AWS_FUNCTIONS and SAVE_LOGS_TO_CSV
120
  if s3_bucket is None:
121
  s3_bucket = DOCUMENT_REDACTION_BUCKET
122
  if s3_key_prefix is None:
tools/config.py CHANGED
@@ -25,6 +25,18 @@ def _get_env_list(env_var_name: str) -> List[str]:
25
  # Set or retrieve configuration variables for the redaction app
26
 
27
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  def get_or_create_env_var(var_name: str, default_value: str, print_val: bool = False):
29
  """
30
  Get an environmental variable, and set it to a default value if it doesn't exist
@@ -100,7 +112,9 @@ if AWS_CONFIG_PATH:
100
  else:
101
  print("AWS config file not found at location:", AWS_CONFIG_PATH)
102
 
103
- RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
 
 
104
 
105
  AWS_REGION = get_or_create_env_var("AWS_REGION", "")
106
 
@@ -119,8 +133,8 @@ AWS_SECRET_KEY = get_or_create_env_var("AWS_SECRET_KEY", "")
119
  DOCUMENT_REDACTION_BUCKET = get_or_create_env_var("DOCUMENT_REDACTION_BUCKET", "")
120
 
121
  # Should the app prioritise using AWS SSO over using API keys stored in environment variables/secrets (defaults to yes)
122
- PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS = get_or_create_env_var(
123
- "PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS", "1"
124
  )
125
 
126
  # Custom headers e.g. if routing traffic through Cloudfront
@@ -134,7 +148,9 @@ CUSTOM_HEADER_VALUE = get_or_create_env_var("CUSTOM_HEADER_VALUE", "")
134
  # Image options
135
  ###
136
  IMAGES_DPI = float(get_or_create_env_var("IMAGES_DPI", "300.0"))
137
- LOAD_TRUNCATED_IMAGES = get_or_create_env_var("LOAD_TRUNCATED_IMAGES", "True")
 
 
138
  MAX_IMAGE_PIXELS = get_or_create_env_var(
139
  "MAX_IMAGE_PIXELS", ""
140
  ) # Changed to None if blank in file_conversion.py
@@ -173,15 +189,19 @@ MPLCONFIGDIR = get_or_create_env_var("MPLCONFIGDIR", "") # Matplotlib cache fol
173
  # By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
174
  # Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
175
 
176
- SAVE_LOGS_TO_CSV = get_or_create_env_var("SAVE_LOGS_TO_CSV", "True")
 
 
177
 
178
- USE_LOG_SUBFOLDERS = get_or_create_env_var("USE_LOG_SUBFOLDERS", "True")
 
 
179
 
180
  FEEDBACK_LOGS_FOLDER = get_or_create_env_var("FEEDBACK_LOGS_FOLDER", "feedback/")
181
  ACCESS_LOGS_FOLDER = get_or_create_env_var("ACCESS_LOGS_FOLDER", "logs/")
182
  USAGE_LOGS_FOLDER = get_or_create_env_var("USAGE_LOGS_FOLDER", "usage/")
183
 
184
- if USE_LOG_SUBFOLDERS == "True":
185
  day_log_subfolder = today_rev + "/"
186
  host_name_subfolder = HOST_NAME + "/"
187
  full_log_subfolder = day_log_subfolder + host_name_subfolder
@@ -201,8 +221,8 @@ S3_USAGE_LOGS_FOLDER = get_or_create_env_var(
201
  )
202
 
203
  # Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
204
- DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var(
205
- "DISPLAY_FILE_NAMES_IN_LOGS", "False"
206
  )
207
 
208
  # Further customisation options for CSV logs
@@ -218,7 +238,9 @@ CSV_USAGE_LOG_HEADERS = get_or_create_env_var(
218
  ) # If blank, uses component labels
219
 
220
  ### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
221
- SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var("SAVE_LOGS_TO_DYNAMODB", "False")
 
 
222
 
223
  ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
224
  "ACCESS_LOG_DYNAMODB_TABLE_NAME", "redaction_access_log"
@@ -238,9 +260,9 @@ USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
238
  DYNAMODB_USAGE_LOG_HEADERS = get_or_create_env_var("DYNAMODB_USAGE_LOG_HEADERS", "")
239
 
240
  # Report logging to console?
241
- LOGGING = get_or_create_env_var("LOGGING", "False")
242
 
243
- if LOGGING == "True":
244
  # Configure logging
245
  logging.basicConfig(
246
  level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
@@ -257,7 +279,7 @@ FEEDBACK_LOG_FILE_NAME = get_or_create_env_var("FEEDBACK_LOG_FILE_NAME", LOG_FIL
257
 
258
  FAVICON_PATH = get_or_create_env_var("FAVICON_PATH", "favicon.png")
259
 
260
- RUN_FASTAPI = get_or_create_env_var("RUN_FASTAPI", "0")
261
 
262
  MAX_QUEUE_SIZE = int(get_or_create_env_var("MAX_QUEUE_SIZE", "5"))
263
 
@@ -291,8 +313,8 @@ MAX_OPEN_TEXT_CHARACTERS = int(
291
  )
292
 
293
  # When loading for review, should PDFs have existing redaction annotations loaded in?
294
- LOAD_REDACTION_ANNOTATIONS_FROM_PDF = get_or_create_env_var(
295
- "LOAD_REDACTION_ANNOTATIONS_FROM_PDF", "True"
296
  )
297
 
298
 
@@ -313,8 +335,8 @@ if POPPLER_FOLDER:
313
  add_folder_to_path(POPPLER_FOLDER)
314
 
315
  # Extraction and PII options open by default:
316
- EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT = get_or_create_env_var(
317
- "EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT", "True"
318
  )
319
 
320
  # List of models to use for text extraction and PII detection
@@ -336,62 +358,56 @@ NO_REDACTION_PII_OPTION = get_or_create_env_var(
336
  LOCAL_PII_OPTION = get_or_create_env_var("LOCAL_PII_OPTION", "Local")
337
  AWS_PII_OPTION = get_or_create_env_var("AWS_PII_OPTION", "AWS Comprehend")
338
 
339
- SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS = get_or_create_env_var(
340
- "SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS", "True"
341
  )
342
- SHOW_AWS_TEXT_EXTRACTION_OPTIONS = get_or_create_env_var(
343
- "SHOW_AWS_TEXT_EXTRACTION_OPTIONS", "True"
344
  )
345
 
346
  # Show at least local options if everything mistakenly removed
347
- if (
348
- SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS != "True"
349
- and SHOW_AWS_TEXT_EXTRACTION_OPTIONS != "True"
350
- ):
351
- SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS = "True"
352
 
353
  local_model_options = list()
354
  aws_model_options = list()
355
  text_extraction_models = list()
356
 
357
- if SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS == "True":
358
  local_model_options.append(SELECTABLE_TEXT_EXTRACT_OPTION)
359
  local_model_options.append(TESSERACT_TEXT_EXTRACT_OPTION)
360
 
361
- if SHOW_AWS_TEXT_EXTRACTION_OPTIONS == "True":
362
  aws_model_options.append(TEXTRACT_TEXT_EXTRACT_OPTION)
363
 
364
  TEXT_EXTRACTION_MODELS = local_model_options + aws_model_options
365
- DO_INITIAL_TABULAR_DATA_CLEAN = get_or_create_env_var(
366
- "DO_INITIAL_TABULAR_DATA_CLEAN", "True"
367
  )
368
 
369
- SHOW_LOCAL_PII_DETECTION_OPTIONS = get_or_create_env_var(
370
- "SHOW_LOCAL_PII_DETECTION_OPTIONS", "True"
371
  )
372
- SHOW_AWS_PII_DETECTION_OPTIONS = get_or_create_env_var(
373
- "SHOW_AWS_PII_DETECTION_OPTIONS", "True"
374
  )
375
 
376
- if (
377
- SHOW_LOCAL_PII_DETECTION_OPTIONS != "True"
378
- and SHOW_AWS_PII_DETECTION_OPTIONS != "True"
379
- ):
380
- SHOW_LOCAL_PII_DETECTION_OPTIONS = "True"
381
 
382
  local_model_options = [NO_REDACTION_PII_OPTION]
383
  aws_model_options = list()
384
  pii_detection_models = list()
385
 
386
- if SHOW_LOCAL_PII_DETECTION_OPTIONS == "True":
387
  local_model_options.append(LOCAL_PII_OPTION)
388
 
389
- if SHOW_AWS_PII_DETECTION_OPTIONS == "True":
390
  aws_model_options.append(AWS_PII_OPTION)
391
 
392
  PII_DETECTION_MODELS = local_model_options + aws_model_options
393
 
394
- if SHOW_AWS_TEXT_EXTRACTION_OPTIONS == "True":
395
  DEFAULT_TEXT_EXTRACTION_MODEL = get_or_create_env_var(
396
  "DEFAULT_TEXT_EXTRACTION_MODEL", TEXTRACT_TEXT_EXTRACT_OPTION
397
  )
@@ -400,7 +416,7 @@ else:
400
  "DEFAULT_TEXT_EXTRACTION_MODEL", SELECTABLE_TEXT_EXTRACT_OPTION
401
  )
402
 
403
- if SHOW_AWS_PII_DETECTION_OPTIONS == "True":
404
  DEFAULT_PII_DETECTION_MODEL = get_or_create_env_var(
405
  "DEFAULT_PII_DETECTION_MODEL", AWS_PII_OPTION
406
  )
@@ -426,10 +442,10 @@ CHOSEN_LOCAL_OCR_MODEL = get_or_create_env_var(
426
  "CHOSEN_LOCAL_OCR_MODEL", "tesseract"
427
  ) # Choose between "tesseract", "hybrid", and "paddle". "paddle" is accurate for whole line text extraction, but word-level extract is not natively supported, and so word bounding boxes will be inaccurate. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with the chosen hybrid model (default PaddleOCR) on words with low confidence.
428
 
429
- SHOW_LOCAL_OCR_MODEL_OPTIONS = get_or_create_env_var(
430
- "SHOW_LOCAL_OCR_MODEL_OPTIONS", "False"
431
  )
432
- if SHOW_LOCAL_OCR_MODEL_OPTIONS == "True":
433
  LOCAL_OCR_MODEL_OPTIONS = [
434
  "tesseract",
435
  "hybrid",
@@ -445,18 +461,18 @@ HYBRID_OCR_PADDING = int(
445
  get_or_create_env_var("HYBRID_OCR_PADDING", "1")
446
  ) # The padding to add to the text when passing it to PaddleOCR for re-extraction using the hybrid OCR method.
447
 
448
- PADDLE_USE_TEXTLINE_ORIENTATION = get_or_create_env_var(
449
- "PADDLE_USE_TEXTLINE_ORIENTATION", "False"
450
  )
451
 
452
  PADDLE_DET_DB_UNCLIP_RATIO = get_or_create_env_var("PADDLE_DET_DB_UNCLIP_RATIO", "1.2")
453
 
454
- SAVE_EXAMPLE_TESSERACT_VS_PADDLE_IMAGES = get_or_create_env_var(
455
- "SAVE_EXAMPLE_TESSERACT_VS_PADDLE_IMAGES", "False"
456
  ) # Whether to save example images of Tesseract vs PaddleOCR re-extraction in hybrid OCR mode.
457
 
458
- SAVE_PADDLE_VISUALISATIONS = get_or_create_env_var(
459
- "SAVE_PADDLE_VISUALISATIONS", "False"
460
  ) # Whether to save visualisations of PaddleOCR bounding boxes.
461
 
462
  # Model storage paths for Lambda compatibility
@@ -543,7 +559,9 @@ DEFAULT_PAGE_MAX = int(get_or_create_env_var("DEFAULT_PAGE_MAX", "0"))
543
 
544
  ### Language selection options
545
 
546
- SHOW_LANGUAGE_SELECTION = get_or_create_env_var("SHOW_LANGUAGE_SELECTION", "False")
 
 
547
 
548
  DEFAULT_LANGUAGE_FULL_NAME = get_or_create_env_var(
549
  "DEFAULT_LANGUAGE_FULL_NAME", "english"
@@ -581,22 +599,24 @@ DEFAULT_DUPLICATE_DETECTION_THRESHOLD = float(
581
  DEFAULT_MIN_CONSECUTIVE_PAGES = int(
582
  get_or_create_env_var("DEFAULT_MIN_CONSECUTIVE_PAGES", "1")
583
  )
584
- USE_GREEDY_DUPLICATE_DETECTION = get_or_create_env_var(
585
- "USE_GREEDY_DUPLICATE_DETECTION", "True"
586
  )
587
- DEFAULT_COMBINE_PAGES = get_or_create_env_var(
588
- "DEFAULT_COMBINE_PAGES", "True"
589
  ) # Combine text from the same page number within a file. Alternative will enable line-level duplicate detection.
590
  DEFAULT_MIN_WORD_COUNT = int(get_or_create_env_var("DEFAULT_MIN_WORD_COUNT", "10"))
591
- REMOVE_DUPLICATE_ROWS = get_or_create_env_var("REMOVE_DUPLICATE_ROWS", "False")
 
 
592
 
593
 
594
  ###
595
  # File output options
596
  ###
597
  # Should the output pdf redaction boxes be drawn using the custom box colour?
598
- USE_GUI_BOX_COLOURS_FOR_OUTPUTS = get_or_create_env_var(
599
- "USE_GUI_BOX_COLOURS_FOR_OUTPUTS", "False"
600
  )
601
 
602
  # This is the colour of the output pdf redaction boxes. Should be a tuple of three integers between 0 and 255
@@ -632,14 +652,16 @@ APPLY_REDACTIONS_TEXT = int(
632
  ) # The default PDF_REDACT_TEXT_REMOVE | 0 removes all characters whose boundary box overlaps any redaction rectangle. This complies with the original legal / data protection intentions of redaction annotations. Other use cases however may require to keep text while redacting vector graphics or images. This can be achieved by setting text=True|PDF_REDACT_TEXT_NONE | 1. This does not comply with the data protection intentions of redaction annotations. Do so at your own risk.
633
 
634
  # If you don't want to redact the text, but instead just draw a box over it, set this to True
635
- RETURN_PDF_FOR_REVIEW = get_or_create_env_var("RETURN_PDF_FOR_REVIEW", "True")
 
 
636
 
637
- RETURN_REDACTED_PDF = get_or_create_env_var(
638
- "RETURN_REDACTED_PDF", "True"
639
  ) # Return a redacted PDF at the end of the redaction task. Could be useful to set this to "False" if you want to ensure that the user always goes to the 'Review Redactions' tab before getting the final redacted PDF product.
640
 
641
- COMPRESS_REDACTED_PDF = get_or_create_env_var(
642
- "COMPRESS_REDACTED_PDF", "False"
643
  ) # On low memory systems, the compression options in pymupdf can cause the app to crash if the PDF is longer than 500 pages or so. Setting this to False will save the PDF only with a basic cleaning option enabled
644
 
645
  ###
@@ -654,7 +676,7 @@ except Exception as e:
654
  extract = TLDExtract(cache_dir=None)
655
 
656
  # Get some environment variables and Launch the Gradio app
657
- COGNITO_AUTH = get_or_create_env_var("COGNITO_AUTH", "0")
658
 
659
 
660
  # Link to user guide - ensure it is a valid URL
@@ -709,12 +731,18 @@ USER_GUIDE_URL = validate_safe_url(
709
  )
710
  )
711
 
712
- SHOW_EXAMPLES = get_or_create_env_var("SHOW_EXAMPLES", "True")
713
- SHOW_AWS_EXAMPLES = get_or_create_env_var("SHOW_AWS_EXAMPLES", "False")
 
 
 
 
714
 
715
  FILE_INPUT_HEIGHT = int(get_or_create_env_var("FILE_INPUT_HEIGHT", "200"))
716
 
717
- RUN_DIRECT_MODE = get_or_create_env_var("RUN_DIRECT_MODE", "0")
 
 
718
 
719
  # Direct mode configuration options
720
  DIRECT_MODE_DEFAULT_USER = get_or_create_env_var(
@@ -736,7 +764,9 @@ DIRECT_MODE_DUPLICATE_TYPE = get_or_create_env_var(
736
 
737
  ### ALLOW LIST
738
 
739
- GET_DEFAULT_ALLOW_LIST = get_or_create_env_var("GET_DEFAULT_ALLOW_LIST", "False")
 
 
740
 
741
  ALLOW_LIST_PATH = get_or_create_env_var(
742
  "ALLOW_LIST_PATH", ""
@@ -753,7 +783,9 @@ else:
753
 
754
  ### DENY LIST
755
 
756
- GET_DEFAULT_DENY_LIST = get_or_create_env_var("GET_DEFAULT_DENY_LIST", "False")
 
 
757
 
758
  S3_DENY_LIST_PATH = get_or_create_env_var(
759
  "S3_DENY_LIST_PATH", ""
@@ -793,9 +825,11 @@ else:
793
  # COST CODE OPTIONS
794
  ###
795
 
796
- SHOW_COSTS = get_or_create_env_var("SHOW_COSTS", "False")
797
 
798
- GET_COST_CODES = get_or_create_env_var("GET_COST_CODES", "False")
 
 
799
 
800
  DEFAULT_COST_CODE = get_or_create_env_var("DEFAULT_COST_CODE", "")
801
 
@@ -813,20 +847,21 @@ if COST_CODES_PATH:
813
  else:
814
  OUTPUT_COST_CODES_PATH = "config/cost_codes.csv"
815
 
816
- ENFORCE_COST_CODES = get_or_create_env_var(
817
- "ENFORCE_COST_CODES", "False"
818
- ) # If you have cost codes listed, is it compulsory to choose one before redacting?
 
819
 
820
- if ENFORCE_COST_CODES == "True":
821
- GET_COST_CODES = "True"
822
 
823
 
824
  ###
825
  # WHOLE DOCUMENT API OPTIONS
826
  ###
827
 
828
- SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS = get_or_create_env_var(
829
- "SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS", "False"
830
  ) # This feature not currently implemented
831
 
832
  TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET = get_or_create_env_var(
@@ -841,9 +876,10 @@ TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER = get_or_create_env_var(
841
  "TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER", "output"
842
  )
843
 
844
- LOAD_PREVIOUS_TEXTRACT_JOBS_S3 = get_or_create_env_var(
845
- "LOAD_PREVIOUS_TEXTRACT_JOBS_S3", "False"
846
- ) # Whether or not to load previous Textract jobs from S3
 
847
 
848
  TEXTRACT_JOBS_S3_LOC = get_or_create_env_var(
849
  "TEXTRACT_JOBS_S3_LOC", "output"
@@ -865,114 +901,15 @@ DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = int(
865
  ###
866
  # Config vars output format
867
  ###
868
- # Ensure that config variables are in the correct format for subsequent use elsewhere
869
-
870
- if LOAD_REDACTION_ANNOTATIONS_FROM_PDF == "True":
871
- LOAD_REDACTION_ANNOTATIONS_FROM_PDF = True
872
- else:
873
- LOAD_REDACTION_ANNOTATIONS_FROM_PDF = False
874
 
875
  # Convert string environment variables to string or list
876
- if SAVE_LOGS_TO_CSV == "True":
877
- SAVE_LOGS_TO_CSV = True
878
- else:
879
- SAVE_LOGS_TO_CSV = False
880
- if SAVE_LOGS_TO_DYNAMODB == "True":
881
- SAVE_LOGS_TO_DYNAMODB = True
882
- else:
883
- SAVE_LOGS_TO_DYNAMODB = False
884
- if SHOW_LANGUAGE_SELECTION == "True":
885
- SHOW_LANGUAGE_SELECTION = True
886
- else:
887
- SHOW_LANGUAGE_SELECTION = False
888
- if DISPLAY_FILE_NAMES_IN_LOGS == "True":
889
- DISPLAY_FILE_NAMES_IN_LOGS = True
890
- else:
891
- DISPLAY_FILE_NAMES_IN_LOGS = False
892
- if DO_INITIAL_TABULAR_DATA_CLEAN == "True":
893
- DO_INITIAL_TABULAR_DATA_CLEAN = True
894
- else:
895
- DO_INITIAL_TABULAR_DATA_CLEAN = False
896
- if COMPRESS_REDACTED_PDF == "True":
897
- COMPRESS_REDACTED_PDF = True
898
- else:
899
- COMPRESS_REDACTED_PDF = False
900
- if RETURN_REDACTED_PDF == "True":
901
- RETURN_REDACTED_PDF = True
902
- else:
903
- RETURN_REDACTED_PDF = False
904
- if USE_GREEDY_DUPLICATE_DETECTION == "True":
905
- USE_GREEDY_DUPLICATE_DETECTION = True
906
- else:
907
- USE_GREEDY_DUPLICATE_DETECTION = False
908
- if DEFAULT_COMBINE_PAGES == "True":
909
- DEFAULT_COMBINE_PAGES = True
910
- else:
911
- DEFAULT_COMBINE_PAGES = False
912
- if REMOVE_DUPLICATE_ROWS == "True":
913
- REMOVE_DUPLICATE_ROWS = True
914
- else:
915
- REMOVE_DUPLICATE_ROWS = False
916
-
917
- if GET_COST_CODES == "True":
918
- GET_COST_CODES = True
919
- else:
920
- GET_COST_CODES = False
921
-
922
- if ENFORCE_COST_CODES == "True":
923
- ENFORCE_COST_CODES = True
924
- else:
925
- ENFORCE_COST_CODES = False
926
-
927
- if SHOW_COSTS == "True":
928
- SHOW_COSTS = True
929
- else:
930
- SHOW_COSTS = False
931
-
932
- if GET_DEFAULT_ALLOW_LIST == "True":
933
- GET_DEFAULT_ALLOW_LIST = True
934
- else:
935
- GET_DEFAULT_ALLOW_LIST = False
936
-
937
- if SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS == "True":
938
- SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS = True
939
- else:
940
- SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS = False
941
-
942
- if SHOW_LOCAL_OCR_MODEL_OPTIONS == "True":
943
- SHOW_LOCAL_OCR_MODEL_OPTIONS = True
944
- else:
945
- SHOW_LOCAL_OCR_MODEL_OPTIONS = False
946
-
947
- if SAVE_EXAMPLE_TESSERACT_VS_PADDLE_IMAGES == "True":
948
- SAVE_EXAMPLE_TESSERACT_VS_PADDLE_IMAGES = True
949
- else:
950
- SAVE_EXAMPLE_TESSERACT_VS_PADDLE_IMAGES = False
951
-
952
- if SAVE_PADDLE_VISUALISATIONS == "True":
953
- SAVE_PADDLE_VISUALISATIONS = True
954
- else:
955
- SAVE_PADDLE_VISUALISATIONS = False
956
-
957
- if SHOW_AWS_TEXT_EXTRACTION_OPTIONS == "True":
958
- SHOW_AWS_TEXT_EXTRACTION_OPTIONS = True
959
- else:
960
- SHOW_AWS_TEXT_EXTRACTION_OPTIONS = False
961
-
962
- if CSV_ACCESS_LOG_HEADERS:
963
- CSV_ACCESS_LOG_HEADERS = _get_env_list(CSV_ACCESS_LOG_HEADERS)
964
- if CSV_FEEDBACK_LOG_HEADERS:
965
- CSV_FEEDBACK_LOG_HEADERS = _get_env_list(CSV_FEEDBACK_LOG_HEADERS)
966
- if CSV_USAGE_LOG_HEADERS:
967
- CSV_USAGE_LOG_HEADERS = _get_env_list(CSV_USAGE_LOG_HEADERS)
968
-
969
- if DYNAMODB_ACCESS_LOG_HEADERS:
970
- DYNAMODB_ACCESS_LOG_HEADERS = _get_env_list(DYNAMODB_ACCESS_LOG_HEADERS)
971
- if DYNAMODB_FEEDBACK_LOG_HEADERS:
972
- DYNAMODB_FEEDBACK_LOG_HEADERS = _get_env_list(DYNAMODB_FEEDBACK_LOG_HEADERS)
973
- if DYNAMODB_USAGE_LOG_HEADERS:
974
- DYNAMODB_USAGE_LOG_HEADERS = _get_env_list(DYNAMODB_USAGE_LOG_HEADERS)
975
 
 
 
 
976
  if CHOSEN_COMPREHEND_ENTITIES:
977
  CHOSEN_COMPREHEND_ENTITIES = _get_env_list(CHOSEN_COMPREHEND_ENTITIES)
978
  if FULL_COMPREHEND_ENTITY_LIST:
@@ -1000,31 +937,3 @@ if ALLOWED_ORIGINS:
1000
 
1001
  if ALLOWED_HOSTS:
1002
  ALLOWED_HOSTS = _get_env_list(ALLOWED_HOSTS)
1003
-
1004
- USE_GUI_BOX_COLOURS_FOR_OUTPUTS = USE_GUI_BOX_COLOURS_FOR_OUTPUTS.lower() == "true"
1005
- RETURN_PDF_FOR_REVIEW = RETURN_PDF_FOR_REVIEW.lower() == "true"
1006
-
1007
- if DO_INITIAL_TABULAR_DATA_CLEAN == "True":
1008
- DO_INITIAL_TABULAR_DATA_CLEAN = True
1009
- else:
1010
- DO_INITIAL_TABULAR_DATA_CLEAN = False
1011
-
1012
- if REMOVE_DUPLICATE_ROWS == "True":
1013
- REMOVE_DUPLICATE_ROWS = True
1014
- else:
1015
- REMOVE_DUPLICATE_ROWS = False
1016
-
1017
- if EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT == "True":
1018
- EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT = True
1019
- else:
1020
- EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT = False
1021
-
1022
- if PADDLE_USE_TEXTLINE_ORIENTATION == "True":
1023
- PADDLE_USE_TEXTLINE_ORIENTATION = True
1024
- else:
1025
- PADDLE_USE_TEXTLINE_ORIENTATION = False
1026
-
1027
- if PADDLE_DET_DB_UNCLIP_RATIO == "True":
1028
- PADDLE_DET_DB_UNCLIP_RATIO = True
1029
- else:
1030
- PADDLE_DET_DB_UNCLIP_RATIO = False
 
25
  # Set or retrieve configuration variables for the redaction app
26
 
27
 
28
+ def convert_string_to_boolean(value: str) -> bool:
29
+ """Convert string to boolean, handling various formats."""
30
+ if isinstance(value, bool):
31
+ return value
32
+ elif value in ["True", "1", "true", "TRUE"]:
33
+ return True
34
+ elif value in ["False", "0", "false", "FALSE"]:
35
+ return False
36
+ else:
37
+ raise ValueError(f"Invalid boolean value: {value}")
38
+
39
+
40
  def get_or_create_env_var(var_name: str, default_value: str, print_val: bool = False):
41
  """
42
  Get an environmental variable, and set it to a default value if it doesn't exist
 
112
  else:
113
  print("AWS config file not found at location:", AWS_CONFIG_PATH)
114
 
115
+ RUN_AWS_FUNCTIONS = convert_string_to_boolean(
116
+ get_or_create_env_var("RUN_AWS_FUNCTIONS", "False")
117
+ )
118
 
119
  AWS_REGION = get_or_create_env_var("AWS_REGION", "")
120
 
 
133
  DOCUMENT_REDACTION_BUCKET = get_or_create_env_var("DOCUMENT_REDACTION_BUCKET", "")
134
 
135
  # Should the app prioritise using AWS SSO over using API keys stored in environment variables/secrets (defaults to yes)
136
+ PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS = convert_string_to_boolean(
137
+ get_or_create_env_var("PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS", "True")
138
  )
139
 
140
  # Custom headers e.g. if routing traffic through Cloudfront
 
148
  # Image options
149
  ###
150
  IMAGES_DPI = float(get_or_create_env_var("IMAGES_DPI", "300.0"))
151
+ LOAD_TRUNCATED_IMAGES = convert_string_to_boolean(
152
+ get_or_create_env_var("LOAD_TRUNCATED_IMAGES", "True")
153
+ )
154
  MAX_IMAGE_PIXELS = get_or_create_env_var(
155
  "MAX_IMAGE_PIXELS", ""
156
  ) # Changed to None if blank in file_conversion.py
 
189
  # By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
190
  # Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
191
 
192
+ SAVE_LOGS_TO_CSV = convert_string_to_boolean(
193
+ get_or_create_env_var("SAVE_LOGS_TO_CSV", "True")
194
+ )
195
 
196
+ USE_LOG_SUBFOLDERS = convert_string_to_boolean(
197
+ get_or_create_env_var("USE_LOG_SUBFOLDERS", "True")
198
+ )
199
 
200
  FEEDBACK_LOGS_FOLDER = get_or_create_env_var("FEEDBACK_LOGS_FOLDER", "feedback/")
201
  ACCESS_LOGS_FOLDER = get_or_create_env_var("ACCESS_LOGS_FOLDER", "logs/")
202
  USAGE_LOGS_FOLDER = get_or_create_env_var("USAGE_LOGS_FOLDER", "usage/")
203
 
204
+ if USE_LOG_SUBFOLDERS:
205
  day_log_subfolder = today_rev + "/"
206
  host_name_subfolder = HOST_NAME + "/"
207
  full_log_subfolder = day_log_subfolder + host_name_subfolder
 
221
  )
222
 
223
  # Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
224
+ DISPLAY_FILE_NAMES_IN_LOGS = convert_string_to_boolean(
225
+ get_or_create_env_var("DISPLAY_FILE_NAMES_IN_LOGS", "False")
226
  )
227
 
228
  # Further customisation options for CSV logs
 
238
  ) # If blank, uses component labels
239
 
240
  ### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
241
+ SAVE_LOGS_TO_DYNAMODB = convert_string_to_boolean(
242
+ get_or_create_env_var("SAVE_LOGS_TO_DYNAMODB", "False")
243
+ )
244
 
245
  ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var(
246
  "ACCESS_LOG_DYNAMODB_TABLE_NAME", "redaction_access_log"
 
260
  DYNAMODB_USAGE_LOG_HEADERS = get_or_create_env_var("DYNAMODB_USAGE_LOG_HEADERS", "")
261
 
262
  # Report logging to console?
263
+ LOGGING = convert_string_to_boolean(get_or_create_env_var("LOGGING", "False"))
264
 
265
+ if LOGGING:
266
  # Configure logging
267
  logging.basicConfig(
268
  level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 
279
 
280
  FAVICON_PATH = get_or_create_env_var("FAVICON_PATH", "favicon.png")
281
 
282
+ RUN_FASTAPI = convert_string_to_boolean(get_or_create_env_var("RUN_FASTAPI", "False"))
283
 
284
  MAX_QUEUE_SIZE = int(get_or_create_env_var("MAX_QUEUE_SIZE", "5"))
285
 
 
313
  )
314
 
315
  # When loading for review, should PDFs have existing redaction annotations loaded in?
316
+ LOAD_REDACTION_ANNOTATIONS_FROM_PDF = convert_string_to_boolean(
317
+ get_or_create_env_var("LOAD_REDACTION_ANNOTATIONS_FROM_PDF", "True")
318
  )
319
 
320
 
 
335
  add_folder_to_path(POPPLER_FOLDER)
336
 
337
  # Extraction and PII options open by default:
338
+ EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT = convert_string_to_boolean(
339
+ get_or_create_env_var("EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT", "True")
340
  )
341
 
342
  # List of models to use for text extraction and PII detection
 
358
  LOCAL_PII_OPTION = get_or_create_env_var("LOCAL_PII_OPTION", "Local")
359
  AWS_PII_OPTION = get_or_create_env_var("AWS_PII_OPTION", "AWS Comprehend")
360
 
361
+ SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS = convert_string_to_boolean(
362
+ get_or_create_env_var("SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS", "True")
363
  )
364
+ SHOW_AWS_TEXT_EXTRACTION_OPTIONS = convert_string_to_boolean(
365
+ get_or_create_env_var("SHOW_AWS_TEXT_EXTRACTION_OPTIONS", "True")
366
  )
367
 
368
  # Show at least local options if everything mistakenly removed
369
+ if not SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS and not SHOW_AWS_TEXT_EXTRACTION_OPTIONS:
370
+ SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS = True
 
 
 
371
 
372
  local_model_options = list()
373
  aws_model_options = list()
374
  text_extraction_models = list()
375
 
376
+ if SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS:
377
  local_model_options.append(SELECTABLE_TEXT_EXTRACT_OPTION)
378
  local_model_options.append(TESSERACT_TEXT_EXTRACT_OPTION)
379
 
380
+ if SHOW_AWS_TEXT_EXTRACTION_OPTIONS:
381
  aws_model_options.append(TEXTRACT_TEXT_EXTRACT_OPTION)
382
 
383
  TEXT_EXTRACTION_MODELS = local_model_options + aws_model_options
384
+ DO_INITIAL_TABULAR_DATA_CLEAN = convert_string_to_boolean(
385
+ get_or_create_env_var("DO_INITIAL_TABULAR_DATA_CLEAN", "True")
386
  )
387
 
388
+ SHOW_LOCAL_PII_DETECTION_OPTIONS = convert_string_to_boolean(
389
+ get_or_create_env_var("SHOW_LOCAL_PII_DETECTION_OPTIONS", "True")
390
  )
391
+ SHOW_AWS_PII_DETECTION_OPTIONS = convert_string_to_boolean(
392
+ get_or_create_env_var("SHOW_AWS_PII_DETECTION_OPTIONS", "True")
393
  )
394
 
395
+ if not SHOW_LOCAL_PII_DETECTION_OPTIONS and not SHOW_AWS_PII_DETECTION_OPTIONS:
396
+ SHOW_LOCAL_PII_DETECTION_OPTIONS = True
 
 
 
397
 
398
  local_model_options = [NO_REDACTION_PII_OPTION]
399
  aws_model_options = list()
400
  pii_detection_models = list()
401
 
402
+ if SHOW_LOCAL_PII_DETECTION_OPTIONS:
403
  local_model_options.append(LOCAL_PII_OPTION)
404
 
405
+ if SHOW_AWS_PII_DETECTION_OPTIONS:
406
  aws_model_options.append(AWS_PII_OPTION)
407
 
408
  PII_DETECTION_MODELS = local_model_options + aws_model_options
409
 
410
+ if SHOW_AWS_TEXT_EXTRACTION_OPTIONS:
411
  DEFAULT_TEXT_EXTRACTION_MODEL = get_or_create_env_var(
412
  "DEFAULT_TEXT_EXTRACTION_MODEL", TEXTRACT_TEXT_EXTRACT_OPTION
413
  )
 
416
  "DEFAULT_TEXT_EXTRACTION_MODEL", SELECTABLE_TEXT_EXTRACT_OPTION
417
  )
418
 
419
+ if SHOW_AWS_PII_DETECTION_OPTIONS:
420
  DEFAULT_PII_DETECTION_MODEL = get_or_create_env_var(
421
  "DEFAULT_PII_DETECTION_MODEL", AWS_PII_OPTION
422
  )
 
442
  "CHOSEN_LOCAL_OCR_MODEL", "tesseract"
443
  ) # Choose between "tesseract", "hybrid", and "paddle". "paddle" is accurate for whole line text extraction, but word-level extract is not natively supported, and so word bounding boxes will be inaccurate. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with the chosen hybrid model (default PaddleOCR) on words with low confidence.
444
 
445
+ SHOW_LOCAL_OCR_MODEL_OPTIONS = convert_string_to_boolean(
446
+ get_or_create_env_var("SHOW_LOCAL_OCR_MODEL_OPTIONS", "False")
447
  )
448
+ if SHOW_LOCAL_OCR_MODEL_OPTIONS:
449
  LOCAL_OCR_MODEL_OPTIONS = [
450
  "tesseract",
451
  "hybrid",
 
461
  get_or_create_env_var("HYBRID_OCR_PADDING", "1")
462
  ) # The padding to add to the text when passing it to PaddleOCR for re-extraction using the hybrid OCR method.
463
 
464
+ PADDLE_USE_TEXTLINE_ORIENTATION = convert_string_to_boolean(
465
+ get_or_create_env_var("PADDLE_USE_TEXTLINE_ORIENTATION", "False")
466
  )
467
 
468
  PADDLE_DET_DB_UNCLIP_RATIO = get_or_create_env_var("PADDLE_DET_DB_UNCLIP_RATIO", "1.2")
469
 
470
+ SAVE_EXAMPLE_TESSERACT_VS_PADDLE_IMAGES = convert_string_to_boolean(
471
+ get_or_create_env_var("SAVE_EXAMPLE_TESSERACT_VS_PADDLE_IMAGES", "False")
472
  ) # Whether to save example images of Tesseract vs PaddleOCR re-extraction in hybrid OCR mode.
473
 
474
+ SAVE_PADDLE_VISUALISATIONS = convert_string_to_boolean(
475
+ get_or_create_env_var("SAVE_PADDLE_VISUALISATIONS", "False")
476
  ) # Whether to save visualisations of PaddleOCR bounding boxes.
477
 
478
  # Model storage paths for Lambda compatibility
 
559
 
560
  ### Language selection options
561
 
562
+ SHOW_LANGUAGE_SELECTION = convert_string_to_boolean(
563
+ get_or_create_env_var("SHOW_LANGUAGE_SELECTION", "False")
564
+ )
565
 
566
  DEFAULT_LANGUAGE_FULL_NAME = get_or_create_env_var(
567
  "DEFAULT_LANGUAGE_FULL_NAME", "english"
 
599
  DEFAULT_MIN_CONSECUTIVE_PAGES = int(
600
  get_or_create_env_var("DEFAULT_MIN_CONSECUTIVE_PAGES", "1")
601
  )
602
+ USE_GREEDY_DUPLICATE_DETECTION = convert_string_to_boolean(
603
+ get_or_create_env_var("USE_GREEDY_DUPLICATE_DETECTION", "True")
604
  )
605
+ DEFAULT_COMBINE_PAGES = convert_string_to_boolean(
606
+ get_or_create_env_var("DEFAULT_COMBINE_PAGES", "True")
607
  ) # Combine text from the same page number within a file. Alternative will enable line-level duplicate detection.
608
  DEFAULT_MIN_WORD_COUNT = int(get_or_create_env_var("DEFAULT_MIN_WORD_COUNT", "10"))
609
+ REMOVE_DUPLICATE_ROWS = convert_string_to_boolean(
610
+ get_or_create_env_var("REMOVE_DUPLICATE_ROWS", "False")
611
+ )
612
 
613
 
614
  ###
615
  # File output options
616
  ###
617
  # Should the output pdf redaction boxes be drawn using the custom box colour?
618
+ USE_GUI_BOX_COLOURS_FOR_OUTPUTS = convert_string_to_boolean(
619
+ get_or_create_env_var("USE_GUI_BOX_COLOURS_FOR_OUTPUTS", "False")
620
  )
621
 
622
  # This is the colour of the output pdf redaction boxes. Should be a tuple of three integers between 0 and 255
 
652
  ) # The default PDF_REDACT_TEXT_REMOVE | 0 removes all characters whose boundary box overlaps any redaction rectangle. This complies with the original legal / data protection intentions of redaction annotations. Other use cases however may require to keep text while redacting vector graphics or images. This can be achieved by setting text=True|PDF_REDACT_TEXT_NONE | 1. This does not comply with the data protection intentions of redaction annotations. Do so at your own risk.
653
 
654
  # If you don't want to redact the text, but instead just draw a box over it, set this to True
655
+ RETURN_PDF_FOR_REVIEW = convert_string_to_boolean(
656
+ get_or_create_env_var("RETURN_PDF_FOR_REVIEW", "True")
657
+ )
658
 
659
+ RETURN_REDACTED_PDF = convert_string_to_boolean(
660
+ get_or_create_env_var("RETURN_REDACTED_PDF", "True")
661
  ) # Return a redacted PDF at the end of the redaction task. Could be useful to set this to "False" if you want to ensure that the user always goes to the 'Review Redactions' tab before getting the final redacted PDF product.
662
 
663
+ COMPRESS_REDACTED_PDF = convert_string_to_boolean(
664
+ get_or_create_env_var("COMPRESS_REDACTED_PDF", "False")
665
  ) # On low memory systems, the compression options in pymupdf can cause the app to crash if the PDF is longer than 500 pages or so. Setting this to False will save the PDF only with a basic cleaning option enabled
666
 
667
  ###
 
676
  extract = TLDExtract(cache_dir=None)
677
 
678
  # Get some environment variables and Launch the Gradio app
679
+ COGNITO_AUTH = convert_string_to_boolean(get_or_create_env_var("COGNITO_AUTH", "False"))
680
 
681
 
682
  # Link to user guide - ensure it is a valid URL
 
731
  )
732
  )
733
 
734
+ SHOW_EXAMPLES = convert_string_to_boolean(
735
+ get_or_create_env_var("SHOW_EXAMPLES", "True")
736
+ )
737
+ SHOW_AWS_EXAMPLES = convert_string_to_boolean(
738
+ get_or_create_env_var("SHOW_AWS_EXAMPLES", "False")
739
+ )
740
 
741
  FILE_INPUT_HEIGHT = int(get_or_create_env_var("FILE_INPUT_HEIGHT", "200"))
742
 
743
+ RUN_DIRECT_MODE = convert_string_to_boolean(
744
+ get_or_create_env_var("RUN_DIRECT_MODE", "False")
745
+ )
746
 
747
  # Direct mode configuration options
748
  DIRECT_MODE_DEFAULT_USER = get_or_create_env_var(
 
764
 
765
  ### ALLOW LIST
766
 
767
+ GET_DEFAULT_ALLOW_LIST = convert_string_to_boolean(
768
+ get_or_create_env_var("GET_DEFAULT_ALLOW_LIST", "False")
769
+ )
770
 
771
  ALLOW_LIST_PATH = get_or_create_env_var(
772
  "ALLOW_LIST_PATH", ""
 
783
 
784
  ### DENY LIST
785
 
786
+ GET_DEFAULT_DENY_LIST = convert_string_to_boolean(
787
+ get_or_create_env_var("GET_DEFAULT_DENY_LIST", "False")
788
+ )
789
 
790
  S3_DENY_LIST_PATH = get_or_create_env_var(
791
  "S3_DENY_LIST_PATH", ""
 
825
  # COST CODE OPTIONS
826
  ###
827
 
828
+ SHOW_COSTS = convert_string_to_boolean(get_or_create_env_var("SHOW_COSTS", "False"))
829
 
830
+ GET_COST_CODES = convert_string_to_boolean(
831
+ get_or_create_env_var("GET_COST_CODES", "False")
832
+ )
833
 
834
  DEFAULT_COST_CODE = get_or_create_env_var("DEFAULT_COST_CODE", "")
835
 
 
847
  else:
848
  OUTPUT_COST_CODES_PATH = "config/cost_codes.csv"
849
 
850
+ ENFORCE_COST_CODES = convert_string_to_boolean(
851
+ get_or_create_env_var("ENFORCE_COST_CODES", "False")
852
+ )
853
+ # If you have cost codes listed, is it compulsory to choose one before redacting?
854
 
855
+ if ENFORCE_COST_CODES:
856
+ GET_COST_CODES = True
857
 
858
 
859
  ###
860
  # WHOLE DOCUMENT API OPTIONS
861
  ###
862
 
863
+ SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS = convert_string_to_boolean(
864
+ get_or_create_env_var("SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS", "False")
865
  ) # This feature not currently implemented
866
 
867
  TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET = get_or_create_env_var(
 
876
  "TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER", "output"
877
  )
878
 
879
+ LOAD_PREVIOUS_TEXTRACT_JOBS_S3 = convert_string_to_boolean(
880
+ get_or_create_env_var("LOAD_PREVIOUS_TEXTRACT_JOBS_S3", "False")
881
+ )
882
+ # Whether or not to load previous Textract jobs from S3
883
 
884
  TEXTRACT_JOBS_S3_LOC = get_or_create_env_var(
885
  "TEXTRACT_JOBS_S3_LOC", "output"
 
901
  ###
902
  # Config vars output format
903
  ###
 
 
 
 
 
 
904
 
905
  # Convert string environment variables to string or list
906
+ CSV_ACCESS_LOG_HEADERS = _get_env_list(CSV_ACCESS_LOG_HEADERS)
907
+ CSV_FEEDBACK_LOG_HEADERS = _get_env_list(CSV_FEEDBACK_LOG_HEADERS)
908
+ CSV_USAGE_LOG_HEADERS = _get_env_list(CSV_USAGE_LOG_HEADERS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
909
 
910
+ DYNAMODB_ACCESS_LOG_HEADERS = _get_env_list(DYNAMODB_ACCESS_LOG_HEADERS)
911
+ DYNAMODB_FEEDBACK_LOG_HEADERS = _get_env_list(DYNAMODB_FEEDBACK_LOG_HEADERS)
912
+ DYNAMODB_USAGE_LOG_HEADERS = _get_env_list(DYNAMODB_USAGE_LOG_HEADERS)
913
  if CHOSEN_COMPREHEND_ENTITIES:
914
  CHOSEN_COMPREHEND_ENTITIES = _get_env_list(CHOSEN_COMPREHEND_ENTITIES)
915
  if FULL_COMPREHEND_ENTITY_LIST:
 
937
 
938
  if ALLOWED_HOSTS:
939
  ALLOWED_HOSTS = _get_env_list(ALLOWED_HOSTS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tools/data_anonymise.py CHANGED
@@ -545,7 +545,7 @@ def anonymise_files_with_open_text(
545
  # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
546
  if pii_identification_method == "AWS Comprehend":
547
  print("Trying to connect to AWS Comprehend service")
548
- if RUN_AWS_FUNCTIONS == "1" and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS == "1":
549
  print("Connecting to Comprehend via existing SSO connection")
550
  comprehend_client = boto3.client("comprehend", region_name=AWS_REGION)
551
  elif aws_access_key_textbox and aws_secret_key_textbox:
@@ -557,7 +557,7 @@ def anonymise_files_with_open_text(
557
  aws_access_key_id=aws_access_key_textbox,
558
  aws_secret_access_key=aws_secret_key_textbox,
559
  )
560
- elif RUN_AWS_FUNCTIONS == "1":
561
  print("Connecting to Comprehend via existing SSO connection")
562
  comprehend_client = boto3.client("comprehend")
563
  elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
 
545
  # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
546
  if pii_identification_method == "AWS Comprehend":
547
  print("Trying to connect to AWS Comprehend service")
548
+ if RUN_AWS_FUNCTIONS and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:
549
  print("Connecting to Comprehend via existing SSO connection")
550
  comprehend_client = boto3.client("comprehend", region_name=AWS_REGION)
551
  elif aws_access_key_textbox and aws_secret_key_textbox:
 
557
  aws_access_key_id=aws_access_key_textbox,
558
  aws_secret_access_key=aws_secret_key_textbox,
559
  )
560
+ elif RUN_AWS_FUNCTIONS:
561
  print("Connecting to Comprehend via existing SSO connection")
562
  comprehend_client = boto3.client("comprehend")
563
  elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
tools/file_conversion.py CHANGED
@@ -48,7 +48,7 @@ if not MAX_IMAGE_PIXELS:
48
  else:
49
  Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
50
 
51
- ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES.lower() == "true"
52
 
53
 
54
  def is_pdf_or_image(filename):
 
48
  else:
49
  Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
50
 
51
+ ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES
52
 
53
 
54
  def is_pdf_or_image(filename):
tools/file_redaction.py CHANGED
@@ -107,7 +107,7 @@ from tools.secure_path_utils import (
107
  validate_path_containment,
108
  )
109
 
110
- ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES.lower() == "true"
111
  if not MAX_IMAGE_PIXELS:
112
  Image.MAX_IMAGE_PIXELS = None
113
  else:
@@ -803,9 +803,9 @@ def choose_and_run_redactor(
803
 
804
  ### Load/create PII identification method
805
 
806
- # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
807
  if pii_identification_method == AWS_PII_OPTION:
808
- if RUN_AWS_FUNCTIONS == "1" and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS == "1":
809
  print("Connecting to Comprehend via existing SSO connection")
810
  comprehend_client = boto3.client("comprehend", region_name=AWS_REGION)
811
  elif aws_access_key_textbox and aws_secret_key_textbox:
@@ -818,7 +818,7 @@ def choose_and_run_redactor(
818
  aws_secret_access_key=aws_secret_key_textbox,
819
  region_name=AWS_REGION,
820
  )
821
- elif RUN_AWS_FUNCTIONS == "1":
822
  print("Connecting to Comprehend via existing SSO connection")
823
  comprehend_client = boto3.client("comprehend", region_name=AWS_REGION)
824
  elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
@@ -839,7 +839,7 @@ def choose_and_run_redactor(
839
 
840
  # Try to connect to AWS Textract Client if using that text extraction method
841
  if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
842
- if RUN_AWS_FUNCTIONS == "1" and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS == "1":
843
  print("Connecting to Textract via existing SSO connection")
844
  textract_client = boto3.client("textract", region_name=AWS_REGION)
845
  elif aws_access_key_textbox and aws_secret_key_textbox:
@@ -852,7 +852,7 @@ def choose_and_run_redactor(
852
  aws_secret_access_key=aws_secret_key_textbox,
853
  region_name=AWS_REGION,
854
  )
855
- elif RUN_AWS_FUNCTIONS == "1":
856
  print("Connecting to Textract via existing SSO connection")
857
  textract_client = boto3.client("textract", region_name=AWS_REGION)
858
  elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
 
107
  validate_path_containment,
108
  )
109
 
110
+ ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES
111
  if not MAX_IMAGE_PIXELS:
112
  Image.MAX_IMAGE_PIXELS = None
113
  else:
 
803
 
804
  ### Load/create PII identification method
805
 
806
+ # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is True, otherwise an environment variable or direct textbox input is needed.
807
  if pii_identification_method == AWS_PII_OPTION:
808
+ if RUN_AWS_FUNCTIONS and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:
809
  print("Connecting to Comprehend via existing SSO connection")
810
  comprehend_client = boto3.client("comprehend", region_name=AWS_REGION)
811
  elif aws_access_key_textbox and aws_secret_key_textbox:
 
818
  aws_secret_access_key=aws_secret_key_textbox,
819
  region_name=AWS_REGION,
820
  )
821
+ elif RUN_AWS_FUNCTIONS:
822
  print("Connecting to Comprehend via existing SSO connection")
823
  comprehend_client = boto3.client("comprehend", region_name=AWS_REGION)
824
  elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
 
839
 
840
  # Try to connect to AWS Textract Client if using that text extraction method
841
  if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
842
+ if RUN_AWS_FUNCTIONS and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:
843
  print("Connecting to Textract via existing SSO connection")
844
  textract_client = boto3.client("textract", region_name=AWS_REGION)
845
  elif aws_access_key_textbox and aws_secret_key_textbox:
 
852
  aws_secret_access_key=aws_secret_key_textbox,
853
  region_name=AWS_REGION,
854
  )
855
+ elif RUN_AWS_FUNCTIONS:
856
  print("Connecting to Textract via existing SSO connection")
857
  textract_client = boto3.client("textract", region_name=AWS_REGION)
858
  elif AWS_ACCESS_KEY and AWS_SECRET_KEY: