seanpedrickcase commited on
Commit
409bdc5
·
1 Parent(s): 150a8d9

Input image creation during redaction should now respect input folders. Minor json output path change

Browse files
entrypoint.sh CHANGED
@@ -21,7 +21,7 @@ else
21
  GRADIO_SERVER_PORT=${GRADIO_SERVER_PORT:-7860}
22
 
23
  # Start uvicorn server.
24
- echo "Starting with Uvicorn on $GRADIO_SERVER_NAME:$GRADIO_SERVER_PORT with root path $ROOT_PATH"
25
  exec uvicorn app:app \
26
  --host $GRADIO_SERVER_NAME \
27
  --port $GRADIO_SERVER_PORT \
 
21
  GRADIO_SERVER_PORT=${GRADIO_SERVER_PORT:-7860}
22
 
23
  # Start uvicorn server.
24
+ echo "Starting with Uvicorn on $GRADIO_SERVER_NAME:$GRADIO_SERVER_PORT"
25
  exec uvicorn app:app \
26
  --host $GRADIO_SERVER_NAME \
27
  --port $GRADIO_SERVER_PORT \
tools/file_redaction.py CHANGED
@@ -985,6 +985,7 @@ def choose_and_run_redactor(
985
  log_files_output_paths=log_files_output_paths,
986
  nlp_analyser=nlp_analyser,
987
  output_folder=output_folder,
 
988
  )
989
 
990
  # This line creates a copy of out_file_paths to break potential links with log_files_output_paths
@@ -1040,6 +1041,7 @@ def choose_and_run_redactor(
1040
  document_cropboxes,
1041
  text_extraction_only,
1042
  output_folder=output_folder,
 
1043
  )
1044
  else:
1045
  out_message = "No redaction method selected"
@@ -1372,9 +1374,6 @@ def choose_and_run_redactor(
1372
  log_files_output_paths.append(
1373
  all_page_line_level_ocr_results_with_words_json_file_path[0]
1374
  )
1375
- log_files_output_paths.append(
1376
- all_page_line_level_ocr_results_with_words_json_file_path
1377
- )
1378
 
1379
  if (
1380
  all_page_line_level_ocr_results_with_words_df_file_path
@@ -2083,6 +2082,7 @@ def redact_page_with_pymupdf(
2083
  page_sizes_df: pd.DataFrame = pd.DataFrame(),
2084
  return_pdf_for_review: bool = RETURN_PDF_FOR_REVIEW,
2085
  return_pdf_end_of_redaction: bool = RETURN_REDACTED_PDF,
 
2086
  ):
2087
  """
2088
  Applies redactions to a single PyMuPDF page based on provided annotations.
@@ -2160,8 +2160,17 @@ def redact_page_with_pymupdf(
2160
  all_image_annotation_boxes = list()
2161
 
2162
  if isinstance(image, Image.Image):
2163
- image_path = move_page_info(str(page))
2164
- image.save(image_path)
 
 
 
 
 
 
 
 
 
2165
  elif isinstance(image, str):
2166
  # Normalize and validate path safety before checking existence
2167
  normalized_path = os.path.normpath(os.path.abspath(image))
@@ -2620,6 +2629,7 @@ def redact_image_pdf(
2620
  max_time: int = int(MAX_TIME_VALUE),
2621
  nlp_analyser: AnalyzerEngine = nlp_analyser,
2622
  output_folder: str = OUTPUT_FOLDER,
 
2623
  progress=Progress(track_tqdm=True),
2624
  ):
2625
  """
@@ -2661,7 +2671,7 @@ def redact_image_pdf(
2661
  - nlp_analyser (AnalyzerEngine, optional): The nlp_analyser object to use for entity detection. Defaults to nlp_analyser.
2662
  - output_folder (str, optional): The folder for file outputs.
2663
  - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
2664
-
2665
  The function returns a redacted PDF document along with processing output objects.
2666
  """
2667
 
@@ -3140,6 +3150,7 @@ def redact_image_pdf(
3140
  redact_whole_page=redact_whole_page,
3141
  original_cropbox=original_cropbox,
3142
  page_sizes_df=page_sizes_df,
 
3143
  )
3144
 
3145
  # Handle dual page objects if returned
@@ -3901,6 +3912,7 @@ def redact_text_pdf(
3901
  original_cropboxes: List[dict] = list(),
3902
  text_extraction_only: bool = False,
3903
  output_folder: str = OUTPUT_FOLDER,
 
3904
  page_break_val: int = int(PAGE_BREAK_VALUE), # Value for page break
3905
  max_time: int = int(MAX_TIME_VALUE),
3906
  nlp_analyser: AnalyzerEngine = nlp_analyser,
@@ -3936,6 +3948,7 @@ def redact_text_pdf(
3936
  - text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
3937
  - language (str, optional): The language to do AWS Comprehend calls. Defaults to value of language if not provided.
3938
  - output_folder (str, optional): The output folder for the function
 
3939
  - page_break_val: Value for page break
3940
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
3941
  - nlp_analyser (AnalyzerEngine, optional): The nlp_analyser object to use for entity detection. Defaults to nlp_analyser.
@@ -4203,6 +4216,7 @@ def redact_text_pdf(
4203
  convert_pikepdf_to_pymupdf_coords=True,
4204
  original_cropbox=original_cropboxes[page_no],
4205
  page_sizes_df=page_sizes_df,
 
4206
  )
4207
 
4208
  # Handle dual page objects if returned
 
985
  log_files_output_paths=log_files_output_paths,
986
  nlp_analyser=nlp_analyser,
987
  output_folder=output_folder,
988
+ input_folder=input_folder,
989
  )
990
 
991
  # This line creates a copy of out_file_paths to break potential links with log_files_output_paths
 
1041
  document_cropboxes,
1042
  text_extraction_only,
1043
  output_folder=output_folder,
1044
+ input_folder=input_folder,
1045
  )
1046
  else:
1047
  out_message = "No redaction method selected"
 
1374
  log_files_output_paths.append(
1375
  all_page_line_level_ocr_results_with_words_json_file_path[0]
1376
  )
 
 
 
1377
 
1378
  if (
1379
  all_page_line_level_ocr_results_with_words_df_file_path
 
2082
  page_sizes_df: pd.DataFrame = pd.DataFrame(),
2083
  return_pdf_for_review: bool = RETURN_PDF_FOR_REVIEW,
2084
  return_pdf_end_of_redaction: bool = RETURN_REDACTED_PDF,
2085
+ input_folder: str = INPUT_FOLDER,
2086
  ):
2087
  """
2088
  Applies redactions to a single PyMuPDF page based on provided annotations.
 
2160
  all_image_annotation_boxes = list()
2161
 
2162
  if isinstance(image, Image.Image):
2163
+ # Create an image path using the input folder with PDF filename
2164
+ # Get the PDF filename from the page's parent document
2165
+ pdf_filename = (
2166
+ os.path.basename(page.parent.name)
2167
+ if hasattr(page.parent, "name") and page.parent.name
2168
+ else "document"
2169
+ )
2170
+ # pdf_name_without_ext = os.path.splitext(pdf_filename)[0]
2171
+ image_path = os.path.join(input_folder, f"{pdf_filename}_{page.number}.png")
2172
+ if not os.path.exists(image_path):
2173
+ image.save(image_path)
2174
  elif isinstance(image, str):
2175
  # Normalize and validate path safety before checking existence
2176
  normalized_path = os.path.normpath(os.path.abspath(image))
 
2629
  max_time: int = int(MAX_TIME_VALUE),
2630
  nlp_analyser: AnalyzerEngine = nlp_analyser,
2631
  output_folder: str = OUTPUT_FOLDER,
2632
+ input_folder: str = INPUT_FOLDER,
2633
  progress=Progress(track_tqdm=True),
2634
  ):
2635
  """
 
2671
  - nlp_analyser (AnalyzerEngine, optional): The nlp_analyser object to use for entity detection. Defaults to nlp_analyser.
2672
  - output_folder (str, optional): The folder for file outputs.
2673
  - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
2674
+ - input_folder (str, optional): The folder for file inputs.
2675
  The function returns a redacted PDF document along with processing output objects.
2676
  """
2677
 
 
3150
  redact_whole_page=redact_whole_page,
3151
  original_cropbox=original_cropbox,
3152
  page_sizes_df=page_sizes_df,
3153
+ input_folder=input_folder,
3154
  )
3155
 
3156
  # Handle dual page objects if returned
 
3912
  original_cropboxes: List[dict] = list(),
3913
  text_extraction_only: bool = False,
3914
  output_folder: str = OUTPUT_FOLDER,
3915
+ input_folder: str = INPUT_FOLDER,
3916
  page_break_val: int = int(PAGE_BREAK_VALUE), # Value for page break
3917
  max_time: int = int(MAX_TIME_VALUE),
3918
  nlp_analyser: AnalyzerEngine = nlp_analyser,
 
3948
  - text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
3949
  - language (str, optional): The language to do AWS Comprehend calls. Defaults to value of language if not provided.
3950
  - output_folder (str, optional): The output folder for the function
3951
+ - input_folder (str, optional): The folder for file inputs.
3952
  - page_break_val: Value for page break
3953
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
3954
  - nlp_analyser (AnalyzerEngine, optional): The nlp_analyser object to use for entity detection. Defaults to nlp_analyser.
 
4216
  convert_pikepdf_to_pymupdf_coords=True,
4217
  original_cropbox=original_cropboxes[page_no],
4218
  page_sizes_df=page_sizes_df,
4219
+ input_folder=input_folder,
4220
  )
4221
 
4222
  # Handle dual page objects if returned
tools/redaction_review.py CHANGED
@@ -1594,6 +1594,7 @@ def apply_redactions_to_review_df_and_files(
1594
  save_pdf: bool = True,
1595
  page_sizes: List[dict] = list(),
1596
  COMPRESS_REDACTED_PDF: bool = COMPRESS_REDACTED_PDF,
 
1597
  progress=gr.Progress(track_tqdm=True),
1598
  ):
1599
  """
@@ -1619,6 +1620,8 @@ def apply_redactions_to_review_df_and_files(
1619
  Defaults to an empty list.
1620
  COMPRESS_REDACTED_PDF (bool, optional): If True, the output PDF will be compressed.
1621
  Defaults to COMPRESS_REDACTED_PDF.
 
 
1622
  progress (gr.Progress, optional): Gradio progress object for tracking task progress.
1623
  Defaults to gr.Progress(track_tqdm=True).
1624
 
@@ -1636,6 +1639,9 @@ def apply_redactions_to_review_df_and_files(
1636
  pdf_doc = list()
1637
  review_df = review_file_state
1638
 
 
 
 
1639
  page_image_annotator_object = all_image_annotations[current_page - 1]
1640
 
1641
  # This replaces the numpy array image object with the image file path
@@ -1774,6 +1780,7 @@ def apply_redactions_to_review_df_and_files(
1774
  page_sizes_df=page_sizes_df,
1775
  return_pdf_for_review=True,
1776
  return_pdf_end_of_redaction=False,
 
1777
  )
1778
 
1779
  # Apply redactions to final page (with text removed)
@@ -1785,6 +1792,7 @@ def apply_redactions_to_review_df_and_files(
1785
  page_sizes_df=page_sizes_df,
1786
  return_pdf_for_review=False,
1787
  return_pdf_end_of_redaction=False,
 
1788
  )
1789
  else:
1790
  print("File type not recognised.")
 
1594
  save_pdf: bool = True,
1595
  page_sizes: List[dict] = list(),
1596
  COMPRESS_REDACTED_PDF: bool = COMPRESS_REDACTED_PDF,
1597
+ input_folder: str = INPUT_FOLDER,
1598
  progress=gr.Progress(track_tqdm=True),
1599
  ):
1600
  """
 
1620
  Defaults to an empty list.
1621
  COMPRESS_REDACTED_PDF (bool, optional): If True, the output PDF will be compressed.
1622
  Defaults to COMPRESS_REDACTED_PDF.
1623
+ input_folder (str, optional): The directory where input files are located and where
1624
+ page images should be saved. Defaults to INPUT_FOLDER.
1625
  progress (gr.Progress, optional): Gradio progress object for tracking task progress.
1626
  Defaults to gr.Progress(track_tqdm=True).
1627
 
 
1639
  pdf_doc = list()
1640
  review_df = review_file_state
1641
 
1642
+ # Always use the provided input_folder parameter
1643
+ # This ensures images are created in the specified input folder, not in example_data
1644
+
1645
  page_image_annotator_object = all_image_annotations[current_page - 1]
1646
 
1647
  # This replaces the numpy array image object with the image file path
 
1780
  page_sizes_df=page_sizes_df,
1781
  return_pdf_for_review=True,
1782
  return_pdf_end_of_redaction=False,
1783
+ input_folder=input_folder,
1784
  )
1785
 
1786
  # Apply redactions to final page (with text removed)
 
1792
  page_sizes_df=page_sizes_df,
1793
  return_pdf_for_review=False,
1794
  return_pdf_end_of_redaction=False,
1795
+ input_folder=input_folder,
1796
  )
1797
  else:
1798
  print("File type not recognised.")