seanpedrickcase commited on
Commit
5086da0
·
1 Parent(s): 78403ba

Fixed whole page redactions being incorrectly positions, and without IDs. Fixed duplicate pages output issue. Minor changes to output redaction box format and related code.

Browse files
app.py CHANGED
@@ -277,6 +277,47 @@ in_redact_comprehend_entities = gr.Dropdown(
277
  label="AWS Comprehend PII identification model (click empty space in box for full list)",
278
  )
279
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  ## Deduplication examples
281
  in_duplicate_pages = gr.File(
282
  label="Upload one or multiple 'ocr_output.csv' files to find duplicate pages and subdocuments",
@@ -973,6 +1014,8 @@ with blocks:
973
  "example_data/example_complaint_letter.jpg",
974
  "example_data/graduate-job-example-cover-letter.pdf",
975
  "example_data/Partnership-Agreement-Toolkit_0_0.pdf",
 
 
976
  ]
977
 
978
  available_examples = list()
@@ -990,6 +1033,10 @@ with blocks:
990
  CHOSEN_COMPREHEND_ENTITIES,
991
  [example_files[0]],
992
  example_files[0],
 
 
 
 
993
  ]
994
  )
995
  example_labels.append("PDF with selectable text redaction")
@@ -1005,6 +1052,10 @@ with blocks:
1005
  CHOSEN_COMPREHEND_ENTITIES,
1006
  [example_files[1]],
1007
  example_files[1],
 
 
 
 
1008
  ]
1009
  )
1010
  example_labels.append("Image redaction with local OCR")
@@ -1020,6 +1071,10 @@ with blocks:
1020
  CHOSEN_COMPREHEND_ENTITIES,
1021
  [example_files[2]],
1022
  example_files[2],
 
 
 
 
1023
  ]
1024
  )
1025
  example_labels.append(
@@ -1038,11 +1093,37 @@ with blocks:
1038
  CHOSEN_COMPREHEND_ENTITIES,
1039
  [example_files[3]],
1040
  example_files[3],
 
 
 
 
1041
  ]
1042
  )
1043
  example_labels.append(
1044
  "PDF redaction with AWS services and signature detection"
1045
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1046
 
1047
  # Only create examples if we have available files
1048
  if available_examples:
@@ -1056,6 +1137,10 @@ with blocks:
1056
  in_redact_comprehend_entities,
1057
  prepared_pdf_state,
1058
  doc_full_file_name_textbox,
 
 
 
 
1059
  ):
1060
  gr.Info(
1061
  "Example data loaded. Now click on 'Extract text and redact document' below to run the example redaction."
@@ -1072,6 +1157,10 @@ with blocks:
1072
  in_redact_comprehend_entities,
1073
  prepared_pdf_state,
1074
  doc_full_file_name_textbox,
 
 
 
 
1075
  ],
1076
  example_labels=example_labels,
1077
  fn=show_info_box_on_click,
@@ -2091,19 +2180,11 @@ with blocks:
2091
  in_allow_list_text = gr.Textbox(
2092
  label="Custom allow list load status"
2093
  )
2094
- with gr.Column():
2095
- in_deny_list = gr.File(
2096
- label="Import custom deny list - csv table with one column of a different word/phrase on each row (case insensitive). Terms in this file will always be redacted.",
2097
- file_count="multiple",
2098
- height=FILE_INPUT_HEIGHT,
2099
- )
2100
  in_deny_list_text = gr.Textbox(label="Custom deny list load status")
2101
  with gr.Column():
2102
- in_fully_redacted_list = gr.File(
2103
- label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.",
2104
- file_count="multiple",
2105
- height=FILE_INPUT_HEIGHT,
2106
- )
2107
  in_fully_redacted_list_text = gr.Textbox(
2108
  label="Fully redacted page list load status"
2109
  )
@@ -2125,33 +2206,10 @@ with blocks:
2125
  show_copy_button=True,
2126
  wrap=True,
2127
  )
2128
- in_deny_list_state = gr.Dataframe(
2129
- value=pd.DataFrame(),
2130
- headers=["deny_list"],
2131
- col_count=(1, "fixed"),
2132
- row_count=(0, "dynamic"),
2133
- label="Deny list",
2134
- visible=True,
2135
- type="pandas",
2136
- interactive=True,
2137
- show_fullscreen_button=True,
2138
- show_copy_button=True,
2139
- wrap=True,
2140
- )
2141
- in_fully_redacted_list_state = gr.Dataframe(
2142
- value=pd.DataFrame(),
2143
- headers=["fully_redacted_pages_list"],
2144
- col_count=(1, "fixed"),
2145
- row_count=(0, "dynamic"),
2146
- label="Fully redacted pages",
2147
- visible=True,
2148
- type="pandas",
2149
- interactive=True,
2150
- show_fullscreen_button=True,
2151
- show_copy_button=True,
2152
- datatype="number",
2153
- wrap=True,
2154
- )
2155
  with gr.Row():
2156
  with gr.Column(scale=2):
2157
  markdown_placeholder = gr.Markdown("")
 
277
  label="AWS Comprehend PII identification model (click empty space in box for full list)",
278
  )
279
 
280
+ in_deny_list = gr.File(
281
+ label="Import custom deny list - csv table with one column of a different word/phrase on each row (case insensitive). Terms in this file will always be redacted.",
282
+ file_count="multiple",
283
+ height=FILE_INPUT_HEIGHT,
284
+ )
285
+
286
+ in_deny_list_state = gr.Dataframe(
287
+ value=pd.DataFrame(),
288
+ headers=["deny_list"],
289
+ col_count=(1, "fixed"),
290
+ row_count=(0, "dynamic"),
291
+ label="Deny list",
292
+ visible=True,
293
+ type="pandas",
294
+ interactive=True,
295
+ show_fullscreen_button=True,
296
+ show_copy_button=True,
297
+ wrap=True,
298
+ )
299
+
300
+ in_fully_redacted_list = gr.File(
301
+ label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.",
302
+ file_count="multiple",
303
+ height=FILE_INPUT_HEIGHT,
304
+ )
305
+
306
+ in_fully_redacted_list_state = gr.Dataframe(
307
+ value=pd.DataFrame(),
308
+ headers=["fully_redacted_pages_list"],
309
+ col_count=(1, "fixed"),
310
+ row_count=(0, "dynamic"),
311
+ label="Fully redacted pages",
312
+ visible=True,
313
+ type="pandas",
314
+ interactive=True,
315
+ show_fullscreen_button=True,
316
+ show_copy_button=True,
317
+ wrap=True,
318
+ )
319
+
320
+
321
  ## Deduplication examples
322
  in_duplicate_pages = gr.File(
323
  label="Upload one or multiple 'ocr_output.csv' files to find duplicate pages and subdocuments",
 
1014
  "example_data/example_complaint_letter.jpg",
1015
  "example_data/graduate-job-example-cover-letter.pdf",
1016
  "example_data/Partnership-Agreement-Toolkit_0_0.pdf",
1017
+ "example_data/partnership_toolkit_redact_custom_deny_list.csv",
1018
+ "example_data/partnership_toolkit_redact_some_pages.csv",
1019
  ]
1020
 
1021
  available_examples = list()
 
1033
  CHOSEN_COMPREHEND_ENTITIES,
1034
  [example_files[0]],
1035
  example_files[0],
1036
+ [],
1037
+ pd.DataFrame(),
1038
+ [],
1039
+ pd.DataFrame(),
1040
  ]
1041
  )
1042
  example_labels.append("PDF with selectable text redaction")
 
1052
  CHOSEN_COMPREHEND_ENTITIES,
1053
  [example_files[1]],
1054
  example_files[1],
1055
+ [],
1056
+ pd.DataFrame(),
1057
+ [],
1058
+ pd.DataFrame(),
1059
  ]
1060
  )
1061
  example_labels.append("Image redaction with local OCR")
 
1071
  CHOSEN_COMPREHEND_ENTITIES,
1072
  [example_files[2]],
1073
  example_files[2],
1074
+ [],
1075
+ pd.DataFrame(),
1076
+ [],
1077
+ pd.DataFrame(),
1078
  ]
1079
  )
1080
  example_labels.append(
 
1093
  CHOSEN_COMPREHEND_ENTITIES,
1094
  [example_files[3]],
1095
  example_files[3],
1096
+ [],
1097
+ pd.DataFrame(),
1098
+ [],
1099
+ pd.DataFrame(),
1100
  ]
1101
  )
1102
  example_labels.append(
1103
  "PDF redaction with AWS services and signature detection"
1104
+ )
1105
+
1106
+ # Add new example for custom deny list and whole page redaction
1107
+ if os.path.exists(example_files[3]) and os.path.exists(example_files[4]) and os.path.exists(example_files[5]):
1108
+ available_examples.append(
1109
+ [
1110
+ [example_files[3]],
1111
+ "Local OCR model - PDFs without selectable text",
1112
+ "Local",
1113
+ [],
1114
+ ["CUSTOM"], # Use CUSTOM entity to enable deny list functionality
1115
+ CHOSEN_COMPREHEND_ENTITIES,
1116
+ [example_files[3]],
1117
+ example_files[3],
1118
+ [example_files[4]],
1119
+ pd.DataFrame(data={"deny_list": ["Sister", "Sister City", "Sister Cities", "Friendship City"]}),
1120
+ [example_files[5]],
1121
+ pd.DataFrame(data={"fully_redacted_pages_list": [2, 5]}),
1122
+ ]
1123
+ )
1124
+ example_labels.append(
1125
+ "PDF redaction with custom deny list and whole page redaction"
1126
+ )
1127
 
1128
  # Only create examples if we have available files
1129
  if available_examples:
 
1137
  in_redact_comprehend_entities,
1138
  prepared_pdf_state,
1139
  doc_full_file_name_textbox,
1140
+ in_deny_list,
1141
+ in_deny_list_state,
1142
+ in_fully_redacted_list,
1143
+ in_fully_redacted_list_state,
1144
  ):
1145
  gr.Info(
1146
  "Example data loaded. Now click on 'Extract text and redact document' below to run the example redaction."
 
1157
  in_redact_comprehend_entities,
1158
  prepared_pdf_state,
1159
  doc_full_file_name_textbox,
1160
+ in_deny_list,
1161
+ in_deny_list_state,
1162
+ in_fully_redacted_list,
1163
+ in_fully_redacted_list_state,
1164
  ],
1165
  example_labels=example_labels,
1166
  fn=show_info_box_on_click,
 
2180
  in_allow_list_text = gr.Textbox(
2181
  label="Custom allow list load status"
2182
  )
2183
+ with gr.Column():
2184
+ in_deny_list.render() # Defined at beginning of file
 
 
 
 
2185
  in_deny_list_text = gr.Textbox(label="Custom deny list load status")
2186
  with gr.Column():
2187
+ in_fully_redacted_list.render() # Defined at beginning of file
 
 
 
 
2188
  in_fully_redacted_list_text = gr.Textbox(
2189
  label="Fully redacted page list load status"
2190
  )
 
2206
  show_copy_button=True,
2207
  wrap=True,
2208
  )
2209
+
2210
+ in_deny_list_state.render() # Defined at beginning of file
2211
+
2212
+ in_fully_redacted_list_state.render() # Defined at beginning of file
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2213
  with gr.Row():
2214
  with gr.Column(scale=2):
2215
  markdown_placeholder = gr.Markdown("")
tools/file_conversion.py CHANGED
@@ -574,7 +574,7 @@ def redact_single_box(
574
 
575
  Returns:
576
  Page or Tuple[Page, Page]: If return_pdf_end_of_redaction is True and retain_text is True,
577
- returns a tuple of (review_page, final_page). Otherwise returns a single Page.
578
  """
579
 
580
  pymupdf_x1 = pymupdf_rect[0]
@@ -582,31 +582,45 @@ def redact_single_box(
582
  pymupdf_x2 = pymupdf_rect[2]
583
  pymupdf_y2 = pymupdf_rect[3]
584
 
585
- full_size_redaction_box = Rect(pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
586
 
587
  out_colour = define_box_colour(
588
  custom_colours, img_annotation_box, CUSTOM_BOX_COLOUR
589
  )
590
 
 
 
 
591
  # Create a copy of the page for final redaction if needed
592
- final_page = None
593
  if return_pdf_end_of_redaction and retain_text:
594
  # Create a deep copy of the page for final redaction
595
- import fitz
596
 
597
- final_page = fitz.open()
598
- final_page.insert_pdf(
599
  pymupdf_page.parent,
600
  from_page=pymupdf_page.number,
601
  to_page=pymupdf_page.number,
602
  )
603
- final_page = final_page[0]
604
 
605
- # Handle review page (retain_text = True)
606
- if retain_text is True:
607
-
608
- img_annotation_box["text"] = img_annotation_box.get("text") or ""
609
- img_annotation_box["label"] = img_annotation_box.get("label") or "Redaction"
610
 
611
  annot = pymupdf_page.add_redact_annot(full_size_redaction_box)
612
  annot.set_colors(stroke=out_colour, fill=out_colour, colors=out_colour)
@@ -620,51 +634,27 @@ def redact_single_box(
620
  )
621
  annot.update(opacity=0.5, cross_out=False)
622
 
623
- # If we need both review and final pages, apply final redaction to the copy
624
- if return_pdf_end_of_redaction and final_page is not None:
625
- # Apply final redaction to the copy
626
- redact_bottom_y = pymupdf_y1 + 2
627
- redact_top_y = pymupdf_y2 - 2
628
-
629
- # Calculate the middle y value and set a small height if default values are too close together
630
- if (redact_top_y - redact_bottom_y) < 1:
631
- middle_y = (pymupdf_y1 + pymupdf_y2) / 2
632
- redact_bottom_y = middle_y - 1
633
- redact_top_y = middle_y + 1
634
-
635
- rect_small_pixel_height = Rect(
636
- pymupdf_x1, redact_bottom_y, pymupdf_x2, redact_top_y
637
- ) # Slightly smaller than outside box
638
 
639
  # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
640
- final_page.add_redact_annot(rect_small_pixel_height)
641
 
642
  # Only create a box over the whole rect if we want to delete the text
643
- shape = final_page.new_shape()
644
  shape.draw_rect(pymupdf_rect)
645
 
646
  # Use solid fill for normal redaction
647
  shape.finish(color=out_colour, fill=out_colour)
648
  shape.commit()
649
 
650
- return pymupdf_page, final_page
651
  else:
652
  return pymupdf_page
653
- else:
654
- # Calculate area to actually remove text from the pdf (different from black box size)
655
- redact_bottom_y = pymupdf_y1 + 2
656
- redact_top_y = pymupdf_y2 - 2
657
-
658
- # Calculate the middle y value and set a small height if default values are too close together
659
- if (redact_top_y - redact_bottom_y) < 1:
660
- middle_y = (pymupdf_y1 + pymupdf_y2) / 2
661
- redact_bottom_y = middle_y - 1
662
- redact_top_y = middle_y + 1
663
-
664
- rect_small_pixel_height = Rect(
665
- pymupdf_x1, redact_bottom_y, pymupdf_x2, redact_top_y
666
- ) # Slightly smaller than outside box
667
 
 
 
668
  # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
669
  pymupdf_page.add_redact_annot(rect_small_pixel_height)
670
 
@@ -792,27 +782,30 @@ def redact_whole_pymupdf_page(
792
  """
793
  # Small border to page that remains white
794
 
795
- # Define the coordinates for the Rect
796
  whole_page_x1, whole_page_y1 = 0 + border, 0 + border # Bottom-left corner
797
-
798
- # If border is a tiny value, assume that we want relative values
799
- if border < 0.1:
800
- whole_page_x2, whole_page_y2 = 1 - border, 1 - border # Top-right corner
801
- else:
802
- whole_page_x2, whole_page_y2 = (
803
- rect_width - border,
804
- rect_height - border,
805
- ) # Top-right corner
806
 
807
  # Create new image annotation element based on whole page coordinates
808
  whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
809
 
810
- # Write whole page annotation to annotation boxes
 
 
 
 
 
 
 
 
811
  whole_page_img_annotation_box = dict()
812
- whole_page_img_annotation_box["xmin"] = whole_page_x1
813
- whole_page_img_annotation_box["ymin"] = whole_page_y1
814
- whole_page_img_annotation_box["xmax"] = whole_page_x2
815
- whole_page_img_annotation_box["ymax"] = whole_page_y2
816
  whole_page_img_annotation_box["color"] = (0, 0, 0)
817
  whole_page_img_annotation_box["label"] = "Whole page"
818
 
 
574
 
575
  Returns:
576
  Page or Tuple[Page, Page]: If return_pdf_end_of_redaction is True and retain_text is True,
577
+ returns a tuple of (review_page, applied_redaction_page). Otherwise returns a single Page.
578
  """
579
 
580
  pymupdf_x1 = pymupdf_rect[0]
 
582
  pymupdf_x2 = pymupdf_rect[2]
583
  pymupdf_y2 = pymupdf_rect[3]
584
 
585
+ # Full size redaction box for covering all the text of a word
586
+ full_size_redaction_box = Rect(pymupdf_x1-1, pymupdf_y1-1, pymupdf_x2+1, pymupdf_y2+1)
587
+
588
+ # Calculate tiny height redaction box so that it doesn't delete text from adjacent lines
589
+ redact_bottom_y = pymupdf_y1 + 2
590
+ redact_top_y = pymupdf_y2 - 2
591
+
592
+ # Calculate the middle y value and set a small height if default values are too close together
593
+ if (redact_top_y - redact_bottom_y) < 1:
594
+ middle_y = (pymupdf_y1 + pymupdf_y2) / 2
595
+ redact_bottom_y = middle_y - 1
596
+ redact_top_y = middle_y + 1
597
+
598
+ rect_small_pixel_height = Rect(
599
+ pymupdf_x1 + 2, redact_bottom_y, pymupdf_x2 - 2, redact_top_y
600
+ ) # Slightly smaller than outside box
601
 
602
  out_colour = define_box_colour(
603
  custom_colours, img_annotation_box, CUSTOM_BOX_COLOUR
604
  )
605
 
606
+ img_annotation_box["text"] = img_annotation_box.get("text") or ""
607
+ img_annotation_box["label"] = img_annotation_box.get("label") or "Redaction"
608
+
609
  # Create a copy of the page for final redaction if needed
610
+ applied_redaction_page = None
611
  if return_pdf_end_of_redaction and retain_text:
612
  # Create a deep copy of the page for final redaction
 
613
 
614
+ applied_redaction_page = pymupdf.open()
615
+ applied_redaction_page.insert_pdf(
616
  pymupdf_page.parent,
617
  from_page=pymupdf_page.number,
618
  to_page=pymupdf_page.number,
619
  )
620
+ applied_redaction_page = applied_redaction_page[0]
621
 
622
+ # Handle review page first, then deal with final redacted page (retain_text = True)
623
+ if retain_text is True:
 
 
 
624
 
625
  annot = pymupdf_page.add_redact_annot(full_size_redaction_box)
626
  annot.set_colors(stroke=out_colour, fill=out_colour, colors=out_colour)
 
634
  )
635
  annot.update(opacity=0.5, cross_out=False)
636
 
637
+ # If we need both review and final pages, and the applied redaction page has been prepared, apply final redaction to the copy
638
+ if return_pdf_end_of_redaction and applied_redaction_page is not None:
639
+ # Apply final redaction to the copy
 
 
 
 
 
 
 
 
 
 
 
 
640
 
641
  # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
642
+ applied_redaction_page.add_redact_annot(rect_small_pixel_height)
643
 
644
  # Only create a box over the whole rect if we want to delete the text
645
+ shape = applied_redaction_page.new_shape()
646
  shape.draw_rect(pymupdf_rect)
647
 
648
  # Use solid fill for normal redaction
649
  shape.finish(color=out_colour, fill=out_colour)
650
  shape.commit()
651
 
652
+ return pymupdf_page, applied_redaction_page
653
  else:
654
  return pymupdf_page
 
 
 
 
 
 
 
 
 
 
 
 
 
 
655
 
656
+ # If we don't need to retain the text, we only have one page which is the applied redaction page, so just apply the redaction to the page
657
+ else:
658
  # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
659
  pymupdf_page.add_redact_annot(rect_small_pixel_height)
660
 
 
782
  """
783
  # Small border to page that remains white
784
 
785
+ # Define the coordinates for the Rect (PDF coordinates for actual redaction)
786
  whole_page_x1, whole_page_y1 = 0 + border, 0 + border # Bottom-left corner
787
+ whole_page_x2, whole_page_y2 = (
788
+ rect_width - border,
789
+ rect_height - border,
790
+ ) # Top-right corner
 
 
 
 
 
791
 
792
  # Create new image annotation element based on whole page coordinates
793
  whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
794
 
795
+ # Calculate relative coordinates for the annotation box (0-1 range)
796
+ # This ensures the coordinates are already in relative format for output files
797
+ relative_border = border / min(rect_width, rect_height) # Scale border proportionally
798
+ relative_x1 = relative_border
799
+ relative_y1 = relative_border
800
+ relative_x2 = 1 - relative_border
801
+ relative_y2 = 1 - relative_border
802
+
803
+ # Write whole page annotation to annotation boxes using relative coordinates
804
  whole_page_img_annotation_box = dict()
805
+ whole_page_img_annotation_box["xmin"] = relative_x1
806
+ whole_page_img_annotation_box["ymin"] = relative_y1
807
+ whole_page_img_annotation_box["xmax"] = relative_x2
808
+ whole_page_img_annotation_box["ymax"] = relative_y2
809
  whole_page_img_annotation_box["color"] = (0, 0, 0)
810
  whole_page_img_annotation_box["label"] = "Whole page"
811
 
tools/file_redaction.py CHANGED
@@ -404,7 +404,7 @@ def choose_and_run_redactor(
404
  if prepared_pdf_file_paths:
405
  review_out_file_paths = [prepared_pdf_file_paths[0]]
406
  else:
407
- review_out_file_paths = []
408
 
409
  # Choose the correct file to prepare
410
  if isinstance(file_paths, str):
@@ -1095,111 +1095,111 @@ def choose_and_run_redactor(
1095
 
1096
  else:
1097
  # Check if we have dual PDF documents to save
1098
- final_pymupdf_doc = None
1099
 
1100
  if RETURN_PDF_FOR_REVIEW and RETURN_REDACTED_PDF:
1101
  if (
1102
- hasattr(redact_image_pdf, "_final_pages")
1103
- and redact_image_pdf._final_pages
1104
  ):
1105
 
1106
  # Create final document by copying the original document and replacing specific pages
1107
- final_pymupdf_doc = pymupdf.open()
1108
- final_pymupdf_doc.insert_pdf(pymupdf_doc)
1109
 
1110
  # Create a mapping of original page numbers to final pages
1111
- final_pages_map = {}
1112
- for final_page_data in redact_image_pdf._final_pages:
1113
- if isinstance(final_page_data, tuple):
1114
- final_page, original_page_number = (
1115
- final_page_data
1116
  )
1117
- final_pages_map[original_page_number] = (
1118
- final_page
1119
  )
1120
  else:
1121
- final_page = final_page_data
1122
- final_pages_map[0] = (
1123
- final_page # Default to page 0 if no original number
1124
  )
1125
 
1126
  # Replace pages in the final document with their final versions
1127
  for (
1128
  original_page_number,
1129
- final_page,
1130
- ) in final_pages_map.items():
1131
  if (
1132
  original_page_number
1133
- < final_pymupdf_doc.page_count
1134
  ):
1135
  # Remove the original page and insert the final page
1136
- final_pymupdf_doc.delete_page(
1137
  original_page_number
1138
  )
1139
- final_pymupdf_doc.insert_pdf(
1140
- final_page.parent,
1141
- from_page=final_page.number,
1142
- to_page=final_page.number,
1143
  start_at=original_page_number,
1144
  )
1145
- # Apply redactions to the final page
1146
- final_pymupdf_doc[
1147
  original_page_number
1148
- ].apply_redactions(images=2, graphics=0, text=0)
1149
  # Clear the stored final pages
1150
- delattr(redact_image_pdf, "_final_pages")
1151
  elif (
1152
- hasattr(redact_text_pdf, "_final_pages")
1153
- and redact_text_pdf._final_pages
1154
  ):
1155
  # Create final document by copying the original document and replacing specific pages
1156
- final_pymupdf_doc = pymupdf.open()
1157
- final_pymupdf_doc.insert_pdf(pymupdf_doc)
1158
 
1159
  # Create a mapping of original page numbers to final pages
1160
- final_pages_map = {}
1161
- for final_page_data in redact_text_pdf._final_pages:
1162
- if isinstance(final_page_data, tuple):
1163
- final_page, original_page_number = (
1164
- final_page_data
1165
  )
1166
- final_pages_map[original_page_number] = (
1167
- final_page
1168
  )
1169
  else:
1170
- final_page = final_page_data
1171
- final_pages_map[0] = (
1172
- final_page # Default to page 0 if no original number
1173
  )
1174
 
1175
  # Replace pages in the final document with their final versions
1176
  for (
1177
  original_page_number,
1178
- final_page,
1179
- ) in final_pages_map.items():
1180
  if (
1181
  original_page_number
1182
- < final_pymupdf_doc.page_count
1183
  ):
1184
  # Remove the original page and insert the final page
1185
- final_pymupdf_doc.delete_page(
1186
  original_page_number
1187
  )
1188
- final_pymupdf_doc.insert_pdf(
1189
- final_page.parent,
1190
- from_page=final_page.number,
1191
- to_page=final_page.number,
1192
  start_at=original_page_number,
1193
  )
1194
- # Apply redactions to the final page
1195
- final_pymupdf_doc[
1196
  original_page_number
1197
- ].apply_redactions(images=2, graphics=0, text=0)
1198
  # Clear the stored final pages
1199
- delattr(redact_text_pdf, "_final_pages")
1200
 
1201
  # Save final redacted PDF if we have dual outputs or if RETURN_PDF_FOR_REVIEW is False
1202
- if RETURN_PDF_FOR_REVIEW is False or final_pymupdf_doc:
1203
  out_redacted_pdf_file_path = (
1204
  output_folder
1205
  + pdf_file_name_without_ext
@@ -1211,7 +1211,7 @@ def choose_and_run_redactor(
1211
 
1212
  # Use final document if available, otherwise use main document
1213
  doc_to_save = (
1214
- final_pymupdf_doc if final_pymupdf_doc else pymupdf_doc
1215
  )
1216
 
1217
  if out_redacted_pdf_file_path:
@@ -2104,7 +2104,7 @@ def redact_page_with_pymupdf(
2104
  Tuple[Page, dict] or Tuple[Tuple[Page, Page], dict]: A tuple containing:
2105
  - page (Page or Tuple[Page, Page]): The PyMuPDF page object(s) with redactions applied.
2106
  If return_pdf_end_of_redaction is True and return_pdf_for_review is True,
2107
- returns a tuple of (review_page, final_page).
2108
  - out_annotation_boxes (dict): A dictionary containing the processed annotation boxes
2109
  for the page, including the image path.
2110
  """
@@ -2271,14 +2271,14 @@ def redact_page_with_pymupdf(
2271
 
2272
  # Handle dual page objects if returned
2273
  if isinstance(redact_result, tuple):
2274
- page, final_page = redact_result
2275
  # Store the final page for later use
2276
- if not hasattr(redact_page_with_pymupdf, "_final_page"):
2277
- redact_page_with_pymupdf._final_page = final_page
2278
  else:
2279
  # If we already have a final page, we need to handle multiple pages
2280
  # For now, we'll use the last final page
2281
- redact_page_with_pymupdf._final_page = final_page
2282
 
2283
  # If whole page is to be redacted, do that here
2284
  if redact_whole_page is True:
@@ -2286,72 +2286,71 @@ def redact_page_with_pymupdf(
2286
  whole_page_img_annotation_box = redact_whole_pymupdf_page(
2287
  rect_height, rect_width, page, custom_colours, border=5
2288
  )
 
 
2289
  all_image_annotation_boxes.append(whole_page_img_annotation_box)
2290
 
2291
  # Handle dual page objects for whole page redaction if needed
2292
  if return_pdf_end_of_redaction and return_pdf_for_review:
2293
  # Create a copy of the page for final redaction using the same approach as redact_single_box
2294
 
2295
- final_page_doc = pymupdf.open()
2296
- final_page_doc.insert_pdf(
2297
  page.parent,
2298
  from_page=page.number,
2299
  to_page=page.number,
2300
  )
2301
- final_page = final_page_doc[0]
2302
 
2303
  # Apply the whole page redaction to the final page as well
2304
  redact_whole_pymupdf_page(
2305
- rect_height, rect_width, final_page, custom_colours, border=5
2306
  )
2307
 
2308
  # Store the final page with its original page number for later use
2309
- if not hasattr(redact_page_with_pymupdf, "_final_page"):
2310
- redact_page_with_pymupdf._final_page = (final_page, page.number)
2311
  else:
2312
  # If we already have a final page, we need to handle multiple pages
2313
  # For now, we'll use the last final page
2314
- redact_page_with_pymupdf._final_page = (final_page, page.number)
2315
 
2316
  out_annotation_boxes = {
2317
  "image": image_path, # Image.open(image_path), #image_path,
2318
  "boxes": all_image_annotation_boxes,
2319
  }
2320
 
 
2321
  if return_pdf_for_review is False:
2322
- # Remove text and all images
2323
- # page.apply_redactions(images=2, graphics=2)
2324
- page.apply_redactions(images=2, graphics=0, text=0)
2325
- # else:
2326
- # # Just apply the box, don't remove images or text
2327
- # page.apply_redactions(images=0, graphics=0, text=1)
2328
 
2329
  set_cropbox_safely(page, original_cropbox)
2330
- # page.set_cropbox(original_cropbox)
2331
- # Set CropBox to original size
2332
  page.clean_contents()
2333
 
2334
  # Handle dual page objects if we have a final page
2335
  if (
2336
  return_pdf_end_of_redaction
2337
  and return_pdf_for_review
2338
- and hasattr(redact_page_with_pymupdf, "_final_page")
2339
  ):
2340
- final_page_data = redact_page_with_pymupdf._final_page
2341
  # Handle both tuple format (new) and single page format (backward compatibility)
2342
- if isinstance(final_page_data, tuple):
2343
- final_page, original_page_number = final_page_data
2344
  else:
2345
- final_page = final_page_data
2346
 
2347
- # Apply redactions to final page
2348
- if return_pdf_for_review is False:
2349
- final_page.apply_redactions(images=2, graphics=0, text=0)
2350
- set_cropbox_safely(final_page, original_cropbox)
2351
- final_page.clean_contents()
 
2352
  # Clear the stored final page
2353
- delattr(redact_page_with_pymupdf, "_final_page")
2354
- return (page, final_page), out_annotation_boxes
 
2355
  else:
2356
  return page, out_annotation_boxes
2357
 
@@ -3116,14 +3115,14 @@ def redact_image_pdf(
3116
 
3117
  # Handle dual page objects if returned
3118
  if isinstance(redact_result[0], tuple):
3119
- (pymupdf_page, pymupdf_final_page), page_image_annotations = (
3120
  redact_result
3121
  )
3122
  # Store the final page with its original page number for later use
3123
- if not hasattr(redact_image_pdf, "_final_pages"):
3124
- redact_image_pdf._final_pages = []
3125
- redact_image_pdf._final_pages.append(
3126
- (pymupdf_final_page, page_no)
3127
  )
3128
  else:
3129
  pymupdf_page, page_image_annotations = redact_result
@@ -4178,14 +4177,14 @@ def redact_text_pdf(
4178
 
4179
  # Handle dual page objects if returned
4180
  if isinstance(redact_result[0], tuple):
4181
- (pymupdf_page, pymupdf_final_page), page_image_annotations = (
4182
  redact_result
4183
  )
4184
  # Store the final page with its original page number for later use
4185
- if not hasattr(redact_text_pdf, "_final_pages"):
4186
- redact_text_pdf._final_pages = []
4187
- redact_text_pdf._final_pages.append(
4188
- (pymupdf_final_page, page_no)
4189
  )
4190
  else:
4191
  pymupdf_page, page_image_annotations = redact_result
@@ -4205,7 +4204,6 @@ def redact_text_pdf(
4205
  # Else, user chose not to run redaction
4206
  else:
4207
  pass
4208
- # print("Not redacting page:", page_no)
4209
 
4210
  # Join extracted text outputs for all lines together
4211
  if not page_text_ocr_outputs.empty:
 
404
  if prepared_pdf_file_paths:
405
  review_out_file_paths = [prepared_pdf_file_paths[0]]
406
  else:
407
+ review_out_file_paths = list()
408
 
409
  # Choose the correct file to prepare
410
  if isinstance(file_paths, str):
 
1095
 
1096
  else:
1097
  # Check if we have dual PDF documents to save
1098
+ applied_redaction_pymupdf_doc = None
1099
 
1100
  if RETURN_PDF_FOR_REVIEW and RETURN_REDACTED_PDF:
1101
  if (
1102
+ hasattr(redact_image_pdf, "_applied_redaction_pages")
1103
+ and redact_image_pdf._applied_redaction_pages
1104
  ):
1105
 
1106
  # Create final document by copying the original document and replacing specific pages
1107
+ applied_redaction_pymupdf_doc = pymupdf.open()
1108
+ applied_redaction_pymupdf_doc.insert_pdf(pymupdf_doc)
1109
 
1110
  # Create a mapping of original page numbers to final pages
1111
+ applied_redaction_pages_map = {}
1112
+ for applied_redaction_page_data in redact_image_pdf._applied_redaction_pages:
1113
+ if isinstance(applied_redaction_page_data, tuple):
1114
+ applied_redaction_page, original_page_number = (
1115
+ applied_redaction_page_data
1116
  )
1117
+ applied_redaction_pages_map[original_page_number] = (
1118
+ applied_redaction_page
1119
  )
1120
  else:
1121
+ applied_redaction_page = applied_redaction_page_data
1122
+ applied_redaction_pages_map[0] = (
1123
+ applied_redaction_page # Default to page 0 if no original number
1124
  )
1125
 
1126
  # Replace pages in the final document with their final versions
1127
  for (
1128
  original_page_number,
1129
+ applied_redaction_page,
1130
+ ) in applied_redaction_pages_map.items():
1131
  if (
1132
  original_page_number
1133
+ < applied_redaction_pymupdf_doc.page_count
1134
  ):
1135
  # Remove the original page and insert the final page
1136
+ applied_redaction_pymupdf_doc.delete_page(
1137
  original_page_number
1138
  )
1139
+ applied_redaction_pymupdf_doc.insert_pdf(
1140
+ applied_redaction_page.parent,
1141
+ from_page=applied_redaction_page.number,
1142
+ to_page=applied_redaction_page.number,
1143
  start_at=original_page_number,
1144
  )
1145
+ # Remove text. Graphic text is effectively removed by the overlapping rectangle shape that becomes an embedded part of the document.
1146
+ applied_redaction_pymupdf_doc[
1147
  original_page_number
1148
+ ].apply_redactions(images=0, graphics=0, text=0)
1149
  # Clear the stored final pages
1150
+ delattr(redact_image_pdf, "_applied_redaction_pages")
1151
  elif (
1152
+ hasattr(redact_text_pdf, "_applied_redaction_pages")
1153
+ and redact_text_pdf._applied_redaction_pages
1154
  ):
1155
  # Create final document by copying the original document and replacing specific pages
1156
+ applied_redaction_pymupdf_doc = pymupdf.open()
1157
+ applied_redaction_pymupdf_doc.insert_pdf(pymupdf_doc)
1158
 
1159
  # Create a mapping of original page numbers to final pages
1160
+ applied_redaction_pages_map = {}
1161
+ for applied_redaction_page_data in redact_text_pdf._applied_redaction_pages:
1162
+ if isinstance(applied_redaction_page_data, tuple):
1163
+ applied_redaction_page, original_page_number = (
1164
+ applied_redaction_page_data
1165
  )
1166
+ applied_redaction_pages_map[original_page_number] = (
1167
+ applied_redaction_page
1168
  )
1169
  else:
1170
+ applied_redaction_page = applied_redaction_page_data
1171
+ applied_redaction_pages_map[0] = (
1172
+ applied_redaction_page # Default to page 0 if no original number
1173
  )
1174
 
1175
  # Replace pages in the final document with their final versions
1176
  for (
1177
  original_page_number,
1178
+ applied_redaction_page,
1179
+ ) in applied_redaction_pages_map.items():
1180
  if (
1181
  original_page_number
1182
+ < applied_redaction_pymupdf_doc.page_count
1183
  ):
1184
  # Remove the original page and insert the final page
1185
+ applied_redaction_pymupdf_doc.delete_page(
1186
  original_page_number
1187
  )
1188
+ applied_redaction_pymupdf_doc.insert_pdf(
1189
+ applied_redaction_page.parent,
1190
+ from_page=applied_redaction_page.number,
1191
+ to_page=applied_redaction_page.number,
1192
  start_at=original_page_number,
1193
  )
1194
+ # Remove text. Graphic text is effectively removed by the overlapping rectangle shape that becomes an embedded part of the document.
1195
+ applied_redaction_pymupdf_doc[
1196
  original_page_number
1197
+ ].apply_redactions(images=0, graphics=0, text=0)
1198
  # Clear the stored final pages
1199
+ delattr(redact_text_pdf, "_applied_redaction_pages")
1200
 
1201
  # Save final redacted PDF if we have dual outputs or if RETURN_PDF_FOR_REVIEW is False
1202
+ if RETURN_PDF_FOR_REVIEW is False or applied_redaction_pymupdf_doc:
1203
  out_redacted_pdf_file_path = (
1204
  output_folder
1205
  + pdf_file_name_without_ext
 
1211
 
1212
  # Use final document if available, otherwise use main document
1213
  doc_to_save = (
1214
+ applied_redaction_pymupdf_doc if applied_redaction_pymupdf_doc else pymupdf_doc
1215
  )
1216
 
1217
  if out_redacted_pdf_file_path:
 
2104
  Tuple[Page, dict] or Tuple[Tuple[Page, Page], dict]: A tuple containing:
2105
  - page (Page or Tuple[Page, Page]): The PyMuPDF page object(s) with redactions applied.
2106
  If return_pdf_end_of_redaction is True and return_pdf_for_review is True,
2107
+ returns a tuple of (review_page, applied_redaction_page).
2108
  - out_annotation_boxes (dict): A dictionary containing the processed annotation boxes
2109
  for the page, including the image path.
2110
  """
 
2271
 
2272
  # Handle dual page objects if returned
2273
  if isinstance(redact_result, tuple):
2274
+ page, applied_redaction_page = redact_result
2275
  # Store the final page for later use
2276
+ if not hasattr(redact_page_with_pymupdf, "_applied_redaction_page"):
2277
+ redact_page_with_pymupdf._applied_redaction_page = applied_redaction_page
2278
  else:
2279
  # If we already have a final page, we need to handle multiple pages
2280
  # For now, we'll use the last final page
2281
+ redact_page_with_pymupdf._applied_redaction_page = applied_redaction_page
2282
 
2283
  # If whole page is to be redacted, do that here
2284
  if redact_whole_page is True:
 
2286
  whole_page_img_annotation_box = redact_whole_pymupdf_page(
2287
  rect_height, rect_width, page, custom_colours, border=5
2288
  )
2289
+ # Ensure the whole page annotation box has a unique ID
2290
+ whole_page_img_annotation_box = fill_missing_box_ids(whole_page_img_annotation_box)
2291
  all_image_annotation_boxes.append(whole_page_img_annotation_box)
2292
 
2293
  # Handle dual page objects for whole page redaction if needed
2294
  if return_pdf_end_of_redaction and return_pdf_for_review:
2295
  # Create a copy of the page for final redaction using the same approach as redact_single_box
2296
 
2297
+ applied_redaction_doc = pymupdf.open()
2298
+ applied_redaction_doc.insert_pdf(
2299
  page.parent,
2300
  from_page=page.number,
2301
  to_page=page.number,
2302
  )
2303
+ applied_redaction_page = applied_redaction_doc[0]
2304
 
2305
  # Apply the whole page redaction to the final page as well
2306
  redact_whole_pymupdf_page(
2307
+ rect_height, rect_width, applied_redaction_page, custom_colours, border=5
2308
  )
2309
 
2310
  # Store the final page with its original page number for later use
2311
+ if not hasattr(redact_page_with_pymupdf, "_applied_redaction_page"):
2312
+ redact_page_with_pymupdf._applied_redaction_page = (applied_redaction_page, page.number)
2313
  else:
2314
  # If we already have a final page, we need to handle multiple pages
2315
  # For now, we'll use the last final page
2316
+ redact_page_with_pymupdf._applied_redaction_page = (applied_redaction_page, page.number)
2317
 
2318
  out_annotation_boxes = {
2319
  "image": image_path, # Image.open(image_path), #image_path,
2320
  "boxes": all_image_annotation_boxes,
2321
  }
2322
 
2323
+ # If we are not returning the review page, can directly remove text and all images
2324
  if return_pdf_for_review is False:
2325
+ # Remove text. Graphic text is effectively removed by the overlapping rectangle shape that becomes an embedded part of the document.
2326
+ page.apply_redactions(images=0, graphics=0, text=0)
 
 
 
 
2327
 
2328
  set_cropbox_safely(page, original_cropbox)
 
 
2329
  page.clean_contents()
2330
 
2331
  # Handle dual page objects if we have a final page
2332
  if (
2333
  return_pdf_end_of_redaction
2334
  and return_pdf_for_review
2335
+ and hasattr(redact_page_with_pymupdf, "_applied_redaction_page")
2336
  ):
2337
+ applied_redaction_page_data = redact_page_with_pymupdf._applied_redaction_page
2338
  # Handle both tuple format (new) and single page format (backward compatibility)
2339
+ if isinstance(applied_redaction_page_data, tuple):
2340
+ applied_redaction_page, original_page_number = applied_redaction_page_data
2341
  else:
2342
+ applied_redaction_page = applied_redaction_page_data
2343
 
2344
+ # Apply redactions to applied redaction page only
2345
+ # Remove text. Graphic text is effectively removed by the overlapping rectangle shape that becomes an embedded part of the document.
2346
+ applied_redaction_page.apply_redactions(images=0, graphics=0, text=0)
2347
+
2348
+ set_cropbox_safely(applied_redaction_page, original_cropbox)
2349
+ applied_redaction_page.clean_contents()
2350
  # Clear the stored final page
2351
+ delattr(redact_page_with_pymupdf, "_applied_redaction_page")
2352
+ return (page, applied_redaction_page), out_annotation_boxes
2353
+
2354
  else:
2355
  return page, out_annotation_boxes
2356
 
 
3115
 
3116
  # Handle dual page objects if returned
3117
  if isinstance(redact_result[0], tuple):
3118
+ (pymupdf_page, pymupdf_applied_redaction_page), page_image_annotations = (
3119
  redact_result
3120
  )
3121
  # Store the final page with its original page number for later use
3122
+ if not hasattr(redact_image_pdf, "_applied_redaction_pages"):
3123
+ redact_image_pdf._applied_redaction_pages = list()
3124
+ redact_image_pdf._applied_redaction_pages.append(
3125
+ (pymupdf_applied_redaction_page, page_no)
3126
  )
3127
  else:
3128
  pymupdf_page, page_image_annotations = redact_result
 
4177
 
4178
  # Handle dual page objects if returned
4179
  if isinstance(redact_result[0], tuple):
4180
+ (pymupdf_page, pymupdf_applied_redaction_page), page_image_annotations = (
4181
  redact_result
4182
  )
4183
  # Store the final page with its original page number for later use
4184
+ if not hasattr(redact_text_pdf, "_applied_redaction_pages"):
4185
+ redact_text_pdf._applied_redaction_pages = list()
4186
+ redact_text_pdf._applied_redaction_pages.append(
4187
+ (pymupdf_applied_redaction_page, page_no)
4188
  )
4189
  else:
4190
  pymupdf_page, page_image_annotations = redact_result
 
4204
  # Else, user chose not to run redaction
4205
  else:
4206
  pass
 
4207
 
4208
  # Join extracted text outputs for all lines together
4209
  if not page_text_ocr_outputs.empty:
tools/find_duplicate_pages.py CHANGED
@@ -462,9 +462,6 @@ def combine_ocr_dataframes(
462
  output_files = list()
463
  if output_folder and output_filename:
464
  # Validate path safety before creating directories and files
465
- print(
466
- f"DEBUG: Validating output_folder='{output_folder}' against OUTPUT_FOLDER='{OUTPUT_FOLDER}'"
467
- )
468
  if not validate_folder_containment(output_folder, OUTPUT_FOLDER):
469
  raise ValueError(f"Unsafe output folder path: {output_folder}")
470
  if not validate_path_safety(output_filename):
@@ -659,9 +656,6 @@ def save_results_and_redaction_lists(
659
  list: A list of paths to all generated files.
660
  """
661
  # Validate the output_folder path for security
662
- print(
663
- f"DEBUG: Validating output_folder='{output_folder}' against OUTPUT_FOLDER='{OUTPUT_FOLDER}'"
664
- )
665
  if not validate_folder_containment(output_folder, OUTPUT_FOLDER):
666
  raise ValueError(f"Invalid or unsafe output folder path: {output_folder}")
667
 
@@ -671,9 +665,6 @@ def save_results_and_redaction_lists(
671
  try:
672
  output_folder_path = Path(output_folder).resolve()
673
  # Validate that the resolved path is within the trusted OUTPUT_FOLDER using robust containment check
674
- print(
675
- f"DEBUG: Validating resolved path='{output_folder_path}' against OUTPUT_FOLDER='{OUTPUT_FOLDER}'"
676
- )
677
  if not validate_folder_containment(str(output_folder_path), OUTPUT_FOLDER):
678
  raise ValueError(
679
  f"Output folder path {output_folder} is outside the trusted directory {OUTPUT_FOLDER}"
@@ -1092,7 +1083,7 @@ def run_duplicate_analysis(
1092
 
1093
  progress(0, desc="Combining input files...")
1094
  df_combined, _, full_out_ocr_df = combine_ocr_output_text(
1095
- files, combine_pages=combine_pages
1096
  )
1097
 
1098
  if df_combined.empty:
 
462
  output_files = list()
463
  if output_folder and output_filename:
464
  # Validate path safety before creating directories and files
 
 
 
465
  if not validate_folder_containment(output_folder, OUTPUT_FOLDER):
466
  raise ValueError(f"Unsafe output folder path: {output_folder}")
467
  if not validate_path_safety(output_filename):
 
656
  list: A list of paths to all generated files.
657
  """
658
  # Validate the output_folder path for security
 
 
 
659
  if not validate_folder_containment(output_folder, OUTPUT_FOLDER):
660
  raise ValueError(f"Invalid or unsafe output folder path: {output_folder}")
661
 
 
665
  try:
666
  output_folder_path = Path(output_folder).resolve()
667
  # Validate that the resolved path is within the trusted OUTPUT_FOLDER using robust containment check
 
 
 
668
  if not validate_folder_containment(str(output_folder_path), OUTPUT_FOLDER):
669
  raise ValueError(
670
  f"Output folder path {output_folder} is outside the trusted directory {OUTPUT_FOLDER}"
 
1083
 
1084
  progress(0, desc="Combining input files...")
1085
  df_combined, _, full_out_ocr_df = combine_ocr_output_text(
1086
+ files, combine_pages=combine_pages, output_folder=output_folder
1087
  )
1088
 
1089
  if df_combined.empty:
tools/secure_path_utils.py CHANGED
@@ -311,14 +311,6 @@ def validate_folder_containment(
311
  path_str = str(normalized_path).lower()
312
  base_str = str(normalized_base).lower()
313
 
314
- print(
315
- f"DEBUG: validate_folder_containment called with path='{path}' base_path='{base_path}'"
316
- )
317
- print(
318
- f"DEBUG: normalized_path='{normalized_path}' normalized_base='{normalized_base}'"
319
- )
320
- print(f"DEBUG: path_str='{path_str}' base_str='{base_str}'")
321
-
322
  # Check if this is a test scenario
323
  is_test_path = any(
324
  test_pattern in path_str
 
311
  path_str = str(normalized_path).lower()
312
  base_str = str(normalized_base).lower()
313
 
 
 
 
 
 
 
 
 
314
  # Check if this is a test scenario
315
  is_test_path = any(
316
  test_pattern in path_str