Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Oct 1

Commit

5086da0

1 Parent(s): 78403ba

Fixed whole page redactions being incorrectly positions, and without IDs. Fixed duplicate pages output issue. Minor changes to output redaction box format and related code.

Browse files

Files changed (5) hide show

app.py +97 -39
tools/file_conversion.py +52 -59
tools/file_redaction.py +97 -99
tools/find_duplicate_pages.py +1 -10
tools/secure_path_utils.py +0 -8

app.py CHANGED Viewed

@@ -277,6 +277,47 @@ in_redact_comprehend_entities = gr.Dropdown(
     label="AWS Comprehend PII identification model (click empty space in box for full list)",
 )
 ## Deduplication examples
 in_duplicate_pages = gr.File(
     label="Upload one or multiple 'ocr_output.csv' files to find duplicate pages and subdocuments",
@@ -973,6 +1014,8 @@ with blocks:
                 "example_data/example_complaint_letter.jpg",
                 "example_data/graduate-job-example-cover-letter.pdf",
                 "example_data/Partnership-Agreement-Toolkit_0_0.pdf",
             ]
             available_examples = list()
@@ -990,6 +1033,10 @@ with blocks:
                         CHOSEN_COMPREHEND_ENTITIES,
                         [example_files[0]],
                         example_files[0],
                     ]
                 )
                 example_labels.append("PDF with selectable text redaction")
@@ -1005,6 +1052,10 @@ with blocks:
                         CHOSEN_COMPREHEND_ENTITIES,
                         [example_files[1]],
                         example_files[1],
                     ]
                 )
                 example_labels.append("Image redaction with local OCR")
@@ -1020,6 +1071,10 @@ with blocks:
                         CHOSEN_COMPREHEND_ENTITIES,
                         [example_files[2]],
                         example_files[2],
                     ]
                 )
                 example_labels.append(
@@ -1038,11 +1093,37 @@ with blocks:
                             CHOSEN_COMPREHEND_ENTITIES,
                             [example_files[3]],
                             example_files[3],
                         ]
                     )
                     example_labels.append(
                         "PDF redaction with AWS services and signature detection"
-                    )
             # Only create examples if we have available files
             if available_examples:
@@ -1056,6 +1137,10 @@ with blocks:
                     in_redact_comprehend_entities,
                     prepared_pdf_state,
                     doc_full_file_name_textbox,
                 ):
                     gr.Info(
                         "Example data loaded. Now click on 'Extract text and redact document' below to run the example redaction."
@@ -1072,6 +1157,10 @@ with blocks:
                         in_redact_comprehend_entities,
                         prepared_pdf_state,
                         doc_full_file_name_textbox,
                     ],
                     example_labels=example_labels,
                     fn=show_info_box_on_click,
@@ -2091,19 +2180,11 @@ with blocks:
                     in_allow_list_text = gr.Textbox(
                         label="Custom allow list load status"
                     )
-                with gr.Column():
-                    in_deny_list = gr.File(
-                        label="Import custom deny list - csv table with one column of a different word/phrase on each row (case insensitive). Terms in this file will always be redacted.",
-                        file_count="multiple",
-                        height=FILE_INPUT_HEIGHT,
-                    )
                     in_deny_list_text = gr.Textbox(label="Custom deny list load status")
                 with gr.Column():
-                    in_fully_redacted_list = gr.File(
-                        label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.",
-                        file_count="multiple",
-                        height=FILE_INPUT_HEIGHT,
-                    )
                     in_fully_redacted_list_text = gr.Textbox(
                         label="Fully redacted page list load status"
                     )
@@ -2125,33 +2206,10 @@ with blocks:
                         show_copy_button=True,
                         wrap=True,
                     )
-                    in_deny_list_state = gr.Dataframe(
-                        value=pd.DataFrame(),
-                        headers=["deny_list"],
-                        col_count=(1, "fixed"),
-                        row_count=(0, "dynamic"),
-                        label="Deny list",
-                        visible=True,
-                        type="pandas",
-                        interactive=True,
-                        show_fullscreen_button=True,
-                        show_copy_button=True,
-                        wrap=True,
-                    )
-                    in_fully_redacted_list_state = gr.Dataframe(
-                        value=pd.DataFrame(),
-                        headers=["fully_redacted_pages_list"],
-                        col_count=(1, "fixed"),
-                        row_count=(0, "dynamic"),
-                        label="Fully redacted pages",
-                        visible=True,
-                        type="pandas",
-                        interactive=True,
-                        show_fullscreen_button=True,
-                        show_copy_button=True,
-                        datatype="number",
-                        wrap=True,
-                    )
                 with gr.Row():
                     with gr.Column(scale=2):
                         markdown_placeholder = gr.Markdown("")

     label="AWS Comprehend PII identification model (click empty space in box for full list)",
 )
+in_deny_list = gr.File(
+                        label="Import custom deny list - csv table with one column of a different word/phrase on each row (case insensitive). Terms in this file will always be redacted.",
+                        file_count="multiple",
+                        height=FILE_INPUT_HEIGHT,
+                    )
+in_deny_list_state = gr.Dataframe(
+                        value=pd.DataFrame(),
+                        headers=["deny_list"],
+                        col_count=(1, "fixed"),
+                        row_count=(0, "dynamic"),
+                        label="Deny list",
+                        visible=True,
+                        type="pandas",
+                        interactive=True,
+                        show_fullscreen_button=True,
+                        show_copy_button=True,
+                        wrap=True,
+                    )
+in_fully_redacted_list = gr.File(
+                        label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.",
+                        file_count="multiple",
+                        height=FILE_INPUT_HEIGHT,
+                    )
+in_fully_redacted_list_state = gr.Dataframe(
+                        value=pd.DataFrame(),
+                        headers=["fully_redacted_pages_list"],
+                        col_count=(1, "fixed"),
+                        row_count=(0, "dynamic"),
+                        label="Fully redacted pages",
+                        visible=True,
+                        type="pandas",
+                        interactive=True,
+                        show_fullscreen_button=True,
+                        show_copy_button=True,
+                        wrap=True,
+                    )
 ## Deduplication examples
 in_duplicate_pages = gr.File(
     label="Upload one or multiple 'ocr_output.csv' files to find duplicate pages and subdocuments",
                 "example_data/example_complaint_letter.jpg",
                 "example_data/graduate-job-example-cover-letter.pdf",
                 "example_data/Partnership-Agreement-Toolkit_0_0.pdf",
+                "example_data/partnership_toolkit_redact_custom_deny_list.csv",
+                "example_data/partnership_toolkit_redact_some_pages.csv",
             ]
             available_examples = list()
                         CHOSEN_COMPREHEND_ENTITIES,
                         [example_files[0]],
                         example_files[0],
+                        [],
+                        pd.DataFrame(),
+                        [],
+                        pd.DataFrame(),
                     ]
                 )
                 example_labels.append("PDF with selectable text redaction")
                         CHOSEN_COMPREHEND_ENTITIES,
                         [example_files[1]],
                         example_files[1],
+                        [],
+                        pd.DataFrame(),
+                        [],
+                        pd.DataFrame(),
                     ]
                 )
                 example_labels.append("Image redaction with local OCR")
                         CHOSEN_COMPREHEND_ENTITIES,
                         [example_files[2]],
                         example_files[2],
+                        [],
+                        pd.DataFrame(),
+                        [],
+                        pd.DataFrame(),
                     ]
                 )
                 example_labels.append(
                             CHOSEN_COMPREHEND_ENTITIES,
                             [example_files[3]],
                             example_files[3],
+                            [],
+                            pd.DataFrame(),
+                            [],
+                            pd.DataFrame(),
                         ]
                     )
                     example_labels.append(
                         "PDF redaction with AWS services and signature detection"
+                    )
+            # Add new example for custom deny list and whole page redaction
+            if os.path.exists(example_files[3]) and os.path.exists(example_files[4]) and os.path.exists(example_files[5]):
+                available_examples.append(
+                    [
+                        [example_files[3]],
+                        "Local OCR model - PDFs without selectable text",
+                        "Local",
+                        [],
+                        ["CUSTOM"],  # Use CUSTOM entity to enable deny list functionality
+                        CHOSEN_COMPREHEND_ENTITIES,
+                        [example_files[3]],
+                        example_files[3],
+                        [example_files[4]],
+                        pd.DataFrame(data={"deny_list": ["Sister", "Sister City", "Sister Cities", "Friendship City"]}),
+                        [example_files[5]],
+                        pd.DataFrame(data={"fully_redacted_pages_list": [2, 5]}),
+                    ]
+                )
+                example_labels.append(
+                    "PDF redaction with custom deny list and whole page redaction"
+                )
             # Only create examples if we have available files
             if available_examples:
                     in_redact_comprehend_entities,
                     prepared_pdf_state,
                     doc_full_file_name_textbox,
+                    in_deny_list,
+                    in_deny_list_state,
+                    in_fully_redacted_list,
+                    in_fully_redacted_list_state,
                 ):
                     gr.Info(
                         "Example data loaded. Now click on 'Extract text and redact document' below to run the example redaction."
                         in_redact_comprehend_entities,
                         prepared_pdf_state,
                         doc_full_file_name_textbox,
+                        in_deny_list,
+                        in_deny_list_state,
+                        in_fully_redacted_list,
+                        in_fully_redacted_list_state,
                     ],
                     example_labels=example_labels,
                     fn=show_info_box_on_click,
                     in_allow_list_text = gr.Textbox(
                         label="Custom allow list load status"
                     )
+                with gr.Column():
+                    in_deny_list.render() # Defined at beginning of file
                     in_deny_list_text = gr.Textbox(label="Custom deny list load status")
                 with gr.Column():
+                    in_fully_redacted_list.render() # Defined at beginning of file
                     in_fully_redacted_list_text = gr.Textbox(
                         label="Fully redacted page list load status"
                     )
                         show_copy_button=True,
                         wrap=True,
                     )
+                    in_deny_list_state.render() # Defined at beginning of file
+                    in_fully_redacted_list_state.render() # Defined at beginning of file
                 with gr.Row():
                     with gr.Column(scale=2):
                         markdown_placeholder = gr.Markdown("")

tools/file_conversion.py CHANGED Viewed

@@ -574,7 +574,7 @@ def redact_single_box(
     Returns:
         Page or Tuple[Page, Page]: If return_pdf_end_of_redaction is True and retain_text is True,
-                                  returns a tuple of (review_page, final_page). Otherwise returns a single Page.
     """
     pymupdf_x1 = pymupdf_rect[0]
@@ -582,31 +582,45 @@ def redact_single_box(
     pymupdf_x2 = pymupdf_rect[2]
     pymupdf_y2 = pymupdf_rect[3]
-    full_size_redaction_box = Rect(pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2)
     out_colour = define_box_colour(
         custom_colours, img_annotation_box, CUSTOM_BOX_COLOUR
     )
     # Create a copy of the page for final redaction if needed
-    final_page = None
     if return_pdf_end_of_redaction and retain_text:
         # Create a deep copy of the page for final redaction
-        import fitz
-        final_page = fitz.open()
-        final_page.insert_pdf(
             pymupdf_page.parent,
             from_page=pymupdf_page.number,
             to_page=pymupdf_page.number,
         )
-        final_page = final_page[0]
-    # Handle review page (retain_text = True)
-    if retain_text is True:
-        img_annotation_box["text"] = img_annotation_box.get("text") or ""
-        img_annotation_box["label"] = img_annotation_box.get("label") or "Redaction"
         annot = pymupdf_page.add_redact_annot(full_size_redaction_box)
         annot.set_colors(stroke=out_colour, fill=out_colour, colors=out_colour)
@@ -620,51 +634,27 @@ def redact_single_box(
         )
         annot.update(opacity=0.5, cross_out=False)
-        # If we need both review and final pages, apply final redaction to the copy
-        if return_pdf_end_of_redaction and final_page is not None:
-            # Apply final redaction to the copy
-            redact_bottom_y = pymupdf_y1 + 2
-            redact_top_y = pymupdf_y2 - 2
-            # Calculate the middle y value and set a small height if default values are too close together
-            if (redact_top_y - redact_bottom_y) < 1:
-                middle_y = (pymupdf_y1 + pymupdf_y2) / 2
-                redact_bottom_y = middle_y - 1
-                redact_top_y = middle_y + 1
-            rect_small_pixel_height = Rect(
-                pymupdf_x1, redact_bottom_y, pymupdf_x2, redact_top_y
-            )  # Slightly smaller than outside box
             # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
-            final_page.add_redact_annot(rect_small_pixel_height)
             # Only create a box over the whole rect if we want to delete the text
-            shape = final_page.new_shape()
             shape.draw_rect(pymupdf_rect)
             # Use solid fill for normal redaction
             shape.finish(color=out_colour, fill=out_colour)
             shape.commit()
-            return pymupdf_page, final_page
         else:
             return pymupdf_page
-    else:
-        # Calculate area to actually remove text from the pdf (different from black box size)
-        redact_bottom_y = pymupdf_y1 + 2
-        redact_top_y = pymupdf_y2 - 2
-        # Calculate the middle y value and set a small height if default values are too close together
-        if (redact_top_y - redact_bottom_y) < 1:
-            middle_y = (pymupdf_y1 + pymupdf_y2) / 2
-            redact_bottom_y = middle_y - 1
-            redact_top_y = middle_y + 1
-        rect_small_pixel_height = Rect(
-            pymupdf_x1, redact_bottom_y, pymupdf_x2, redact_top_y
-        )  # Slightly smaller than outside box
         # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
         pymupdf_page.add_redact_annot(rect_small_pixel_height)
@@ -792,27 +782,30 @@ def redact_whole_pymupdf_page(
     """
     # Small border to page that remains white
-    # Define the coordinates for the Rect
     whole_page_x1, whole_page_y1 = 0 + border, 0 + border  # Bottom-left corner
-    # If border is a tiny value, assume that we want relative values
-    if border < 0.1:
-        whole_page_x2, whole_page_y2 = 1 - border, 1 - border  # Top-right corner
-    else:
-        whole_page_x2, whole_page_y2 = (
-            rect_width - border,
-            rect_height - border,
-        )  # Top-right corner
     # Create new image annotation element based on whole page coordinates
     whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
-    # Write whole page annotation to annotation boxes
     whole_page_img_annotation_box = dict()
-    whole_page_img_annotation_box["xmin"] = whole_page_x1
-    whole_page_img_annotation_box["ymin"] = whole_page_y1
-    whole_page_img_annotation_box["xmax"] = whole_page_x2
-    whole_page_img_annotation_box["ymax"] = whole_page_y2
     whole_page_img_annotation_box["color"] = (0, 0, 0)
     whole_page_img_annotation_box["label"] = "Whole page"

     Returns:
         Page or Tuple[Page, Page]: If return_pdf_end_of_redaction is True and retain_text is True,
+                                  returns a tuple of (review_page, applied_redaction_page). Otherwise returns a single Page.
     """
     pymupdf_x1 = pymupdf_rect[0]
     pymupdf_x2 = pymupdf_rect[2]
     pymupdf_y2 = pymupdf_rect[3]
+    # Full size redaction box for covering all the text of a word
+    full_size_redaction_box = Rect(pymupdf_x1-1, pymupdf_y1-1, pymupdf_x2+1, pymupdf_y2+1)
+    # Calculate tiny height redaction box so that it doesn't delete text from adjacent lines
+    redact_bottom_y = pymupdf_y1 + 2
+    redact_top_y = pymupdf_y2 - 2
+    # Calculate the middle y value and set a small height if default values are too close together
+    if (redact_top_y - redact_bottom_y) < 1:
+        middle_y = (pymupdf_y1 + pymupdf_y2) / 2
+        redact_bottom_y = middle_y - 1
+        redact_top_y = middle_y + 1
+    rect_small_pixel_height = Rect(
+        pymupdf_x1 + 2, redact_bottom_y, pymupdf_x2 - 2, redact_top_y
+    )  # Slightly smaller than outside box
     out_colour = define_box_colour(
         custom_colours, img_annotation_box, CUSTOM_BOX_COLOUR
     )
+    img_annotation_box["text"] = img_annotation_box.get("text") or ""
+    img_annotation_box["label"] = img_annotation_box.get("label") or "Redaction"
     # Create a copy of the page for final redaction if needed
+    applied_redaction_page = None
     if return_pdf_end_of_redaction and retain_text:
         # Create a deep copy of the page for final redaction
+        applied_redaction_page = pymupdf.open()
+        applied_redaction_page.insert_pdf(
             pymupdf_page.parent,
             from_page=pymupdf_page.number,
             to_page=pymupdf_page.number,
         )
+        applied_redaction_page = applied_redaction_page[0]
+    # Handle review page first, then deal with final redacted page (retain_text = True)
+    if retain_text is True:
         annot = pymupdf_page.add_redact_annot(full_size_redaction_box)
         annot.set_colors(stroke=out_colour, fill=out_colour, colors=out_colour)
         )
         annot.update(opacity=0.5, cross_out=False)
+        # If we need both review and final pages, and the applied redaction page has been prepared, apply final redaction to the copy
+        if return_pdf_end_of_redaction and applied_redaction_page is not None:
+            # Apply final redaction to the copy
             # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
+            applied_redaction_page.add_redact_annot(rect_small_pixel_height)
             # Only create a box over the whole rect if we want to delete the text
+            shape = applied_redaction_page.new_shape()
             shape.draw_rect(pymupdf_rect)
             # Use solid fill for normal redaction
             shape.finish(color=out_colour, fill=out_colour)
             shape.commit()
+            return pymupdf_page, applied_redaction_page
         else:
             return pymupdf_page
+    # If we don't need to retain the text, we only have one page which is the applied redaction page, so just apply the redaction to the page
+    else:
         # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
         pymupdf_page.add_redact_annot(rect_small_pixel_height)
     """
     # Small border to page that remains white
+    # Define the coordinates for the Rect (PDF coordinates for actual redaction)
     whole_page_x1, whole_page_y1 = 0 + border, 0 + border  # Bottom-left corner
+    whole_page_x2, whole_page_y2 = (
+        rect_width - border,
+        rect_height - border,
+    )  # Top-right corner
     # Create new image annotation element based on whole page coordinates
     whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
+    # Calculate relative coordinates for the annotation box (0-1 range)
+    # This ensures the coordinates are already in relative format for output files
+    relative_border = border / min(rect_width, rect_height)  # Scale border proportionally
+    relative_x1 = relative_border
+    relative_y1 = relative_border
+    relative_x2 = 1 - relative_border
+    relative_y2 = 1 - relative_border
+    # Write whole page annotation to annotation boxes using relative coordinates
     whole_page_img_annotation_box = dict()
+    whole_page_img_annotation_box["xmin"] = relative_x1
+    whole_page_img_annotation_box["ymin"] = relative_y1
+    whole_page_img_annotation_box["xmax"] = relative_x2
+    whole_page_img_annotation_box["ymax"] = relative_y2
     whole_page_img_annotation_box["color"] = (0, 0, 0)
     whole_page_img_annotation_box["label"] = "Whole page"

tools/file_redaction.py CHANGED Viewed

@@ -404,7 +404,7 @@ def choose_and_run_redactor(
     if prepared_pdf_file_paths:
         review_out_file_paths = [prepared_pdf_file_paths[0]]
     else:
-        review_out_file_paths = []
     # Choose the correct file to prepare
     if isinstance(file_paths, str):
@@ -1095,111 +1095,111 @@ def choose_and_run_redactor(
                     else:
                         # Check if we have dual PDF documents to save
-                        final_pymupdf_doc = None
                         if RETURN_PDF_FOR_REVIEW and RETURN_REDACTED_PDF:
                             if (
-                                hasattr(redact_image_pdf, "_final_pages")
-                                and redact_image_pdf._final_pages
                             ):
                                 # Create final document by copying the original document and replacing specific pages
-                                final_pymupdf_doc = pymupdf.open()
-                                final_pymupdf_doc.insert_pdf(pymupdf_doc)
                                 # Create a mapping of original page numbers to final pages
-                                final_pages_map = {}
-                                for final_page_data in redact_image_pdf._final_pages:
-                                    if isinstance(final_page_data, tuple):
-                                        final_page, original_page_number = (
-                                            final_page_data
                                         )
-                                        final_pages_map[original_page_number] = (
-                                            final_page
                                         )
                                     else:
-                                        final_page = final_page_data
-                                        final_pages_map[0] = (
-                                            final_page  # Default to page 0 if no original number
                                         )
                                 # Replace pages in the final document with their final versions
                                 for (
                                     original_page_number,
-                                    final_page,
-                                ) in final_pages_map.items():
                                     if (
                                         original_page_number
-                                        < final_pymupdf_doc.page_count
                                     ):
                                         # Remove the original page and insert the final page
-                                        final_pymupdf_doc.delete_page(
                                             original_page_number
                                         )
-                                        final_pymupdf_doc.insert_pdf(
-                                            final_page.parent,
-                                            from_page=final_page.number,
-                                            to_page=final_page.number,
                                             start_at=original_page_number,
                                         )
-                                        # Apply redactions to the final page
-                                        final_pymupdf_doc[
                                             original_page_number
-                                        ].apply_redactions(images=2, graphics=0, text=0)
                                 # Clear the stored final pages
-                                delattr(redact_image_pdf, "_final_pages")
                             elif (
-                                hasattr(redact_text_pdf, "_final_pages")
-                                and redact_text_pdf._final_pages
                             ):
                                 # Create final document by copying the original document and replacing specific pages
-                                final_pymupdf_doc = pymupdf.open()
-                                final_pymupdf_doc.insert_pdf(pymupdf_doc)
                                 # Create a mapping of original page numbers to final pages
-                                final_pages_map = {}
-                                for final_page_data in redact_text_pdf._final_pages:
-                                    if isinstance(final_page_data, tuple):
-                                        final_page, original_page_number = (
-                                            final_page_data
                                         )
-                                        final_pages_map[original_page_number] = (
-                                            final_page
                                         )
                                     else:
-                                        final_page = final_page_data
-                                        final_pages_map[0] = (
-                                            final_page  # Default to page 0 if no original number
                                         )
                                 # Replace pages in the final document with their final versions
                                 for (
                                     original_page_number,
-                                    final_page,
-                                ) in final_pages_map.items():
                                     if (
                                         original_page_number
-                                        < final_pymupdf_doc.page_count
                                     ):
                                         # Remove the original page and insert the final page
-                                        final_pymupdf_doc.delete_page(
                                             original_page_number
                                         )
-                                        final_pymupdf_doc.insert_pdf(
-                                            final_page.parent,
-                                            from_page=final_page.number,
-                                            to_page=final_page.number,
                                             start_at=original_page_number,
                                         )
-                                        # Apply redactions to the final page
-                                        final_pymupdf_doc[
                                             original_page_number
-                                        ].apply_redactions(images=2, graphics=0, text=0)
                                 # Clear the stored final pages
-                                delattr(redact_text_pdf, "_final_pages")
                         # Save final redacted PDF if we have dual outputs or if RETURN_PDF_FOR_REVIEW is False
-                        if RETURN_PDF_FOR_REVIEW is False or final_pymupdf_doc:
                             out_redacted_pdf_file_path = (
                                 output_folder
                                 + pdf_file_name_without_ext
@@ -1211,7 +1211,7 @@ def choose_and_run_redactor(
                             # Use final document if available, otherwise use main document
                             doc_to_save = (
-                                final_pymupdf_doc if final_pymupdf_doc else pymupdf_doc
                             )
                             if out_redacted_pdf_file_path:
@@ -2104,7 +2104,7 @@ def redact_page_with_pymupdf(
         Tuple[Page, dict] or Tuple[Tuple[Page, Page], dict]: A tuple containing:
             - page (Page or Tuple[Page, Page]): The PyMuPDF page object(s) with redactions applied.
                                                If return_pdf_end_of_redaction is True and return_pdf_for_review is True,
-                                               returns a tuple of (review_page, final_page).
             - out_annotation_boxes (dict): A dictionary containing the processed annotation boxes
                                            for the page, including the image path.
     """
@@ -2271,14 +2271,14 @@ def redact_page_with_pymupdf(
         # Handle dual page objects if returned
         if isinstance(redact_result, tuple):
-            page, final_page = redact_result
             # Store the final page for later use
-            if not hasattr(redact_page_with_pymupdf, "_final_page"):
-                redact_page_with_pymupdf._final_page = final_page
             else:
                 # If we already have a final page, we need to handle multiple pages
                 # For now, we'll use the last final page
-                redact_page_with_pymupdf._final_page = final_page
     # If whole page is to be redacted, do that here
     if redact_whole_page is True:
@@ -2286,72 +2286,71 @@ def redact_page_with_pymupdf(
         whole_page_img_annotation_box = redact_whole_pymupdf_page(
             rect_height, rect_width, page, custom_colours, border=5
         )
         all_image_annotation_boxes.append(whole_page_img_annotation_box)
         # Handle dual page objects for whole page redaction if needed
         if return_pdf_end_of_redaction and return_pdf_for_review:
             # Create a copy of the page for final redaction using the same approach as redact_single_box
-            final_page_doc = pymupdf.open()
-            final_page_doc.insert_pdf(
                 page.parent,
                 from_page=page.number,
                 to_page=page.number,
             )
-            final_page = final_page_doc[0]
             # Apply the whole page redaction to the final page as well
             redact_whole_pymupdf_page(
-                rect_height, rect_width, final_page, custom_colours, border=5
             )
             # Store the final page with its original page number for later use
-            if not hasattr(redact_page_with_pymupdf, "_final_page"):
-                redact_page_with_pymupdf._final_page = (final_page, page.number)
             else:
                 # If we already have a final page, we need to handle multiple pages
                 # For now, we'll use the last final page
-                redact_page_with_pymupdf._final_page = (final_page, page.number)
     out_annotation_boxes = {
         "image": image_path,  # Image.open(image_path), #image_path,
         "boxes": all_image_annotation_boxes,
     }
     if return_pdf_for_review is False:
-        # Remove text and all images
-        # page.apply_redactions(images=2, graphics=2)
-        page.apply_redactions(images=2, graphics=0, text=0)
-    # else:
-    #     # Just apply the box, don't remove images or text
-    #     page.apply_redactions(images=0, graphics=0, text=1)
     set_cropbox_safely(page, original_cropbox)
-    # page.set_cropbox(original_cropbox)
-    # Set CropBox to original size
     page.clean_contents()
     # Handle dual page objects if we have a final page
     if (
         return_pdf_end_of_redaction
         and return_pdf_for_review
-        and hasattr(redact_page_with_pymupdf, "_final_page")
     ):
-        final_page_data = redact_page_with_pymupdf._final_page
         # Handle both tuple format (new) and single page format (backward compatibility)
-        if isinstance(final_page_data, tuple):
-            final_page, original_page_number = final_page_data
         else:
-            final_page = final_page_data
-        # Apply redactions to final page
-        if return_pdf_for_review is False:
-            final_page.apply_redactions(images=2, graphics=0, text=0)
-        set_cropbox_safely(final_page, original_cropbox)
-        final_page.clean_contents()
         # Clear the stored final page
-        delattr(redact_page_with_pymupdf, "_final_page")
-        return (page, final_page), out_annotation_boxes
     else:
         return page, out_annotation_boxes
@@ -3116,14 +3115,14 @@ def redact_image_pdf(
                     # Handle dual page objects if returned
                     if isinstance(redact_result[0], tuple):
-                        (pymupdf_page, pymupdf_final_page), page_image_annotations = (
                             redact_result
                         )
                         # Store the final page with its original page number for later use
-                        if not hasattr(redact_image_pdf, "_final_pages"):
-                            redact_image_pdf._final_pages = []
-                        redact_image_pdf._final_pages.append(
-                            (pymupdf_final_page, page_no)
                         )
                     else:
                         pymupdf_page, page_image_annotations = redact_result
@@ -4178,14 +4177,14 @@ def redact_text_pdf(
                     # Handle dual page objects if returned
                     if isinstance(redact_result[0], tuple):
-                        (pymupdf_page, pymupdf_final_page), page_image_annotations = (
                             redact_result
                         )
                         # Store the final page with its original page number for later use
-                        if not hasattr(redact_text_pdf, "_final_pages"):
-                            redact_text_pdf._final_pages = []
-                        redact_text_pdf._final_pages.append(
-                            (pymupdf_final_page, page_no)
                         )
                     else:
                         pymupdf_page, page_image_annotations = redact_result
@@ -4205,7 +4204,6 @@ def redact_text_pdf(
                 # Else, user chose not to run redaction
                 else:
                     pass
-                    # print("Not redacting page:", page_no)
                 # Join extracted text outputs for all lines together
                 if not page_text_ocr_outputs.empty:

     if prepared_pdf_file_paths:
         review_out_file_paths = [prepared_pdf_file_paths[0]]
     else:
+        review_out_file_paths = list()
     # Choose the correct file to prepare
     if isinstance(file_paths, str):
                     else:
                         # Check if we have dual PDF documents to save
+                        applied_redaction_pymupdf_doc = None
                         if RETURN_PDF_FOR_REVIEW and RETURN_REDACTED_PDF:
                             if (
+                                hasattr(redact_image_pdf, "_applied_redaction_pages")
+                                and redact_image_pdf._applied_redaction_pages
                             ):
                                 # Create final document by copying the original document and replacing specific pages
+                                applied_redaction_pymupdf_doc = pymupdf.open()
+                                applied_redaction_pymupdf_doc.insert_pdf(pymupdf_doc)
                                 # Create a mapping of original page numbers to final pages
+                                applied_redaction_pages_map = {}
+                                for applied_redaction_page_data in redact_image_pdf._applied_redaction_pages:
+                                    if isinstance(applied_redaction_page_data, tuple):
+                                        applied_redaction_page, original_page_number = (
+                                            applied_redaction_page_data
                                         )
+                                        applied_redaction_pages_map[original_page_number] = (
+                                            applied_redaction_page
                                         )
                                     else:
+                                        applied_redaction_page = applied_redaction_page_data
+                                        applied_redaction_pages_map[0] = (
+                                            applied_redaction_page  # Default to page 0 if no original number
                                         )
                                 # Replace pages in the final document with their final versions
                                 for (
                                     original_page_number,
+                                    applied_redaction_page,
+                                ) in applied_redaction_pages_map.items():
                                     if (
                                         original_page_number
+                                        < applied_redaction_pymupdf_doc.page_count
                                     ):
                                         # Remove the original page and insert the final page
+                                        applied_redaction_pymupdf_doc.delete_page(
                                             original_page_number
                                         )
+                                        applied_redaction_pymupdf_doc.insert_pdf(
+                                            applied_redaction_page.parent,
+                                            from_page=applied_redaction_page.number,
+                                            to_page=applied_redaction_page.number,
                                             start_at=original_page_number,
                                         )
+                                        # Remove text. Graphic text is effectively removed by the overlapping rectangle shape that becomes an embedded part of the document.
+                                        applied_redaction_pymupdf_doc[
                                             original_page_number
+                                        ].apply_redactions(images=0, graphics=0, text=0)
                                 # Clear the stored final pages
+                                delattr(redact_image_pdf, "_applied_redaction_pages")
                             elif (
+                                hasattr(redact_text_pdf, "_applied_redaction_pages")
+                                and redact_text_pdf._applied_redaction_pages
                             ):
                                 # Create final document by copying the original document and replacing specific pages
+                                applied_redaction_pymupdf_doc = pymupdf.open()
+                                applied_redaction_pymupdf_doc.insert_pdf(pymupdf_doc)
                                 # Create a mapping of original page numbers to final pages
+                                applied_redaction_pages_map = {}
+                                for applied_redaction_page_data in redact_text_pdf._applied_redaction_pages:
+                                    if isinstance(applied_redaction_page_data, tuple):
+                                        applied_redaction_page, original_page_number = (
+                                            applied_redaction_page_data
                                         )
+                                        applied_redaction_pages_map[original_page_number] = (
+                                            applied_redaction_page
                                         )
                                     else:
+                                        applied_redaction_page = applied_redaction_page_data
+                                        applied_redaction_pages_map[0] = (
+                                            applied_redaction_page  # Default to page 0 if no original number
                                         )
                                 # Replace pages in the final document with their final versions
                                 for (
                                     original_page_number,
+                                    applied_redaction_page,
+                                ) in applied_redaction_pages_map.items():
                                     if (
                                         original_page_number
+                                        < applied_redaction_pymupdf_doc.page_count
                                     ):
                                         # Remove the original page and insert the final page
+                                        applied_redaction_pymupdf_doc.delete_page(
                                             original_page_number
                                         )
+                                        applied_redaction_pymupdf_doc.insert_pdf(
+                                            applied_redaction_page.parent,
+                                            from_page=applied_redaction_page.number,
+                                            to_page=applied_redaction_page.number,
                                             start_at=original_page_number,
                                         )
+                                        # Remove text. Graphic text is effectively removed by the overlapping rectangle shape that becomes an embedded part of the document.
+                                        applied_redaction_pymupdf_doc[
                                             original_page_number
+                                        ].apply_redactions(images=0, graphics=0, text=0)
                                 # Clear the stored final pages
+                                delattr(redact_text_pdf, "_applied_redaction_pages")
                         # Save final redacted PDF if we have dual outputs or if RETURN_PDF_FOR_REVIEW is False
+                        if RETURN_PDF_FOR_REVIEW is False or applied_redaction_pymupdf_doc:
                             out_redacted_pdf_file_path = (
                                 output_folder
                                 + pdf_file_name_without_ext
                             # Use final document if available, otherwise use main document
                             doc_to_save = (
+                                applied_redaction_pymupdf_doc if applied_redaction_pymupdf_doc else pymupdf_doc
                             )
                             if out_redacted_pdf_file_path:
         Tuple[Page, dict] or Tuple[Tuple[Page, Page], dict]: A tuple containing:
             - page (Page or Tuple[Page, Page]): The PyMuPDF page object(s) with redactions applied.
                                                If return_pdf_end_of_redaction is True and return_pdf_for_review is True,
+                                               returns a tuple of (review_page, applied_redaction_page).
             - out_annotation_boxes (dict): A dictionary containing the processed annotation boxes
                                            for the page, including the image path.
     """
         # Handle dual page objects if returned
         if isinstance(redact_result, tuple):
+            page, applied_redaction_page = redact_result
             # Store the final page for later use
+            if not hasattr(redact_page_with_pymupdf, "_applied_redaction_page"):
+                redact_page_with_pymupdf._applied_redaction_page = applied_redaction_page
             else:
                 # If we already have a final page, we need to handle multiple pages
                 # For now, we'll use the last final page
+                redact_page_with_pymupdf._applied_redaction_page = applied_redaction_page
     # If whole page is to be redacted, do that here
     if redact_whole_page is True:
         whole_page_img_annotation_box = redact_whole_pymupdf_page(
             rect_height, rect_width, page, custom_colours, border=5
         )
+        # Ensure the whole page annotation box has a unique ID
+        whole_page_img_annotation_box = fill_missing_box_ids(whole_page_img_annotation_box)
         all_image_annotation_boxes.append(whole_page_img_annotation_box)
         # Handle dual page objects for whole page redaction if needed
         if return_pdf_end_of_redaction and return_pdf_for_review:
             # Create a copy of the page for final redaction using the same approach as redact_single_box
+            applied_redaction_doc = pymupdf.open()
+            applied_redaction_doc.insert_pdf(
                 page.parent,
                 from_page=page.number,
                 to_page=page.number,
             )
+            applied_redaction_page = applied_redaction_doc[0]
             # Apply the whole page redaction to the final page as well
             redact_whole_pymupdf_page(
+                rect_height, rect_width, applied_redaction_page, custom_colours, border=5
             )
             # Store the final page with its original page number for later use
+            if not hasattr(redact_page_with_pymupdf, "_applied_redaction_page"):
+                redact_page_with_pymupdf._applied_redaction_page = (applied_redaction_page, page.number)
             else:
                 # If we already have a final page, we need to handle multiple pages
                 # For now, we'll use the last final page
+                redact_page_with_pymupdf._applied_redaction_page = (applied_redaction_page, page.number)
     out_annotation_boxes = {
         "image": image_path,  # Image.open(image_path), #image_path,
         "boxes": all_image_annotation_boxes,
     }
+    # If we are not returning the review page, can directly remove text and all images
     if return_pdf_for_review is False:
+        # Remove text. Graphic text is effectively removed by the overlapping rectangle shape that becomes an embedded part of the document.
+        page.apply_redactions(images=0, graphics=0, text=0)
     set_cropbox_safely(page, original_cropbox)
     page.clean_contents()
     # Handle dual page objects if we have a final page
     if (
         return_pdf_end_of_redaction
         and return_pdf_for_review
+        and hasattr(redact_page_with_pymupdf, "_applied_redaction_page")
     ):
+        applied_redaction_page_data = redact_page_with_pymupdf._applied_redaction_page
         # Handle both tuple format (new) and single page format (backward compatibility)
+        if isinstance(applied_redaction_page_data, tuple):
+            applied_redaction_page, original_page_number = applied_redaction_page_data
         else:
+            applied_redaction_page = applied_redaction_page_data
+        # Apply redactions to applied redaction page only
+        # Remove text. Graphic text is effectively removed by the overlapping rectangle shape that becomes an embedded part of the document.
+        applied_redaction_page.apply_redactions(images=0, graphics=0, text=0)
+        set_cropbox_safely(applied_redaction_page, original_cropbox)
+        applied_redaction_page.clean_contents()
         # Clear the stored final page
+        delattr(redact_page_with_pymupdf, "_applied_redaction_page")
+        return (page, applied_redaction_page), out_annotation_boxes
     else:
         return page, out_annotation_boxes
                     # Handle dual page objects if returned
                     if isinstance(redact_result[0], tuple):
+                        (pymupdf_page, pymupdf_applied_redaction_page), page_image_annotations = (
                             redact_result
                         )
                         # Store the final page with its original page number for later use
+                        if not hasattr(redact_image_pdf, "_applied_redaction_pages"):
+                            redact_image_pdf._applied_redaction_pages = list()
+                        redact_image_pdf._applied_redaction_pages.append(
+                            (pymupdf_applied_redaction_page, page_no)
                         )
                     else:
                         pymupdf_page, page_image_annotations = redact_result
                     # Handle dual page objects if returned
                     if isinstance(redact_result[0], tuple):
+                        (pymupdf_page, pymupdf_applied_redaction_page), page_image_annotations = (
                             redact_result
                         )
                         # Store the final page with its original page number for later use
+                        if not hasattr(redact_text_pdf, "_applied_redaction_pages"):
+                            redact_text_pdf._applied_redaction_pages = list()
+                        redact_text_pdf._applied_redaction_pages.append(
+                            (pymupdf_applied_redaction_page, page_no)
                         )
                     else:
                         pymupdf_page, page_image_annotations = redact_result
                 # Else, user chose not to run redaction
                 else:
                     pass
                 # Join extracted text outputs for all lines together
                 if not page_text_ocr_outputs.empty:

tools/find_duplicate_pages.py CHANGED Viewed

@@ -462,9 +462,6 @@ def combine_ocr_dataframes(
     output_files = list()
     if output_folder and output_filename:
         # Validate path safety before creating directories and files
-        print(
-            f"DEBUG: Validating output_folder='{output_folder}' against OUTPUT_FOLDER='{OUTPUT_FOLDER}'"
-        )
         if not validate_folder_containment(output_folder, OUTPUT_FOLDER):
             raise ValueError(f"Unsafe output folder path: {output_folder}")
         if not validate_path_safety(output_filename):
@@ -659,9 +656,6 @@ def save_results_and_redaction_lists(
         list: A list of paths to all generated files.
     """
     # Validate the output_folder path for security
-    print(
-        f"DEBUG: Validating output_folder='{output_folder}' against OUTPUT_FOLDER='{OUTPUT_FOLDER}'"
-    )
     if not validate_folder_containment(output_folder, OUTPUT_FOLDER):
         raise ValueError(f"Invalid or unsafe output folder path: {output_folder}")
@@ -671,9 +665,6 @@ def save_results_and_redaction_lists(
     try:
         output_folder_path = Path(output_folder).resolve()
         # Validate that the resolved path is within the trusted OUTPUT_FOLDER using robust containment check
-        print(
-            f"DEBUG: Validating resolved path='{output_folder_path}' against OUTPUT_FOLDER='{OUTPUT_FOLDER}'"
-        )
         if not validate_folder_containment(str(output_folder_path), OUTPUT_FOLDER):
             raise ValueError(
                 f"Output folder path {output_folder} is outside the trusted directory {OUTPUT_FOLDER}"
@@ -1092,7 +1083,7 @@ def run_duplicate_analysis(
     progress(0, desc="Combining input files...")
     df_combined, _, full_out_ocr_df = combine_ocr_output_text(
-        files, combine_pages=combine_pages
     )
     if df_combined.empty:

     output_files = list()
     if output_folder and output_filename:
         # Validate path safety before creating directories and files
         if not validate_folder_containment(output_folder, OUTPUT_FOLDER):
             raise ValueError(f"Unsafe output folder path: {output_folder}")
         if not validate_path_safety(output_filename):
         list: A list of paths to all generated files.
     """
     # Validate the output_folder path for security
     if not validate_folder_containment(output_folder, OUTPUT_FOLDER):
         raise ValueError(f"Invalid or unsafe output folder path: {output_folder}")
     try:
         output_folder_path = Path(output_folder).resolve()
         # Validate that the resolved path is within the trusted OUTPUT_FOLDER using robust containment check
         if not validate_folder_containment(str(output_folder_path), OUTPUT_FOLDER):
             raise ValueError(
                 f"Output folder path {output_folder} is outside the trusted directory {OUTPUT_FOLDER}"
     progress(0, desc="Combining input files...")
     df_combined, _, full_out_ocr_df = combine_ocr_output_text(
+        files, combine_pages=combine_pages, output_folder=output_folder
     )
     if df_combined.empty:

tools/secure_path_utils.py CHANGED Viewed

@@ -311,14 +311,6 @@ def validate_folder_containment(
         path_str = str(normalized_path).lower()
         base_str = str(normalized_base).lower()
-        print(
-            f"DEBUG: validate_folder_containment called with path='{path}' base_path='{base_path}'"
-        )
-        print(
-            f"DEBUG: normalized_path='{normalized_path}' normalized_base='{normalized_base}'"
-        )
-        print(f"DEBUG: path_str='{path_str}' base_str='{base_str}'")
         # Check if this is a test scenario
         is_test_path = any(
             test_pattern in path_str

         path_str = str(normalized_path).lower()
         base_str = str(normalized_base).lower()
         # Check if this is a test scenario
         is_test_path = any(
             test_pattern in path_str