Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Oct 2

Commit

150a8d9

1 Parent(s): 5086da0

Fixed linting issues

Browse files

Files changed (3) hide show

app.py +60 -45
tools/file_conversion.py +9 -5
tools/file_redaction.py +55 -24

app.py CHANGED Viewed

@@ -278,44 +278,44 @@ in_redact_comprehend_entities = gr.Dropdown(
 )
 in_deny_list = gr.File(
-                        label="Import custom deny list - csv table with one column of a different word/phrase on each row (case insensitive). Terms in this file will always be redacted.",
-                        file_count="multiple",
-                        height=FILE_INPUT_HEIGHT,
-                    )
 in_deny_list_state = gr.Dataframe(
-                        value=pd.DataFrame(),
-                        headers=["deny_list"],
-                        col_count=(1, "fixed"),
-                        row_count=(0, "dynamic"),
-                        label="Deny list",
-                        visible=True,
-                        type="pandas",
-                        interactive=True,
-                        show_fullscreen_button=True,
-                        show_copy_button=True,
-                        wrap=True,
-                    )
 in_fully_redacted_list = gr.File(
-                        label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.",
-                        file_count="multiple",
-                        height=FILE_INPUT_HEIGHT,
-                    )
 in_fully_redacted_list_state = gr.Dataframe(
-                        value=pd.DataFrame(),
-                        headers=["fully_redacted_pages_list"],
-                        col_count=(1, "fixed"),
-                        row_count=(0, "dynamic"),
-                        label="Fully redacted pages",
-                        visible=True,
-                        type="pandas",
-                        interactive=True,
-                        show_fullscreen_button=True,
-                        show_copy_button=True,
-                        wrap=True,
-                    )
 ## Deduplication examples
@@ -1101,22 +1101,37 @@ with blocks:
                     )
                     example_labels.append(
                         "PDF redaction with AWS services and signature detection"
-                    )
             # Add new example for custom deny list and whole page redaction
-            if os.path.exists(example_files[3]) and os.path.exists(example_files[4]) and os.path.exists(example_files[5]):
                 available_examples.append(
                     [
                         [example_files[3]],
                         "Local OCR model - PDFs without selectable text",
                         "Local",
                         [],
-                        ["CUSTOM"],  # Use CUSTOM entity to enable deny list functionality
                         CHOSEN_COMPREHEND_ENTITIES,
                         [example_files[3]],
                         example_files[3],
                         [example_files[4]],
-                        pd.DataFrame(data={"deny_list": ["Sister", "Sister City", "Sister Cities", "Friendship City"]}),
                         [example_files[5]],
                         pd.DataFrame(data={"fully_redacted_pages_list": [2, 5]}),
                     ]
@@ -1137,7 +1152,7 @@ with blocks:
                     in_redact_comprehend_entities,
                     prepared_pdf_state,
                     doc_full_file_name_textbox,
-                    in_deny_list,
                     in_deny_list_state,
                     in_fully_redacted_list,
                     in_fully_redacted_list_state,
@@ -2153,7 +2168,7 @@ with blocks:
             value="## Please give feedback", visible=False
         )
         data_feedback_radio = gr.Radio(
-            label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document.",
             choices=["The results were good", "The results were not good"],
             visible=False,
             show_label=True,
@@ -2180,11 +2195,11 @@ with blocks:
                     in_allow_list_text = gr.Textbox(
                         label="Custom allow list load status"
                     )
-                with gr.Column():
-                    in_deny_list.render() # Defined at beginning of file
                     in_deny_list_text = gr.Textbox(label="Custom deny list load status")
                 with gr.Column():
-                    in_fully_redacted_list.render() # Defined at beginning of file
                     in_fully_redacted_list_text = gr.Textbox(
                         label="Fully redacted page list load status"
                     )
@@ -2206,10 +2221,10 @@ with blocks:
                         show_copy_button=True,
                         wrap=True,
                     )
-                    in_deny_list_state.render() # Defined at beginning of file
-                    in_fully_redacted_list_state.render() # Defined at beginning of file
                 with gr.Row():
                     with gr.Column(scale=2):
                         markdown_placeholder = gr.Markdown("")

 )
 in_deny_list = gr.File(
+    label="Import custom deny list - csv table with one column of a different word/phrase on each row (case insensitive). Terms in this file will always be redacted.",
+    file_count="multiple",
+    height=FILE_INPUT_HEIGHT,
+)
 in_deny_list_state = gr.Dataframe(
+    value=pd.DataFrame(),
+    headers=["deny_list"],
+    col_count=(1, "fixed"),
+    row_count=(0, "dynamic"),
+    label="Deny list",
+    visible=True,
+    type="pandas",
+    interactive=True,
+    show_fullscreen_button=True,
+    show_copy_button=True,
+    wrap=True,
+)
 in_fully_redacted_list = gr.File(
+    label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.",
+    file_count="multiple",
+    height=FILE_INPUT_HEIGHT,
+)
 in_fully_redacted_list_state = gr.Dataframe(
+    value=pd.DataFrame(),
+    headers=["fully_redacted_pages_list"],
+    col_count=(1, "fixed"),
+    row_count=(0, "dynamic"),
+    label="Fully redacted pages",
+    visible=True,
+    type="pandas",
+    interactive=True,
+    show_fullscreen_button=True,
+    show_copy_button=True,
+    wrap=True,
+)
 ## Deduplication examples
                     )
                     example_labels.append(
                         "PDF redaction with AWS services and signature detection"
+                    )
             # Add new example for custom deny list and whole page redaction
+            if (
+                os.path.exists(example_files[3])
+                and os.path.exists(example_files[4])
+                and os.path.exists(example_files[5])
+            ):
                 available_examples.append(
                     [
                         [example_files[3]],
                         "Local OCR model - PDFs without selectable text",
                         "Local",
                         [],
+                        [
+                            "CUSTOM"
+                        ],  # Use CUSTOM entity to enable deny list functionality
                         CHOSEN_COMPREHEND_ENTITIES,
                         [example_files[3]],
                         example_files[3],
                         [example_files[4]],
+                        pd.DataFrame(
+                            data={
+                                "deny_list": [
+                                    "Sister",
+                                    "Sister City",
+                                    "Sister Cities",
+                                    "Friendship City",
+                                ]
+                            }
+                        ),
                         [example_files[5]],
                         pd.DataFrame(data={"fully_redacted_pages_list": [2, 5]}),
                     ]
                     in_redact_comprehend_entities,
                     prepared_pdf_state,
                     doc_full_file_name_textbox,
+                    in_deny_list,
                     in_deny_list_state,
                     in_fully_redacted_list,
                     in_fully_redacted_list_state,
             value="## Please give feedback", visible=False
         )
         data_feedback_radio = gr.Radio(
+            label="Please give some feedback about the results of the redaction.",
             choices=["The results were good", "The results were not good"],
             visible=False,
             show_label=True,
                     in_allow_list_text = gr.Textbox(
                         label="Custom allow list load status"
                     )
+                with gr.Column():
+                    in_deny_list.render()  # Defined at beginning of file
                     in_deny_list_text = gr.Textbox(label="Custom deny list load status")
                 with gr.Column():
+                    in_fully_redacted_list.render()  # Defined at beginning of file
                     in_fully_redacted_list_text = gr.Textbox(
                         label="Fully redacted page list load status"
                     )
                         show_copy_button=True,
                         wrap=True,
                     )
+                    in_deny_list_state.render()  # Defined at beginning of file
+                    in_fully_redacted_list_state.render()  # Defined at beginning of file
                 with gr.Row():
                     with gr.Column(scale=2):
                         markdown_placeholder = gr.Markdown("")

tools/file_conversion.py CHANGED Viewed

@@ -583,7 +583,9 @@ def redact_single_box(
     pymupdf_y2 = pymupdf_rect[3]
     # Full size redaction box for covering all the text of a word
-    full_size_redaction_box = Rect(pymupdf_x1-1, pymupdf_y1-1, pymupdf_x2+1, pymupdf_y2+1)
     # Calculate tiny height redaction box so that it doesn't delete text from adjacent lines
     redact_bottom_y = pymupdf_y1 + 2
@@ -620,7 +622,7 @@ def redact_single_box(
         applied_redaction_page = applied_redaction_page[0]
     # Handle review page first, then deal with final redacted page (retain_text = True)
-    if retain_text is True:
         annot = pymupdf_page.add_redact_annot(full_size_redaction_box)
         annot.set_colors(stroke=out_colour, fill=out_colour, colors=out_colour)
@@ -636,7 +638,7 @@ def redact_single_box(
         # If we need both review and final pages, and the applied redaction page has been prepared, apply final redaction to the copy
         if return_pdf_end_of_redaction and applied_redaction_page is not None:
-            # Apply final redaction to the copy
             # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
             applied_redaction_page.add_redact_annot(rect_small_pixel_height)
@@ -654,7 +656,7 @@ def redact_single_box(
             return pymupdf_page
     # If we don't need to retain the text, we only have one page which is the applied redaction page, so just apply the redaction to the page
-    else:
         # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
         pymupdf_page.add_redact_annot(rect_small_pixel_height)
@@ -794,7 +796,9 @@ def redact_whole_pymupdf_page(
     # Calculate relative coordinates for the annotation box (0-1 range)
     # This ensures the coordinates are already in relative format for output files
-    relative_border = border / min(rect_width, rect_height)  # Scale border proportionally
     relative_x1 = relative_border
     relative_y1 = relative_border
     relative_x2 = 1 - relative_border

     pymupdf_y2 = pymupdf_rect[3]
     # Full size redaction box for covering all the text of a word
+    full_size_redaction_box = Rect(
+        pymupdf_x1 - 1, pymupdf_y1 - 1, pymupdf_x2 + 1, pymupdf_y2 + 1
+    )
     # Calculate tiny height redaction box so that it doesn't delete text from adjacent lines
     redact_bottom_y = pymupdf_y1 + 2
         applied_redaction_page = applied_redaction_page[0]
     # Handle review page first, then deal with final redacted page (retain_text = True)
+    if retain_text is True:
         annot = pymupdf_page.add_redact_annot(full_size_redaction_box)
         annot.set_colors(stroke=out_colour, fill=out_colour, colors=out_colour)
         # If we need both review and final pages, and the applied redaction page has been prepared, apply final redaction to the copy
         if return_pdf_end_of_redaction and applied_redaction_page is not None:
+            # Apply final redaction to the copy
             # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
             applied_redaction_page.add_redact_annot(rect_small_pixel_height)
             return pymupdf_page
     # If we don't need to retain the text, we only have one page which is the applied redaction page, so just apply the redaction to the page
+    else:
         # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
         pymupdf_page.add_redact_annot(rect_small_pixel_height)
     # Calculate relative coordinates for the annotation box (0-1 range)
     # This ensures the coordinates are already in relative format for output files
+    relative_border = border / min(
+        rect_width, rect_height
+    )  # Scale border proportionally
     relative_x1 = relative_border
     relative_y1 = relative_border
     relative_x2 = 1 - relative_border

tools/file_redaction.py CHANGED Viewed

@@ -1109,16 +1109,20 @@ def choose_and_run_redactor(
                                 # Create a mapping of original page numbers to final pages
                                 applied_redaction_pages_map = {}
-                                for applied_redaction_page_data in redact_image_pdf._applied_redaction_pages:
                                     if isinstance(applied_redaction_page_data, tuple):
                                         applied_redaction_page, original_page_number = (
                                             applied_redaction_page_data
                                         )
-                                        applied_redaction_pages_map[original_page_number] = (
-                                            applied_redaction_page
-                                        )
                                     else:
-                                        applied_redaction_page = applied_redaction_page_data
                                         applied_redaction_pages_map[0] = (
                                             applied_redaction_page  # Default to page 0 if no original number
                                         )
@@ -1158,16 +1162,20 @@ def choose_and_run_redactor(
                                 # Create a mapping of original page numbers to final pages
                                 applied_redaction_pages_map = {}
-                                for applied_redaction_page_data in redact_text_pdf._applied_redaction_pages:
                                     if isinstance(applied_redaction_page_data, tuple):
                                         applied_redaction_page, original_page_number = (
                                             applied_redaction_page_data
                                         )
-                                        applied_redaction_pages_map[original_page_number] = (
-                                            applied_redaction_page
-                                        )
                                     else:
-                                        applied_redaction_page = applied_redaction_page_data
                                         applied_redaction_pages_map[0] = (
                                             applied_redaction_page  # Default to page 0 if no original number
                                         )
@@ -1199,7 +1207,10 @@ def choose_and_run_redactor(
                                 delattr(redact_text_pdf, "_applied_redaction_pages")
                         # Save final redacted PDF if we have dual outputs or if RETURN_PDF_FOR_REVIEW is False
-                        if RETURN_PDF_FOR_REVIEW is False or applied_redaction_pymupdf_doc:
                             out_redacted_pdf_file_path = (
                                 output_folder
                                 + pdf_file_name_without_ext
@@ -1211,7 +1222,9 @@ def choose_and_run_redactor(
                             # Use final document if available, otherwise use main document
                             doc_to_save = (
-                                applied_redaction_pymupdf_doc if applied_redaction_pymupdf_doc else pymupdf_doc
                             )
                             if out_redacted_pdf_file_path:
@@ -2274,11 +2287,15 @@ def redact_page_with_pymupdf(
             page, applied_redaction_page = redact_result
             # Store the final page for later use
             if not hasattr(redact_page_with_pymupdf, "_applied_redaction_page"):
-                redact_page_with_pymupdf._applied_redaction_page = applied_redaction_page
             else:
                 # If we already have a final page, we need to handle multiple pages
                 # For now, we'll use the last final page
-                redact_page_with_pymupdf._applied_redaction_page = applied_redaction_page
     # If whole page is to be redacted, do that here
     if redact_whole_page is True:
@@ -2287,7 +2304,9 @@ def redact_page_with_pymupdf(
             rect_height, rect_width, page, custom_colours, border=5
         )
         # Ensure the whole page annotation box has a unique ID
-        whole_page_img_annotation_box = fill_missing_box_ids(whole_page_img_annotation_box)
         all_image_annotation_boxes.append(whole_page_img_annotation_box)
         # Handle dual page objects for whole page redaction if needed
@@ -2304,16 +2323,26 @@ def redact_page_with_pymupdf(
             # Apply the whole page redaction to the final page as well
             redact_whole_pymupdf_page(
-                rect_height, rect_width, applied_redaction_page, custom_colours, border=5
             )
             # Store the final page with its original page number for later use
             if not hasattr(redact_page_with_pymupdf, "_applied_redaction_page"):
-                redact_page_with_pymupdf._applied_redaction_page = (applied_redaction_page, page.number)
             else:
                 # If we already have a final page, we need to handle multiple pages
                 # For now, we'll use the last final page
-                redact_page_with_pymupdf._applied_redaction_page = (applied_redaction_page, page.number)
     out_annotation_boxes = {
         "image": image_path,  # Image.open(image_path), #image_path,
@@ -3115,9 +3144,10 @@ def redact_image_pdf(
                     # Handle dual page objects if returned
                     if isinstance(redact_result[0], tuple):
-                        (pymupdf_page, pymupdf_applied_redaction_page), page_image_annotations = (
-                            redact_result
-                        )
                         # Store the final page with its original page number for later use
                         if not hasattr(redact_image_pdf, "_applied_redaction_pages"):
                             redact_image_pdf._applied_redaction_pages = list()
@@ -4177,9 +4207,10 @@ def redact_text_pdf(
                     # Handle dual page objects if returned
                     if isinstance(redact_result[0], tuple):
-                        (pymupdf_page, pymupdf_applied_redaction_page), page_image_annotations = (
-                            redact_result
-                        )
                         # Store the final page with its original page number for later use
                         if not hasattr(redact_text_pdf, "_applied_redaction_pages"):
                             redact_text_pdf._applied_redaction_pages = list()

                                 # Create a mapping of original page numbers to final pages
                                 applied_redaction_pages_map = {}
+                                for (
+                                    applied_redaction_page_data
+                                ) in redact_image_pdf._applied_redaction_pages:
                                     if isinstance(applied_redaction_page_data, tuple):
                                         applied_redaction_page, original_page_number = (
                                             applied_redaction_page_data
                                         )
+                                        applied_redaction_pages_map[
+                                            original_page_number
+                                        ] = applied_redaction_page
                                     else:
+                                        applied_redaction_page = (
+                                            applied_redaction_page_data
+                                        )
                                         applied_redaction_pages_map[0] = (
                                             applied_redaction_page  # Default to page 0 if no original number
                                         )
                                 # Create a mapping of original page numbers to final pages
                                 applied_redaction_pages_map = {}
+                                for (
+                                    applied_redaction_page_data
+                                ) in redact_text_pdf._applied_redaction_pages:
                                     if isinstance(applied_redaction_page_data, tuple):
                                         applied_redaction_page, original_page_number = (
                                             applied_redaction_page_data
                                         )
+                                        applied_redaction_pages_map[
+                                            original_page_number
+                                        ] = applied_redaction_page
                                     else:
+                                        applied_redaction_page = (
+                                            applied_redaction_page_data
+                                        )
                                         applied_redaction_pages_map[0] = (
                                             applied_redaction_page  # Default to page 0 if no original number
                                         )
                                 delattr(redact_text_pdf, "_applied_redaction_pages")
                         # Save final redacted PDF if we have dual outputs or if RETURN_PDF_FOR_REVIEW is False
+                        if (
+                            RETURN_PDF_FOR_REVIEW is False
+                            or applied_redaction_pymupdf_doc
+                        ):
                             out_redacted_pdf_file_path = (
                                 output_folder
                                 + pdf_file_name_without_ext
                             # Use final document if available, otherwise use main document
                             doc_to_save = (
+                                applied_redaction_pymupdf_doc
+                                if applied_redaction_pymupdf_doc
+                                else pymupdf_doc
                             )
                             if out_redacted_pdf_file_path:
             page, applied_redaction_page = redact_result
             # Store the final page for later use
             if not hasattr(redact_page_with_pymupdf, "_applied_redaction_page"):
+                redact_page_with_pymupdf._applied_redaction_page = (
+                    applied_redaction_page
+                )
             else:
                 # If we already have a final page, we need to handle multiple pages
                 # For now, we'll use the last final page
+                redact_page_with_pymupdf._applied_redaction_page = (
+                    applied_redaction_page
+                )
     # If whole page is to be redacted, do that here
     if redact_whole_page is True:
             rect_height, rect_width, page, custom_colours, border=5
         )
         # Ensure the whole page annotation box has a unique ID
+        whole_page_img_annotation_box = fill_missing_box_ids(
+            whole_page_img_annotation_box
+        )
         all_image_annotation_boxes.append(whole_page_img_annotation_box)
         # Handle dual page objects for whole page redaction if needed
             # Apply the whole page redaction to the final page as well
             redact_whole_pymupdf_page(
+                rect_height,
+                rect_width,
+                applied_redaction_page,
+                custom_colours,
+                border=5,
             )
             # Store the final page with its original page number for later use
             if not hasattr(redact_page_with_pymupdf, "_applied_redaction_page"):
+                redact_page_with_pymupdf._applied_redaction_page = (
+                    applied_redaction_page,
+                    page.number,
+                )
             else:
                 # If we already have a final page, we need to handle multiple pages
                 # For now, we'll use the last final page
+                redact_page_with_pymupdf._applied_redaction_page = (
+                    applied_redaction_page,
+                    page.number,
+                )
     out_annotation_boxes = {
         "image": image_path,  # Image.open(image_path), #image_path,
                     # Handle dual page objects if returned
                     if isinstance(redact_result[0], tuple):
+                        (
+                            pymupdf_page,
+                            pymupdf_applied_redaction_page,
+                        ), page_image_annotations = redact_result
                         # Store the final page with its original page number for later use
                         if not hasattr(redact_image_pdf, "_applied_redaction_pages"):
                             redact_image_pdf._applied_redaction_pages = list()
                     # Handle dual page objects if returned
                     if isinstance(redact_result[0], tuple):
+                        (
+                            pymupdf_page,
+                            pymupdf_applied_redaction_page,
+                        ), page_image_annotations = redact_result
                         # Store the final page with its original page number for later use
                         if not hasattr(redact_text_pdf, "_applied_redaction_pages"):
                             redact_text_pdf._applied_redaction_pages = list()