Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Oct 2

Commit

409bdc5

1 Parent(s): 150a8d9

Input image creation during redaction should now respect input folders. Minor json output path change

Browse files

Files changed (3) hide show

entrypoint.sh +1 -1
tools/file_redaction.py +20 -6
tools/redaction_review.py +8 -0

entrypoint.sh CHANGED Viewed

@@ -21,7 +21,7 @@ else
         GRADIO_SERVER_PORT=${GRADIO_SERVER_PORT:-7860}
         # Start uvicorn server.
-        echo "Starting with Uvicorn on $GRADIO_SERVER_NAME:$GRADIO_SERVER_PORT with root path $ROOT_PATH"
         exec uvicorn app:app \
             --host $GRADIO_SERVER_NAME \
             --port $GRADIO_SERVER_PORT \

         GRADIO_SERVER_PORT=${GRADIO_SERVER_PORT:-7860}
         # Start uvicorn server.
+        echo "Starting with Uvicorn on $GRADIO_SERVER_NAME:$GRADIO_SERVER_PORT"
         exec uvicorn app:app \
             --host $GRADIO_SERVER_NAME \
             --port $GRADIO_SERVER_PORT \

tools/file_redaction.py CHANGED Viewed

@@ -985,6 +985,7 @@ def choose_and_run_redactor(
                 log_files_output_paths=log_files_output_paths,
                 nlp_analyser=nlp_analyser,
                 output_folder=output_folder,
             )
             # This line creates a copy of out_file_paths to break potential links with log_files_output_paths
@@ -1040,6 +1041,7 @@ def choose_and_run_redactor(
                 document_cropboxes,
                 text_extraction_only,
                 output_folder=output_folder,
             )
         else:
             out_message = "No redaction method selected"
@@ -1372,9 +1374,6 @@ def choose_and_run_redactor(
                         log_files_output_paths.append(
                             all_page_line_level_ocr_results_with_words_json_file_path[0]
                         )
-                    log_files_output_paths.append(
-                        all_page_line_level_ocr_results_with_words_json_file_path
-                    )
                 if (
                     all_page_line_level_ocr_results_with_words_df_file_path
@@ -2083,6 +2082,7 @@ def redact_page_with_pymupdf(
     page_sizes_df: pd.DataFrame = pd.DataFrame(),
     return_pdf_for_review: bool = RETURN_PDF_FOR_REVIEW,
     return_pdf_end_of_redaction: bool = RETURN_REDACTED_PDF,
 ):
     """
     Applies redactions to a single PyMuPDF page based on provided annotations.
@@ -2160,8 +2160,17 @@ def redact_page_with_pymupdf(
     all_image_annotation_boxes = list()
     if isinstance(image, Image.Image):
-        image_path = move_page_info(str(page))
-        image.save(image_path)
     elif isinstance(image, str):
         # Normalize and validate path safety before checking existence
         normalized_path = os.path.normpath(os.path.abspath(image))
@@ -2620,6 +2629,7 @@ def redact_image_pdf(
     max_time: int = int(MAX_TIME_VALUE),
     nlp_analyser: AnalyzerEngine = nlp_analyser,
     output_folder: str = OUTPUT_FOLDER,
     progress=Progress(track_tqdm=True),
 ):
     """
@@ -2661,7 +2671,7 @@ def redact_image_pdf(
     - nlp_analyser (AnalyzerEngine, optional): The nlp_analyser object to use for entity detection. Defaults to nlp_analyser.
     - output_folder (str, optional): The folder for file outputs.
     - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
     The function returns a redacted PDF document along with processing output objects.
     """
@@ -3140,6 +3150,7 @@ def redact_image_pdf(
                         redact_whole_page=redact_whole_page,
                         original_cropbox=original_cropbox,
                         page_sizes_df=page_sizes_df,
                     )
                     # Handle dual page objects if returned
@@ -3901,6 +3912,7 @@ def redact_text_pdf(
     original_cropboxes: List[dict] = list(),
     text_extraction_only: bool = False,
     output_folder: str = OUTPUT_FOLDER,
     page_break_val: int = int(PAGE_BREAK_VALUE),  # Value for page break
     max_time: int = int(MAX_TIME_VALUE),
     nlp_analyser: AnalyzerEngine = nlp_analyser,
@@ -3936,6 +3948,7 @@ def redact_text_pdf(
     - text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
     - language (str, optional): The language to do AWS Comprehend calls. Defaults to value of language if not provided.
     - output_folder (str, optional): The output folder for the function
     - page_break_val: Value for page break
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - nlp_analyser (AnalyzerEngine, optional): The nlp_analyser object to use for entity detection. Defaults to nlp_analyser.
@@ -4203,6 +4216,7 @@ def redact_text_pdf(
                         convert_pikepdf_to_pymupdf_coords=True,
                         original_cropbox=original_cropboxes[page_no],
                         page_sizes_df=page_sizes_df,
                     )
                     # Handle dual page objects if returned

                 log_files_output_paths=log_files_output_paths,
                 nlp_analyser=nlp_analyser,
                 output_folder=output_folder,
+                input_folder=input_folder,
             )
             # This line creates a copy of out_file_paths to break potential links with log_files_output_paths
                 document_cropboxes,
                 text_extraction_only,
                 output_folder=output_folder,
+                input_folder=input_folder,
             )
         else:
             out_message = "No redaction method selected"
                         log_files_output_paths.append(
                             all_page_line_level_ocr_results_with_words_json_file_path[0]
                         )
                 if (
                     all_page_line_level_ocr_results_with_words_df_file_path
     page_sizes_df: pd.DataFrame = pd.DataFrame(),
     return_pdf_for_review: bool = RETURN_PDF_FOR_REVIEW,
     return_pdf_end_of_redaction: bool = RETURN_REDACTED_PDF,
+    input_folder: str = INPUT_FOLDER,
 ):
     """
     Applies redactions to a single PyMuPDF page based on provided annotations.
     all_image_annotation_boxes = list()
     if isinstance(image, Image.Image):
+        # Create an image path using the input folder with PDF filename
+        # Get the PDF filename from the page's parent document
+        pdf_filename = (
+            os.path.basename(page.parent.name)
+            if hasattr(page.parent, "name") and page.parent.name
+            else "document"
+        )
+        # pdf_name_without_ext = os.path.splitext(pdf_filename)[0]
+        image_path = os.path.join(input_folder, f"{pdf_filename}_{page.number}.png")
+        if not os.path.exists(image_path):
+            image.save(image_path)
     elif isinstance(image, str):
         # Normalize and validate path safety before checking existence
         normalized_path = os.path.normpath(os.path.abspath(image))
     max_time: int = int(MAX_TIME_VALUE),
     nlp_analyser: AnalyzerEngine = nlp_analyser,
     output_folder: str = OUTPUT_FOLDER,
+    input_folder: str = INPUT_FOLDER,
     progress=Progress(track_tqdm=True),
 ):
     """
     - nlp_analyser (AnalyzerEngine, optional): The nlp_analyser object to use for entity detection. Defaults to nlp_analyser.
     - output_folder (str, optional): The folder for file outputs.
     - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
+    - input_folder (str, optional): The folder for file inputs.
     The function returns a redacted PDF document along with processing output objects.
     """
                         redact_whole_page=redact_whole_page,
                         original_cropbox=original_cropbox,
                         page_sizes_df=page_sizes_df,
+                        input_folder=input_folder,
                     )
                     # Handle dual page objects if returned
     original_cropboxes: List[dict] = list(),
     text_extraction_only: bool = False,
     output_folder: str = OUTPUT_FOLDER,
+    input_folder: str = INPUT_FOLDER,
     page_break_val: int = int(PAGE_BREAK_VALUE),  # Value for page break
     max_time: int = int(MAX_TIME_VALUE),
     nlp_analyser: AnalyzerEngine = nlp_analyser,
     - text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
     - language (str, optional): The language to do AWS Comprehend calls. Defaults to value of language if not provided.
     - output_folder (str, optional): The output folder for the function
+    - input_folder (str, optional): The folder for file inputs.
     - page_break_val: Value for page break
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - nlp_analyser (AnalyzerEngine, optional): The nlp_analyser object to use for entity detection. Defaults to nlp_analyser.
                         convert_pikepdf_to_pymupdf_coords=True,
                         original_cropbox=original_cropboxes[page_no],
                         page_sizes_df=page_sizes_df,
+                        input_folder=input_folder,
                     )
                     # Handle dual page objects if returned

tools/redaction_review.py CHANGED Viewed

@@ -1594,6 +1594,7 @@ def apply_redactions_to_review_df_and_files(
     save_pdf: bool = True,
     page_sizes: List[dict] = list(),
     COMPRESS_REDACTED_PDF: bool = COMPRESS_REDACTED_PDF,
     progress=gr.Progress(track_tqdm=True),
 ):
     """
@@ -1619,6 +1620,8 @@ def apply_redactions_to_review_df_and_files(
                                            Defaults to an empty list.
         COMPRESS_REDACTED_PDF (bool, optional): If True, the output PDF will be compressed.
                                                 Defaults to COMPRESS_REDACTED_PDF.
         progress (gr.Progress, optional): Gradio progress object for tracking task progress.
                                           Defaults to gr.Progress(track_tqdm=True).
@@ -1636,6 +1639,9 @@ def apply_redactions_to_review_df_and_files(
     pdf_doc = list()
     review_df = review_file_state
     page_image_annotator_object = all_image_annotations[current_page - 1]
     # This replaces the numpy array image object with the image file path
@@ -1774,6 +1780,7 @@ def apply_redactions_to_review_df_and_files(
                             page_sizes_df=page_sizes_df,
                             return_pdf_for_review=True,
                             return_pdf_end_of_redaction=False,
                         )
                     # Apply redactions to final page (with text removed)
@@ -1785,6 +1792,7 @@ def apply_redactions_to_review_df_and_files(
                         page_sizes_df=page_sizes_df,
                         return_pdf_for_review=False,
                         return_pdf_end_of_redaction=False,
                     )
             else:
                 print("File type not recognised.")

     save_pdf: bool = True,
     page_sizes: List[dict] = list(),
     COMPRESS_REDACTED_PDF: bool = COMPRESS_REDACTED_PDF,
+    input_folder: str = INPUT_FOLDER,
     progress=gr.Progress(track_tqdm=True),
 ):
     """
                                            Defaults to an empty list.
         COMPRESS_REDACTED_PDF (bool, optional): If True, the output PDF will be compressed.
                                                 Defaults to COMPRESS_REDACTED_PDF.
+        input_folder (str, optional): The directory where input files are located and where
+                                     page images should be saved. Defaults to INPUT_FOLDER.
         progress (gr.Progress, optional): Gradio progress object for tracking task progress.
                                           Defaults to gr.Progress(track_tqdm=True).
     pdf_doc = list()
     review_df = review_file_state
+    # Always use the provided input_folder parameter
+    # This ensures images are created in the specified input folder, not in example_data
     page_image_annotator_object = all_image_annotations[current_page - 1]
     # This replaces the numpy array image object with the image file path
                             page_sizes_df=page_sizes_df,
                             return_pdf_for_review=True,
                             return_pdf_end_of_redaction=False,
+                            input_folder=input_folder,
                         )
                     # Apply redactions to final page (with text removed)
                         page_sizes_df=page_sizes_df,
                         return_pdf_for_review=False,
                         return_pdf_end_of_redaction=False,
+                        input_folder=input_folder,
                     )
             else:
                 print("File type not recognised.")