Commit
·
409bdc5
1
Parent(s):
150a8d9
Input image creation during redaction should now respect input folders. Minor json output path change
Browse files- entrypoint.sh +1 -1
- tools/file_redaction.py +20 -6
- tools/redaction_review.py +8 -0
entrypoint.sh
CHANGED
|
@@ -21,7 +21,7 @@ else
|
|
| 21 |
GRADIO_SERVER_PORT=${GRADIO_SERVER_PORT:-7860}
|
| 22 |
|
| 23 |
# Start uvicorn server.
|
| 24 |
-
echo "Starting with Uvicorn on $GRADIO_SERVER_NAME:$GRADIO_SERVER_PORT
|
| 25 |
exec uvicorn app:app \
|
| 26 |
--host $GRADIO_SERVER_NAME \
|
| 27 |
--port $GRADIO_SERVER_PORT \
|
|
|
|
| 21 |
GRADIO_SERVER_PORT=${GRADIO_SERVER_PORT:-7860}
|
| 22 |
|
| 23 |
# Start uvicorn server.
|
| 24 |
+
echo "Starting with Uvicorn on $GRADIO_SERVER_NAME:$GRADIO_SERVER_PORT"
|
| 25 |
exec uvicorn app:app \
|
| 26 |
--host $GRADIO_SERVER_NAME \
|
| 27 |
--port $GRADIO_SERVER_PORT \
|
tools/file_redaction.py
CHANGED
|
@@ -985,6 +985,7 @@ def choose_and_run_redactor(
|
|
| 985 |
log_files_output_paths=log_files_output_paths,
|
| 986 |
nlp_analyser=nlp_analyser,
|
| 987 |
output_folder=output_folder,
|
|
|
|
| 988 |
)
|
| 989 |
|
| 990 |
# This line creates a copy of out_file_paths to break potential links with log_files_output_paths
|
|
@@ -1040,6 +1041,7 @@ def choose_and_run_redactor(
|
|
| 1040 |
document_cropboxes,
|
| 1041 |
text_extraction_only,
|
| 1042 |
output_folder=output_folder,
|
|
|
|
| 1043 |
)
|
| 1044 |
else:
|
| 1045 |
out_message = "No redaction method selected"
|
|
@@ -1372,9 +1374,6 @@ def choose_and_run_redactor(
|
|
| 1372 |
log_files_output_paths.append(
|
| 1373 |
all_page_line_level_ocr_results_with_words_json_file_path[0]
|
| 1374 |
)
|
| 1375 |
-
log_files_output_paths.append(
|
| 1376 |
-
all_page_line_level_ocr_results_with_words_json_file_path
|
| 1377 |
-
)
|
| 1378 |
|
| 1379 |
if (
|
| 1380 |
all_page_line_level_ocr_results_with_words_df_file_path
|
|
@@ -2083,6 +2082,7 @@ def redact_page_with_pymupdf(
|
|
| 2083 |
page_sizes_df: pd.DataFrame = pd.DataFrame(),
|
| 2084 |
return_pdf_for_review: bool = RETURN_PDF_FOR_REVIEW,
|
| 2085 |
return_pdf_end_of_redaction: bool = RETURN_REDACTED_PDF,
|
|
|
|
| 2086 |
):
|
| 2087 |
"""
|
| 2088 |
Applies redactions to a single PyMuPDF page based on provided annotations.
|
|
@@ -2160,8 +2160,17 @@ def redact_page_with_pymupdf(
|
|
| 2160 |
all_image_annotation_boxes = list()
|
| 2161 |
|
| 2162 |
if isinstance(image, Image.Image):
|
| 2163 |
-
|
| 2164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2165 |
elif isinstance(image, str):
|
| 2166 |
# Normalize and validate path safety before checking existence
|
| 2167 |
normalized_path = os.path.normpath(os.path.abspath(image))
|
|
@@ -2620,6 +2629,7 @@ def redact_image_pdf(
|
|
| 2620 |
max_time: int = int(MAX_TIME_VALUE),
|
| 2621 |
nlp_analyser: AnalyzerEngine = nlp_analyser,
|
| 2622 |
output_folder: str = OUTPUT_FOLDER,
|
|
|
|
| 2623 |
progress=Progress(track_tqdm=True),
|
| 2624 |
):
|
| 2625 |
"""
|
|
@@ -2661,7 +2671,7 @@ def redact_image_pdf(
|
|
| 2661 |
- nlp_analyser (AnalyzerEngine, optional): The nlp_analyser object to use for entity detection. Defaults to nlp_analyser.
|
| 2662 |
- output_folder (str, optional): The folder for file outputs.
|
| 2663 |
- progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
| 2664 |
-
|
| 2665 |
The function returns a redacted PDF document along with processing output objects.
|
| 2666 |
"""
|
| 2667 |
|
|
@@ -3140,6 +3150,7 @@ def redact_image_pdf(
|
|
| 3140 |
redact_whole_page=redact_whole_page,
|
| 3141 |
original_cropbox=original_cropbox,
|
| 3142 |
page_sizes_df=page_sizes_df,
|
|
|
|
| 3143 |
)
|
| 3144 |
|
| 3145 |
# Handle dual page objects if returned
|
|
@@ -3901,6 +3912,7 @@ def redact_text_pdf(
|
|
| 3901 |
original_cropboxes: List[dict] = list(),
|
| 3902 |
text_extraction_only: bool = False,
|
| 3903 |
output_folder: str = OUTPUT_FOLDER,
|
|
|
|
| 3904 |
page_break_val: int = int(PAGE_BREAK_VALUE), # Value for page break
|
| 3905 |
max_time: int = int(MAX_TIME_VALUE),
|
| 3906 |
nlp_analyser: AnalyzerEngine = nlp_analyser,
|
|
@@ -3936,6 +3948,7 @@ def redact_text_pdf(
|
|
| 3936 |
- text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
|
| 3937 |
- language (str, optional): The language to do AWS Comprehend calls. Defaults to value of language if not provided.
|
| 3938 |
- output_folder (str, optional): The output folder for the function
|
|
|
|
| 3939 |
- page_break_val: Value for page break
|
| 3940 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
| 3941 |
- nlp_analyser (AnalyzerEngine, optional): The nlp_analyser object to use for entity detection. Defaults to nlp_analyser.
|
|
@@ -4203,6 +4216,7 @@ def redact_text_pdf(
|
|
| 4203 |
convert_pikepdf_to_pymupdf_coords=True,
|
| 4204 |
original_cropbox=original_cropboxes[page_no],
|
| 4205 |
page_sizes_df=page_sizes_df,
|
|
|
|
| 4206 |
)
|
| 4207 |
|
| 4208 |
# Handle dual page objects if returned
|
|
|
|
| 985 |
log_files_output_paths=log_files_output_paths,
|
| 986 |
nlp_analyser=nlp_analyser,
|
| 987 |
output_folder=output_folder,
|
| 988 |
+
input_folder=input_folder,
|
| 989 |
)
|
| 990 |
|
| 991 |
# This line creates a copy of out_file_paths to break potential links with log_files_output_paths
|
|
|
|
| 1041 |
document_cropboxes,
|
| 1042 |
text_extraction_only,
|
| 1043 |
output_folder=output_folder,
|
| 1044 |
+
input_folder=input_folder,
|
| 1045 |
)
|
| 1046 |
else:
|
| 1047 |
out_message = "No redaction method selected"
|
|
|
|
| 1374 |
log_files_output_paths.append(
|
| 1375 |
all_page_line_level_ocr_results_with_words_json_file_path[0]
|
| 1376 |
)
|
|
|
|
|
|
|
|
|
|
| 1377 |
|
| 1378 |
if (
|
| 1379 |
all_page_line_level_ocr_results_with_words_df_file_path
|
|
|
|
| 2082 |
page_sizes_df: pd.DataFrame = pd.DataFrame(),
|
| 2083 |
return_pdf_for_review: bool = RETURN_PDF_FOR_REVIEW,
|
| 2084 |
return_pdf_end_of_redaction: bool = RETURN_REDACTED_PDF,
|
| 2085 |
+
input_folder: str = INPUT_FOLDER,
|
| 2086 |
):
|
| 2087 |
"""
|
| 2088 |
Applies redactions to a single PyMuPDF page based on provided annotations.
|
|
|
|
| 2160 |
all_image_annotation_boxes = list()
|
| 2161 |
|
| 2162 |
if isinstance(image, Image.Image):
|
| 2163 |
+
# Create an image path using the input folder with PDF filename
|
| 2164 |
+
# Get the PDF filename from the page's parent document
|
| 2165 |
+
pdf_filename = (
|
| 2166 |
+
os.path.basename(page.parent.name)
|
| 2167 |
+
if hasattr(page.parent, "name") and page.parent.name
|
| 2168 |
+
else "document"
|
| 2169 |
+
)
|
| 2170 |
+
# pdf_name_without_ext = os.path.splitext(pdf_filename)[0]
|
| 2171 |
+
image_path = os.path.join(input_folder, f"{pdf_filename}_{page.number}.png")
|
| 2172 |
+
if not os.path.exists(image_path):
|
| 2173 |
+
image.save(image_path)
|
| 2174 |
elif isinstance(image, str):
|
| 2175 |
# Normalize and validate path safety before checking existence
|
| 2176 |
normalized_path = os.path.normpath(os.path.abspath(image))
|
|
|
|
| 2629 |
max_time: int = int(MAX_TIME_VALUE),
|
| 2630 |
nlp_analyser: AnalyzerEngine = nlp_analyser,
|
| 2631 |
output_folder: str = OUTPUT_FOLDER,
|
| 2632 |
+
input_folder: str = INPUT_FOLDER,
|
| 2633 |
progress=Progress(track_tqdm=True),
|
| 2634 |
):
|
| 2635 |
"""
|
|
|
|
| 2671 |
- nlp_analyser (AnalyzerEngine, optional): The nlp_analyser object to use for entity detection. Defaults to nlp_analyser.
|
| 2672 |
- output_folder (str, optional): The folder for file outputs.
|
| 2673 |
- progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
| 2674 |
+
- input_folder (str, optional): The folder for file inputs.
|
| 2675 |
The function returns a redacted PDF document along with processing output objects.
|
| 2676 |
"""
|
| 2677 |
|
|
|
|
| 3150 |
redact_whole_page=redact_whole_page,
|
| 3151 |
original_cropbox=original_cropbox,
|
| 3152 |
page_sizes_df=page_sizes_df,
|
| 3153 |
+
input_folder=input_folder,
|
| 3154 |
)
|
| 3155 |
|
| 3156 |
# Handle dual page objects if returned
|
|
|
|
| 3912 |
original_cropboxes: List[dict] = list(),
|
| 3913 |
text_extraction_only: bool = False,
|
| 3914 |
output_folder: str = OUTPUT_FOLDER,
|
| 3915 |
+
input_folder: str = INPUT_FOLDER,
|
| 3916 |
page_break_val: int = int(PAGE_BREAK_VALUE), # Value for page break
|
| 3917 |
max_time: int = int(MAX_TIME_VALUE),
|
| 3918 |
nlp_analyser: AnalyzerEngine = nlp_analyser,
|
|
|
|
| 3948 |
- text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
|
| 3949 |
- language (str, optional): The language to do AWS Comprehend calls. Defaults to value of language if not provided.
|
| 3950 |
- output_folder (str, optional): The output folder for the function
|
| 3951 |
+
- input_folder (str, optional): The folder for file inputs.
|
| 3952 |
- page_break_val: Value for page break
|
| 3953 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
| 3954 |
- nlp_analyser (AnalyzerEngine, optional): The nlp_analyser object to use for entity detection. Defaults to nlp_analyser.
|
|
|
|
| 4216 |
convert_pikepdf_to_pymupdf_coords=True,
|
| 4217 |
original_cropbox=original_cropboxes[page_no],
|
| 4218 |
page_sizes_df=page_sizes_df,
|
| 4219 |
+
input_folder=input_folder,
|
| 4220 |
)
|
| 4221 |
|
| 4222 |
# Handle dual page objects if returned
|
tools/redaction_review.py
CHANGED
|
@@ -1594,6 +1594,7 @@ def apply_redactions_to_review_df_and_files(
|
|
| 1594 |
save_pdf: bool = True,
|
| 1595 |
page_sizes: List[dict] = list(),
|
| 1596 |
COMPRESS_REDACTED_PDF: bool = COMPRESS_REDACTED_PDF,
|
|
|
|
| 1597 |
progress=gr.Progress(track_tqdm=True),
|
| 1598 |
):
|
| 1599 |
"""
|
|
@@ -1619,6 +1620,8 @@ def apply_redactions_to_review_df_and_files(
|
|
| 1619 |
Defaults to an empty list.
|
| 1620 |
COMPRESS_REDACTED_PDF (bool, optional): If True, the output PDF will be compressed.
|
| 1621 |
Defaults to COMPRESS_REDACTED_PDF.
|
|
|
|
|
|
|
| 1622 |
progress (gr.Progress, optional): Gradio progress object for tracking task progress.
|
| 1623 |
Defaults to gr.Progress(track_tqdm=True).
|
| 1624 |
|
|
@@ -1636,6 +1639,9 @@ def apply_redactions_to_review_df_and_files(
|
|
| 1636 |
pdf_doc = list()
|
| 1637 |
review_df = review_file_state
|
| 1638 |
|
|
|
|
|
|
|
|
|
|
| 1639 |
page_image_annotator_object = all_image_annotations[current_page - 1]
|
| 1640 |
|
| 1641 |
# This replaces the numpy array image object with the image file path
|
|
@@ -1774,6 +1780,7 @@ def apply_redactions_to_review_df_and_files(
|
|
| 1774 |
page_sizes_df=page_sizes_df,
|
| 1775 |
return_pdf_for_review=True,
|
| 1776 |
return_pdf_end_of_redaction=False,
|
|
|
|
| 1777 |
)
|
| 1778 |
|
| 1779 |
# Apply redactions to final page (with text removed)
|
|
@@ -1785,6 +1792,7 @@ def apply_redactions_to_review_df_and_files(
|
|
| 1785 |
page_sizes_df=page_sizes_df,
|
| 1786 |
return_pdf_for_review=False,
|
| 1787 |
return_pdf_end_of_redaction=False,
|
|
|
|
| 1788 |
)
|
| 1789 |
else:
|
| 1790 |
print("File type not recognised.")
|
|
|
|
| 1594 |
save_pdf: bool = True,
|
| 1595 |
page_sizes: List[dict] = list(),
|
| 1596 |
COMPRESS_REDACTED_PDF: bool = COMPRESS_REDACTED_PDF,
|
| 1597 |
+
input_folder: str = INPUT_FOLDER,
|
| 1598 |
progress=gr.Progress(track_tqdm=True),
|
| 1599 |
):
|
| 1600 |
"""
|
|
|
|
| 1620 |
Defaults to an empty list.
|
| 1621 |
COMPRESS_REDACTED_PDF (bool, optional): If True, the output PDF will be compressed.
|
| 1622 |
Defaults to COMPRESS_REDACTED_PDF.
|
| 1623 |
+
input_folder (str, optional): The directory where input files are located and where
|
| 1624 |
+
page images should be saved. Defaults to INPUT_FOLDER.
|
| 1625 |
progress (gr.Progress, optional): Gradio progress object for tracking task progress.
|
| 1626 |
Defaults to gr.Progress(track_tqdm=True).
|
| 1627 |
|
|
|
|
| 1639 |
pdf_doc = list()
|
| 1640 |
review_df = review_file_state
|
| 1641 |
|
| 1642 |
+
# Always use the provided input_folder parameter
|
| 1643 |
+
# This ensures images are created in the specified input folder, not in example_data
|
| 1644 |
+
|
| 1645 |
page_image_annotator_object = all_image_annotations[current_page - 1]
|
| 1646 |
|
| 1647 |
# This replaces the numpy array image object with the image file path
|
|
|
|
| 1780 |
page_sizes_df=page_sizes_df,
|
| 1781 |
return_pdf_for_review=True,
|
| 1782 |
return_pdf_end_of_redaction=False,
|
| 1783 |
+
input_folder=input_folder,
|
| 1784 |
)
|
| 1785 |
|
| 1786 |
# Apply redactions to final page (with text removed)
|
|
|
|
| 1792 |
page_sizes_df=page_sizes_df,
|
| 1793 |
return_pdf_for_review=False,
|
| 1794 |
return_pdf_end_of_redaction=False,
|
| 1795 |
+
input_folder=input_folder,
|
| 1796 |
)
|
| 1797 |
else:
|
| 1798 |
print("File type not recognised.")
|