Commit
·
5086da0
1
Parent(s):
78403ba
Fixed whole page redactions being incorrectly positions, and without IDs. Fixed duplicate pages output issue. Minor changes to output redaction box format and related code.
Browse files- app.py +97 -39
- tools/file_conversion.py +52 -59
- tools/file_redaction.py +97 -99
- tools/find_duplicate_pages.py +1 -10
- tools/secure_path_utils.py +0 -8
app.py
CHANGED
|
@@ -277,6 +277,47 @@ in_redact_comprehend_entities = gr.Dropdown(
|
|
| 277 |
label="AWS Comprehend PII identification model (click empty space in box for full list)",
|
| 278 |
)
|
| 279 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
## Deduplication examples
|
| 281 |
in_duplicate_pages = gr.File(
|
| 282 |
label="Upload one or multiple 'ocr_output.csv' files to find duplicate pages and subdocuments",
|
|
@@ -973,6 +1014,8 @@ with blocks:
|
|
| 973 |
"example_data/example_complaint_letter.jpg",
|
| 974 |
"example_data/graduate-job-example-cover-letter.pdf",
|
| 975 |
"example_data/Partnership-Agreement-Toolkit_0_0.pdf",
|
|
|
|
|
|
|
| 976 |
]
|
| 977 |
|
| 978 |
available_examples = list()
|
|
@@ -990,6 +1033,10 @@ with blocks:
|
|
| 990 |
CHOSEN_COMPREHEND_ENTITIES,
|
| 991 |
[example_files[0]],
|
| 992 |
example_files[0],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 993 |
]
|
| 994 |
)
|
| 995 |
example_labels.append("PDF with selectable text redaction")
|
|
@@ -1005,6 +1052,10 @@ with blocks:
|
|
| 1005 |
CHOSEN_COMPREHEND_ENTITIES,
|
| 1006 |
[example_files[1]],
|
| 1007 |
example_files[1],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1008 |
]
|
| 1009 |
)
|
| 1010 |
example_labels.append("Image redaction with local OCR")
|
|
@@ -1020,6 +1071,10 @@ with blocks:
|
|
| 1020 |
CHOSEN_COMPREHEND_ENTITIES,
|
| 1021 |
[example_files[2]],
|
| 1022 |
example_files[2],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1023 |
]
|
| 1024 |
)
|
| 1025 |
example_labels.append(
|
|
@@ -1038,11 +1093,37 @@ with blocks:
|
|
| 1038 |
CHOSEN_COMPREHEND_ENTITIES,
|
| 1039 |
[example_files[3]],
|
| 1040 |
example_files[3],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1041 |
]
|
| 1042 |
)
|
| 1043 |
example_labels.append(
|
| 1044 |
"PDF redaction with AWS services and signature detection"
|
| 1045 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1046 |
|
| 1047 |
# Only create examples if we have available files
|
| 1048 |
if available_examples:
|
|
@@ -1056,6 +1137,10 @@ with blocks:
|
|
| 1056 |
in_redact_comprehend_entities,
|
| 1057 |
prepared_pdf_state,
|
| 1058 |
doc_full_file_name_textbox,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1059 |
):
|
| 1060 |
gr.Info(
|
| 1061 |
"Example data loaded. Now click on 'Extract text and redact document' below to run the example redaction."
|
|
@@ -1072,6 +1157,10 @@ with blocks:
|
|
| 1072 |
in_redact_comprehend_entities,
|
| 1073 |
prepared_pdf_state,
|
| 1074 |
doc_full_file_name_textbox,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1075 |
],
|
| 1076 |
example_labels=example_labels,
|
| 1077 |
fn=show_info_box_on_click,
|
|
@@ -2091,19 +2180,11 @@ with blocks:
|
|
| 2091 |
in_allow_list_text = gr.Textbox(
|
| 2092 |
label="Custom allow list load status"
|
| 2093 |
)
|
| 2094 |
-
with gr.Column():
|
| 2095 |
-
in_deny_list
|
| 2096 |
-
label="Import custom deny list - csv table with one column of a different word/phrase on each row (case insensitive). Terms in this file will always be redacted.",
|
| 2097 |
-
file_count="multiple",
|
| 2098 |
-
height=FILE_INPUT_HEIGHT,
|
| 2099 |
-
)
|
| 2100 |
in_deny_list_text = gr.Textbox(label="Custom deny list load status")
|
| 2101 |
with gr.Column():
|
| 2102 |
-
in_fully_redacted_list
|
| 2103 |
-
label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.",
|
| 2104 |
-
file_count="multiple",
|
| 2105 |
-
height=FILE_INPUT_HEIGHT,
|
| 2106 |
-
)
|
| 2107 |
in_fully_redacted_list_text = gr.Textbox(
|
| 2108 |
label="Fully redacted page list load status"
|
| 2109 |
)
|
|
@@ -2125,33 +2206,10 @@ with blocks:
|
|
| 2125 |
show_copy_button=True,
|
| 2126 |
wrap=True,
|
| 2127 |
)
|
| 2128 |
-
|
| 2129 |
-
|
| 2130 |
-
|
| 2131 |
-
|
| 2132 |
-
row_count=(0, "dynamic"),
|
| 2133 |
-
label="Deny list",
|
| 2134 |
-
visible=True,
|
| 2135 |
-
type="pandas",
|
| 2136 |
-
interactive=True,
|
| 2137 |
-
show_fullscreen_button=True,
|
| 2138 |
-
show_copy_button=True,
|
| 2139 |
-
wrap=True,
|
| 2140 |
-
)
|
| 2141 |
-
in_fully_redacted_list_state = gr.Dataframe(
|
| 2142 |
-
value=pd.DataFrame(),
|
| 2143 |
-
headers=["fully_redacted_pages_list"],
|
| 2144 |
-
col_count=(1, "fixed"),
|
| 2145 |
-
row_count=(0, "dynamic"),
|
| 2146 |
-
label="Fully redacted pages",
|
| 2147 |
-
visible=True,
|
| 2148 |
-
type="pandas",
|
| 2149 |
-
interactive=True,
|
| 2150 |
-
show_fullscreen_button=True,
|
| 2151 |
-
show_copy_button=True,
|
| 2152 |
-
datatype="number",
|
| 2153 |
-
wrap=True,
|
| 2154 |
-
)
|
| 2155 |
with gr.Row():
|
| 2156 |
with gr.Column(scale=2):
|
| 2157 |
markdown_placeholder = gr.Markdown("")
|
|
|
|
| 277 |
label="AWS Comprehend PII identification model (click empty space in box for full list)",
|
| 278 |
)
|
| 279 |
|
| 280 |
+
in_deny_list = gr.File(
|
| 281 |
+
label="Import custom deny list - csv table with one column of a different word/phrase on each row (case insensitive). Terms in this file will always be redacted.",
|
| 282 |
+
file_count="multiple",
|
| 283 |
+
height=FILE_INPUT_HEIGHT,
|
| 284 |
+
)
|
| 285 |
+
|
| 286 |
+
in_deny_list_state = gr.Dataframe(
|
| 287 |
+
value=pd.DataFrame(),
|
| 288 |
+
headers=["deny_list"],
|
| 289 |
+
col_count=(1, "fixed"),
|
| 290 |
+
row_count=(0, "dynamic"),
|
| 291 |
+
label="Deny list",
|
| 292 |
+
visible=True,
|
| 293 |
+
type="pandas",
|
| 294 |
+
interactive=True,
|
| 295 |
+
show_fullscreen_button=True,
|
| 296 |
+
show_copy_button=True,
|
| 297 |
+
wrap=True,
|
| 298 |
+
)
|
| 299 |
+
|
| 300 |
+
in_fully_redacted_list = gr.File(
|
| 301 |
+
label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.",
|
| 302 |
+
file_count="multiple",
|
| 303 |
+
height=FILE_INPUT_HEIGHT,
|
| 304 |
+
)
|
| 305 |
+
|
| 306 |
+
in_fully_redacted_list_state = gr.Dataframe(
|
| 307 |
+
value=pd.DataFrame(),
|
| 308 |
+
headers=["fully_redacted_pages_list"],
|
| 309 |
+
col_count=(1, "fixed"),
|
| 310 |
+
row_count=(0, "dynamic"),
|
| 311 |
+
label="Fully redacted pages",
|
| 312 |
+
visible=True,
|
| 313 |
+
type="pandas",
|
| 314 |
+
interactive=True,
|
| 315 |
+
show_fullscreen_button=True,
|
| 316 |
+
show_copy_button=True,
|
| 317 |
+
wrap=True,
|
| 318 |
+
)
|
| 319 |
+
|
| 320 |
+
|
| 321 |
## Deduplication examples
|
| 322 |
in_duplicate_pages = gr.File(
|
| 323 |
label="Upload one or multiple 'ocr_output.csv' files to find duplicate pages and subdocuments",
|
|
|
|
| 1014 |
"example_data/example_complaint_letter.jpg",
|
| 1015 |
"example_data/graduate-job-example-cover-letter.pdf",
|
| 1016 |
"example_data/Partnership-Agreement-Toolkit_0_0.pdf",
|
| 1017 |
+
"example_data/partnership_toolkit_redact_custom_deny_list.csv",
|
| 1018 |
+
"example_data/partnership_toolkit_redact_some_pages.csv",
|
| 1019 |
]
|
| 1020 |
|
| 1021 |
available_examples = list()
|
|
|
|
| 1033 |
CHOSEN_COMPREHEND_ENTITIES,
|
| 1034 |
[example_files[0]],
|
| 1035 |
example_files[0],
|
| 1036 |
+
[],
|
| 1037 |
+
pd.DataFrame(),
|
| 1038 |
+
[],
|
| 1039 |
+
pd.DataFrame(),
|
| 1040 |
]
|
| 1041 |
)
|
| 1042 |
example_labels.append("PDF with selectable text redaction")
|
|
|
|
| 1052 |
CHOSEN_COMPREHEND_ENTITIES,
|
| 1053 |
[example_files[1]],
|
| 1054 |
example_files[1],
|
| 1055 |
+
[],
|
| 1056 |
+
pd.DataFrame(),
|
| 1057 |
+
[],
|
| 1058 |
+
pd.DataFrame(),
|
| 1059 |
]
|
| 1060 |
)
|
| 1061 |
example_labels.append("Image redaction with local OCR")
|
|
|
|
| 1071 |
CHOSEN_COMPREHEND_ENTITIES,
|
| 1072 |
[example_files[2]],
|
| 1073 |
example_files[2],
|
| 1074 |
+
[],
|
| 1075 |
+
pd.DataFrame(),
|
| 1076 |
+
[],
|
| 1077 |
+
pd.DataFrame(),
|
| 1078 |
]
|
| 1079 |
)
|
| 1080 |
example_labels.append(
|
|
|
|
| 1093 |
CHOSEN_COMPREHEND_ENTITIES,
|
| 1094 |
[example_files[3]],
|
| 1095 |
example_files[3],
|
| 1096 |
+
[],
|
| 1097 |
+
pd.DataFrame(),
|
| 1098 |
+
[],
|
| 1099 |
+
pd.DataFrame(),
|
| 1100 |
]
|
| 1101 |
)
|
| 1102 |
example_labels.append(
|
| 1103 |
"PDF redaction with AWS services and signature detection"
|
| 1104 |
+
)
|
| 1105 |
+
|
| 1106 |
+
# Add new example for custom deny list and whole page redaction
|
| 1107 |
+
if os.path.exists(example_files[3]) and os.path.exists(example_files[4]) and os.path.exists(example_files[5]):
|
| 1108 |
+
available_examples.append(
|
| 1109 |
+
[
|
| 1110 |
+
[example_files[3]],
|
| 1111 |
+
"Local OCR model - PDFs without selectable text",
|
| 1112 |
+
"Local",
|
| 1113 |
+
[],
|
| 1114 |
+
["CUSTOM"], # Use CUSTOM entity to enable deny list functionality
|
| 1115 |
+
CHOSEN_COMPREHEND_ENTITIES,
|
| 1116 |
+
[example_files[3]],
|
| 1117 |
+
example_files[3],
|
| 1118 |
+
[example_files[4]],
|
| 1119 |
+
pd.DataFrame(data={"deny_list": ["Sister", "Sister City", "Sister Cities", "Friendship City"]}),
|
| 1120 |
+
[example_files[5]],
|
| 1121 |
+
pd.DataFrame(data={"fully_redacted_pages_list": [2, 5]}),
|
| 1122 |
+
]
|
| 1123 |
+
)
|
| 1124 |
+
example_labels.append(
|
| 1125 |
+
"PDF redaction with custom deny list and whole page redaction"
|
| 1126 |
+
)
|
| 1127 |
|
| 1128 |
# Only create examples if we have available files
|
| 1129 |
if available_examples:
|
|
|
|
| 1137 |
in_redact_comprehend_entities,
|
| 1138 |
prepared_pdf_state,
|
| 1139 |
doc_full_file_name_textbox,
|
| 1140 |
+
in_deny_list,
|
| 1141 |
+
in_deny_list_state,
|
| 1142 |
+
in_fully_redacted_list,
|
| 1143 |
+
in_fully_redacted_list_state,
|
| 1144 |
):
|
| 1145 |
gr.Info(
|
| 1146 |
"Example data loaded. Now click on 'Extract text and redact document' below to run the example redaction."
|
|
|
|
| 1157 |
in_redact_comprehend_entities,
|
| 1158 |
prepared_pdf_state,
|
| 1159 |
doc_full_file_name_textbox,
|
| 1160 |
+
in_deny_list,
|
| 1161 |
+
in_deny_list_state,
|
| 1162 |
+
in_fully_redacted_list,
|
| 1163 |
+
in_fully_redacted_list_state,
|
| 1164 |
],
|
| 1165 |
example_labels=example_labels,
|
| 1166 |
fn=show_info_box_on_click,
|
|
|
|
| 2180 |
in_allow_list_text = gr.Textbox(
|
| 2181 |
label="Custom allow list load status"
|
| 2182 |
)
|
| 2183 |
+
with gr.Column():
|
| 2184 |
+
in_deny_list.render() # Defined at beginning of file
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2185 |
in_deny_list_text = gr.Textbox(label="Custom deny list load status")
|
| 2186 |
with gr.Column():
|
| 2187 |
+
in_fully_redacted_list.render() # Defined at beginning of file
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2188 |
in_fully_redacted_list_text = gr.Textbox(
|
| 2189 |
label="Fully redacted page list load status"
|
| 2190 |
)
|
|
|
|
| 2206 |
show_copy_button=True,
|
| 2207 |
wrap=True,
|
| 2208 |
)
|
| 2209 |
+
|
| 2210 |
+
in_deny_list_state.render() # Defined at beginning of file
|
| 2211 |
+
|
| 2212 |
+
in_fully_redacted_list_state.render() # Defined at beginning of file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2213 |
with gr.Row():
|
| 2214 |
with gr.Column(scale=2):
|
| 2215 |
markdown_placeholder = gr.Markdown("")
|
tools/file_conversion.py
CHANGED
|
@@ -574,7 +574,7 @@ def redact_single_box(
|
|
| 574 |
|
| 575 |
Returns:
|
| 576 |
Page or Tuple[Page, Page]: If return_pdf_end_of_redaction is True and retain_text is True,
|
| 577 |
-
returns a tuple of (review_page,
|
| 578 |
"""
|
| 579 |
|
| 580 |
pymupdf_x1 = pymupdf_rect[0]
|
|
@@ -582,31 +582,45 @@ def redact_single_box(
|
|
| 582 |
pymupdf_x2 = pymupdf_rect[2]
|
| 583 |
pymupdf_y2 = pymupdf_rect[3]
|
| 584 |
|
| 585 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 586 |
|
| 587 |
out_colour = define_box_colour(
|
| 588 |
custom_colours, img_annotation_box, CUSTOM_BOX_COLOUR
|
| 589 |
)
|
| 590 |
|
|
|
|
|
|
|
|
|
|
| 591 |
# Create a copy of the page for final redaction if needed
|
| 592 |
-
|
| 593 |
if return_pdf_end_of_redaction and retain_text:
|
| 594 |
# Create a deep copy of the page for final redaction
|
| 595 |
-
import fitz
|
| 596 |
|
| 597 |
-
|
| 598 |
-
|
| 599 |
pymupdf_page.parent,
|
| 600 |
from_page=pymupdf_page.number,
|
| 601 |
to_page=pymupdf_page.number,
|
| 602 |
)
|
| 603 |
-
|
| 604 |
|
| 605 |
-
# Handle review page (retain_text = True)
|
| 606 |
-
if retain_text is True:
|
| 607 |
-
|
| 608 |
-
img_annotation_box["text"] = img_annotation_box.get("text") or ""
|
| 609 |
-
img_annotation_box["label"] = img_annotation_box.get("label") or "Redaction"
|
| 610 |
|
| 611 |
annot = pymupdf_page.add_redact_annot(full_size_redaction_box)
|
| 612 |
annot.set_colors(stroke=out_colour, fill=out_colour, colors=out_colour)
|
|
@@ -620,51 +634,27 @@ def redact_single_box(
|
|
| 620 |
)
|
| 621 |
annot.update(opacity=0.5, cross_out=False)
|
| 622 |
|
| 623 |
-
# If we need both review and final pages, apply final redaction to the copy
|
| 624 |
-
if return_pdf_end_of_redaction and
|
| 625 |
-
# Apply final redaction to the copy
|
| 626 |
-
redact_bottom_y = pymupdf_y1 + 2
|
| 627 |
-
redact_top_y = pymupdf_y2 - 2
|
| 628 |
-
|
| 629 |
-
# Calculate the middle y value and set a small height if default values are too close together
|
| 630 |
-
if (redact_top_y - redact_bottom_y) < 1:
|
| 631 |
-
middle_y = (pymupdf_y1 + pymupdf_y2) / 2
|
| 632 |
-
redact_bottom_y = middle_y - 1
|
| 633 |
-
redact_top_y = middle_y + 1
|
| 634 |
-
|
| 635 |
-
rect_small_pixel_height = Rect(
|
| 636 |
-
pymupdf_x1, redact_bottom_y, pymupdf_x2, redact_top_y
|
| 637 |
-
) # Slightly smaller than outside box
|
| 638 |
|
| 639 |
# Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
|
| 640 |
-
|
| 641 |
|
| 642 |
# Only create a box over the whole rect if we want to delete the text
|
| 643 |
-
shape =
|
| 644 |
shape.draw_rect(pymupdf_rect)
|
| 645 |
|
| 646 |
# Use solid fill for normal redaction
|
| 647 |
shape.finish(color=out_colour, fill=out_colour)
|
| 648 |
shape.commit()
|
| 649 |
|
| 650 |
-
return pymupdf_page,
|
| 651 |
else:
|
| 652 |
return pymupdf_page
|
| 653 |
-
else:
|
| 654 |
-
# Calculate area to actually remove text from the pdf (different from black box size)
|
| 655 |
-
redact_bottom_y = pymupdf_y1 + 2
|
| 656 |
-
redact_top_y = pymupdf_y2 - 2
|
| 657 |
-
|
| 658 |
-
# Calculate the middle y value and set a small height if default values are too close together
|
| 659 |
-
if (redact_top_y - redact_bottom_y) < 1:
|
| 660 |
-
middle_y = (pymupdf_y1 + pymupdf_y2) / 2
|
| 661 |
-
redact_bottom_y = middle_y - 1
|
| 662 |
-
redact_top_y = middle_y + 1
|
| 663 |
-
|
| 664 |
-
rect_small_pixel_height = Rect(
|
| 665 |
-
pymupdf_x1, redact_bottom_y, pymupdf_x2, redact_top_y
|
| 666 |
-
) # Slightly smaller than outside box
|
| 667 |
|
|
|
|
|
|
|
| 668 |
# Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
|
| 669 |
pymupdf_page.add_redact_annot(rect_small_pixel_height)
|
| 670 |
|
|
@@ -792,27 +782,30 @@ def redact_whole_pymupdf_page(
|
|
| 792 |
"""
|
| 793 |
# Small border to page that remains white
|
| 794 |
|
| 795 |
-
# Define the coordinates for the Rect
|
| 796 |
whole_page_x1, whole_page_y1 = 0 + border, 0 + border # Bottom-left corner
|
| 797 |
-
|
| 798 |
-
|
| 799 |
-
|
| 800 |
-
|
| 801 |
-
else:
|
| 802 |
-
whole_page_x2, whole_page_y2 = (
|
| 803 |
-
rect_width - border,
|
| 804 |
-
rect_height - border,
|
| 805 |
-
) # Top-right corner
|
| 806 |
|
| 807 |
# Create new image annotation element based on whole page coordinates
|
| 808 |
whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
|
| 809 |
|
| 810 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 811 |
whole_page_img_annotation_box = dict()
|
| 812 |
-
whole_page_img_annotation_box["xmin"] =
|
| 813 |
-
whole_page_img_annotation_box["ymin"] =
|
| 814 |
-
whole_page_img_annotation_box["xmax"] =
|
| 815 |
-
whole_page_img_annotation_box["ymax"] =
|
| 816 |
whole_page_img_annotation_box["color"] = (0, 0, 0)
|
| 817 |
whole_page_img_annotation_box["label"] = "Whole page"
|
| 818 |
|
|
|
|
| 574 |
|
| 575 |
Returns:
|
| 576 |
Page or Tuple[Page, Page]: If return_pdf_end_of_redaction is True and retain_text is True,
|
| 577 |
+
returns a tuple of (review_page, applied_redaction_page). Otherwise returns a single Page.
|
| 578 |
"""
|
| 579 |
|
| 580 |
pymupdf_x1 = pymupdf_rect[0]
|
|
|
|
| 582 |
pymupdf_x2 = pymupdf_rect[2]
|
| 583 |
pymupdf_y2 = pymupdf_rect[3]
|
| 584 |
|
| 585 |
+
# Full size redaction box for covering all the text of a word
|
| 586 |
+
full_size_redaction_box = Rect(pymupdf_x1-1, pymupdf_y1-1, pymupdf_x2+1, pymupdf_y2+1)
|
| 587 |
+
|
| 588 |
+
# Calculate tiny height redaction box so that it doesn't delete text from adjacent lines
|
| 589 |
+
redact_bottom_y = pymupdf_y1 + 2
|
| 590 |
+
redact_top_y = pymupdf_y2 - 2
|
| 591 |
+
|
| 592 |
+
# Calculate the middle y value and set a small height if default values are too close together
|
| 593 |
+
if (redact_top_y - redact_bottom_y) < 1:
|
| 594 |
+
middle_y = (pymupdf_y1 + pymupdf_y2) / 2
|
| 595 |
+
redact_bottom_y = middle_y - 1
|
| 596 |
+
redact_top_y = middle_y + 1
|
| 597 |
+
|
| 598 |
+
rect_small_pixel_height = Rect(
|
| 599 |
+
pymupdf_x1 + 2, redact_bottom_y, pymupdf_x2 - 2, redact_top_y
|
| 600 |
+
) # Slightly smaller than outside box
|
| 601 |
|
| 602 |
out_colour = define_box_colour(
|
| 603 |
custom_colours, img_annotation_box, CUSTOM_BOX_COLOUR
|
| 604 |
)
|
| 605 |
|
| 606 |
+
img_annotation_box["text"] = img_annotation_box.get("text") or ""
|
| 607 |
+
img_annotation_box["label"] = img_annotation_box.get("label") or "Redaction"
|
| 608 |
+
|
| 609 |
# Create a copy of the page for final redaction if needed
|
| 610 |
+
applied_redaction_page = None
|
| 611 |
if return_pdf_end_of_redaction and retain_text:
|
| 612 |
# Create a deep copy of the page for final redaction
|
|
|
|
| 613 |
|
| 614 |
+
applied_redaction_page = pymupdf.open()
|
| 615 |
+
applied_redaction_page.insert_pdf(
|
| 616 |
pymupdf_page.parent,
|
| 617 |
from_page=pymupdf_page.number,
|
| 618 |
to_page=pymupdf_page.number,
|
| 619 |
)
|
| 620 |
+
applied_redaction_page = applied_redaction_page[0]
|
| 621 |
|
| 622 |
+
# Handle review page first, then deal with final redacted page (retain_text = True)
|
| 623 |
+
if retain_text is True:
|
|
|
|
|
|
|
|
|
|
| 624 |
|
| 625 |
annot = pymupdf_page.add_redact_annot(full_size_redaction_box)
|
| 626 |
annot.set_colors(stroke=out_colour, fill=out_colour, colors=out_colour)
|
|
|
|
| 634 |
)
|
| 635 |
annot.update(opacity=0.5, cross_out=False)
|
| 636 |
|
| 637 |
+
# If we need both review and final pages, and the applied redaction page has been prepared, apply final redaction to the copy
|
| 638 |
+
if return_pdf_end_of_redaction and applied_redaction_page is not None:
|
| 639 |
+
# Apply final redaction to the copy
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 640 |
|
| 641 |
# Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
|
| 642 |
+
applied_redaction_page.add_redact_annot(rect_small_pixel_height)
|
| 643 |
|
| 644 |
# Only create a box over the whole rect if we want to delete the text
|
| 645 |
+
shape = applied_redaction_page.new_shape()
|
| 646 |
shape.draw_rect(pymupdf_rect)
|
| 647 |
|
| 648 |
# Use solid fill for normal redaction
|
| 649 |
shape.finish(color=out_colour, fill=out_colour)
|
| 650 |
shape.commit()
|
| 651 |
|
| 652 |
+
return pymupdf_page, applied_redaction_page
|
| 653 |
else:
|
| 654 |
return pymupdf_page
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 655 |
|
| 656 |
+
# If we don't need to retain the text, we only have one page which is the applied redaction page, so just apply the redaction to the page
|
| 657 |
+
else:
|
| 658 |
# Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
|
| 659 |
pymupdf_page.add_redact_annot(rect_small_pixel_height)
|
| 660 |
|
|
|
|
| 782 |
"""
|
| 783 |
# Small border to page that remains white
|
| 784 |
|
| 785 |
+
# Define the coordinates for the Rect (PDF coordinates for actual redaction)
|
| 786 |
whole_page_x1, whole_page_y1 = 0 + border, 0 + border # Bottom-left corner
|
| 787 |
+
whole_page_x2, whole_page_y2 = (
|
| 788 |
+
rect_width - border,
|
| 789 |
+
rect_height - border,
|
| 790 |
+
) # Top-right corner
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 791 |
|
| 792 |
# Create new image annotation element based on whole page coordinates
|
| 793 |
whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
|
| 794 |
|
| 795 |
+
# Calculate relative coordinates for the annotation box (0-1 range)
|
| 796 |
+
# This ensures the coordinates are already in relative format for output files
|
| 797 |
+
relative_border = border / min(rect_width, rect_height) # Scale border proportionally
|
| 798 |
+
relative_x1 = relative_border
|
| 799 |
+
relative_y1 = relative_border
|
| 800 |
+
relative_x2 = 1 - relative_border
|
| 801 |
+
relative_y2 = 1 - relative_border
|
| 802 |
+
|
| 803 |
+
# Write whole page annotation to annotation boxes using relative coordinates
|
| 804 |
whole_page_img_annotation_box = dict()
|
| 805 |
+
whole_page_img_annotation_box["xmin"] = relative_x1
|
| 806 |
+
whole_page_img_annotation_box["ymin"] = relative_y1
|
| 807 |
+
whole_page_img_annotation_box["xmax"] = relative_x2
|
| 808 |
+
whole_page_img_annotation_box["ymax"] = relative_y2
|
| 809 |
whole_page_img_annotation_box["color"] = (0, 0, 0)
|
| 810 |
whole_page_img_annotation_box["label"] = "Whole page"
|
| 811 |
|
tools/file_redaction.py
CHANGED
|
@@ -404,7 +404,7 @@ def choose_and_run_redactor(
|
|
| 404 |
if prepared_pdf_file_paths:
|
| 405 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
| 406 |
else:
|
| 407 |
-
review_out_file_paths =
|
| 408 |
|
| 409 |
# Choose the correct file to prepare
|
| 410 |
if isinstance(file_paths, str):
|
|
@@ -1095,111 +1095,111 @@ def choose_and_run_redactor(
|
|
| 1095 |
|
| 1096 |
else:
|
| 1097 |
# Check if we have dual PDF documents to save
|
| 1098 |
-
|
| 1099 |
|
| 1100 |
if RETURN_PDF_FOR_REVIEW and RETURN_REDACTED_PDF:
|
| 1101 |
if (
|
| 1102 |
-
hasattr(redact_image_pdf, "
|
| 1103 |
-
and redact_image_pdf.
|
| 1104 |
):
|
| 1105 |
|
| 1106 |
# Create final document by copying the original document and replacing specific pages
|
| 1107 |
-
|
| 1108 |
-
|
| 1109 |
|
| 1110 |
# Create a mapping of original page numbers to final pages
|
| 1111 |
-
|
| 1112 |
-
for
|
| 1113 |
-
if isinstance(
|
| 1114 |
-
|
| 1115 |
-
|
| 1116 |
)
|
| 1117 |
-
|
| 1118 |
-
|
| 1119 |
)
|
| 1120 |
else:
|
| 1121 |
-
|
| 1122 |
-
|
| 1123 |
-
|
| 1124 |
)
|
| 1125 |
|
| 1126 |
# Replace pages in the final document with their final versions
|
| 1127 |
for (
|
| 1128 |
original_page_number,
|
| 1129 |
-
|
| 1130 |
-
) in
|
| 1131 |
if (
|
| 1132 |
original_page_number
|
| 1133 |
-
<
|
| 1134 |
):
|
| 1135 |
# Remove the original page and insert the final page
|
| 1136 |
-
|
| 1137 |
original_page_number
|
| 1138 |
)
|
| 1139 |
-
|
| 1140 |
-
|
| 1141 |
-
from_page=
|
| 1142 |
-
to_page=
|
| 1143 |
start_at=original_page_number,
|
| 1144 |
)
|
| 1145 |
-
#
|
| 1146 |
-
|
| 1147 |
original_page_number
|
| 1148 |
-
].apply_redactions(images=
|
| 1149 |
# Clear the stored final pages
|
| 1150 |
-
delattr(redact_image_pdf, "
|
| 1151 |
elif (
|
| 1152 |
-
hasattr(redact_text_pdf, "
|
| 1153 |
-
and redact_text_pdf.
|
| 1154 |
):
|
| 1155 |
# Create final document by copying the original document and replacing specific pages
|
| 1156 |
-
|
| 1157 |
-
|
| 1158 |
|
| 1159 |
# Create a mapping of original page numbers to final pages
|
| 1160 |
-
|
| 1161 |
-
for
|
| 1162 |
-
if isinstance(
|
| 1163 |
-
|
| 1164 |
-
|
| 1165 |
)
|
| 1166 |
-
|
| 1167 |
-
|
| 1168 |
)
|
| 1169 |
else:
|
| 1170 |
-
|
| 1171 |
-
|
| 1172 |
-
|
| 1173 |
)
|
| 1174 |
|
| 1175 |
# Replace pages in the final document with their final versions
|
| 1176 |
for (
|
| 1177 |
original_page_number,
|
| 1178 |
-
|
| 1179 |
-
) in
|
| 1180 |
if (
|
| 1181 |
original_page_number
|
| 1182 |
-
<
|
| 1183 |
):
|
| 1184 |
# Remove the original page and insert the final page
|
| 1185 |
-
|
| 1186 |
original_page_number
|
| 1187 |
)
|
| 1188 |
-
|
| 1189 |
-
|
| 1190 |
-
from_page=
|
| 1191 |
-
to_page=
|
| 1192 |
start_at=original_page_number,
|
| 1193 |
)
|
| 1194 |
-
#
|
| 1195 |
-
|
| 1196 |
original_page_number
|
| 1197 |
-
].apply_redactions(images=
|
| 1198 |
# Clear the stored final pages
|
| 1199 |
-
delattr(redact_text_pdf, "
|
| 1200 |
|
| 1201 |
# Save final redacted PDF if we have dual outputs or if RETURN_PDF_FOR_REVIEW is False
|
| 1202 |
-
if RETURN_PDF_FOR_REVIEW is False or
|
| 1203 |
out_redacted_pdf_file_path = (
|
| 1204 |
output_folder
|
| 1205 |
+ pdf_file_name_without_ext
|
|
@@ -1211,7 +1211,7 @@ def choose_and_run_redactor(
|
|
| 1211 |
|
| 1212 |
# Use final document if available, otherwise use main document
|
| 1213 |
doc_to_save = (
|
| 1214 |
-
|
| 1215 |
)
|
| 1216 |
|
| 1217 |
if out_redacted_pdf_file_path:
|
|
@@ -2104,7 +2104,7 @@ def redact_page_with_pymupdf(
|
|
| 2104 |
Tuple[Page, dict] or Tuple[Tuple[Page, Page], dict]: A tuple containing:
|
| 2105 |
- page (Page or Tuple[Page, Page]): The PyMuPDF page object(s) with redactions applied.
|
| 2106 |
If return_pdf_end_of_redaction is True and return_pdf_for_review is True,
|
| 2107 |
-
returns a tuple of (review_page,
|
| 2108 |
- out_annotation_boxes (dict): A dictionary containing the processed annotation boxes
|
| 2109 |
for the page, including the image path.
|
| 2110 |
"""
|
|
@@ -2271,14 +2271,14 @@ def redact_page_with_pymupdf(
|
|
| 2271 |
|
| 2272 |
# Handle dual page objects if returned
|
| 2273 |
if isinstance(redact_result, tuple):
|
| 2274 |
-
page,
|
| 2275 |
# Store the final page for later use
|
| 2276 |
-
if not hasattr(redact_page_with_pymupdf, "
|
| 2277 |
-
redact_page_with_pymupdf.
|
| 2278 |
else:
|
| 2279 |
# If we already have a final page, we need to handle multiple pages
|
| 2280 |
# For now, we'll use the last final page
|
| 2281 |
-
redact_page_with_pymupdf.
|
| 2282 |
|
| 2283 |
# If whole page is to be redacted, do that here
|
| 2284 |
if redact_whole_page is True:
|
|
@@ -2286,72 +2286,71 @@ def redact_page_with_pymupdf(
|
|
| 2286 |
whole_page_img_annotation_box = redact_whole_pymupdf_page(
|
| 2287 |
rect_height, rect_width, page, custom_colours, border=5
|
| 2288 |
)
|
|
|
|
|
|
|
| 2289 |
all_image_annotation_boxes.append(whole_page_img_annotation_box)
|
| 2290 |
|
| 2291 |
# Handle dual page objects for whole page redaction if needed
|
| 2292 |
if return_pdf_end_of_redaction and return_pdf_for_review:
|
| 2293 |
# Create a copy of the page for final redaction using the same approach as redact_single_box
|
| 2294 |
|
| 2295 |
-
|
| 2296 |
-
|
| 2297 |
page.parent,
|
| 2298 |
from_page=page.number,
|
| 2299 |
to_page=page.number,
|
| 2300 |
)
|
| 2301 |
-
|
| 2302 |
|
| 2303 |
# Apply the whole page redaction to the final page as well
|
| 2304 |
redact_whole_pymupdf_page(
|
| 2305 |
-
rect_height, rect_width,
|
| 2306 |
)
|
| 2307 |
|
| 2308 |
# Store the final page with its original page number for later use
|
| 2309 |
-
if not hasattr(redact_page_with_pymupdf, "
|
| 2310 |
-
redact_page_with_pymupdf.
|
| 2311 |
else:
|
| 2312 |
# If we already have a final page, we need to handle multiple pages
|
| 2313 |
# For now, we'll use the last final page
|
| 2314 |
-
redact_page_with_pymupdf.
|
| 2315 |
|
| 2316 |
out_annotation_boxes = {
|
| 2317 |
"image": image_path, # Image.open(image_path), #image_path,
|
| 2318 |
"boxes": all_image_annotation_boxes,
|
| 2319 |
}
|
| 2320 |
|
|
|
|
| 2321 |
if return_pdf_for_review is False:
|
| 2322 |
-
# Remove text
|
| 2323 |
-
|
| 2324 |
-
page.apply_redactions(images=2, graphics=0, text=0)
|
| 2325 |
-
# else:
|
| 2326 |
-
# # Just apply the box, don't remove images or text
|
| 2327 |
-
# page.apply_redactions(images=0, graphics=0, text=1)
|
| 2328 |
|
| 2329 |
set_cropbox_safely(page, original_cropbox)
|
| 2330 |
-
# page.set_cropbox(original_cropbox)
|
| 2331 |
-
# Set CropBox to original size
|
| 2332 |
page.clean_contents()
|
| 2333 |
|
| 2334 |
# Handle dual page objects if we have a final page
|
| 2335 |
if (
|
| 2336 |
return_pdf_end_of_redaction
|
| 2337 |
and return_pdf_for_review
|
| 2338 |
-
and hasattr(redact_page_with_pymupdf, "
|
| 2339 |
):
|
| 2340 |
-
|
| 2341 |
# Handle both tuple format (new) and single page format (backward compatibility)
|
| 2342 |
-
if isinstance(
|
| 2343 |
-
|
| 2344 |
else:
|
| 2345 |
-
|
| 2346 |
|
| 2347 |
-
# Apply redactions to
|
| 2348 |
-
|
| 2349 |
-
|
| 2350 |
-
|
| 2351 |
-
|
|
|
|
| 2352 |
# Clear the stored final page
|
| 2353 |
-
delattr(redact_page_with_pymupdf, "
|
| 2354 |
-
return (page,
|
|
|
|
| 2355 |
else:
|
| 2356 |
return page, out_annotation_boxes
|
| 2357 |
|
|
@@ -3116,14 +3115,14 @@ def redact_image_pdf(
|
|
| 3116 |
|
| 3117 |
# Handle dual page objects if returned
|
| 3118 |
if isinstance(redact_result[0], tuple):
|
| 3119 |
-
(pymupdf_page,
|
| 3120 |
redact_result
|
| 3121 |
)
|
| 3122 |
# Store the final page with its original page number for later use
|
| 3123 |
-
if not hasattr(redact_image_pdf, "
|
| 3124 |
-
redact_image_pdf.
|
| 3125 |
-
redact_image_pdf.
|
| 3126 |
-
(
|
| 3127 |
)
|
| 3128 |
else:
|
| 3129 |
pymupdf_page, page_image_annotations = redact_result
|
|
@@ -4178,14 +4177,14 @@ def redact_text_pdf(
|
|
| 4178 |
|
| 4179 |
# Handle dual page objects if returned
|
| 4180 |
if isinstance(redact_result[0], tuple):
|
| 4181 |
-
(pymupdf_page,
|
| 4182 |
redact_result
|
| 4183 |
)
|
| 4184 |
# Store the final page with its original page number for later use
|
| 4185 |
-
if not hasattr(redact_text_pdf, "
|
| 4186 |
-
redact_text_pdf.
|
| 4187 |
-
redact_text_pdf.
|
| 4188 |
-
(
|
| 4189 |
)
|
| 4190 |
else:
|
| 4191 |
pymupdf_page, page_image_annotations = redact_result
|
|
@@ -4205,7 +4204,6 @@ def redact_text_pdf(
|
|
| 4205 |
# Else, user chose not to run redaction
|
| 4206 |
else:
|
| 4207 |
pass
|
| 4208 |
-
# print("Not redacting page:", page_no)
|
| 4209 |
|
| 4210 |
# Join extracted text outputs for all lines together
|
| 4211 |
if not page_text_ocr_outputs.empty:
|
|
|
|
| 404 |
if prepared_pdf_file_paths:
|
| 405 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
| 406 |
else:
|
| 407 |
+
review_out_file_paths = list()
|
| 408 |
|
| 409 |
# Choose the correct file to prepare
|
| 410 |
if isinstance(file_paths, str):
|
|
|
|
| 1095 |
|
| 1096 |
else:
|
| 1097 |
# Check if we have dual PDF documents to save
|
| 1098 |
+
applied_redaction_pymupdf_doc = None
|
| 1099 |
|
| 1100 |
if RETURN_PDF_FOR_REVIEW and RETURN_REDACTED_PDF:
|
| 1101 |
if (
|
| 1102 |
+
hasattr(redact_image_pdf, "_applied_redaction_pages")
|
| 1103 |
+
and redact_image_pdf._applied_redaction_pages
|
| 1104 |
):
|
| 1105 |
|
| 1106 |
# Create final document by copying the original document and replacing specific pages
|
| 1107 |
+
applied_redaction_pymupdf_doc = pymupdf.open()
|
| 1108 |
+
applied_redaction_pymupdf_doc.insert_pdf(pymupdf_doc)
|
| 1109 |
|
| 1110 |
# Create a mapping of original page numbers to final pages
|
| 1111 |
+
applied_redaction_pages_map = {}
|
| 1112 |
+
for applied_redaction_page_data in redact_image_pdf._applied_redaction_pages:
|
| 1113 |
+
if isinstance(applied_redaction_page_data, tuple):
|
| 1114 |
+
applied_redaction_page, original_page_number = (
|
| 1115 |
+
applied_redaction_page_data
|
| 1116 |
)
|
| 1117 |
+
applied_redaction_pages_map[original_page_number] = (
|
| 1118 |
+
applied_redaction_page
|
| 1119 |
)
|
| 1120 |
else:
|
| 1121 |
+
applied_redaction_page = applied_redaction_page_data
|
| 1122 |
+
applied_redaction_pages_map[0] = (
|
| 1123 |
+
applied_redaction_page # Default to page 0 if no original number
|
| 1124 |
)
|
| 1125 |
|
| 1126 |
# Replace pages in the final document with their final versions
|
| 1127 |
for (
|
| 1128 |
original_page_number,
|
| 1129 |
+
applied_redaction_page,
|
| 1130 |
+
) in applied_redaction_pages_map.items():
|
| 1131 |
if (
|
| 1132 |
original_page_number
|
| 1133 |
+
< applied_redaction_pymupdf_doc.page_count
|
| 1134 |
):
|
| 1135 |
# Remove the original page and insert the final page
|
| 1136 |
+
applied_redaction_pymupdf_doc.delete_page(
|
| 1137 |
original_page_number
|
| 1138 |
)
|
| 1139 |
+
applied_redaction_pymupdf_doc.insert_pdf(
|
| 1140 |
+
applied_redaction_page.parent,
|
| 1141 |
+
from_page=applied_redaction_page.number,
|
| 1142 |
+
to_page=applied_redaction_page.number,
|
| 1143 |
start_at=original_page_number,
|
| 1144 |
)
|
| 1145 |
+
# Remove text. Graphic text is effectively removed by the overlapping rectangle shape that becomes an embedded part of the document.
|
| 1146 |
+
applied_redaction_pymupdf_doc[
|
| 1147 |
original_page_number
|
| 1148 |
+
].apply_redactions(images=0, graphics=0, text=0)
|
| 1149 |
# Clear the stored final pages
|
| 1150 |
+
delattr(redact_image_pdf, "_applied_redaction_pages")
|
| 1151 |
elif (
|
| 1152 |
+
hasattr(redact_text_pdf, "_applied_redaction_pages")
|
| 1153 |
+
and redact_text_pdf._applied_redaction_pages
|
| 1154 |
):
|
| 1155 |
# Create final document by copying the original document and replacing specific pages
|
| 1156 |
+
applied_redaction_pymupdf_doc = pymupdf.open()
|
| 1157 |
+
applied_redaction_pymupdf_doc.insert_pdf(pymupdf_doc)
|
| 1158 |
|
| 1159 |
# Create a mapping of original page numbers to final pages
|
| 1160 |
+
applied_redaction_pages_map = {}
|
| 1161 |
+
for applied_redaction_page_data in redact_text_pdf._applied_redaction_pages:
|
| 1162 |
+
if isinstance(applied_redaction_page_data, tuple):
|
| 1163 |
+
applied_redaction_page, original_page_number = (
|
| 1164 |
+
applied_redaction_page_data
|
| 1165 |
)
|
| 1166 |
+
applied_redaction_pages_map[original_page_number] = (
|
| 1167 |
+
applied_redaction_page
|
| 1168 |
)
|
| 1169 |
else:
|
| 1170 |
+
applied_redaction_page = applied_redaction_page_data
|
| 1171 |
+
applied_redaction_pages_map[0] = (
|
| 1172 |
+
applied_redaction_page # Default to page 0 if no original number
|
| 1173 |
)
|
| 1174 |
|
| 1175 |
# Replace pages in the final document with their final versions
|
| 1176 |
for (
|
| 1177 |
original_page_number,
|
| 1178 |
+
applied_redaction_page,
|
| 1179 |
+
) in applied_redaction_pages_map.items():
|
| 1180 |
if (
|
| 1181 |
original_page_number
|
| 1182 |
+
< applied_redaction_pymupdf_doc.page_count
|
| 1183 |
):
|
| 1184 |
# Remove the original page and insert the final page
|
| 1185 |
+
applied_redaction_pymupdf_doc.delete_page(
|
| 1186 |
original_page_number
|
| 1187 |
)
|
| 1188 |
+
applied_redaction_pymupdf_doc.insert_pdf(
|
| 1189 |
+
applied_redaction_page.parent,
|
| 1190 |
+
from_page=applied_redaction_page.number,
|
| 1191 |
+
to_page=applied_redaction_page.number,
|
| 1192 |
start_at=original_page_number,
|
| 1193 |
)
|
| 1194 |
+
# Remove text. Graphic text is effectively removed by the overlapping rectangle shape that becomes an embedded part of the document.
|
| 1195 |
+
applied_redaction_pymupdf_doc[
|
| 1196 |
original_page_number
|
| 1197 |
+
].apply_redactions(images=0, graphics=0, text=0)
|
| 1198 |
# Clear the stored final pages
|
| 1199 |
+
delattr(redact_text_pdf, "_applied_redaction_pages")
|
| 1200 |
|
| 1201 |
# Save final redacted PDF if we have dual outputs or if RETURN_PDF_FOR_REVIEW is False
|
| 1202 |
+
if RETURN_PDF_FOR_REVIEW is False or applied_redaction_pymupdf_doc:
|
| 1203 |
out_redacted_pdf_file_path = (
|
| 1204 |
output_folder
|
| 1205 |
+ pdf_file_name_without_ext
|
|
|
|
| 1211 |
|
| 1212 |
# Use final document if available, otherwise use main document
|
| 1213 |
doc_to_save = (
|
| 1214 |
+
applied_redaction_pymupdf_doc if applied_redaction_pymupdf_doc else pymupdf_doc
|
| 1215 |
)
|
| 1216 |
|
| 1217 |
if out_redacted_pdf_file_path:
|
|
|
|
| 2104 |
Tuple[Page, dict] or Tuple[Tuple[Page, Page], dict]: A tuple containing:
|
| 2105 |
- page (Page or Tuple[Page, Page]): The PyMuPDF page object(s) with redactions applied.
|
| 2106 |
If return_pdf_end_of_redaction is True and return_pdf_for_review is True,
|
| 2107 |
+
returns a tuple of (review_page, applied_redaction_page).
|
| 2108 |
- out_annotation_boxes (dict): A dictionary containing the processed annotation boxes
|
| 2109 |
for the page, including the image path.
|
| 2110 |
"""
|
|
|
|
| 2271 |
|
| 2272 |
# Handle dual page objects if returned
|
| 2273 |
if isinstance(redact_result, tuple):
|
| 2274 |
+
page, applied_redaction_page = redact_result
|
| 2275 |
# Store the final page for later use
|
| 2276 |
+
if not hasattr(redact_page_with_pymupdf, "_applied_redaction_page"):
|
| 2277 |
+
redact_page_with_pymupdf._applied_redaction_page = applied_redaction_page
|
| 2278 |
else:
|
| 2279 |
# If we already have a final page, we need to handle multiple pages
|
| 2280 |
# For now, we'll use the last final page
|
| 2281 |
+
redact_page_with_pymupdf._applied_redaction_page = applied_redaction_page
|
| 2282 |
|
| 2283 |
# If whole page is to be redacted, do that here
|
| 2284 |
if redact_whole_page is True:
|
|
|
|
| 2286 |
whole_page_img_annotation_box = redact_whole_pymupdf_page(
|
| 2287 |
rect_height, rect_width, page, custom_colours, border=5
|
| 2288 |
)
|
| 2289 |
+
# Ensure the whole page annotation box has a unique ID
|
| 2290 |
+
whole_page_img_annotation_box = fill_missing_box_ids(whole_page_img_annotation_box)
|
| 2291 |
all_image_annotation_boxes.append(whole_page_img_annotation_box)
|
| 2292 |
|
| 2293 |
# Handle dual page objects for whole page redaction if needed
|
| 2294 |
if return_pdf_end_of_redaction and return_pdf_for_review:
|
| 2295 |
# Create a copy of the page for final redaction using the same approach as redact_single_box
|
| 2296 |
|
| 2297 |
+
applied_redaction_doc = pymupdf.open()
|
| 2298 |
+
applied_redaction_doc.insert_pdf(
|
| 2299 |
page.parent,
|
| 2300 |
from_page=page.number,
|
| 2301 |
to_page=page.number,
|
| 2302 |
)
|
| 2303 |
+
applied_redaction_page = applied_redaction_doc[0]
|
| 2304 |
|
| 2305 |
# Apply the whole page redaction to the final page as well
|
| 2306 |
redact_whole_pymupdf_page(
|
| 2307 |
+
rect_height, rect_width, applied_redaction_page, custom_colours, border=5
|
| 2308 |
)
|
| 2309 |
|
| 2310 |
# Store the final page with its original page number for later use
|
| 2311 |
+
if not hasattr(redact_page_with_pymupdf, "_applied_redaction_page"):
|
| 2312 |
+
redact_page_with_pymupdf._applied_redaction_page = (applied_redaction_page, page.number)
|
| 2313 |
else:
|
| 2314 |
# If we already have a final page, we need to handle multiple pages
|
| 2315 |
# For now, we'll use the last final page
|
| 2316 |
+
redact_page_with_pymupdf._applied_redaction_page = (applied_redaction_page, page.number)
|
| 2317 |
|
| 2318 |
out_annotation_boxes = {
|
| 2319 |
"image": image_path, # Image.open(image_path), #image_path,
|
| 2320 |
"boxes": all_image_annotation_boxes,
|
| 2321 |
}
|
| 2322 |
|
| 2323 |
+
# If we are not returning the review page, can directly remove text and all images
|
| 2324 |
if return_pdf_for_review is False:
|
| 2325 |
+
# Remove text. Graphic text is effectively removed by the overlapping rectangle shape that becomes an embedded part of the document.
|
| 2326 |
+
page.apply_redactions(images=0, graphics=0, text=0)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2327 |
|
| 2328 |
set_cropbox_safely(page, original_cropbox)
|
|
|
|
|
|
|
| 2329 |
page.clean_contents()
|
| 2330 |
|
| 2331 |
# Handle dual page objects if we have a final page
|
| 2332 |
if (
|
| 2333 |
return_pdf_end_of_redaction
|
| 2334 |
and return_pdf_for_review
|
| 2335 |
+
and hasattr(redact_page_with_pymupdf, "_applied_redaction_page")
|
| 2336 |
):
|
| 2337 |
+
applied_redaction_page_data = redact_page_with_pymupdf._applied_redaction_page
|
| 2338 |
# Handle both tuple format (new) and single page format (backward compatibility)
|
| 2339 |
+
if isinstance(applied_redaction_page_data, tuple):
|
| 2340 |
+
applied_redaction_page, original_page_number = applied_redaction_page_data
|
| 2341 |
else:
|
| 2342 |
+
applied_redaction_page = applied_redaction_page_data
|
| 2343 |
|
| 2344 |
+
# Apply redactions to applied redaction page only
|
| 2345 |
+
# Remove text. Graphic text is effectively removed by the overlapping rectangle shape that becomes an embedded part of the document.
|
| 2346 |
+
applied_redaction_page.apply_redactions(images=0, graphics=0, text=0)
|
| 2347 |
+
|
| 2348 |
+
set_cropbox_safely(applied_redaction_page, original_cropbox)
|
| 2349 |
+
applied_redaction_page.clean_contents()
|
| 2350 |
# Clear the stored final page
|
| 2351 |
+
delattr(redact_page_with_pymupdf, "_applied_redaction_page")
|
| 2352 |
+
return (page, applied_redaction_page), out_annotation_boxes
|
| 2353 |
+
|
| 2354 |
else:
|
| 2355 |
return page, out_annotation_boxes
|
| 2356 |
|
|
|
|
| 3115 |
|
| 3116 |
# Handle dual page objects if returned
|
| 3117 |
if isinstance(redact_result[0], tuple):
|
| 3118 |
+
(pymupdf_page, pymupdf_applied_redaction_page), page_image_annotations = (
|
| 3119 |
redact_result
|
| 3120 |
)
|
| 3121 |
# Store the final page with its original page number for later use
|
| 3122 |
+
if not hasattr(redact_image_pdf, "_applied_redaction_pages"):
|
| 3123 |
+
redact_image_pdf._applied_redaction_pages = list()
|
| 3124 |
+
redact_image_pdf._applied_redaction_pages.append(
|
| 3125 |
+
(pymupdf_applied_redaction_page, page_no)
|
| 3126 |
)
|
| 3127 |
else:
|
| 3128 |
pymupdf_page, page_image_annotations = redact_result
|
|
|
|
| 4177 |
|
| 4178 |
# Handle dual page objects if returned
|
| 4179 |
if isinstance(redact_result[0], tuple):
|
| 4180 |
+
(pymupdf_page, pymupdf_applied_redaction_page), page_image_annotations = (
|
| 4181 |
redact_result
|
| 4182 |
)
|
| 4183 |
# Store the final page with its original page number for later use
|
| 4184 |
+
if not hasattr(redact_text_pdf, "_applied_redaction_pages"):
|
| 4185 |
+
redact_text_pdf._applied_redaction_pages = list()
|
| 4186 |
+
redact_text_pdf._applied_redaction_pages.append(
|
| 4187 |
+
(pymupdf_applied_redaction_page, page_no)
|
| 4188 |
)
|
| 4189 |
else:
|
| 4190 |
pymupdf_page, page_image_annotations = redact_result
|
|
|
|
| 4204 |
# Else, user chose not to run redaction
|
| 4205 |
else:
|
| 4206 |
pass
|
|
|
|
| 4207 |
|
| 4208 |
# Join extracted text outputs for all lines together
|
| 4209 |
if not page_text_ocr_outputs.empty:
|
tools/find_duplicate_pages.py
CHANGED
|
@@ -462,9 +462,6 @@ def combine_ocr_dataframes(
|
|
| 462 |
output_files = list()
|
| 463 |
if output_folder and output_filename:
|
| 464 |
# Validate path safety before creating directories and files
|
| 465 |
-
print(
|
| 466 |
-
f"DEBUG: Validating output_folder='{output_folder}' against OUTPUT_FOLDER='{OUTPUT_FOLDER}'"
|
| 467 |
-
)
|
| 468 |
if not validate_folder_containment(output_folder, OUTPUT_FOLDER):
|
| 469 |
raise ValueError(f"Unsafe output folder path: {output_folder}")
|
| 470 |
if not validate_path_safety(output_filename):
|
|
@@ -659,9 +656,6 @@ def save_results_and_redaction_lists(
|
|
| 659 |
list: A list of paths to all generated files.
|
| 660 |
"""
|
| 661 |
# Validate the output_folder path for security
|
| 662 |
-
print(
|
| 663 |
-
f"DEBUG: Validating output_folder='{output_folder}' against OUTPUT_FOLDER='{OUTPUT_FOLDER}'"
|
| 664 |
-
)
|
| 665 |
if not validate_folder_containment(output_folder, OUTPUT_FOLDER):
|
| 666 |
raise ValueError(f"Invalid or unsafe output folder path: {output_folder}")
|
| 667 |
|
|
@@ -671,9 +665,6 @@ def save_results_and_redaction_lists(
|
|
| 671 |
try:
|
| 672 |
output_folder_path = Path(output_folder).resolve()
|
| 673 |
# Validate that the resolved path is within the trusted OUTPUT_FOLDER using robust containment check
|
| 674 |
-
print(
|
| 675 |
-
f"DEBUG: Validating resolved path='{output_folder_path}' against OUTPUT_FOLDER='{OUTPUT_FOLDER}'"
|
| 676 |
-
)
|
| 677 |
if not validate_folder_containment(str(output_folder_path), OUTPUT_FOLDER):
|
| 678 |
raise ValueError(
|
| 679 |
f"Output folder path {output_folder} is outside the trusted directory {OUTPUT_FOLDER}"
|
|
@@ -1092,7 +1083,7 @@ def run_duplicate_analysis(
|
|
| 1092 |
|
| 1093 |
progress(0, desc="Combining input files...")
|
| 1094 |
df_combined, _, full_out_ocr_df = combine_ocr_output_text(
|
| 1095 |
-
files, combine_pages=combine_pages
|
| 1096 |
)
|
| 1097 |
|
| 1098 |
if df_combined.empty:
|
|
|
|
| 462 |
output_files = list()
|
| 463 |
if output_folder and output_filename:
|
| 464 |
# Validate path safety before creating directories and files
|
|
|
|
|
|
|
|
|
|
| 465 |
if not validate_folder_containment(output_folder, OUTPUT_FOLDER):
|
| 466 |
raise ValueError(f"Unsafe output folder path: {output_folder}")
|
| 467 |
if not validate_path_safety(output_filename):
|
|
|
|
| 656 |
list: A list of paths to all generated files.
|
| 657 |
"""
|
| 658 |
# Validate the output_folder path for security
|
|
|
|
|
|
|
|
|
|
| 659 |
if not validate_folder_containment(output_folder, OUTPUT_FOLDER):
|
| 660 |
raise ValueError(f"Invalid or unsafe output folder path: {output_folder}")
|
| 661 |
|
|
|
|
| 665 |
try:
|
| 666 |
output_folder_path = Path(output_folder).resolve()
|
| 667 |
# Validate that the resolved path is within the trusted OUTPUT_FOLDER using robust containment check
|
|
|
|
|
|
|
|
|
|
| 668 |
if not validate_folder_containment(str(output_folder_path), OUTPUT_FOLDER):
|
| 669 |
raise ValueError(
|
| 670 |
f"Output folder path {output_folder} is outside the trusted directory {OUTPUT_FOLDER}"
|
|
|
|
| 1083 |
|
| 1084 |
progress(0, desc="Combining input files...")
|
| 1085 |
df_combined, _, full_out_ocr_df = combine_ocr_output_text(
|
| 1086 |
+
files, combine_pages=combine_pages, output_folder=output_folder
|
| 1087 |
)
|
| 1088 |
|
| 1089 |
if df_combined.empty:
|
tools/secure_path_utils.py
CHANGED
|
@@ -311,14 +311,6 @@ def validate_folder_containment(
|
|
| 311 |
path_str = str(normalized_path).lower()
|
| 312 |
base_str = str(normalized_base).lower()
|
| 313 |
|
| 314 |
-
print(
|
| 315 |
-
f"DEBUG: validate_folder_containment called with path='{path}' base_path='{base_path}'"
|
| 316 |
-
)
|
| 317 |
-
print(
|
| 318 |
-
f"DEBUG: normalized_path='{normalized_path}' normalized_base='{normalized_base}'"
|
| 319 |
-
)
|
| 320 |
-
print(f"DEBUG: path_str='{path_str}' base_str='{base_str}'")
|
| 321 |
-
|
| 322 |
# Check if this is a test scenario
|
| 323 |
is_test_path = any(
|
| 324 |
test_pattern in path_str
|
|
|
|
| 311 |
path_str = str(normalized_path).lower()
|
| 312 |
base_str = str(normalized_base).lower()
|
| 313 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
# Check if this is a test scenario
|
| 315 |
is_test_path = any(
|
| 316 |
test_pattern in path_str
|