Commit
·
150a8d9
1
Parent(s):
5086da0
Fixed linting issues
Browse files- app.py +60 -45
- tools/file_conversion.py +9 -5
- tools/file_redaction.py +55 -24
app.py
CHANGED
|
@@ -278,44 +278,44 @@ in_redact_comprehend_entities = gr.Dropdown(
|
|
| 278 |
)
|
| 279 |
|
| 280 |
in_deny_list = gr.File(
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
|
| 286 |
in_deny_list_state = gr.Dataframe(
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
|
| 300 |
in_fully_redacted_list = gr.File(
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
|
| 306 |
in_fully_redacted_list_state = gr.Dataframe(
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
|
| 320 |
|
| 321 |
## Deduplication examples
|
|
@@ -1101,22 +1101,37 @@ with blocks:
|
|
| 1101 |
)
|
| 1102 |
example_labels.append(
|
| 1103 |
"PDF redaction with AWS services and signature detection"
|
| 1104 |
-
)
|
| 1105 |
|
| 1106 |
# Add new example for custom deny list and whole page redaction
|
| 1107 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1108 |
available_examples.append(
|
| 1109 |
[
|
| 1110 |
[example_files[3]],
|
| 1111 |
"Local OCR model - PDFs without selectable text",
|
| 1112 |
"Local",
|
| 1113 |
[],
|
| 1114 |
-
[
|
|
|
|
|
|
|
| 1115 |
CHOSEN_COMPREHEND_ENTITIES,
|
| 1116 |
[example_files[3]],
|
| 1117 |
example_files[3],
|
| 1118 |
[example_files[4]],
|
| 1119 |
-
pd.DataFrame(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1120 |
[example_files[5]],
|
| 1121 |
pd.DataFrame(data={"fully_redacted_pages_list": [2, 5]}),
|
| 1122 |
]
|
|
@@ -1137,7 +1152,7 @@ with blocks:
|
|
| 1137 |
in_redact_comprehend_entities,
|
| 1138 |
prepared_pdf_state,
|
| 1139 |
doc_full_file_name_textbox,
|
| 1140 |
-
in_deny_list,
|
| 1141 |
in_deny_list_state,
|
| 1142 |
in_fully_redacted_list,
|
| 1143 |
in_fully_redacted_list_state,
|
|
@@ -2153,7 +2168,7 @@ with blocks:
|
|
| 2153 |
value="## Please give feedback", visible=False
|
| 2154 |
)
|
| 2155 |
data_feedback_radio = gr.Radio(
|
| 2156 |
-
label="Please give some feedback about the results of the redaction.
|
| 2157 |
choices=["The results were good", "The results were not good"],
|
| 2158 |
visible=False,
|
| 2159 |
show_label=True,
|
|
@@ -2180,11 +2195,11 @@ with blocks:
|
|
| 2180 |
in_allow_list_text = gr.Textbox(
|
| 2181 |
label="Custom allow list load status"
|
| 2182 |
)
|
| 2183 |
-
with gr.Column():
|
| 2184 |
-
in_deny_list.render()
|
| 2185 |
in_deny_list_text = gr.Textbox(label="Custom deny list load status")
|
| 2186 |
with gr.Column():
|
| 2187 |
-
in_fully_redacted_list.render()
|
| 2188 |
in_fully_redacted_list_text = gr.Textbox(
|
| 2189 |
label="Fully redacted page list load status"
|
| 2190 |
)
|
|
@@ -2206,10 +2221,10 @@ with blocks:
|
|
| 2206 |
show_copy_button=True,
|
| 2207 |
wrap=True,
|
| 2208 |
)
|
| 2209 |
-
|
| 2210 |
-
in_deny_list_state.render()
|
| 2211 |
-
|
| 2212 |
-
in_fully_redacted_list_state.render()
|
| 2213 |
with gr.Row():
|
| 2214 |
with gr.Column(scale=2):
|
| 2215 |
markdown_placeholder = gr.Markdown("")
|
|
|
|
| 278 |
)
|
| 279 |
|
| 280 |
in_deny_list = gr.File(
|
| 281 |
+
label="Import custom deny list - csv table with one column of a different word/phrase on each row (case insensitive). Terms in this file will always be redacted.",
|
| 282 |
+
file_count="multiple",
|
| 283 |
+
height=FILE_INPUT_HEIGHT,
|
| 284 |
+
)
|
| 285 |
|
| 286 |
in_deny_list_state = gr.Dataframe(
|
| 287 |
+
value=pd.DataFrame(),
|
| 288 |
+
headers=["deny_list"],
|
| 289 |
+
col_count=(1, "fixed"),
|
| 290 |
+
row_count=(0, "dynamic"),
|
| 291 |
+
label="Deny list",
|
| 292 |
+
visible=True,
|
| 293 |
+
type="pandas",
|
| 294 |
+
interactive=True,
|
| 295 |
+
show_fullscreen_button=True,
|
| 296 |
+
show_copy_button=True,
|
| 297 |
+
wrap=True,
|
| 298 |
+
)
|
| 299 |
|
| 300 |
in_fully_redacted_list = gr.File(
|
| 301 |
+
label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.",
|
| 302 |
+
file_count="multiple",
|
| 303 |
+
height=FILE_INPUT_HEIGHT,
|
| 304 |
+
)
|
| 305 |
|
| 306 |
in_fully_redacted_list_state = gr.Dataframe(
|
| 307 |
+
value=pd.DataFrame(),
|
| 308 |
+
headers=["fully_redacted_pages_list"],
|
| 309 |
+
col_count=(1, "fixed"),
|
| 310 |
+
row_count=(0, "dynamic"),
|
| 311 |
+
label="Fully redacted pages",
|
| 312 |
+
visible=True,
|
| 313 |
+
type="pandas",
|
| 314 |
+
interactive=True,
|
| 315 |
+
show_fullscreen_button=True,
|
| 316 |
+
show_copy_button=True,
|
| 317 |
+
wrap=True,
|
| 318 |
+
)
|
| 319 |
|
| 320 |
|
| 321 |
## Deduplication examples
|
|
|
|
| 1101 |
)
|
| 1102 |
example_labels.append(
|
| 1103 |
"PDF redaction with AWS services and signature detection"
|
| 1104 |
+
)
|
| 1105 |
|
| 1106 |
# Add new example for custom deny list and whole page redaction
|
| 1107 |
+
if (
|
| 1108 |
+
os.path.exists(example_files[3])
|
| 1109 |
+
and os.path.exists(example_files[4])
|
| 1110 |
+
and os.path.exists(example_files[5])
|
| 1111 |
+
):
|
| 1112 |
available_examples.append(
|
| 1113 |
[
|
| 1114 |
[example_files[3]],
|
| 1115 |
"Local OCR model - PDFs without selectable text",
|
| 1116 |
"Local",
|
| 1117 |
[],
|
| 1118 |
+
[
|
| 1119 |
+
"CUSTOM"
|
| 1120 |
+
], # Use CUSTOM entity to enable deny list functionality
|
| 1121 |
CHOSEN_COMPREHEND_ENTITIES,
|
| 1122 |
[example_files[3]],
|
| 1123 |
example_files[3],
|
| 1124 |
[example_files[4]],
|
| 1125 |
+
pd.DataFrame(
|
| 1126 |
+
data={
|
| 1127 |
+
"deny_list": [
|
| 1128 |
+
"Sister",
|
| 1129 |
+
"Sister City",
|
| 1130 |
+
"Sister Cities",
|
| 1131 |
+
"Friendship City",
|
| 1132 |
+
]
|
| 1133 |
+
}
|
| 1134 |
+
),
|
| 1135 |
[example_files[5]],
|
| 1136 |
pd.DataFrame(data={"fully_redacted_pages_list": [2, 5]}),
|
| 1137 |
]
|
|
|
|
| 1152 |
in_redact_comprehend_entities,
|
| 1153 |
prepared_pdf_state,
|
| 1154 |
doc_full_file_name_textbox,
|
| 1155 |
+
in_deny_list,
|
| 1156 |
in_deny_list_state,
|
| 1157 |
in_fully_redacted_list,
|
| 1158 |
in_fully_redacted_list_state,
|
|
|
|
| 2168 |
value="## Please give feedback", visible=False
|
| 2169 |
)
|
| 2170 |
data_feedback_radio = gr.Radio(
|
| 2171 |
+
label="Please give some feedback about the results of the redaction.",
|
| 2172 |
choices=["The results were good", "The results were not good"],
|
| 2173 |
visible=False,
|
| 2174 |
show_label=True,
|
|
|
|
| 2195 |
in_allow_list_text = gr.Textbox(
|
| 2196 |
label="Custom allow list load status"
|
| 2197 |
)
|
| 2198 |
+
with gr.Column():
|
| 2199 |
+
in_deny_list.render() # Defined at beginning of file
|
| 2200 |
in_deny_list_text = gr.Textbox(label="Custom deny list load status")
|
| 2201 |
with gr.Column():
|
| 2202 |
+
in_fully_redacted_list.render() # Defined at beginning of file
|
| 2203 |
in_fully_redacted_list_text = gr.Textbox(
|
| 2204 |
label="Fully redacted page list load status"
|
| 2205 |
)
|
|
|
|
| 2221 |
show_copy_button=True,
|
| 2222 |
wrap=True,
|
| 2223 |
)
|
| 2224 |
+
|
| 2225 |
+
in_deny_list_state.render() # Defined at beginning of file
|
| 2226 |
+
|
| 2227 |
+
in_fully_redacted_list_state.render() # Defined at beginning of file
|
| 2228 |
with gr.Row():
|
| 2229 |
with gr.Column(scale=2):
|
| 2230 |
markdown_placeholder = gr.Markdown("")
|
tools/file_conversion.py
CHANGED
|
@@ -583,7 +583,9 @@ def redact_single_box(
|
|
| 583 |
pymupdf_y2 = pymupdf_rect[3]
|
| 584 |
|
| 585 |
# Full size redaction box for covering all the text of a word
|
| 586 |
-
full_size_redaction_box = Rect(
|
|
|
|
|
|
|
| 587 |
|
| 588 |
# Calculate tiny height redaction box so that it doesn't delete text from adjacent lines
|
| 589 |
redact_bottom_y = pymupdf_y1 + 2
|
|
@@ -620,7 +622,7 @@ def redact_single_box(
|
|
| 620 |
applied_redaction_page = applied_redaction_page[0]
|
| 621 |
|
| 622 |
# Handle review page first, then deal with final redacted page (retain_text = True)
|
| 623 |
-
if retain_text is True:
|
| 624 |
|
| 625 |
annot = pymupdf_page.add_redact_annot(full_size_redaction_box)
|
| 626 |
annot.set_colors(stroke=out_colour, fill=out_colour, colors=out_colour)
|
|
@@ -636,7 +638,7 @@ def redact_single_box(
|
|
| 636 |
|
| 637 |
# If we need both review and final pages, and the applied redaction page has been prepared, apply final redaction to the copy
|
| 638 |
if return_pdf_end_of_redaction and applied_redaction_page is not None:
|
| 639 |
-
# Apply final redaction to the copy
|
| 640 |
|
| 641 |
# Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
|
| 642 |
applied_redaction_page.add_redact_annot(rect_small_pixel_height)
|
|
@@ -654,7 +656,7 @@ def redact_single_box(
|
|
| 654 |
return pymupdf_page
|
| 655 |
|
| 656 |
# If we don't need to retain the text, we only have one page which is the applied redaction page, so just apply the redaction to the page
|
| 657 |
-
else:
|
| 658 |
# Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
|
| 659 |
pymupdf_page.add_redact_annot(rect_small_pixel_height)
|
| 660 |
|
|
@@ -794,7 +796,9 @@ def redact_whole_pymupdf_page(
|
|
| 794 |
|
| 795 |
# Calculate relative coordinates for the annotation box (0-1 range)
|
| 796 |
# This ensures the coordinates are already in relative format for output files
|
| 797 |
-
relative_border = border / min(
|
|
|
|
|
|
|
| 798 |
relative_x1 = relative_border
|
| 799 |
relative_y1 = relative_border
|
| 800 |
relative_x2 = 1 - relative_border
|
|
|
|
| 583 |
pymupdf_y2 = pymupdf_rect[3]
|
| 584 |
|
| 585 |
# Full size redaction box for covering all the text of a word
|
| 586 |
+
full_size_redaction_box = Rect(
|
| 587 |
+
pymupdf_x1 - 1, pymupdf_y1 - 1, pymupdf_x2 + 1, pymupdf_y2 + 1
|
| 588 |
+
)
|
| 589 |
|
| 590 |
# Calculate tiny height redaction box so that it doesn't delete text from adjacent lines
|
| 591 |
redact_bottom_y = pymupdf_y1 + 2
|
|
|
|
| 622 |
applied_redaction_page = applied_redaction_page[0]
|
| 623 |
|
| 624 |
# Handle review page first, then deal with final redacted page (retain_text = True)
|
| 625 |
+
if retain_text is True:
|
| 626 |
|
| 627 |
annot = pymupdf_page.add_redact_annot(full_size_redaction_box)
|
| 628 |
annot.set_colors(stroke=out_colour, fill=out_colour, colors=out_colour)
|
|
|
|
| 638 |
|
| 639 |
# If we need both review and final pages, and the applied redaction page has been prepared, apply final redaction to the copy
|
| 640 |
if return_pdf_end_of_redaction and applied_redaction_page is not None:
|
| 641 |
+
# Apply final redaction to the copy
|
| 642 |
|
| 643 |
# Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
|
| 644 |
applied_redaction_page.add_redact_annot(rect_small_pixel_height)
|
|
|
|
| 656 |
return pymupdf_page
|
| 657 |
|
| 658 |
# If we don't need to retain the text, we only have one page which is the applied redaction page, so just apply the redaction to the page
|
| 659 |
+
else:
|
| 660 |
# Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
|
| 661 |
pymupdf_page.add_redact_annot(rect_small_pixel_height)
|
| 662 |
|
|
|
|
| 796 |
|
| 797 |
# Calculate relative coordinates for the annotation box (0-1 range)
|
| 798 |
# This ensures the coordinates are already in relative format for output files
|
| 799 |
+
relative_border = border / min(
|
| 800 |
+
rect_width, rect_height
|
| 801 |
+
) # Scale border proportionally
|
| 802 |
relative_x1 = relative_border
|
| 803 |
relative_y1 = relative_border
|
| 804 |
relative_x2 = 1 - relative_border
|
tools/file_redaction.py
CHANGED
|
@@ -1109,16 +1109,20 @@ def choose_and_run_redactor(
|
|
| 1109 |
|
| 1110 |
# Create a mapping of original page numbers to final pages
|
| 1111 |
applied_redaction_pages_map = {}
|
| 1112 |
-
for
|
|
|
|
|
|
|
| 1113 |
if isinstance(applied_redaction_page_data, tuple):
|
| 1114 |
applied_redaction_page, original_page_number = (
|
| 1115 |
applied_redaction_page_data
|
| 1116 |
)
|
| 1117 |
-
applied_redaction_pages_map[
|
| 1118 |
-
|
| 1119 |
-
|
| 1120 |
else:
|
| 1121 |
-
applied_redaction_page =
|
|
|
|
|
|
|
| 1122 |
applied_redaction_pages_map[0] = (
|
| 1123 |
applied_redaction_page # Default to page 0 if no original number
|
| 1124 |
)
|
|
@@ -1158,16 +1162,20 @@ def choose_and_run_redactor(
|
|
| 1158 |
|
| 1159 |
# Create a mapping of original page numbers to final pages
|
| 1160 |
applied_redaction_pages_map = {}
|
| 1161 |
-
for
|
|
|
|
|
|
|
| 1162 |
if isinstance(applied_redaction_page_data, tuple):
|
| 1163 |
applied_redaction_page, original_page_number = (
|
| 1164 |
applied_redaction_page_data
|
| 1165 |
)
|
| 1166 |
-
applied_redaction_pages_map[
|
| 1167 |
-
|
| 1168 |
-
|
| 1169 |
else:
|
| 1170 |
-
applied_redaction_page =
|
|
|
|
|
|
|
| 1171 |
applied_redaction_pages_map[0] = (
|
| 1172 |
applied_redaction_page # Default to page 0 if no original number
|
| 1173 |
)
|
|
@@ -1199,7 +1207,10 @@ def choose_and_run_redactor(
|
|
| 1199 |
delattr(redact_text_pdf, "_applied_redaction_pages")
|
| 1200 |
|
| 1201 |
# Save final redacted PDF if we have dual outputs or if RETURN_PDF_FOR_REVIEW is False
|
| 1202 |
-
if
|
|
|
|
|
|
|
|
|
|
| 1203 |
out_redacted_pdf_file_path = (
|
| 1204 |
output_folder
|
| 1205 |
+ pdf_file_name_without_ext
|
|
@@ -1211,7 +1222,9 @@ def choose_and_run_redactor(
|
|
| 1211 |
|
| 1212 |
# Use final document if available, otherwise use main document
|
| 1213 |
doc_to_save = (
|
| 1214 |
-
applied_redaction_pymupdf_doc
|
|
|
|
|
|
|
| 1215 |
)
|
| 1216 |
|
| 1217 |
if out_redacted_pdf_file_path:
|
|
@@ -2274,11 +2287,15 @@ def redact_page_with_pymupdf(
|
|
| 2274 |
page, applied_redaction_page = redact_result
|
| 2275 |
# Store the final page for later use
|
| 2276 |
if not hasattr(redact_page_with_pymupdf, "_applied_redaction_page"):
|
| 2277 |
-
redact_page_with_pymupdf._applied_redaction_page =
|
|
|
|
|
|
|
| 2278 |
else:
|
| 2279 |
# If we already have a final page, we need to handle multiple pages
|
| 2280 |
# For now, we'll use the last final page
|
| 2281 |
-
redact_page_with_pymupdf._applied_redaction_page =
|
|
|
|
|
|
|
| 2282 |
|
| 2283 |
# If whole page is to be redacted, do that here
|
| 2284 |
if redact_whole_page is True:
|
|
@@ -2287,7 +2304,9 @@ def redact_page_with_pymupdf(
|
|
| 2287 |
rect_height, rect_width, page, custom_colours, border=5
|
| 2288 |
)
|
| 2289 |
# Ensure the whole page annotation box has a unique ID
|
| 2290 |
-
whole_page_img_annotation_box = fill_missing_box_ids(
|
|
|
|
|
|
|
| 2291 |
all_image_annotation_boxes.append(whole_page_img_annotation_box)
|
| 2292 |
|
| 2293 |
# Handle dual page objects for whole page redaction if needed
|
|
@@ -2304,16 +2323,26 @@ def redact_page_with_pymupdf(
|
|
| 2304 |
|
| 2305 |
# Apply the whole page redaction to the final page as well
|
| 2306 |
redact_whole_pymupdf_page(
|
| 2307 |
-
rect_height,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2308 |
)
|
| 2309 |
|
| 2310 |
# Store the final page with its original page number for later use
|
| 2311 |
if not hasattr(redact_page_with_pymupdf, "_applied_redaction_page"):
|
| 2312 |
-
redact_page_with_pymupdf._applied_redaction_page = (
|
|
|
|
|
|
|
|
|
|
| 2313 |
else:
|
| 2314 |
# If we already have a final page, we need to handle multiple pages
|
| 2315 |
# For now, we'll use the last final page
|
| 2316 |
-
redact_page_with_pymupdf._applied_redaction_page = (
|
|
|
|
|
|
|
|
|
|
| 2317 |
|
| 2318 |
out_annotation_boxes = {
|
| 2319 |
"image": image_path, # Image.open(image_path), #image_path,
|
|
@@ -3115,9 +3144,10 @@ def redact_image_pdf(
|
|
| 3115 |
|
| 3116 |
# Handle dual page objects if returned
|
| 3117 |
if isinstance(redact_result[0], tuple):
|
| 3118 |
-
(
|
| 3119 |
-
|
| 3120 |
-
|
|
|
|
| 3121 |
# Store the final page with its original page number for later use
|
| 3122 |
if not hasattr(redact_image_pdf, "_applied_redaction_pages"):
|
| 3123 |
redact_image_pdf._applied_redaction_pages = list()
|
|
@@ -4177,9 +4207,10 @@ def redact_text_pdf(
|
|
| 4177 |
|
| 4178 |
# Handle dual page objects if returned
|
| 4179 |
if isinstance(redact_result[0], tuple):
|
| 4180 |
-
(
|
| 4181 |
-
|
| 4182 |
-
|
|
|
|
| 4183 |
# Store the final page with its original page number for later use
|
| 4184 |
if not hasattr(redact_text_pdf, "_applied_redaction_pages"):
|
| 4185 |
redact_text_pdf._applied_redaction_pages = list()
|
|
|
|
| 1109 |
|
| 1110 |
# Create a mapping of original page numbers to final pages
|
| 1111 |
applied_redaction_pages_map = {}
|
| 1112 |
+
for (
|
| 1113 |
+
applied_redaction_page_data
|
| 1114 |
+
) in redact_image_pdf._applied_redaction_pages:
|
| 1115 |
if isinstance(applied_redaction_page_data, tuple):
|
| 1116 |
applied_redaction_page, original_page_number = (
|
| 1117 |
applied_redaction_page_data
|
| 1118 |
)
|
| 1119 |
+
applied_redaction_pages_map[
|
| 1120 |
+
original_page_number
|
| 1121 |
+
] = applied_redaction_page
|
| 1122 |
else:
|
| 1123 |
+
applied_redaction_page = (
|
| 1124 |
+
applied_redaction_page_data
|
| 1125 |
+
)
|
| 1126 |
applied_redaction_pages_map[0] = (
|
| 1127 |
applied_redaction_page # Default to page 0 if no original number
|
| 1128 |
)
|
|
|
|
| 1162 |
|
| 1163 |
# Create a mapping of original page numbers to final pages
|
| 1164 |
applied_redaction_pages_map = {}
|
| 1165 |
+
for (
|
| 1166 |
+
applied_redaction_page_data
|
| 1167 |
+
) in redact_text_pdf._applied_redaction_pages:
|
| 1168 |
if isinstance(applied_redaction_page_data, tuple):
|
| 1169 |
applied_redaction_page, original_page_number = (
|
| 1170 |
applied_redaction_page_data
|
| 1171 |
)
|
| 1172 |
+
applied_redaction_pages_map[
|
| 1173 |
+
original_page_number
|
| 1174 |
+
] = applied_redaction_page
|
| 1175 |
else:
|
| 1176 |
+
applied_redaction_page = (
|
| 1177 |
+
applied_redaction_page_data
|
| 1178 |
+
)
|
| 1179 |
applied_redaction_pages_map[0] = (
|
| 1180 |
applied_redaction_page # Default to page 0 if no original number
|
| 1181 |
)
|
|
|
|
| 1207 |
delattr(redact_text_pdf, "_applied_redaction_pages")
|
| 1208 |
|
| 1209 |
# Save final redacted PDF if we have dual outputs or if RETURN_PDF_FOR_REVIEW is False
|
| 1210 |
+
if (
|
| 1211 |
+
RETURN_PDF_FOR_REVIEW is False
|
| 1212 |
+
or applied_redaction_pymupdf_doc
|
| 1213 |
+
):
|
| 1214 |
out_redacted_pdf_file_path = (
|
| 1215 |
output_folder
|
| 1216 |
+ pdf_file_name_without_ext
|
|
|
|
| 1222 |
|
| 1223 |
# Use final document if available, otherwise use main document
|
| 1224 |
doc_to_save = (
|
| 1225 |
+
applied_redaction_pymupdf_doc
|
| 1226 |
+
if applied_redaction_pymupdf_doc
|
| 1227 |
+
else pymupdf_doc
|
| 1228 |
)
|
| 1229 |
|
| 1230 |
if out_redacted_pdf_file_path:
|
|
|
|
| 2287 |
page, applied_redaction_page = redact_result
|
| 2288 |
# Store the final page for later use
|
| 2289 |
if not hasattr(redact_page_with_pymupdf, "_applied_redaction_page"):
|
| 2290 |
+
redact_page_with_pymupdf._applied_redaction_page = (
|
| 2291 |
+
applied_redaction_page
|
| 2292 |
+
)
|
| 2293 |
else:
|
| 2294 |
# If we already have a final page, we need to handle multiple pages
|
| 2295 |
# For now, we'll use the last final page
|
| 2296 |
+
redact_page_with_pymupdf._applied_redaction_page = (
|
| 2297 |
+
applied_redaction_page
|
| 2298 |
+
)
|
| 2299 |
|
| 2300 |
# If whole page is to be redacted, do that here
|
| 2301 |
if redact_whole_page is True:
|
|
|
|
| 2304 |
rect_height, rect_width, page, custom_colours, border=5
|
| 2305 |
)
|
| 2306 |
# Ensure the whole page annotation box has a unique ID
|
| 2307 |
+
whole_page_img_annotation_box = fill_missing_box_ids(
|
| 2308 |
+
whole_page_img_annotation_box
|
| 2309 |
+
)
|
| 2310 |
all_image_annotation_boxes.append(whole_page_img_annotation_box)
|
| 2311 |
|
| 2312 |
# Handle dual page objects for whole page redaction if needed
|
|
|
|
| 2323 |
|
| 2324 |
# Apply the whole page redaction to the final page as well
|
| 2325 |
redact_whole_pymupdf_page(
|
| 2326 |
+
rect_height,
|
| 2327 |
+
rect_width,
|
| 2328 |
+
applied_redaction_page,
|
| 2329 |
+
custom_colours,
|
| 2330 |
+
border=5,
|
| 2331 |
)
|
| 2332 |
|
| 2333 |
# Store the final page with its original page number for later use
|
| 2334 |
if not hasattr(redact_page_with_pymupdf, "_applied_redaction_page"):
|
| 2335 |
+
redact_page_with_pymupdf._applied_redaction_page = (
|
| 2336 |
+
applied_redaction_page,
|
| 2337 |
+
page.number,
|
| 2338 |
+
)
|
| 2339 |
else:
|
| 2340 |
# If we already have a final page, we need to handle multiple pages
|
| 2341 |
# For now, we'll use the last final page
|
| 2342 |
+
redact_page_with_pymupdf._applied_redaction_page = (
|
| 2343 |
+
applied_redaction_page,
|
| 2344 |
+
page.number,
|
| 2345 |
+
)
|
| 2346 |
|
| 2347 |
out_annotation_boxes = {
|
| 2348 |
"image": image_path, # Image.open(image_path), #image_path,
|
|
|
|
| 3144 |
|
| 3145 |
# Handle dual page objects if returned
|
| 3146 |
if isinstance(redact_result[0], tuple):
|
| 3147 |
+
(
|
| 3148 |
+
pymupdf_page,
|
| 3149 |
+
pymupdf_applied_redaction_page,
|
| 3150 |
+
), page_image_annotations = redact_result
|
| 3151 |
# Store the final page with its original page number for later use
|
| 3152 |
if not hasattr(redact_image_pdf, "_applied_redaction_pages"):
|
| 3153 |
redact_image_pdf._applied_redaction_pages = list()
|
|
|
|
| 4207 |
|
| 4208 |
# Handle dual page objects if returned
|
| 4209 |
if isinstance(redact_result[0], tuple):
|
| 4210 |
+
(
|
| 4211 |
+
pymupdf_page,
|
| 4212 |
+
pymupdf_applied_redaction_page,
|
| 4213 |
+
), page_image_annotations = redact_result
|
| 4214 |
# Store the final page with its original page number for later use
|
| 4215 |
if not hasattr(redact_text_pdf, "_applied_redaction_pages"):
|
| 4216 |
redact_text_pdf._applied_redaction_pages = list()
|