seanpedrickcase commited on
Commit
150a8d9
·
1 Parent(s): 5086da0

Fixed linting issues

Browse files
Files changed (3) hide show
  1. app.py +60 -45
  2. tools/file_conversion.py +9 -5
  3. tools/file_redaction.py +55 -24
app.py CHANGED
@@ -278,44 +278,44 @@ in_redact_comprehend_entities = gr.Dropdown(
278
  )
279
 
280
  in_deny_list = gr.File(
281
- label="Import custom deny list - csv table with one column of a different word/phrase on each row (case insensitive). Terms in this file will always be redacted.",
282
- file_count="multiple",
283
- height=FILE_INPUT_HEIGHT,
284
- )
285
 
286
  in_deny_list_state = gr.Dataframe(
287
- value=pd.DataFrame(),
288
- headers=["deny_list"],
289
- col_count=(1, "fixed"),
290
- row_count=(0, "dynamic"),
291
- label="Deny list",
292
- visible=True,
293
- type="pandas",
294
- interactive=True,
295
- show_fullscreen_button=True,
296
- show_copy_button=True,
297
- wrap=True,
298
- )
299
 
300
  in_fully_redacted_list = gr.File(
301
- label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.",
302
- file_count="multiple",
303
- height=FILE_INPUT_HEIGHT,
304
- )
305
 
306
  in_fully_redacted_list_state = gr.Dataframe(
307
- value=pd.DataFrame(),
308
- headers=["fully_redacted_pages_list"],
309
- col_count=(1, "fixed"),
310
- row_count=(0, "dynamic"),
311
- label="Fully redacted pages",
312
- visible=True,
313
- type="pandas",
314
- interactive=True,
315
- show_fullscreen_button=True,
316
- show_copy_button=True,
317
- wrap=True,
318
- )
319
 
320
 
321
  ## Deduplication examples
@@ -1101,22 +1101,37 @@ with blocks:
1101
  )
1102
  example_labels.append(
1103
  "PDF redaction with AWS services and signature detection"
1104
- )
1105
 
1106
  # Add new example for custom deny list and whole page redaction
1107
- if os.path.exists(example_files[3]) and os.path.exists(example_files[4]) and os.path.exists(example_files[5]):
 
 
 
 
1108
  available_examples.append(
1109
  [
1110
  [example_files[3]],
1111
  "Local OCR model - PDFs without selectable text",
1112
  "Local",
1113
  [],
1114
- ["CUSTOM"], # Use CUSTOM entity to enable deny list functionality
 
 
1115
  CHOSEN_COMPREHEND_ENTITIES,
1116
  [example_files[3]],
1117
  example_files[3],
1118
  [example_files[4]],
1119
- pd.DataFrame(data={"deny_list": ["Sister", "Sister City", "Sister Cities", "Friendship City"]}),
 
 
 
 
 
 
 
 
 
1120
  [example_files[5]],
1121
  pd.DataFrame(data={"fully_redacted_pages_list": [2, 5]}),
1122
  ]
@@ -1137,7 +1152,7 @@ with blocks:
1137
  in_redact_comprehend_entities,
1138
  prepared_pdf_state,
1139
  doc_full_file_name_textbox,
1140
- in_deny_list,
1141
  in_deny_list_state,
1142
  in_fully_redacted_list,
1143
  in_fully_redacted_list_state,
@@ -2153,7 +2168,7 @@ with blocks:
2153
  value="## Please give feedback", visible=False
2154
  )
2155
  data_feedback_radio = gr.Radio(
2156
- label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document.",
2157
  choices=["The results were good", "The results were not good"],
2158
  visible=False,
2159
  show_label=True,
@@ -2180,11 +2195,11 @@ with blocks:
2180
  in_allow_list_text = gr.Textbox(
2181
  label="Custom allow list load status"
2182
  )
2183
- with gr.Column():
2184
- in_deny_list.render() # Defined at beginning of file
2185
  in_deny_list_text = gr.Textbox(label="Custom deny list load status")
2186
  with gr.Column():
2187
- in_fully_redacted_list.render() # Defined at beginning of file
2188
  in_fully_redacted_list_text = gr.Textbox(
2189
  label="Fully redacted page list load status"
2190
  )
@@ -2206,10 +2221,10 @@ with blocks:
2206
  show_copy_button=True,
2207
  wrap=True,
2208
  )
2209
-
2210
- in_deny_list_state.render() # Defined at beginning of file
2211
-
2212
- in_fully_redacted_list_state.render() # Defined at beginning of file
2213
  with gr.Row():
2214
  with gr.Column(scale=2):
2215
  markdown_placeholder = gr.Markdown("")
 
278
  )
279
 
280
  in_deny_list = gr.File(
281
+ label="Import custom deny list - csv table with one column of a different word/phrase on each row (case insensitive). Terms in this file will always be redacted.",
282
+ file_count="multiple",
283
+ height=FILE_INPUT_HEIGHT,
284
+ )
285
 
286
  in_deny_list_state = gr.Dataframe(
287
+ value=pd.DataFrame(),
288
+ headers=["deny_list"],
289
+ col_count=(1, "fixed"),
290
+ row_count=(0, "dynamic"),
291
+ label="Deny list",
292
+ visible=True,
293
+ type="pandas",
294
+ interactive=True,
295
+ show_fullscreen_button=True,
296
+ show_copy_button=True,
297
+ wrap=True,
298
+ )
299
 
300
  in_fully_redacted_list = gr.File(
301
+ label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.",
302
+ file_count="multiple",
303
+ height=FILE_INPUT_HEIGHT,
304
+ )
305
 
306
  in_fully_redacted_list_state = gr.Dataframe(
307
+ value=pd.DataFrame(),
308
+ headers=["fully_redacted_pages_list"],
309
+ col_count=(1, "fixed"),
310
+ row_count=(0, "dynamic"),
311
+ label="Fully redacted pages",
312
+ visible=True,
313
+ type="pandas",
314
+ interactive=True,
315
+ show_fullscreen_button=True,
316
+ show_copy_button=True,
317
+ wrap=True,
318
+ )
319
 
320
 
321
  ## Deduplication examples
 
1101
  )
1102
  example_labels.append(
1103
  "PDF redaction with AWS services and signature detection"
1104
+ )
1105
 
1106
  # Add new example for custom deny list and whole page redaction
1107
+ if (
1108
+ os.path.exists(example_files[3])
1109
+ and os.path.exists(example_files[4])
1110
+ and os.path.exists(example_files[5])
1111
+ ):
1112
  available_examples.append(
1113
  [
1114
  [example_files[3]],
1115
  "Local OCR model - PDFs without selectable text",
1116
  "Local",
1117
  [],
1118
+ [
1119
+ "CUSTOM"
1120
+ ], # Use CUSTOM entity to enable deny list functionality
1121
  CHOSEN_COMPREHEND_ENTITIES,
1122
  [example_files[3]],
1123
  example_files[3],
1124
  [example_files[4]],
1125
+ pd.DataFrame(
1126
+ data={
1127
+ "deny_list": [
1128
+ "Sister",
1129
+ "Sister City",
1130
+ "Sister Cities",
1131
+ "Friendship City",
1132
+ ]
1133
+ }
1134
+ ),
1135
  [example_files[5]],
1136
  pd.DataFrame(data={"fully_redacted_pages_list": [2, 5]}),
1137
  ]
 
1152
  in_redact_comprehend_entities,
1153
  prepared_pdf_state,
1154
  doc_full_file_name_textbox,
1155
+ in_deny_list,
1156
  in_deny_list_state,
1157
  in_fully_redacted_list,
1158
  in_fully_redacted_list_state,
 
2168
  value="## Please give feedback", visible=False
2169
  )
2170
  data_feedback_radio = gr.Radio(
2171
+ label="Please give some feedback about the results of the redaction.",
2172
  choices=["The results were good", "The results were not good"],
2173
  visible=False,
2174
  show_label=True,
 
2195
  in_allow_list_text = gr.Textbox(
2196
  label="Custom allow list load status"
2197
  )
2198
+ with gr.Column():
2199
+ in_deny_list.render() # Defined at beginning of file
2200
  in_deny_list_text = gr.Textbox(label="Custom deny list load status")
2201
  with gr.Column():
2202
+ in_fully_redacted_list.render() # Defined at beginning of file
2203
  in_fully_redacted_list_text = gr.Textbox(
2204
  label="Fully redacted page list load status"
2205
  )
 
2221
  show_copy_button=True,
2222
  wrap=True,
2223
  )
2224
+
2225
+ in_deny_list_state.render() # Defined at beginning of file
2226
+
2227
+ in_fully_redacted_list_state.render() # Defined at beginning of file
2228
  with gr.Row():
2229
  with gr.Column(scale=2):
2230
  markdown_placeholder = gr.Markdown("")
tools/file_conversion.py CHANGED
@@ -583,7 +583,9 @@ def redact_single_box(
583
  pymupdf_y2 = pymupdf_rect[3]
584
 
585
  # Full size redaction box for covering all the text of a word
586
- full_size_redaction_box = Rect(pymupdf_x1-1, pymupdf_y1-1, pymupdf_x2+1, pymupdf_y2+1)
 
 
587
 
588
  # Calculate tiny height redaction box so that it doesn't delete text from adjacent lines
589
  redact_bottom_y = pymupdf_y1 + 2
@@ -620,7 +622,7 @@ def redact_single_box(
620
  applied_redaction_page = applied_redaction_page[0]
621
 
622
  # Handle review page first, then deal with final redacted page (retain_text = True)
623
- if retain_text is True:
624
 
625
  annot = pymupdf_page.add_redact_annot(full_size_redaction_box)
626
  annot.set_colors(stroke=out_colour, fill=out_colour, colors=out_colour)
@@ -636,7 +638,7 @@ def redact_single_box(
636
 
637
  # If we need both review and final pages, and the applied redaction page has been prepared, apply final redaction to the copy
638
  if return_pdf_end_of_redaction and applied_redaction_page is not None:
639
- # Apply final redaction to the copy
640
 
641
  # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
642
  applied_redaction_page.add_redact_annot(rect_small_pixel_height)
@@ -654,7 +656,7 @@ def redact_single_box(
654
  return pymupdf_page
655
 
656
  # If we don't need to retain the text, we only have one page which is the applied redaction page, so just apply the redaction to the page
657
- else:
658
  # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
659
  pymupdf_page.add_redact_annot(rect_small_pixel_height)
660
 
@@ -794,7 +796,9 @@ def redact_whole_pymupdf_page(
794
 
795
  # Calculate relative coordinates for the annotation box (0-1 range)
796
  # This ensures the coordinates are already in relative format for output files
797
- relative_border = border / min(rect_width, rect_height) # Scale border proportionally
 
 
798
  relative_x1 = relative_border
799
  relative_y1 = relative_border
800
  relative_x2 = 1 - relative_border
 
583
  pymupdf_y2 = pymupdf_rect[3]
584
 
585
  # Full size redaction box for covering all the text of a word
586
+ full_size_redaction_box = Rect(
587
+ pymupdf_x1 - 1, pymupdf_y1 - 1, pymupdf_x2 + 1, pymupdf_y2 + 1
588
+ )
589
 
590
  # Calculate tiny height redaction box so that it doesn't delete text from adjacent lines
591
  redact_bottom_y = pymupdf_y1 + 2
 
622
  applied_redaction_page = applied_redaction_page[0]
623
 
624
  # Handle review page first, then deal with final redacted page (retain_text = True)
625
+ if retain_text is True:
626
 
627
  annot = pymupdf_page.add_redact_annot(full_size_redaction_box)
628
  annot.set_colors(stroke=out_colour, fill=out_colour, colors=out_colour)
 
638
 
639
  # If we need both review and final pages, and the applied redaction page has been prepared, apply final redaction to the copy
640
  if return_pdf_end_of_redaction and applied_redaction_page is not None:
641
+ # Apply final redaction to the copy
642
 
643
  # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
644
  applied_redaction_page.add_redact_annot(rect_small_pixel_height)
 
656
  return pymupdf_page
657
 
658
  # If we don't need to retain the text, we only have one page which is the applied redaction page, so just apply the redaction to the page
659
+ else:
660
  # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
661
  pymupdf_page.add_redact_annot(rect_small_pixel_height)
662
 
 
796
 
797
  # Calculate relative coordinates for the annotation box (0-1 range)
798
  # This ensures the coordinates are already in relative format for output files
799
+ relative_border = border / min(
800
+ rect_width, rect_height
801
+ ) # Scale border proportionally
802
  relative_x1 = relative_border
803
  relative_y1 = relative_border
804
  relative_x2 = 1 - relative_border
tools/file_redaction.py CHANGED
@@ -1109,16 +1109,20 @@ def choose_and_run_redactor(
1109
 
1110
  # Create a mapping of original page numbers to final pages
1111
  applied_redaction_pages_map = {}
1112
- for applied_redaction_page_data in redact_image_pdf._applied_redaction_pages:
 
 
1113
  if isinstance(applied_redaction_page_data, tuple):
1114
  applied_redaction_page, original_page_number = (
1115
  applied_redaction_page_data
1116
  )
1117
- applied_redaction_pages_map[original_page_number] = (
1118
- applied_redaction_page
1119
- )
1120
  else:
1121
- applied_redaction_page = applied_redaction_page_data
 
 
1122
  applied_redaction_pages_map[0] = (
1123
  applied_redaction_page # Default to page 0 if no original number
1124
  )
@@ -1158,16 +1162,20 @@ def choose_and_run_redactor(
1158
 
1159
  # Create a mapping of original page numbers to final pages
1160
  applied_redaction_pages_map = {}
1161
- for applied_redaction_page_data in redact_text_pdf._applied_redaction_pages:
 
 
1162
  if isinstance(applied_redaction_page_data, tuple):
1163
  applied_redaction_page, original_page_number = (
1164
  applied_redaction_page_data
1165
  )
1166
- applied_redaction_pages_map[original_page_number] = (
1167
- applied_redaction_page
1168
- )
1169
  else:
1170
- applied_redaction_page = applied_redaction_page_data
 
 
1171
  applied_redaction_pages_map[0] = (
1172
  applied_redaction_page # Default to page 0 if no original number
1173
  )
@@ -1199,7 +1207,10 @@ def choose_and_run_redactor(
1199
  delattr(redact_text_pdf, "_applied_redaction_pages")
1200
 
1201
  # Save final redacted PDF if we have dual outputs or if RETURN_PDF_FOR_REVIEW is False
1202
- if RETURN_PDF_FOR_REVIEW is False or applied_redaction_pymupdf_doc:
 
 
 
1203
  out_redacted_pdf_file_path = (
1204
  output_folder
1205
  + pdf_file_name_without_ext
@@ -1211,7 +1222,9 @@ def choose_and_run_redactor(
1211
 
1212
  # Use final document if available, otherwise use main document
1213
  doc_to_save = (
1214
- applied_redaction_pymupdf_doc if applied_redaction_pymupdf_doc else pymupdf_doc
 
 
1215
  )
1216
 
1217
  if out_redacted_pdf_file_path:
@@ -2274,11 +2287,15 @@ def redact_page_with_pymupdf(
2274
  page, applied_redaction_page = redact_result
2275
  # Store the final page for later use
2276
  if not hasattr(redact_page_with_pymupdf, "_applied_redaction_page"):
2277
- redact_page_with_pymupdf._applied_redaction_page = applied_redaction_page
 
 
2278
  else:
2279
  # If we already have a final page, we need to handle multiple pages
2280
  # For now, we'll use the last final page
2281
- redact_page_with_pymupdf._applied_redaction_page = applied_redaction_page
 
 
2282
 
2283
  # If whole page is to be redacted, do that here
2284
  if redact_whole_page is True:
@@ -2287,7 +2304,9 @@ def redact_page_with_pymupdf(
2287
  rect_height, rect_width, page, custom_colours, border=5
2288
  )
2289
  # Ensure the whole page annotation box has a unique ID
2290
- whole_page_img_annotation_box = fill_missing_box_ids(whole_page_img_annotation_box)
 
 
2291
  all_image_annotation_boxes.append(whole_page_img_annotation_box)
2292
 
2293
  # Handle dual page objects for whole page redaction if needed
@@ -2304,16 +2323,26 @@ def redact_page_with_pymupdf(
2304
 
2305
  # Apply the whole page redaction to the final page as well
2306
  redact_whole_pymupdf_page(
2307
- rect_height, rect_width, applied_redaction_page, custom_colours, border=5
 
 
 
 
2308
  )
2309
 
2310
  # Store the final page with its original page number for later use
2311
  if not hasattr(redact_page_with_pymupdf, "_applied_redaction_page"):
2312
- redact_page_with_pymupdf._applied_redaction_page = (applied_redaction_page, page.number)
 
 
 
2313
  else:
2314
  # If we already have a final page, we need to handle multiple pages
2315
  # For now, we'll use the last final page
2316
- redact_page_with_pymupdf._applied_redaction_page = (applied_redaction_page, page.number)
 
 
 
2317
 
2318
  out_annotation_boxes = {
2319
  "image": image_path, # Image.open(image_path), #image_path,
@@ -3115,9 +3144,10 @@ def redact_image_pdf(
3115
 
3116
  # Handle dual page objects if returned
3117
  if isinstance(redact_result[0], tuple):
3118
- (pymupdf_page, pymupdf_applied_redaction_page), page_image_annotations = (
3119
- redact_result
3120
- )
 
3121
  # Store the final page with its original page number for later use
3122
  if not hasattr(redact_image_pdf, "_applied_redaction_pages"):
3123
  redact_image_pdf._applied_redaction_pages = list()
@@ -4177,9 +4207,10 @@ def redact_text_pdf(
4177
 
4178
  # Handle dual page objects if returned
4179
  if isinstance(redact_result[0], tuple):
4180
- (pymupdf_page, pymupdf_applied_redaction_page), page_image_annotations = (
4181
- redact_result
4182
- )
 
4183
  # Store the final page with its original page number for later use
4184
  if not hasattr(redact_text_pdf, "_applied_redaction_pages"):
4185
  redact_text_pdf._applied_redaction_pages = list()
 
1109
 
1110
  # Create a mapping of original page numbers to final pages
1111
  applied_redaction_pages_map = {}
1112
+ for (
1113
+ applied_redaction_page_data
1114
+ ) in redact_image_pdf._applied_redaction_pages:
1115
  if isinstance(applied_redaction_page_data, tuple):
1116
  applied_redaction_page, original_page_number = (
1117
  applied_redaction_page_data
1118
  )
1119
+ applied_redaction_pages_map[
1120
+ original_page_number
1121
+ ] = applied_redaction_page
1122
  else:
1123
+ applied_redaction_page = (
1124
+ applied_redaction_page_data
1125
+ )
1126
  applied_redaction_pages_map[0] = (
1127
  applied_redaction_page # Default to page 0 if no original number
1128
  )
 
1162
 
1163
  # Create a mapping of original page numbers to final pages
1164
  applied_redaction_pages_map = {}
1165
+ for (
1166
+ applied_redaction_page_data
1167
+ ) in redact_text_pdf._applied_redaction_pages:
1168
  if isinstance(applied_redaction_page_data, tuple):
1169
  applied_redaction_page, original_page_number = (
1170
  applied_redaction_page_data
1171
  )
1172
+ applied_redaction_pages_map[
1173
+ original_page_number
1174
+ ] = applied_redaction_page
1175
  else:
1176
+ applied_redaction_page = (
1177
+ applied_redaction_page_data
1178
+ )
1179
  applied_redaction_pages_map[0] = (
1180
  applied_redaction_page # Default to page 0 if no original number
1181
  )
 
1207
  delattr(redact_text_pdf, "_applied_redaction_pages")
1208
 
1209
  # Save final redacted PDF if we have dual outputs or if RETURN_PDF_FOR_REVIEW is False
1210
+ if (
1211
+ RETURN_PDF_FOR_REVIEW is False
1212
+ or applied_redaction_pymupdf_doc
1213
+ ):
1214
  out_redacted_pdf_file_path = (
1215
  output_folder
1216
  + pdf_file_name_without_ext
 
1222
 
1223
  # Use final document if available, otherwise use main document
1224
  doc_to_save = (
1225
+ applied_redaction_pymupdf_doc
1226
+ if applied_redaction_pymupdf_doc
1227
+ else pymupdf_doc
1228
  )
1229
 
1230
  if out_redacted_pdf_file_path:
 
2287
  page, applied_redaction_page = redact_result
2288
  # Store the final page for later use
2289
  if not hasattr(redact_page_with_pymupdf, "_applied_redaction_page"):
2290
+ redact_page_with_pymupdf._applied_redaction_page = (
2291
+ applied_redaction_page
2292
+ )
2293
  else:
2294
  # If we already have a final page, we need to handle multiple pages
2295
  # For now, we'll use the last final page
2296
+ redact_page_with_pymupdf._applied_redaction_page = (
2297
+ applied_redaction_page
2298
+ )
2299
 
2300
  # If whole page is to be redacted, do that here
2301
  if redact_whole_page is True:
 
2304
  rect_height, rect_width, page, custom_colours, border=5
2305
  )
2306
  # Ensure the whole page annotation box has a unique ID
2307
+ whole_page_img_annotation_box = fill_missing_box_ids(
2308
+ whole_page_img_annotation_box
2309
+ )
2310
  all_image_annotation_boxes.append(whole_page_img_annotation_box)
2311
 
2312
  # Handle dual page objects for whole page redaction if needed
 
2323
 
2324
  # Apply the whole page redaction to the final page as well
2325
  redact_whole_pymupdf_page(
2326
+ rect_height,
2327
+ rect_width,
2328
+ applied_redaction_page,
2329
+ custom_colours,
2330
+ border=5,
2331
  )
2332
 
2333
  # Store the final page with its original page number for later use
2334
  if not hasattr(redact_page_with_pymupdf, "_applied_redaction_page"):
2335
+ redact_page_with_pymupdf._applied_redaction_page = (
2336
+ applied_redaction_page,
2337
+ page.number,
2338
+ )
2339
  else:
2340
  # If we already have a final page, we need to handle multiple pages
2341
  # For now, we'll use the last final page
2342
+ redact_page_with_pymupdf._applied_redaction_page = (
2343
+ applied_redaction_page,
2344
+ page.number,
2345
+ )
2346
 
2347
  out_annotation_boxes = {
2348
  "image": image_path, # Image.open(image_path), #image_path,
 
3144
 
3145
  # Handle dual page objects if returned
3146
  if isinstance(redact_result[0], tuple):
3147
+ (
3148
+ pymupdf_page,
3149
+ pymupdf_applied_redaction_page,
3150
+ ), page_image_annotations = redact_result
3151
  # Store the final page with its original page number for later use
3152
  if not hasattr(redact_image_pdf, "_applied_redaction_pages"):
3153
  redact_image_pdf._applied_redaction_pages = list()
 
4207
 
4208
  # Handle dual page objects if returned
4209
  if isinstance(redact_result[0], tuple):
4210
+ (
4211
+ pymupdf_page,
4212
+ pymupdf_applied_redaction_page,
4213
+ ), page_image_annotations = redact_result
4214
  # Store the final page with its original page number for later use
4215
  if not hasattr(redact_text_pdf, "_applied_redaction_pages"):
4216
  redact_text_pdf._applied_redaction_pages = list()