Commit
·
5fcccbe
1
Parent(s):
e27231e
Expanded checks for out of range page cropboxes
Browse files- .dockerignore +1 -0
- .gitignore +1 -0
- tools/config.py +2 -2
- tools/file_redaction.py +56 -8
- tools/textract_batch_call.py +1 -5
.dockerignore
CHANGED
|
@@ -19,4 +19,5 @@ logs/*
|
|
| 19 |
config/*
|
| 20 |
user_guide/*
|
| 21 |
cdk/*
|
|
|
|
| 22 |
web/*
|
|
|
|
| 19 |
config/*
|
| 20 |
user_guide/*
|
| 21 |
cdk/*
|
| 22 |
+
cdk/config/*
|
| 23 |
web/*
|
.gitignore
CHANGED
|
@@ -20,4 +20,5 @@ config/*
|
|
| 20 |
doc_redaction_amplify_app/*
|
| 21 |
user_guide/*
|
| 22 |
cdk/*
|
|
|
|
| 23 |
web/*
|
|
|
|
| 20 |
doc_redaction_amplify_app/*
|
| 21 |
user_guide/*
|
| 22 |
cdk/*
|
| 23 |
+
cdk/config/*
|
| 24 |
web/*
|
tools/config.py
CHANGED
|
@@ -212,8 +212,8 @@ if LOGGING == 'True':
|
|
| 212 |
###
|
| 213 |
|
| 214 |
# Create Tesseract and Poppler folders if you have installed them locally
|
| 215 |
-
TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "") # e.g. tesseract/
|
| 216 |
-
POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "") # e.g. poppler/poppler-24.02.0/Library/bin/
|
| 217 |
|
| 218 |
if TESSERACT_FOLDER: add_folder_to_path(TESSERACT_FOLDER)
|
| 219 |
if POPPLER_FOLDER: add_folder_to_path(POPPLER_FOLDER)
|
|
|
|
| 212 |
###
|
| 213 |
|
| 214 |
# Create Tesseract and Poppler folders if you have installed them locally
|
| 215 |
+
TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "") # # If installing for Windows, install Tesseract 5.5.0 from here: https://github.com/UB-Mannheim/tesseract/wiki. Then this environment variable should point to the Tesseract folder e.g. tesseract/
|
| 216 |
+
POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "") # If installing on Windows,install Poppler from here https://github.com/oschwartz10612/poppler-windows. This variable needs to point to the poppler bin folder e.g. poppler/poppler-24.02.0/Library/bin/
|
| 217 |
|
| 218 |
if TESSERACT_FOLDER: add_folder_to_path(TESSERACT_FOLDER)
|
| 219 |
if POPPLER_FOLDER: add_folder_to_path(POPPLER_FOLDER)
|
tools/file_redaction.py
CHANGED
|
@@ -8,7 +8,7 @@ import copy
|
|
| 8 |
|
| 9 |
from tqdm import tqdm
|
| 10 |
from PIL import Image, ImageChops, ImageFile, ImageDraw
|
| 11 |
-
from typing import List, Dict, Tuple
|
| 12 |
import pandas as pd
|
| 13 |
|
| 14 |
from pdfminer.high_level import extract_pages
|
|
@@ -932,22 +932,70 @@ def convert_pikepdf_annotations_to_result_annotation_box(page:Page, annot:dict,
|
|
| 932 |
|
| 933 |
return img_annotation_box, rect
|
| 934 |
|
| 935 |
-
def set_cropbox_safely(page, original_cropbox):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 936 |
"""
|
| 937 |
-
Sets the cropbox of a page
|
| 938 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 939 |
|
| 940 |
Args:
|
| 941 |
-
page: The
|
| 942 |
-
original_cropbox: The
|
| 943 |
"""
|
| 944 |
mediabox = page.mediabox
|
| 945 |
-
|
| 946 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 947 |
page.set_cropbox(mediabox)
|
| 948 |
else:
|
| 949 |
page.set_cropbox(original_cropbox)
|
| 950 |
|
|
|
|
| 951 |
def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_pikepdf_to_pymupdf_coords:bool=True, original_cropbox:List[Rect]=[], page_sizes_df:pd.DataFrame=pd.DataFrame()):
|
| 952 |
|
| 953 |
rect_height = page.rect.height
|
|
|
|
| 8 |
|
| 9 |
from tqdm import tqdm
|
| 10 |
from PIL import Image, ImageChops, ImageFile, ImageDraw
|
| 11 |
+
from typing import List, Dict, Tuple, Optional
|
| 12 |
import pandas as pd
|
| 13 |
|
| 14 |
from pdfminer.high_level import extract_pages
|
|
|
|
| 932 |
|
| 933 |
return img_annotation_box, rect
|
| 934 |
|
| 935 |
+
# def set_cropbox_safely(page, original_cropbox):
|
| 936 |
+
# """
|
| 937 |
+
# Sets the cropbox of a page, ensuring it's not larger than the mediabox.
|
| 938 |
+
# If the original cropbox is larger, the mediabox is used instead.
|
| 939 |
+
|
| 940 |
+
# Args:
|
| 941 |
+
# page: The PyMuPdf page object.
|
| 942 |
+
# original_cropbox: The fitz.Rect representing the desired cropbox.
|
| 943 |
+
# """
|
| 944 |
+
# mediabox = page.mediabox
|
| 945 |
+
# if original_cropbox.width > mediabox.width or original_cropbox.height > mediabox.height:
|
| 946 |
+
# #print("Warning: Requested cropbox is larger than the mediabox. Using mediabox instead.")
|
| 947 |
+
# page.set_cropbox(mediabox)
|
| 948 |
+
# else:
|
| 949 |
+
# page.set_cropbox(original_cropbox)
|
| 950 |
+
|
| 951 |
+
|
| 952 |
+
def set_cropbox_safely(page: Page, original_cropbox: Optional[Rect]):
|
| 953 |
"""
|
| 954 |
+
Sets the cropbox of a PyMuPDF page safely and defensively.
|
| 955 |
+
|
| 956 |
+
If the 'original_cropbox' is valid (i.e., a fitz.Rect instance, not None, not empty,
|
| 957 |
+
not infinite, and fully contained within the page's mediabox), it is set as the cropbox.
|
| 958 |
+
|
| 959 |
+
Otherwise, the page's mediabox is used, and a warning is printed to explain why.
|
| 960 |
|
| 961 |
Args:
|
| 962 |
+
page: The PyMuPDF page object.
|
| 963 |
+
original_cropbox: The Rect representing the desired cropbox.
|
| 964 |
"""
|
| 965 |
mediabox = page.mediabox
|
| 966 |
+
reason_for_defaulting = ""
|
| 967 |
+
|
| 968 |
+
# Check for None
|
| 969 |
+
if original_cropbox is None:
|
| 970 |
+
reason_for_defaulting = "the original cropbox is None."
|
| 971 |
+
# Check for incorrect type
|
| 972 |
+
elif not isinstance(original_cropbox, Rect):
|
| 973 |
+
reason_for_defaulting = f"the original cropbox is not a fitz.Rect instance (got {type(original_cropbox)})."
|
| 974 |
+
else:
|
| 975 |
+
# Normalise the cropbox (ensures x0 < x1 and y0 < y1)
|
| 976 |
+
original_cropbox.normalize()
|
| 977 |
+
|
| 978 |
+
# Check for empty or infinite or out-of-bounds
|
| 979 |
+
if original_cropbox.is_empty:
|
| 980 |
+
reason_for_defaulting = f"the provided original cropbox {original_cropbox} is empty."
|
| 981 |
+
elif original_cropbox.is_infinite:
|
| 982 |
+
reason_for_defaulting = f"the provided original cropbox {original_cropbox} is infinite."
|
| 983 |
+
elif not mediabox.contains(original_cropbox):
|
| 984 |
+
reason_for_defaulting = (
|
| 985 |
+
f"the provided original cropbox {original_cropbox} is not fully contained "
|
| 986 |
+
f"within the page's mediabox {mediabox}."
|
| 987 |
+
)
|
| 988 |
+
|
| 989 |
+
if reason_for_defaulting:
|
| 990 |
+
print(
|
| 991 |
+
f"Warning (Page {page.number}): Cannot use original cropbox because {reason_for_defaulting} "
|
| 992 |
+
f"Defaulting to the page's mediabox as the cropbox."
|
| 993 |
+
)
|
| 994 |
page.set_cropbox(mediabox)
|
| 995 |
else:
|
| 996 |
page.set_cropbox(original_cropbox)
|
| 997 |
|
| 998 |
+
|
| 999 |
def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_pikepdf_to_pymupdf_coords:bool=True, original_cropbox:List[Rect]=[], page_sizes_df:pd.DataFrame=pd.DataFrame()):
|
| 1000 |
|
| 1001 |
rect_height = page.rect.height
|
tools/textract_batch_call.py
CHANGED
|
@@ -338,19 +338,15 @@ def load_pdf_job_file_from_s3(
|
|
| 338 |
RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS):
|
| 339 |
|
| 340 |
try:
|
| 341 |
-
print("load_s3_jobs_input_loc:", load_s3_jobs_input_loc)
|
| 342 |
pdf_file_location = ''
|
| 343 |
doc_file_name_no_extension_textbox = ''
|
| 344 |
|
| 345 |
s3_input_key_prefix = os.path.join(load_s3_jobs_input_loc, pdf_filename).replace("\\", "/")
|
| 346 |
s3_input_key_prefix = s3_input_key_prefix + ".pdf"
|
| 347 |
-
|
| 348 |
-
|
| 349 |
local_input_file_path = os.path.join(local_output_dir, pdf_filename)
|
| 350 |
local_input_file_path = local_input_file_path + ".pdf"
|
| 351 |
|
| 352 |
-
print("input to s3 download:", s3_bucket_name, s3_input_key_prefix, local_input_file_path)
|
| 353 |
-
|
| 354 |
download_file_from_s3(s3_bucket_name, s3_input_key_prefix, local_input_file_path, RUN_AWS_FUNCTIONS= RUN_AWS_FUNCTIONS)
|
| 355 |
|
| 356 |
pdf_file_location = [local_input_file_path]
|
|
|
|
| 338 |
RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS):
|
| 339 |
|
| 340 |
try:
|
|
|
|
| 341 |
pdf_file_location = ''
|
| 342 |
doc_file_name_no_extension_textbox = ''
|
| 343 |
|
| 344 |
s3_input_key_prefix = os.path.join(load_s3_jobs_input_loc, pdf_filename).replace("\\", "/")
|
| 345 |
s3_input_key_prefix = s3_input_key_prefix + ".pdf"
|
| 346 |
+
|
|
|
|
| 347 |
local_input_file_path = os.path.join(local_output_dir, pdf_filename)
|
| 348 |
local_input_file_path = local_input_file_path + ".pdf"
|
| 349 |
|
|
|
|
|
|
|
| 350 |
download_file_from_s3(s3_bucket_name, s3_input_key_prefix, local_input_file_path, RUN_AWS_FUNCTIONS= RUN_AWS_FUNCTIONS)
|
| 351 |
|
| 352 |
pdf_file_location = [local_input_file_path]
|