Commit
·
f5b280d
1
Parent(s):
f957846
Fixes to test suites. Minor default package changes (paddleocr not required)
Browse files- .github/scripts/setup_test_data.py +9 -1
- .github/{README.md → workflow_README.md} +0 -0
- .github/workflows/ci.yml +3 -3
- .github/workflows/multi-os-test.yml +1 -1
- .github/workflows/test.yml +27 -16
- requirements.txt +3 -2
- test/demo_single_test.py +1 -1
.github/scripts/setup_test_data.py
CHANGED
|
@@ -8,6 +8,14 @@ import os
|
|
| 8 |
|
| 9 |
import pandas as pd
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
def create_directories():
|
| 13 |
"""Create necessary directories."""
|
|
@@ -152,7 +160,7 @@ def create_ocr_output():
|
|
| 152 |
"top": [0.95, 0.92, 0.88],
|
| 153 |
"width": [0.05, 0.02, 0.02],
|
| 154 |
"height": [0.01, 0.02, 0.02],
|
| 155 |
-
"line": [1, 2, 3]
|
| 156 |
}
|
| 157 |
df = pd.DataFrame(ocr_data)
|
| 158 |
df.to_csv(
|
|
|
|
| 8 |
|
| 9 |
import pandas as pd
|
| 10 |
|
| 11 |
+
# Install reportlab if not available
|
| 12 |
+
try:
|
| 13 |
+
import reportlab
|
| 14 |
+
except ImportError:
|
| 15 |
+
import subprocess
|
| 16 |
+
subprocess.check_call(['pip', 'install', 'reportlab'])
|
| 17 |
+
import reportlab
|
| 18 |
+
|
| 19 |
|
| 20 |
def create_directories():
|
| 21 |
"""Create necessary directories."""
|
|
|
|
| 160 |
"top": [0.95, 0.92, 0.88],
|
| 161 |
"width": [0.05, 0.02, 0.02],
|
| 162 |
"height": [0.01, 0.02, 0.02],
|
| 163 |
+
"line": [1, 2, 3]
|
| 164 |
}
|
| 165 |
df = pd.DataFrame(ocr_data)
|
| 166 |
df.to_csv(
|
.github/{README.md → workflow_README.md}
RENAMED
|
File without changes
|
.github/workflows/ci.yml
CHANGED
|
@@ -186,9 +186,9 @@ jobs:
|
|
| 186 |
python -m pip install --upgrade pip
|
| 187 |
pip install safety bandit
|
| 188 |
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
|
| 193 |
- name: Run bandit security check
|
| 194 |
run: |
|
|
|
|
| 186 |
python -m pip install --upgrade pip
|
| 187 |
pip install safety bandit
|
| 188 |
|
| 189 |
+
#- name: Run safety scan - removed as now requires login
|
| 190 |
+
# run: |
|
| 191 |
+
# safety scan -r requirements.txt
|
| 192 |
|
| 193 |
- name: Run bandit security check
|
| 194 |
run: |
|
.github/workflows/multi-os-test.yml
CHANGED
|
@@ -16,7 +16,7 @@ jobs:
|
|
| 16 |
strategy:
|
| 17 |
matrix:
|
| 18 |
os: [ubuntu-latest, macos-latest] # windows-latest removed for now as I have not been able to install tesseract on Windows using this method
|
| 19 |
-
python-version: ["3.
|
| 20 |
exclude:
|
| 21 |
# Exclude some combinations to reduce CI time
|
| 22 |
#- os: windows-latest
|
|
|
|
| 16 |
strategy:
|
| 17 |
matrix:
|
| 18 |
os: [ubuntu-latest, macos-latest] # windows-latest removed for now as I have not been able to install tesseract on Windows using this method
|
| 19 |
+
python-version: ["3.11", "3.12", "3.13"]
|
| 20 |
exclude:
|
| 21 |
# Exclude some combinations to reduce CI time
|
| 22 |
#- os: windows-latest
|
.github/workflows/test.yml
CHANGED
|
@@ -16,7 +16,7 @@ jobs:
|
|
| 16 |
runs-on: ubuntu-latest
|
| 17 |
strategy:
|
| 18 |
matrix:
|
| 19 |
-
python-version: [3.
|
| 20 |
|
| 21 |
steps:
|
| 22 |
- uses: actions/checkout@v4
|
|
@@ -48,7 +48,7 @@ jobs:
|
|
| 48 |
run: |
|
| 49 |
python -m pip install --upgrade pip
|
| 50 |
pip install -r requirements.txt
|
| 51 |
-
pip install pytest pytest-cov pytest-html pytest-xdist
|
| 52 |
|
| 53 |
- name: Download spaCy model
|
| 54 |
run: |
|
|
@@ -64,16 +64,18 @@ jobs:
|
|
| 64 |
# Create dummy PDF for testing
|
| 65 |
python -c "
|
| 66 |
import os
|
| 67 |
-
|
| 68 |
-
from reportlab.lib.pagesizes import letter
|
| 69 |
-
|
| 70 |
# Install reportlab if not available
|
| 71 |
try:
|
| 72 |
-
|
| 73 |
except ImportError:
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
# Create dummy PDF
|
| 79 |
c = canvas.Canvas('example_data/example_of_emails_sent_to_a_professor_before_applying.pdf', pagesize=letter)
|
|
@@ -148,15 +150,24 @@ jobs:
|
|
| 148 |
|
| 149 |
# Create dummy OCR output
|
| 150 |
ocr_data = {
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
}
|
| 156 |
df = pd.DataFrame(ocr_data)
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
|
|
|
|
|
|
| 160 |
"
|
| 161 |
|
| 162 |
- name: Create dummy image for testing
|
|
|
|
| 16 |
runs-on: ubuntu-latest
|
| 17 |
strategy:
|
| 18 |
matrix:
|
| 19 |
+
python-version: [3.11, 3.12, 3.13]
|
| 20 |
|
| 21 |
steps:
|
| 22 |
- uses: actions/checkout@v4
|
|
|
|
| 48 |
run: |
|
| 49 |
python -m pip install --upgrade pip
|
| 50 |
pip install -r requirements.txt
|
| 51 |
+
pip install pytest pytest-cov pytest-html pytest-xdist reportlab pillow
|
| 52 |
|
| 53 |
- name: Download spaCy model
|
| 54 |
run: |
|
|
|
|
| 64 |
# Create dummy PDF for testing
|
| 65 |
python -c "
|
| 66 |
import os
|
| 67 |
+
|
|
|
|
|
|
|
| 68 |
# Install reportlab if not available
|
| 69 |
try:
|
| 70 |
+
import reportlab
|
| 71 |
except ImportError:
|
| 72 |
+
import subprocess
|
| 73 |
+
subprocess.check_call(['pip', 'install', 'reportlab'])
|
| 74 |
+
import reportlab
|
| 75 |
+
|
| 76 |
+
from reportlab.pdfgen import canvas
|
| 77 |
+
from reportlab.lib.pagesizes import letter
|
| 78 |
+
|
| 79 |
|
| 80 |
# Create dummy PDF
|
| 81 |
c = canvas.Canvas('example_data/example_of_emails_sent_to_a_professor_before_applying.pdf', pagesize=letter)
|
|
|
|
| 150 |
|
| 151 |
# Create dummy OCR output
|
| 152 |
ocr_data = {
|
| 153 |
+
"page": [1, 2, 3],
|
| 154 |
+
"text": [
|
| 155 |
+
"This is page 1 content with some text",
|
| 156 |
+
"This is page 2 content with different text",
|
| 157 |
+
"This is page 3 content with more text",
|
| 158 |
+
],
|
| 159 |
+
"left": [0.1, 0.3, 0.5],
|
| 160 |
+
"top": [0.95, 0.92, 0.88],
|
| 161 |
+
"width": [0.05, 0.02, 0.02],
|
| 162 |
+
"height": [0.01, 0.02, 0.02],
|
| 163 |
+
"line": [1, 2, 3]
|
| 164 |
}
|
| 165 |
df = pd.DataFrame(ocr_data)
|
| 166 |
+
df.to_csv(
|
| 167 |
+
"example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv",
|
| 168 |
+
index=False,
|
| 169 |
+
)
|
| 170 |
+
print("Created dummy OCR output CSV")
|
| 171 |
"
|
| 172 |
|
| 173 |
- name: Create dummy image for testing
|
requirements.txt
CHANGED
|
@@ -24,8 +24,9 @@ rapidfuzz==3.13.0
|
|
| 24 |
python-dotenv==1.0.1
|
| 25 |
awslambdaric==3.1.1
|
| 26 |
python-docx==1.2.0
|
| 27 |
-
|
| 28 |
-
|
|
|
|
| 29 |
|
| 30 |
# Test dependencies
|
| 31 |
pytest>=7.0.0
|
|
|
|
| 24 |
python-dotenv==1.0.1
|
| 25 |
awslambdaric==3.1.1
|
| 26 |
python-docx==1.2.0
|
| 27 |
+
# Optional: uncomment the below to install paddleOCR if you want to use hybrid text extraction (tesseract plus paddleocr)
|
| 28 |
+
# paddlepaddle==3.1.0 # Consider installing the GPU version for faster local OCR inference with PaddleOCR: paddlepaddle-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ , compatible with CUDA 12.6. See this for more details: https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#span-id-gpu-gpu-version-of-paddlepaddle-span
|
| 29 |
+
# paddleocr==3.1.1
|
| 30 |
|
| 31 |
# Test dependencies
|
| 32 |
pytest>=7.0.0
|
test/demo_single_test.py
CHANGED
|
@@ -14,7 +14,7 @@ import tempfile
|
|
| 14 |
# Add the parent directory to the path
|
| 15 |
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 16 |
|
| 17 |
-
from test
|
| 18 |
|
| 19 |
|
| 20 |
def demo_pdf_redaction():
|
|
|
|
| 14 |
# Add the parent directory to the path
|
| 15 |
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 16 |
|
| 17 |
+
from test import run_cli_redact
|
| 18 |
|
| 19 |
|
| 20 |
def demo_pdf_redaction():
|