seanpedrickcase commited on
Commit
f5b280d
·
1 Parent(s): f957846

Fixes to test suites. Minor default package changes (paddleocr not required)

Browse files
.github/scripts/setup_test_data.py CHANGED
@@ -8,6 +8,14 @@ import os
8
 
9
  import pandas as pd
10
 
 
 
 
 
 
 
 
 
11
 
12
  def create_directories():
13
  """Create necessary directories."""
@@ -152,7 +160,7 @@ def create_ocr_output():
152
  "top": [0.95, 0.92, 0.88],
153
  "width": [0.05, 0.02, 0.02],
154
  "height": [0.01, 0.02, 0.02],
155
- "line": [1, 2, 3],
156
  }
157
  df = pd.DataFrame(ocr_data)
158
  df.to_csv(
 
8
 
9
  import pandas as pd
10
 
11
+ # Install reportlab if not available
12
+ try:
13
+ import reportlab
14
+ except ImportError:
15
+ import subprocess
16
+ subprocess.check_call(['pip', 'install', 'reportlab'])
17
+ import reportlab
18
+
19
 
20
  def create_directories():
21
  """Create necessary directories."""
 
160
  "top": [0.95, 0.92, 0.88],
161
  "width": [0.05, 0.02, 0.02],
162
  "height": [0.01, 0.02, 0.02],
163
+ "line": [1, 2, 3]
164
  }
165
  df = pd.DataFrame(ocr_data)
166
  df.to_csv(
.github/{README.md → workflow_README.md} RENAMED
File without changes
.github/workflows/ci.yml CHANGED
@@ -186,9 +186,9 @@ jobs:
186
  python -m pip install --upgrade pip
187
  pip install safety bandit
188
 
189
- - name: Run safety scan
190
- run: |
191
- safety scan -r requirements.txt
192
 
193
  - name: Run bandit security check
194
  run: |
 
186
  python -m pip install --upgrade pip
187
  pip install safety bandit
188
 
189
+ #- name: Run safety scan - removed as now requires login
190
+ # run: |
191
+ # safety scan -r requirements.txt
192
 
193
  - name: Run bandit security check
194
  run: |
.github/workflows/multi-os-test.yml CHANGED
@@ -16,7 +16,7 @@ jobs:
16
  strategy:
17
  matrix:
18
  os: [ubuntu-latest, macos-latest] # windows-latest removed for now as I have not been able to install tesseract on Windows using this method
19
- python-version: ["3.10", "3.11", "3.12"]
20
  exclude:
21
  # Exclude some combinations to reduce CI time
22
  #- os: windows-latest
 
16
  strategy:
17
  matrix:
18
  os: [ubuntu-latest, macos-latest] # windows-latest removed for now as I have not been able to install tesseract on Windows using this method
19
+ python-version: ["3.11", "3.12", "3.13"]
20
  exclude:
21
  # Exclude some combinations to reduce CI time
22
  #- os: windows-latest
.github/workflows/test.yml CHANGED
@@ -16,7 +16,7 @@ jobs:
16
  runs-on: ubuntu-latest
17
  strategy:
18
  matrix:
19
- python-version: [3.10, 3.11, 3.12]
20
 
21
  steps:
22
  - uses: actions/checkout@v4
@@ -48,7 +48,7 @@ jobs:
48
  run: |
49
  python -m pip install --upgrade pip
50
  pip install -r requirements.txt
51
- pip install pytest pytest-cov pytest-html pytest-xdist
52
 
53
  - name: Download spaCy model
54
  run: |
@@ -64,16 +64,18 @@ jobs:
64
  # Create dummy PDF for testing
65
  python -c "
66
  import os
67
- from reportlab.pdfgen import canvas
68
- from reportlab.lib.pagesizes import letter
69
-
70
  # Install reportlab if not available
71
  try:
72
- import reportlab
73
  except ImportError:
74
- import subprocess
75
- subprocess.check_call(['pip', 'install', 'reportlab'])
76
- import reportlab
 
 
 
 
77
 
78
  # Create dummy PDF
79
  c = canvas.Canvas('example_data/example_of_emails_sent_to_a_professor_before_applying.pdf', pagesize=letter)
@@ -148,15 +150,24 @@ jobs:
148
 
149
  # Create dummy OCR output
150
  ocr_data = {
151
- 'file_name': ['test.pdf', 'test.pdf'],
152
- 'page_number': [1, 2],
153
- 'text': ['This is page 1 content', 'This is page 2 content'],
154
- 'confidence': [0.95, 0.92]
 
 
 
 
 
 
 
155
  }
156
  df = pd.DataFrame(ocr_data)
157
- os.makedirs('example_data/example_outputs', exist_ok=True)
158
- df.to_csv('example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv', index=False)
159
- print('Created dummy OCR output CSV')
 
 
160
  "
161
 
162
  - name: Create dummy image for testing
 
16
  runs-on: ubuntu-latest
17
  strategy:
18
  matrix:
19
+ python-version: [3.11, 3.12, 3.13]
20
 
21
  steps:
22
  - uses: actions/checkout@v4
 
48
  run: |
49
  python -m pip install --upgrade pip
50
  pip install -r requirements.txt
51
+ pip install pytest pytest-cov pytest-html pytest-xdist reportlab pillow
52
 
53
  - name: Download spaCy model
54
  run: |
 
64
  # Create dummy PDF for testing
65
  python -c "
66
  import os
67
+
 
 
68
  # Install reportlab if not available
69
  try:
70
+ import reportlab
71
  except ImportError:
72
+ import subprocess
73
+ subprocess.check_call(['pip', 'install', 'reportlab'])
74
+ import reportlab
75
+
76
+ from reportlab.pdfgen import canvas
77
+ from reportlab.lib.pagesizes import letter
78
+
79
 
80
  # Create dummy PDF
81
  c = canvas.Canvas('example_data/example_of_emails_sent_to_a_professor_before_applying.pdf', pagesize=letter)
 
150
 
151
  # Create dummy OCR output
152
  ocr_data = {
153
+ "page": [1, 2, 3],
154
+ "text": [
155
+ "This is page 1 content with some text",
156
+ "This is page 2 content with different text",
157
+ "This is page 3 content with more text",
158
+ ],
159
+ "left": [0.1, 0.3, 0.5],
160
+ "top": [0.95, 0.92, 0.88],
161
+ "width": [0.05, 0.02, 0.02],
162
+ "height": [0.01, 0.02, 0.02],
163
+ "line": [1, 2, 3]
164
  }
165
  df = pd.DataFrame(ocr_data)
166
+ df.to_csv(
167
+ "example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv",
168
+ index=False,
169
+ )
170
+ print("Created dummy OCR output CSV")
171
  "
172
 
173
  - name: Create dummy image for testing
requirements.txt CHANGED
@@ -24,8 +24,9 @@ rapidfuzz==3.13.0
24
  python-dotenv==1.0.1
25
  awslambdaric==3.1.1
26
  python-docx==1.2.0
27
- paddlepaddle==3.1.0 # Consider installing the GPU version for faster local OCR inference with PaddleOCR: paddlepaddle-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ , compatible with CUDA 12.6. See this for more details: https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#span-id-gpu-gpu-version-of-paddlepaddle-span
28
- paddleocr==3.1.1
 
29
 
30
  # Test dependencies
31
  pytest>=7.0.0
 
24
  python-dotenv==1.0.1
25
  awslambdaric==3.1.1
26
  python-docx==1.2.0
27
+ # Optional: uncomment the below to install paddleOCR if you want to use hybrid text extraction (tesseract plus paddleocr)
28
+ # paddlepaddle==3.1.0 # Consider installing the GPU version for faster local OCR inference with PaddleOCR: paddlepaddle-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ , compatible with CUDA 12.6. See this for more details: https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#span-id-gpu-gpu-version-of-paddlepaddle-span
29
+ # paddleocr==3.1.1
30
 
31
  # Test dependencies
32
  pytest>=7.0.0
test/demo_single_test.py CHANGED
@@ -14,7 +14,7 @@ import tempfile
14
  # Add the parent directory to the path
15
  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
16
 
17
- from test.test import run_cli_redact
18
 
19
 
20
  def demo_pdf_redaction():
 
14
  # Add the parent directory to the path
15
  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
16
 
17
+ from test import run_cli_redact
18
 
19
 
20
  def demo_pdf_redaction():