Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Sep 24

Commit

f5b280d

1 Parent(s): f957846

Fixes to test suites. Minor default package changes (paddleocr not required)

Browse files

Files changed (7) hide show

.github/scripts/setup_test_data.py +9 -1
.github/{README.md → workflow_README.md} +0 -0
.github/workflows/ci.yml +3 -3
.github/workflows/multi-os-test.yml +1 -1
.github/workflows/test.yml +27 -16
requirements.txt +3 -2
test/demo_single_test.py +1 -1

.github/scripts/setup_test_data.py CHANGED Viewed

@@ -8,6 +8,14 @@ import os
 import pandas as pd
 def create_directories():
     """Create necessary directories."""
@@ -152,7 +160,7 @@ def create_ocr_output():
         "top": [0.95, 0.92, 0.88],
         "width": [0.05, 0.02, 0.02],
         "height": [0.01, 0.02, 0.02],
-        "line": [1, 2, 3],
     }
     df = pd.DataFrame(ocr_data)
     df.to_csv(

 import pandas as pd
+ # Install reportlab if not available
+try:
+      import reportlab
+except ImportError:
+      import subprocess
+      subprocess.check_call(['pip', 'install', 'reportlab'])
+      import reportlab
 def create_directories():
     """Create necessary directories."""
         "top": [0.95, 0.92, 0.88],
         "width": [0.05, 0.02, 0.02],
         "height": [0.01, 0.02, 0.02],
+        "line": [1, 2, 3]
     }
     df = pd.DataFrame(ocr_data)
     df.to_csv(

.github/{README.md → workflow_README.md} RENAMED Viewed

File without changes

.github/workflows/ci.yml CHANGED Viewed

@@ -186,9 +186,9 @@ jobs:
         python -m pip install --upgrade pip
         pip install safety bandit
-    - name: Run safety scan
-      run: |
-        safety scan -r requirements.txt
     - name: Run bandit security check
       run: |

         python -m pip install --upgrade pip
         pip install safety bandit
+    #- name: Run safety scan - removed as now requires login
+    #  run: |
+    #    safety scan -r requirements.txt
     - name: Run bandit security check
       run: |

.github/workflows/multi-os-test.yml CHANGED Viewed

@@ -16,7 +16,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest] # windows-latest removed for now as I have not been able to install tesseract on Windows using this method
-        python-version: ["3.10", "3.11", "3.12"]
         exclude:
           # Exclude some combinations to reduce CI time
           #- os: windows-latest

     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest] # windows-latest removed for now as I have not been able to install tesseract on Windows using this method
+        python-version: ["3.11", "3.12", "3.13"]
         exclude:
           # Exclude some combinations to reduce CI time
           #- os: windows-latest

.github/workflows/test.yml CHANGED Viewed

@@ -16,7 +16,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.10, 3.11, 3.12]
     steps:
     - uses: actions/checkout@v4
@@ -48,7 +48,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install -r requirements.txt
-        pip install pytest pytest-cov pytest-html pytest-xdist
     - name: Download spaCy model
       run: |
@@ -64,16 +64,18 @@ jobs:
         # Create dummy PDF for testing
         python -c "
         import os
-        from reportlab.pdfgen import canvas
-        from reportlab.lib.pagesizes import letter
         # Install reportlab if not available
         try:
-            import reportlab
         except ImportError:
-            import subprocess
-            subprocess.check_call(['pip', 'install', 'reportlab'])
-            import reportlab
         # Create dummy PDF
         c = canvas.Canvas('example_data/example_of_emails_sent_to_a_professor_before_applying.pdf', pagesize=letter)
@@ -148,15 +150,24 @@ jobs:
         # Create dummy OCR output
         ocr_data = {
-            'file_name': ['test.pdf', 'test.pdf'],
-            'page_number': [1, 2],
-            'text': ['This is page 1 content', 'This is page 2 content'],
-            'confidence': [0.95, 0.92]
         }
         df = pd.DataFrame(ocr_data)
-        os.makedirs('example_data/example_outputs', exist_ok=True)
-        df.to_csv('example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv', index=False)
-        print('Created dummy OCR output CSV')
         "
     - name: Create dummy image for testing

     runs-on: ubuntu-latest
     strategy:
       matrix:
+        python-version: [3.11, 3.12, 3.13]
     steps:
     - uses: actions/checkout@v4
       run: |
         python -m pip install --upgrade pip
         pip install -r requirements.txt
+        pip install pytest pytest-cov pytest-html pytest-xdist reportlab pillow
     - name: Download spaCy model
       run: |
         # Create dummy PDF for testing
         python -c "
         import os
         # Install reportlab if not available
         try:
+          import reportlab
         except ImportError:
+          import subprocess
+          subprocess.check_call(['pip', 'install', 'reportlab'])
+          import reportlab
+        from reportlab.pdfgen import canvas
+        from reportlab.lib.pagesizes import letter
         # Create dummy PDF
         c = canvas.Canvas('example_data/example_of_emails_sent_to_a_professor_before_applying.pdf', pagesize=letter)
         # Create dummy OCR output
         ocr_data = {
+        "page": [1, 2, 3],
+        "text": [
+            "This is page 1 content with some text",
+            "This is page 2 content with different text",
+            "This is page 3 content with more text",
+        ],
+        "left": [0.1, 0.3, 0.5],
+        "top": [0.95, 0.92, 0.88],
+        "width": [0.05, 0.02, 0.02],
+        "height": [0.01, 0.02, 0.02],
+        "line": [1, 2, 3]
         }
         df = pd.DataFrame(ocr_data)
+        df.to_csv(
+            "example_data/example_outputs/doubled_output_joined.pdf_ocr_output.csv",
+            index=False,
+        )
+        print("Created dummy OCR output CSV")
         "
     - name: Create dummy image for testing

requirements.txt CHANGED Viewed

@@ -24,8 +24,9 @@ rapidfuzz==3.13.0
 python-dotenv==1.0.1
 awslambdaric==3.1.1
 python-docx==1.2.0
-paddlepaddle==3.1.0 # Consider installing the GPU version for faster local OCR inference with PaddleOCR: paddlepaddle-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ , compatible with CUDA 12.6. See this for more details: https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#span-id-gpu-gpu-version-of-paddlepaddle-span
-paddleocr==3.1.1
 # Test dependencies
 pytest>=7.0.0

 python-dotenv==1.0.1
 awslambdaric==3.1.1
 python-docx==1.2.0
+# Optional: uncomment the below to install paddleOCR if you want to use hybrid text extraction (tesseract plus paddleocr)
+# paddlepaddle==3.1.0 # Consider installing the GPU version for faster local OCR inference with PaddleOCR: paddlepaddle-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ , compatible with CUDA 12.6. See this for more details: https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#span-id-gpu-gpu-version-of-paddlepaddle-span
+# paddleocr==3.1.1
 # Test dependencies
 pytest>=7.0.0

test/demo_single_test.py CHANGED Viewed

@@ -14,7 +14,7 @@ import tempfile
 # Add the parent directory to the path
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from test.test import run_cli_redact
 def demo_pdf_redaction():

 # Add the parent directory to the path
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from test import run_cli_redact
 def demo_pdf_redaction():