Spaces:

AuditEdge
/

optimised-ocr

Running

App Files Files Community

AuditEdge commited on Jan 29

Commit

3bb9361

1 Parent(s): 8538469

S3 configuration done

Browse files

Files changed (5) hide show

.gitignore +1 -0
app.py +98 -44
requirements.txt +2 -1
s3_setup.py +44 -0
sample.py +8 -7

.gitignore CHANGED Viewed

@@ -19,3 +19,4 @@ test_images_folder
 uploads
 pause_space.py
 .DS_Store

 uploads
 pause_space.py
 .DS_Store
+test_s3_client.py

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import Dict
 import os
 import shutil
 import logging
 import torch
 from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
@@ -159,57 +160,98 @@ def perform_inference(file_paths: Dict[str, str]):
         "cheque_file": cheque_model,
         "gst_file": gst_model,
     }
-    # Dictionary to store results for each document type
-    inference_results = {}
-    # Loop through the file paths and perform inference
-    for doc_type, file_path in file_paths.items():
-        if doc_type in model_dirs:
-            print(f"Processing {doc_type} using model at {model_dirs[doc_type]}")
-            # Prepare batch for inference
-            images_path = [file_path]
-            inference_batch = prepare_batch_for_inference(images_path)
-            # Prepare context for the specific document type
-            # context = {"model_dir": model_dirs[doc_type]}
-            # context = aadhar_model
-            if doc_type == "aadhar_file":
-                context = aadhar_model
-                processor = processor_aadhar
-                name = "aadhar"
-                attachemnt_num = 3
-            if doc_type == "pan_file":
-                context = pan_model
-                processor = processor_pan
-                name = "pan"
-                attachemnt_num = 2
-            if doc_type == "gst_file":
-                context = gst_model
-                processor = processor_gst
-                name = "gst"
-                attachemnt_num = 4
-            if doc_type == "cheque_file":
-                context = cheque_model
-                processor = processor_cheque
-                name = "cheque"
-                attachemnt_num = 8
-            # Perform inference (replace `handle` with your actual function)
-            result = handle(inference_batch, context,processor,name)
-            # Store the result
-            inference_results["attachment_{}".format(attachemnt_num)] = result
-        else:
-            print(f"Model directory not found for {doc_type}. Skipping.")
-    return inference_results
 # Routes
 @app.get("/")
@@ -247,18 +289,30 @@ async def aadhar_ocr(
             f_path = value
             preprocessing = doc_processing(name,id_type,doc_type,f_path)
             response = preprocessing.process()
-            files[key] = response["output_p"]
             print("response",response)
         # Perform inference
         result = perform_inference(files)
         return {"status": "success", "result": result}
     except Exception as e:
         logging.error(f"Error processing files: {e}")
         # raise HTTPException(status_code=500, detail="Internal Server Error")
-        return {"status":400}

 import os
 import shutil
 import logging
+from s3_setup import s3_client
 import torch
 from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
         "cheque_file": cheque_model,
         "gst_file": gst_model,
     }
+    try:
+        # Dictionary to store results for each document type
+        inference_results = {}
+        # Loop through the file paths and perform inference
+        for doc_type, file_path in file_paths.items():
+            if doc_type in model_dirs:
+                print(f"Processing {doc_type} using model at {model_dirs[doc_type]}")
+                # Prepare batch for inference
+                processed_file_p = file_path.split("&&")[0]
+                unprocessed_file_path = file_path.split("&&")[1]
+                images_path = [processed_file_p]
+                inference_batch = prepare_batch_for_inference(images_path)
+                # Prepare context for the specific document type
+                # context = {"model_dir": model_dirs[doc_type]}
+                #initialize s3 client
+                client = s3_client()
+                local_file_path= unprocessed_file_path
+                bucket_name = "edgekycdocs"
+                file_name = unprocessed_file_path.split("/")[-1]
+                # context = aadhar_model
+                if doc_type == "aadhar_file":
+                    context = aadhar_model
+                    processor = processor_aadhar
+                    name = "aadhar"
+                    attachemnt_num = 3
+                    folder_name = "aadhardocs"
+                if doc_type == "pan_file":
+                    context = pan_model
+                    processor = processor_pan
+                    name = "pan"
+                    attachemnt_num = 2
+                    folder_name = "pandocs"
+                if doc_type == "gst_file":
+                    context = gst_model
+                    processor = processor_gst
+                    name = "gst"
+                    attachemnt_num = 4
+                    folder_name = "gstdocs"
+                if doc_type == "cheque_file":
+                    context = cheque_model
+                    processor = processor_cheque
+                    name = "cheque"
+                    attachemnt_num = 8
+                    folder_name = "bankchequedocs"
+                # upload the document to s3 bucket here
+                response = client.upload_file(local_file_path,bucket_name,folder_name,file_name)
+                print("The file has been uploaded to s3 bucket",response)
+                # Perform inference (replace `handle` with your actual function)
+                result = handle(inference_batch, context,processor,name)
+                # result["attachment_url": response["url"]]
+                result["attachment_url"] = response["url"]
+                result["detect"] = True
+                print("result required",result)
+                # if result[""]
+                # Store the result
+                inference_results["attachment_{}".format(attachemnt_num)] = result
+            else:
+                print(f"Model directory not found for {doc_type}. Skipping.")
+            # print(Javed)
+            return inference_results
+    except:
+        return {
+                "status": "error",
+                "message": "Text extraction failed."
+                }
 # Routes
 @app.get("/")
             f_path = value
             preprocessing = doc_processing(name,id_type,doc_type,f_path)
             response = preprocessing.process()
+            files[key] = response["output_p"] + "&&" + f_path
+            # files["unprocessed_file_path"] = f_path
             print("response",response)
         # Perform inference
         result = perform_inference(files)
+        print("this is the result we got",result)
+        if "status" in list(result.keys()):
+            raise Exception("Custom error message")
+        # if result["status"] == "error":
         return {"status": "success", "result": result}
     except Exception as e:
         logging.error(f"Error processing files: {e}")
         # raise HTTPException(status_code=500, detail="Internal Server Error")
+        return {
+                "status": 400,
+                "message": "Text extraction failed."
+                }

requirements.txt CHANGED Viewed

@@ -8,4 +8,5 @@ pillow
 google-cloud-vision
 python-dotenv
 pymupdf
-pillow

 google-cloud-vision
 python-dotenv
 pymupdf
+pillow
+boto3

s3_setup.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import boto3
+# AWS credentials (if not set in environment variables or AWS CLI config)
+from dotenv import load_dotenv
+import os
+import sys
+from utils import doc_processing
+# Load .env file
+load_dotenv()
+# Access variables
+AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
+AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
+print("AWS_ACCESS_KEY_ID",AWS_ACCESS_KEY_ID)
+print("AWS_SECRET_ACCESS_KEY",AWS_SECRET_ACCESS_KEY)
+# Initialize S3 client
+class s3_client:
+    def __init__(self):
+        self.aws_access_key_id = AWS_ACCESS_KEY_ID
+        self.aws_secret_access_key = AWS_SECRET_ACCESS_KEY
+    def initialize(self):
+        return boto3.client(
+                's3',
+                aws_access_key_id=self.aws_access_key_id,
+                aws_secret_access_key=self.aws_secret_access_key
+                            )
+    def upload_file(self,local_file_path, bucket_name,folder_name,file_name):
+        try:
+            client = self.initialize()
+            client.upload_file(local_file_path, bucket_name, f"{folder_name}/{file_name}")
+            print(f"File uploaded successfully to {bucket_name}/{folder_name}{file_name}")
+            url = f"https://edgekycdocs.s3.eu-north-1.amazonaws.com/{folder_name}/{file_name}"
+            print("file url",url)
+            return {"status": 200, "message":"file uploaded successfully" , "url" : url}
+        except Exception as e:
+            print("Error uploading file:", e)
+            return {"status": 400, "message":e}

sample.py CHANGED Viewed

@@ -2,17 +2,18 @@ import requests
 import sys
 # Define the API endpoint
-# url = "http://127.0.0.0:7860/api/home"
-test_url = "http://127.0.0.1:7860/"
-response = requests.get(test_url)
-post_url = "http://127.0.0.1:7860/api/aadhar_ocr"
-print("Status Code:", response.status_code)
-print("Response Text:", response.text)
-sys.exit()
 # response = requests.get(url)

 import sys
 # Define the API endpoint
+# url = "http://localhost:7680/"
+# # test_url = "http://localhost:7860/"
+# response = requests.get(url)
+# print("Status Code:", response.status_code)
+# print("Response Text:", response.text)
+# sys.exit()
+post_url = "http://localhost:7680/api/aadhar_ocr"
 # response = requests.get(url)