Spaces:
Running
Running
S3 configuration done
Browse files- .gitignore +1 -0
- app.py +98 -44
- requirements.txt +2 -1
- s3_setup.py +44 -0
- sample.py +8 -7
.gitignore
CHANGED
|
@@ -19,3 +19,4 @@ test_images_folder
|
|
| 19 |
uploads
|
| 20 |
pause_space.py
|
| 21 |
.DS_Store
|
|
|
|
|
|
| 19 |
uploads
|
| 20 |
pause_space.py
|
| 21 |
.DS_Store
|
| 22 |
+
test_s3_client.py
|
app.py
CHANGED
|
@@ -4,6 +4,7 @@ from typing import Dict
|
|
| 4 |
import os
|
| 5 |
import shutil
|
| 6 |
import logging
|
|
|
|
| 7 |
|
| 8 |
import torch
|
| 9 |
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
|
|
@@ -159,57 +160,98 @@ def perform_inference(file_paths: Dict[str, str]):
|
|
| 159 |
"cheque_file": cheque_model,
|
| 160 |
"gst_file": gst_model,
|
| 161 |
}
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
attachemnt_num = 3
|
| 183 |
|
| 184 |
-
|
| 185 |
-
context = pan_model
|
| 186 |
-
processor = processor_pan
|
| 187 |
-
name = "pan"
|
| 188 |
-
attachemnt_num = 2
|
| 189 |
|
| 190 |
-
if doc_type == "gst_file":
|
| 191 |
-
context = gst_model
|
| 192 |
-
processor = processor_gst
|
| 193 |
-
name = "gst"
|
| 194 |
-
attachemnt_num = 4
|
| 195 |
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
|
| 202 |
|
|
|
|
|
|
|
| 203 |
|
| 204 |
-
|
| 205 |
-
result = handle(inference_batch, context,processor,name)
|
| 206 |
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
|
| 212 |
-
return inference_results
|
| 213 |
|
| 214 |
# Routes
|
| 215 |
@app.get("/")
|
|
@@ -247,18 +289,30 @@ async def aadhar_ocr(
|
|
| 247 |
f_path = value
|
| 248 |
preprocessing = doc_processing(name,id_type,doc_type,f_path)
|
| 249 |
response = preprocessing.process()
|
| 250 |
-
files[key] = response["output_p"]
|
|
|
|
| 251 |
print("response",response)
|
| 252 |
|
| 253 |
|
| 254 |
# Perform inference
|
| 255 |
result = perform_inference(files)
|
| 256 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
return {"status": "success", "result": result}
|
| 258 |
|
|
|
|
| 259 |
except Exception as e:
|
| 260 |
logging.error(f"Error processing files: {e}")
|
| 261 |
# raise HTTPException(status_code=500, detail="Internal Server Error")
|
| 262 |
-
return {
|
|
|
|
|
|
|
|
|
|
| 263 |
|
| 264 |
|
|
|
|
| 4 |
import os
|
| 5 |
import shutil
|
| 6 |
import logging
|
| 7 |
+
from s3_setup import s3_client
|
| 8 |
|
| 9 |
import torch
|
| 10 |
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
|
|
|
|
| 160 |
"cheque_file": cheque_model,
|
| 161 |
"gst_file": gst_model,
|
| 162 |
}
|
| 163 |
+
try:
|
| 164 |
+
# Dictionary to store results for each document type
|
| 165 |
+
inference_results = {}
|
| 166 |
|
| 167 |
+
# Loop through the file paths and perform inference
|
| 168 |
+
for doc_type, file_path in file_paths.items():
|
| 169 |
+
if doc_type in model_dirs:
|
| 170 |
+
print(f"Processing {doc_type} using model at {model_dirs[doc_type]}")
|
| 171 |
+
|
| 172 |
+
# Prepare batch for inference
|
| 173 |
+
processed_file_p = file_path.split("&&")[0]
|
| 174 |
+
unprocessed_file_path = file_path.split("&&")[1]
|
| 175 |
+
|
| 176 |
+
images_path = [processed_file_p]
|
| 177 |
+
inference_batch = prepare_batch_for_inference(images_path)
|
| 178 |
+
|
| 179 |
+
# Prepare context for the specific document type
|
| 180 |
+
# context = {"model_dir": model_dirs[doc_type]}
|
| 181 |
+
#initialize s3 client
|
| 182 |
+
client = s3_client()
|
| 183 |
+
|
| 184 |
+
local_file_path= unprocessed_file_path
|
| 185 |
+
bucket_name = "edgekycdocs"
|
|
|
|
| 186 |
|
| 187 |
+
file_name = unprocessed_file_path.split("/")[-1]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
# context = aadhar_model
|
| 193 |
+
if doc_type == "aadhar_file":
|
| 194 |
+
context = aadhar_model
|
| 195 |
+
processor = processor_aadhar
|
| 196 |
+
name = "aadhar"
|
| 197 |
+
attachemnt_num = 3
|
| 198 |
+
folder_name = "aadhardocs"
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
if doc_type == "pan_file":
|
| 202 |
+
context = pan_model
|
| 203 |
+
processor = processor_pan
|
| 204 |
+
name = "pan"
|
| 205 |
+
attachemnt_num = 2
|
| 206 |
+
folder_name = "pandocs"
|
| 207 |
+
|
| 208 |
+
if doc_type == "gst_file":
|
| 209 |
+
context = gst_model
|
| 210 |
+
processor = processor_gst
|
| 211 |
+
name = "gst"
|
| 212 |
+
attachemnt_num = 4
|
| 213 |
+
folder_name = "gstdocs"
|
| 214 |
+
|
| 215 |
+
if doc_type == "cheque_file":
|
| 216 |
+
context = cheque_model
|
| 217 |
+
processor = processor_cheque
|
| 218 |
+
name = "cheque"
|
| 219 |
+
attachemnt_num = 8
|
| 220 |
+
folder_name = "bankchequedocs"
|
| 221 |
+
|
| 222 |
|
| 223 |
|
| 224 |
+
# upload the document to s3 bucket here
|
| 225 |
+
|
| 226 |
|
| 227 |
+
response = client.upload_file(local_file_path,bucket_name,folder_name,file_name)
|
|
|
|
| 228 |
|
| 229 |
+
print("The file has been uploaded to s3 bucket",response)
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
# Perform inference (replace `handle` with your actual function)
|
| 233 |
+
result = handle(inference_batch, context,processor,name)
|
| 234 |
+
# result["attachment_url": response["url"]]
|
| 235 |
+
result["attachment_url"] = response["url"]
|
| 236 |
+
result["detect"] = True
|
| 237 |
+
|
| 238 |
+
print("result required",result)
|
| 239 |
+
|
| 240 |
+
# if result[""]
|
| 241 |
+
|
| 242 |
+
# Store the result
|
| 243 |
+
inference_results["attachment_{}".format(attachemnt_num)] = result
|
| 244 |
+
else:
|
| 245 |
+
print(f"Model directory not found for {doc_type}. Skipping.")
|
| 246 |
+
# print(Javed)
|
| 247 |
+
|
| 248 |
+
return inference_results
|
| 249 |
+
except:
|
| 250 |
+
return {
|
| 251 |
+
"status": "error",
|
| 252 |
+
"message": "Text extraction failed."
|
| 253 |
+
}
|
| 254 |
|
|
|
|
| 255 |
|
| 256 |
# Routes
|
| 257 |
@app.get("/")
|
|
|
|
| 289 |
f_path = value
|
| 290 |
preprocessing = doc_processing(name,id_type,doc_type,f_path)
|
| 291 |
response = preprocessing.process()
|
| 292 |
+
files[key] = response["output_p"] + "&&" + f_path
|
| 293 |
+
# files["unprocessed_file_path"] = f_path
|
| 294 |
print("response",response)
|
| 295 |
|
| 296 |
|
| 297 |
# Perform inference
|
| 298 |
result = perform_inference(files)
|
| 299 |
|
| 300 |
+
print("this is the result we got",result)
|
| 301 |
+
if "status" in list(result.keys()):
|
| 302 |
+
raise Exception("Custom error message")
|
| 303 |
+
# if result["status"] == "error":
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
|
| 307 |
return {"status": "success", "result": result}
|
| 308 |
|
| 309 |
+
|
| 310 |
except Exception as e:
|
| 311 |
logging.error(f"Error processing files: {e}")
|
| 312 |
# raise HTTPException(status_code=500, detail="Internal Server Error")
|
| 313 |
+
return {
|
| 314 |
+
"status": 400,
|
| 315 |
+
"message": "Text extraction failed."
|
| 316 |
+
}
|
| 317 |
|
| 318 |
|
requirements.txt
CHANGED
|
@@ -8,4 +8,5 @@ pillow
|
|
| 8 |
google-cloud-vision
|
| 9 |
python-dotenv
|
| 10 |
pymupdf
|
| 11 |
-
pillow
|
|
|
|
|
|
| 8 |
google-cloud-vision
|
| 9 |
python-dotenv
|
| 10 |
pymupdf
|
| 11 |
+
pillow
|
| 12 |
+
boto3
|
s3_setup.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import boto3
|
| 2 |
+
# AWS credentials (if not set in environment variables or AWS CLI config)
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
import os
|
| 5 |
+
import sys
|
| 6 |
+
|
| 7 |
+
from utils import doc_processing
|
| 8 |
+
|
| 9 |
+
# Load .env file
|
| 10 |
+
load_dotenv()
|
| 11 |
+
# Access variables
|
| 12 |
+
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
|
| 13 |
+
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
|
| 14 |
+
print("AWS_ACCESS_KEY_ID",AWS_ACCESS_KEY_ID)
|
| 15 |
+
print("AWS_SECRET_ACCESS_KEY",AWS_SECRET_ACCESS_KEY)
|
| 16 |
+
# Initialize S3 client
|
| 17 |
+
|
| 18 |
+
class s3_client:
|
| 19 |
+
def __init__(self):
|
| 20 |
+
self.aws_access_key_id = AWS_ACCESS_KEY_ID
|
| 21 |
+
self.aws_secret_access_key = AWS_SECRET_ACCESS_KEY
|
| 22 |
+
|
| 23 |
+
def initialize(self):
|
| 24 |
+
return boto3.client(
|
| 25 |
+
's3',
|
| 26 |
+
aws_access_key_id=self.aws_access_key_id,
|
| 27 |
+
aws_secret_access_key=self.aws_secret_access_key
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
def upload_file(self,local_file_path, bucket_name,folder_name,file_name):
|
| 31 |
+
try:
|
| 32 |
+
client = self.initialize()
|
| 33 |
+
client.upload_file(local_file_path, bucket_name, f"{folder_name}/{file_name}")
|
| 34 |
+
print(f"File uploaded successfully to {bucket_name}/{folder_name}{file_name}")
|
| 35 |
+
url = f"https://edgekycdocs.s3.eu-north-1.amazonaws.com/{folder_name}/{file_name}"
|
| 36 |
+
print("file url",url)
|
| 37 |
+
return {"status": 200, "message":"file uploaded successfully" , "url" : url}
|
| 38 |
+
except Exception as e:
|
| 39 |
+
print("Error uploading file:", e)
|
| 40 |
+
return {"status": 400, "message":e}
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
|
sample.py
CHANGED
|
@@ -2,17 +2,18 @@ import requests
|
|
| 2 |
import sys
|
| 3 |
|
| 4 |
# Define the API endpoint
|
| 5 |
-
# url = "http://
|
| 6 |
-
test_url = "http://
|
| 7 |
|
| 8 |
-
response = requests.get(
|
| 9 |
|
| 10 |
-
post_url = "http://127.0.0.1:7860/api/aadhar_ocr"
|
| 11 |
-
print("Status Code:", response.status_code)
|
| 12 |
-
print("Response Text:", response.text)
|
| 13 |
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
| 15 |
|
|
|
|
| 16 |
|
| 17 |
# response = requests.get(url)
|
| 18 |
|
|
|
|
| 2 |
import sys
|
| 3 |
|
| 4 |
# Define the API endpoint
|
| 5 |
+
# url = "http://localhost:7680/"
|
| 6 |
+
# # test_url = "http://localhost:7860/"
|
| 7 |
|
| 8 |
+
# response = requests.get(url)
|
| 9 |
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
+
# print("Status Code:", response.status_code)
|
| 12 |
+
# print("Response Text:", response.text)
|
| 13 |
+
|
| 14 |
+
# sys.exit()
|
| 15 |
|
| 16 |
+
post_url = "http://localhost:7680/api/aadhar_ocr"
|
| 17 |
|
| 18 |
# response = requests.get(url)
|
| 19 |
|