DPT2

Sleeping

App Files Files Community

DPT2 / app.py

Seth0330

Update app.py

11d644c verified 23 days ago

raw

history blame

13.9 kB

	import io, os, re, json
	from typing import List, Tuple, Dict

	import numpy as np
	import pandas as pd
	from PIL import Image, ImageOps, ImageFilter

	import streamlit as st
	import torch
	import torchvision.transforms as T

	# --- word detector (Tesseract) ---
	import pytesseract
	from pytesseract import Output

	# --- PDF -> images ---
	from pdf2image import convert_from_bytes

	# ---- import the repo's models ----
	# Install via requirements.txt (git+https URL) OR copy repo files into root.
	# The repo defines model classes: Swin_CTC, VED
	import models as pdrt_models # from dparres/Pretrained-Document-Recognition-Transformers

	st.set_page_config(page_title="Invoice OCR (ViT recognizer + Tesseract detector)", layout="wide")

	# ========================= UI SIDEBAR =========================
	st.sidebar.header("Model")
	arch = st.sidebar.selectbox("Architecture", ["Swin_CTC", "VED"], index=0)
	ckpt_path = st.sidebar.text_input("Checkpoint path (inside Space)", value="checkpoints/pdrt_weights.pth")
	alphabet = st.sidebar.text_input("Alphabet (ordered classes, exclude CTC blank)", value="0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-_/.,:;()[]{}#+*&%$@!?\"' ")
	img_h = st.sidebar.number_input("Recognizer input height", 64, 256, 128, 8)
	img_w = st.sidebar.number_input("Recognizer input width", 128, 2048, 512, 16)
	det_lang = st.sidebar.text_input("Tesseract lang(s) for detection only", value="eng")
	show_boxes = st.sidebar.checkbox("Show word boxes", value=False)
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# ========================= UTILITIES =========================
	def load_pages(file_bytes: bytes, name: str) -> List[Image.Image]:
	name = (name or "").lower()
	if name.endswith(".pdf"):
	return convert_from_bytes(file_bytes, dpi=300)
	return [Image.open(io.BytesIO(file_bytes)).convert("RGB")]

	def preprocess_for_detection(img: Image.Image) -> Image.Image:
	g = ImageOps.grayscale(img)
	g = ImageOps.autocontrast(g)
	g = g.filter(ImageFilter.UnsharpMask(radius=1, percent=150, threshold=3))
	return g

	@st.cache_resource
	def load_pdrt(arch_name: str, ckpt: str, num_classes: int):
	if arch_name == "Swin_CTC":
	model = pdrt_models.Swin_CTC(num_classes=num_classes)
	elif arch_name == "VED":
	model = pdrt_models.VED(num_classes=num_classes)
	else:
	raise ValueError("Unknown model")
	state = torch.load(ckpt, map_location="cpu")
	model.load_state_dict(state, strict=False)
	model.eval().to(device)
	return model

	def build_transform(img_h: int, img_w: int):
	return T.Compose([
	T.Grayscale(num_output_channels=3), # keep 3ch if encoder expects RGB
	T.Resize((img_h, img_w)),
	T.ToTensor(),
	T.Normalize(mean=[0.5,0.5,0.5], std=[0.5,0.5,0.5]),
	])

	def greedy_ctc_decode(logits: torch.Tensor, alphabet: str) -> str:
	"""
	logits: (B, T, C) or (T, B, C). We map argmax to chars, collapse repeats, remove blank.
	We assume blank_id = len(alphabet).
	"""
	if logits.dim() == 3 and logits.shape[0] != 1 and logits.shape[1] == 1:
	# rare shape, just permute if needed
	pass
	if logits.shape[0] == 1:
	logits = logits.squeeze(0) # (T, C)
	elif logits.shape[1] == 1:
	logits = logits[:,0,:] # (T, C)
	probs = logits.softmax(-1)
	ids = probs.argmax(-1).tolist()
	blank_id = len(alphabet)
	out = []
	prev = None
	for i in ids:
	if i != prev and i != blank_id:
	out.append(alphabet[i] if i < len(alphabet) else "")
	prev = i
	return "".join(out)

	def recognize_word_crops(model, crops: List[Image.Image], tfm, arch_name: str, alphabet: str) -> List[str]:
	texts = []
	with torch.no_grad():
	for im in crops:
	x = tfm(im).unsqueeze(0).to(device)
	y = model(x)
	if arch_name == "Swin_CTC":
	# expect CTC logits [B, T, C] or [T, B, C]
	if y.dim() == 3 and y.shape[0] == 1: # [1, T, C]
	logits = y[0] # [T, C]
	elif y.dim() == 3 and y.shape[1] == 1: # [T, 1, C]
	logits = y[:,0,:]
	else:
	logits = y
	txt = greedy_ctc_decode(logits, alphabet)
	else:
	# VED: if returns token ids/logits, plug your repo's decoding here.
	# Fallback: argmax over last dim per step and map ids to alphabet (no blank).
	if y.dim() == 3 and y.shape[0] == 1:
	y = y[0]
	ids = y.argmax(-1).tolist()
	txt = "".join(alphabet[i] if i < len(alphabet) else "" for i in ids).strip()
	texts.append(txt)
	return texts

	def detect_words(img: Image.Image, lang="eng") -> pd.DataFrame:
	df = pytesseract.image_to_data(img, lang=lang, output_type=Output.DATAFRAME)
	df = df.dropna(subset=["text"]).reset_index(drop=True)
	df["x2"] = df["left"] + df["width"]
	df["y2"] = df["top"] + df["height"]
	return df[df["conf"] > -1]

	def crop_words(img: Image.Image, df: pd.DataFrame) -> List[Tuple[Image.Image, Dict]]:
	crops, metas = [], []
	for _, r in df.iterrows():
	if str(r["text"]).strip() == "":
	continue
	box = (int(r["left"]), int(r["top"]), int(r["x2"]), int(r["y2"]))
	c = img.crop(box)
	crops.append(c)
	metas.append({"box": box})
	return crops, metas

	# ---------------- key fields & table (same logic as earlier Tesseract app) ----------------
	CURRENCY = r"(?P<curr>USD\|CAD\|EUR\|GBP\|\$\|C\$\|€\|£)?"
	MONEY = rf"{CURRENCY}\s?(?P<amt>\d{{1,3}}(?:[,]\d{{3}})*(?:[.]\d{{2}})?)"
	DATE = r"(?P<date>(?:\d{4}[-/]\d{1,2}[-/]\d{1,2})\|(?:\d{1,2}[-/]\d{1,2}[-/]\d{2,4})\|(?:[A-Za-z]{3,9}\s+\d{1,2},\s*\d{2,4}))"
	INV_PAT = r"(?:invoice\s(?:no\.?\|#\|number)?\s[:\-]?\s*(?P<inv>[A-Z0-9\-_/]{4,}))"
	PO_PAT = r"(?:po\s(?:no\.?\|#\|number)?\s[:\-]?\s*(?P<po>[A-Z0-9\-_/]{3,}))"
	TOTAL_PAT = rf"(?:\b(total(?:\samount)?\|amount\sdue\|grand\stotal)\b.?{MONEY})"
	SUBTOTAL_PAT = rf"(?:\bsub\stotal\b.?{MONEY})"
	TAX_PAT = rf"(?:\b(tax\|gst\|vat\|hst)\b.*?{MONEY})"

	def parse_fields(fulltext: str):
	t = re.sub(r"[ \t]+", " ", fulltext)
	t = re.sub(r"\n{2,}", "\n", t)
	out = {"invoice_number":None,"invoice_date":None,"po_number":None,"subtotal":None,"tax":None,"total":None,"currency":None}
	m = re.search(INV_PAT, t, re.I); out["invoice_number"] = m.group("inv") if m else None
	m = re.search(PO_PAT, t, re.I); out["po_number"] = m.group("po") if m else None
	m = re.search(rf"(invoice\sdate[:\-\s]){DATE}", t, re.I)
	out["invoice_date"] = (m.group("date") if m else (re.search(DATE, t, re.I).group("date") if re.search(DATE, t, re.I) else None))
	m = re.search(SUBTOTAL_PAT, t, re.I\|re.S);
	if m: out["subtotal"], out["currency"] = m.group("amt").replace(",",""), m.group("curr") or out["currency"]
	m = re.search(TAX_PAT, t, re.I\|re.S);
	if m: out["tax"], out["currency"] = m.group("amt").replace(",",""), m.group("curr") or out["currency"]
	m = re.search(TOTAL_PAT, t, re.I\|re.S);
	if m: out["total"], out["currency"] = m.group("amt").replace(",",""), m.group("curr") or out["currency"]
	if out["currency"] in ["$", "C$", "€", "£"]:
	out["currency"] = {"$":"USD", "C$":"CAD", "€":"EUR", "£":"GBP"}[out["currency"]]
	return out

	HEAD_CANDIDATES = ["description","item","qty","quantity","price","unit","rate","amount","total"]
	def items_from_wordgrid(df: pd.DataFrame) -> pd.DataFrame:
	# Group into lines
	df = df.copy()
	df["cx"] = df["left"] + 0.5*df["width"]
	df["cy"] = df["top"] + 0.5*df["height"]
	lines = []
	for (b,p,l), g in df.groupby(["block_num","par_num","line_num"]):
	text = " ".join([t for t in g["text"].astype(str) if t.strip()])
	if text.strip():
	lines.append({
	"block_num":b,"par_num":p,"line_num":l,
	"text": text.lower(),
	"top": g["top"].min(), "bottom": (g["top"]+g["height"]).max(),
	"left": g["left"].min(), "right": (g["left"]+g["width"]).max(),
	"words": g.sort_values("cx")[["cx","left","top","width","height"]].values.tolist()
	})
	L = pd.DataFrame(lines)
	if L.empty: return pd.DataFrame()
	L["score"] = L["text"].apply(lambda s: sum(1 for h in HEAD_CANDIDATES if h in s))
	headers = L[L["score"]>=2].sort_values(["score","top"], ascending=[False,True])
	if headers.empty: return pd.DataFrame()
	H = headers.iloc[0]
	header_y = H["bottom"] + 4

	# choose column centers from header words positions
	# we reuse df within header band
	header_band = df[(df["top"]>=H["top"]-5) & ((df["top"]+df["height"])<=H["bottom"]+5)]
	header_band = header_band.sort_values("left")
	col_x = header_band["left"].tolist()
	if len(col_x)<2: return pd.DataFrame()
	# region below header until totals
	below = df[df["top"]>header_y].copy()
	totals_mask = below["text"].str.lower().str.contains(r"(sub\stotal\|amount\sdue\|total\|grand\s*total\|balance)", regex=True, na=False)
	if totals_mask.any():
	stop_y = below.loc[totals_mask,"top"].min()
	below = below[below["top"]<stop_y-4]
	rows = []
	for (b,p,l), g in below.groupby(["block_num","par_num","line_num"]):
	if g["text"].astype(str).str.strip().eq("").all(): continue
	g = g.sort_values("left")
	# assign to nearest header word x
	xs = np.array(col_x)
	buckets = {i:[] for i in range(len(xs))}
	for _,w in g.iterrows():
	idx = int(np.abs(xs - w["left"]).argmin())
	buckets[idx].append(str(w["text"]))
	vals = [" ".join(buckets.get(i,[])).strip() for i in range(len(xs))]
	rows.append(vals)
	if not rows: return pd.DataFrame()
	df_rows = pd.DataFrame(rows).fillna("")
	# try to name columns
	names = []
	for i, w in enumerate(header_band["text"].tolist()[:df_rows.shape[1]]):
	wl = w.lower()
	if "desc" in wl or wl in ["item","description"]:
	names.append("description")
	elif wl in ["qty","quantity"]:
	names.append("quantity")
	elif "unit" in wl or "rate" in wl or "price" in wl:
	names.append("unit_price")
	elif "amount" in wl or "total" in wl:
	names.append("line_total")
	else:
	names.append(f"col_{i}")
	df_rows.columns = names
	# drop empty lines
	df_rows = df_rows[~(df_rows.fillna("").apply(lambda r: "".join(r.values), axis=1).str.strip()=="")]
	return df_rows.reset_index(drop=True)

	# ========================= APP =========================
	st.title("Invoice Extraction — ViT recognizer (dparres) + Tesseract detector")

	up = st.file_uploader("Upload an invoice (PDF/JPG/PNG)", type=["pdf","png","jpg","jpeg"])
	if not up:
	st.info("Upload a scanned invoice to begin.")
	st.stop()

	pages = load_pages(up.read(), up.name)

	# load model once
	num_classes = len(alphabet) + (1 if arch=="Swin_CTC" else 0) # add CTC blank for Swin_CTC
	assert os.path.exists(ckpt_path), f"Checkpoint not found: {ckpt_path}"
	model = load_pdrt(arch, ckpt_path, num_classes)
	tfm = build_transform(img_h, img_w)

	page_idx = 0
	if len(pages) > 1:
	page_idx = st.number_input("Page", 1, len(pages), 1) - 1
	img = pages[page_idx]

	col1, col2 = st.columns([1.1,1.3], gap="large")

	with col1:
	st.subheader("Preview")
	st.image(img, use_column_width=True)
	det_img = preprocess_for_detection(img)
	with st.expander("Detection view"):
	st.image(det_img, use_column_width=True)

	with col2:
	st.subheader("OCR & Extraction")
	# 1) detect words (boxes only)
	det_df = detect_words(det_img, lang=det_lang)

	# 2) crop & recognize each word via ViT recognizer
	crops, metas = crop_words(det_img, det_df)
	texts = recognize_word_crops(model, crops, tfm, arch, alphabet)

	# 3) stitch line-by-line using tesseract line indices
	det_df = det_df.reset_index(drop=True)
	det_df["pred"] = texts
	grouped = det_df.groupby(["block_num","par_num","line_num"])
	lines = []
	for _, g in grouped:
	g = g.sort_values("left")
	line = " ".join([t for t in g["pred"].tolist() if t])
	lines.append(line)
	full_text = "\n".join([ln for ln in lines if ln.strip()])

	if show_boxes:
	st.caption("First 15 predicted words")
	st.write(det_df[["left","top","width","height","text","pred"]].head(15))

	# 4) key fields
	key_fields = parse_fields(full_text)
	k1,k2,k3 = st.columns(3)
	with k1:
	st.write(f"Invoice #: {key_fields.get('invoice_number') or '—'}")
	st.write(f"Invoice Date: {key_fields.get('invoice_date') or '—'}")
	with k2:
	st.write(f"PO #: {key_fields.get('po_number') or '—'}")
	st.write(f"Subtotal: {key_fields.get('subtotal') or '—'}")
	with k3:
	st.write(f"Tax: {key_fields.get('tax') or '—'}")
	tot = key_fields.get('total') or '—'
	cur = key_fields.get('currency') or ''
	st.write(f"Total: {tot} {cur}".strip())

	# 5) line items (geometry heuristic)
	items = items_from_wordgrid(det_df.assign(text=det_df["pred"]))
	st.markdown("Line Items")
	if items.empty:
	st.caption("No line items confidently detected.")
	else:
	st.dataframe(items, use_container_width=True)

	# 6) downloads
	result = {
	"file": up.name, "page": page_idx+1,
	"key_fields": key_fields,
	"items": items.to_dict(orient="records") if not items.empty else [],
	"full_text": full_text
	}
	st.download_button("Download JSON", data=json.dumps(result, indent=2), file_name="invoice_extraction.json", mime="application/json")
	if not items.empty:
	st.download_button("Download Items CSV", data=items.to_csv(index=False), file_name="invoice_items.csv", mime="text/csv")