Spaces:

samhithaxdk
/

TEM1-TargetFinder

Sleeping

App Files Files Community

TEM1-TargetFinder / app.py

samhithaxdk

Update app.py

966e5cc verified 3 months ago

raw

history blame contribute delete

17.5 kB

	import os, re, io, time, math, textwrap, warnings, requests
	import numpy as np
	import pandas as pd
	import matplotlib
	matplotlib.use("Agg") # 👈 headless backend for HF Spaces
	import matplotlib.pyplot as plt

	import gradio as gr

	import torch
	from transformers import AutoTokenizer, AutoModel
	from sklearn.cluster import KMeans
	from sklearn.linear_model import LogisticRegression
	from sklearn.calibration import CalibratedClassifierCV
	from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, average_precision_score
	from sklearn.metrics.pairwise import cosine_similarity
	from xgboost import XGBRegressor

	warnings.filterwarnings("ignore")

	# -----------------------------
	# Config
	# -----------------------------
	DATA_CANDIDATES = [
	os.getenv("TEM1_DATA_PATH", "tem1_clean.csv"),
	"data/tem1_clean.csv",
	"/data/tem1_clean.csv",
	]
	UNIPROT_ID = "P62593" # TEM-1 beta-lactamase
	PAFF_BINDER_THRESHOLD = 6.0 # >=6 ~ <=1µM

	# -----------------------------
	# Small helpers
	# -----------------------------
	def pAff_to_nM(p):
	# p = -log10(Kd M) -> Kd (nM) = 10**(9-p)
	return 10.0 ** (9.0 - float(p))

	def fmt_conc(nM):
	if nM < 1e-3: return f"{nM*1e3:.2f} pM"
	if nM < 1: return f"{nM:.2f} nM"
	if nM < 1e3: return f"{nM/1e3:.2f} µM"
	return f"{nM/1e6:.2f} mM"

	def conf_label(p):
	if p >= 0.80: return "Likely"
	if p >= 0.60: return "Uncertain"
	return "Unlikely"

	def conf_emoji(p):
	if p >= 0.80: return "🟢"
	if p >= 0.60: return "🟡"
	return "🔴"

	def _parse_smiles_block(text, limit=100):
	items = [s.strip() for s in re.split(r'[\n,;]+', str(text or "")) if s.strip()]
	return items[:limit]

	# -----------------------------
	# Load TEM-1 protein and embed
	# -----------------------------
	print("[boot] Fetching TEM-1 (UniProt %s)" % UNIPROT_ID)
	fasta = requests.get(f"https://rest.uniprot.org/uniprotkb/{UNIPROT_ID}.fasta").text
	TEM1_SEQ = "".join(line.strip() for line in fasta.splitlines() if not line.startswith(">"))
	TEM1_SEQ = re.sub(r"[^ACDEFGHIKLMNPQRSTVWY]", "", TEM1_SEQ.upper())
	print("[boot] TEM-1 length:", len(TEM1_SEQ))

	device = "cuda" if torch.cuda.is_available() else "cpu"
	print("[boot] Using device:", device)

	print("[boot] Loading ESM-2 35M ...")
	tok_p = AutoTokenizer.from_pretrained("facebook/esm2_t12_35M_UR50D")
	mdl_p = AutoModel.from_pretrained("facebook/esm2_t12_35M_UR50D").to(device).eval()

	print("[boot] Loading ChemBERTa ...")
	tok_l = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
	mdl_l = AutoModel.from_pretrained("seyonec/ChemBERTa-zinc-base-v1").to(device).eval()

	with torch.inference_mode():
	toks = tok_p(TEM1_SEQ, return_tensors="pt", add_special_tokens=True).to(device)
	rep = mdl_p(**toks).last_hidden_state[0, 1:-1, :].mean(dim=0).cpu().numpy()
	prot_vec = rep.astype(np.float32) # ~480-D
	print("[boot] Protein embedding:", prot_vec.shape)

	def _embed_ligands(smiles_list, batch_size=64, max_length=256):
	vecs = []
	for i in range(0, len(smiles_list), batch_size):
	batch = smiles_list[i:i+batch_size]
	enc = tok_l(batch, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to(device)
	with torch.inference_mode():
	out = mdl_l(**enc).last_hidden_state
	cls = out[:, 0, :].detach().cpu().numpy().astype(np.float32)
	vecs.append(cls)
	return np.vstack(vecs) if vecs else np.zeros((0, mdl_l.config.hidden_size), dtype=np.float32)

	# -----------------------------
	# Try to load training data
	# -----------------------------
	df = None
	for p in DATA_CANDIDATES:
	if os.path.exists(p):
	try:
	df = pd.read_csv(p)
	if {'smiles','pAff'}.issubset(df.columns):
	print(f"[boot] Loaded dataset: {p} -> {df.shape}")
	break
	except Exception as e:
	print("[boot] Failed reading", p, e)

	have_data = df is not None

	# Placeholders initialized below
	reg = None
	clf = None
	clf_cal = None
	bins = None
	q90_table = None
	lig_tr = None
	metrics_md = "(Train a model or upload tem1_clean.csv to populate metrics here.)"

	def _train_models_from_df(df):
	global reg, clf, clf_cal, bins, q90_table, lig_tr, metrics_md

	df = df.dropna(subset=["smiles","pAff"]).reset_index(drop=True)

	# Ligand embeddings
	t0 = time.time()
	lig_X = _embed_ligands(df["smiles"].tolist())
	print(f"[train] Ligand embed {lig_X.shape} in {time.time()-t0:.1f}s")

	# Joint features with protein
	prot_X = np.repeat(prot_vec.reshape(1, -1), len(df), axis=0)
	X = np.hstack([prot_X, lig_X]).astype(np.float32)

	# Targets
	y = df["pAff"].astype(np.float32).values
	y_bin = (y >= PAFF_BINDER_THRESHOLD).astype(int)

	# Group-wise split by k-means clusters (scaffold-free)
	k = max(5, min(50, len(df)//50))
	km = KMeans(n_clusters=k, random_state=7, n_init=10)
	groups = km.fit_predict(lig_X)

	# custom split that holds out whole clusters
	def groupwise_split(groups, test_frac=0.2, seed=7):
	rng = np.random.default_rng(seed)
	keys = list(set(groups))
	rng.shuffle(keys)
	N = len(groups)
	target = int(N*test_frac)
	taken, test_idx = 0, []
	for key in keys:
	idx = np.where(groups==key)[0].tolist()
	test_idx.extend(idx)
	taken += len(idx)
	if taken >= target:
	break
	train_idx = sorted(set(range(N)) - set(test_idx))
	return np.array(train_idx), np.array(test_idx)

	tr_idx, te_idx = groupwise_split(groups, test_frac=0.2, seed=7)
	X_tr, X_te = X[tr_idx], X[te_idx]
	y_tr, y_te = y[tr_idx], y[te_idx]
	yb_tr, yb_te = y_bin[tr_idx], y_bin[te_idx]

	# Heads
	reg = XGBRegressor(
	n_estimators=600, max_depth=6, learning_rate=0.05,
	subsample=0.8, colsample_bytree=0.8, n_jobs=-1
	).fit(X_tr, y_tr)

	clf = LogisticRegression(max_iter=2000).fit(X_tr, yb_tr)

	# Metrics
	pred = reg.predict(X_te)
	try:
	rmse = mean_squared_error(y_te, pred, squared=False)
	except TypeError:
	rmse = mean_squared_error(y_te, pred) ** 0.5
	r2 = r2_score(y_te, pred)
	p_bin = clf.predict_proba(X_te)[:, 1]
	roc = roc_auc_score(yb_te, p_bin)
	pr = average_precision_score(yb_te, p_bin)

	# conditional q90 by predicted bin
	bins = np.linspace(float(pred.min()), float(pred.max()), 8)
	bin_idx = np.digitize(pred, bins)
	abs_err = np.abs(y_te - pred)
	q90_table = np.zeros(len(bins)+1, dtype=np.float32)
	for i in range(len(q90_table)):
	vals = abs_err[bin_idx==i]
	q90_table[i] = np.quantile(vals, 0.90) if len(vals)>0 else float(np.quantile(abs_err, 0.90))

	# calibration & similarity
	clf_cal = CalibratedClassifierCV(clf, method="isotonic", cv=3).fit(X_tr, yb_tr)
	lig_tr = lig_X[tr_idx]

	metrics_md = (
	f"Eval (held-out) — RMSE: {rmse:.2f} pAff (≈×{10**rmse:.1f}), "
	f"R²: {r2:.2f}, ROC-AUC: {roc:.2f}, PR-AUC: {pr:.2f}"
	)
	print("[train] done.")

	def q90_for(p):
	i = int(np.digitize([p], bins)[0]) if bins is not None else 0
	i = max(0, min(i, len(q90_table)-1)) if q90_table is not None else 0
	return q90_table[i] if q90_table is not None else 0.75 # conservative fallback

	# Try real training; otherwise install heuristic heads
	if have_data:
	_train_models_from_df(df)
	else:
	print("[boot] No dataset found — using heuristic heads (demo mode).")

	class HeuristicReg:
	def predict(self, X):
	# X: [B, Dp+Dl]; take ligand part and compute cosine to protein-projected vector
	Dp = prot_vec.shape[0]
	lig = X[:, Dp:]
	# project protein to ligand dims
	pv = prot_vec[:lig.shape[1]]
	pv = pv / (np.linalg.norm(pv) + 1e-8)
	lig_n = lig / (np.linalg.norm(lig, axis=1, keepdims=True)+1e-8)
	sim = (lig_n @ pv)
	return 5.5 + 2.0*(sim.clip(-1,1)+1)/2.0 # ~ [4.5,7.5]

	class HeuristicClf:
	def predict_proba(self, X):
	Dp = prot_vec.shape[0]
	lig = X[:, Dp:]
	pv = prot_vec[:lig.shape[1]]
	pv = pv / (np.linalg.norm(pv) + 1e-8)
	lig_n = lig / (np.linalg.norm(lig, axis=1, keepdims=True)+1e-8)
	sim = (lig_n @ pv)
	z = (sim - sim.min()) / (sim.max()-sim.min()+1e-8)
	p = 1/(1+np.exp(-4*(z-0.5)))
	return np.vstack([1-p, p]).T

	reg = HeuristicReg()
	clf = HeuristicClf()
	clf_cal = clf
	bins = np.linspace(4.0, 8.0, 8)
	q90_table = np.full(len(bins)+1, 0.75, dtype=np.float32)
	lig_tr = np.zeros((1, mdl_l.config.hidden_size), dtype=np.float32)
	metrics_md = "(Demo mode — upload tem1_clean.csv to train real heads.)"

	# -----------------------------
	# Prediction helpers
	# -----------------------------
	def train_similarity(smiles):
	enc = tok_l([smiles], padding=True, truncation=True, max_length=256, return_tensors="pt").to(device)
	with torch.inference_mode():
	lig = mdl_l(**enc).last_hidden_state[:,0,:].cpu().numpy().astype(np.float32)
	if lig_tr is None or lig_tr.shape[0]==0:
	return 0.0
	sim = cosine_similarity(lig, lig_tr)[0]
	return float(sim.max())

	import matplotlib.pyplot as plt # (already imported at top, fine to keep)

	import traceback
	import matplotlib.pyplot as plt # keep after matplotlib.use("Agg")

	def _blank_fig(width=3.6, height=0.6):
	fig = plt.figure(figsize=(width, height))
	plt.axis("off")
	return fig

	def predict_smiles(smiles: str):
	try:
	# Empty input → friendly message + blank fig
	if not smiles:
	return "Please enter a SMILES", _blank_fig()

	# 1) ligand embedding
	enc = tok_l([smiles], padding=True, truncation=True, max_length=256, return_tensors="pt").to(device)
	with torch.inference_mode():
	out = mdl_l(**enc).last_hidden_state
	lig = out[:, 0, :].detach().cpu().numpy().astype(np.float32)

	# 2) joint feature
	fx = np.hstack([prot_vec.reshape(1, -1), lig]).astype(np.float32)

	# 3) regression + interval
	p_aff = float(reg.predict(fx)[0])
	q90 = q90_for(p_aff)
	p_lo, p_hi = p_aff - q90, p_aff + q90

	nM_center = pAff_to_nM(p_aff)
	nM_hi, nM_lo = pAff_to_nM(p_hi), pAff_to_nM(p_lo)

	# 4) calibrated binder probability
	try:
	p_cal = float(clf_cal.predict_proba(fx)[:, 1])
	except Exception:
	p_cal = float(clf.predict_proba(fx)[:, 1])
	label = conf_label(p_cal); mark = conf_emoji(p_cal)
	badge = " (≤1 µM)" if p_aff >= PAFF_BINDER_THRESHOLD else ""

	# 5) similarity
	sim = train_similarity(smiles)
	sim_note = (f"\nNearest-set similarity: {sim:.2f}"
	if sim >= 0.60 else
	f"\n⚠️ Low similarity to training set: {sim:.2f}")

	md = (
	f"Predicted pAff: {p_aff:.2f} (−log10 M){badge} → Kd ≈ {fmt_conc(nM_center)}\n\n"
	f"90% interval: {p_lo:.2f} — {p_hi:.2f} (≈ {fmt_conc(nM_hi)} to {fmt_conc(nM_lo)})\n\n"
	f"Binder confidence: {mark} {label} ({p_cal:.2f}){sim_note}\n"
	)

	# Mini bar to visualize P(binder)
	fig = plt.figure(figsize=(3.6, 0.6))
	ax = fig.add_axes([0.07, 0.35, 0.86, 0.35])
	ax.barh([0], [p_cal], height=0.6)
	ax.set_xlim(0, 1)
	ax.set_yticks([])
	ax.set_xticks([0, 0.5, 1.0])
	ax.set_title("P(binder)")
	for spine in ax.spines.values():
	spine.set_visible(False)

	return md, fig

	except Exception as e:
	# Show the error inline so we can debug without checking logs
	tb = traceback.format_exc(limit=5)
	msg = f"❌ Error: {e}\n\n```\n{tb}\n```"
	return msg, _blank_fig()

	def batch_predict(smiles_text):
	smi = _parse_smiles_block(smiles_text)
	if not smi:
	return [], np.array([]), np.array([])
	lig = _embed_ligands(smi) # (L, Dl)
	P = np.repeat(prot_vec.reshape(1, -1), len(smi), 0) # (L, Dp)
	X = np.hstack([P, lig]).astype(np.float32) # (L, Dp+Dl)
	p_aff = reg.predict(X)
	p_bind = clf.predict_proba(X)[:, 1]
	return smi, p_aff, p_bind

	def plot_paff_bars(names, paff, paff_thr=PAFF_BINDER_THRESHOLD):
	names = list(names); paff = np.array(paff, dtype=float)
	fig, ax = plt.subplots(figsize=(max(6, len(names)*0.6), 3.2))
	ax.bar(range(len(names)), paff)
	ax.axhline(paff_thr, linestyle="--")
	ax.set_xticks(range(len(names)))
	ax.set_xticklabels([n[:16]+("…" if len(n)>16 else "") for n in names], rotation=45, ha="right")
	ax.set_ylabel("Predicted pAff (−log10 M)"); ax.set_title("Batch predictions — pAff")
	plt.tight_layout()
	return fig

	def plot_paff_vs_pbind(names, paff, pbind, hi=0.80, mid=0.60, paff_thr=PAFF_BINDER_THRESHOLD):
	names = list(names); paff = np.array(paff, dtype=float); pbind = np.array(pbind, dtype=float)
	fig, ax = plt.subplots(figsize=(5.8, 4.2))
	ax.scatter(paff, pbind, s=36)
	ax.axvline(paff_thr, linestyle="--"); ax.axhline(hi, linestyle="--"); ax.axhline(mid, linestyle="--")
	top = np.argsort(-(paff + pbind))[:10]
	for i in top:
	lbl = names[i][:18] + ("…" if len(names[i]) > 18 else "")
	ax.annotate(lbl, (paff[i], pbind[i]), xytext=(4, 4), textcoords="offset points")
	ax.set_xlabel("Predicted pAff (−log10 M)"); ax.set_ylabel("Calibrated P(binder)")
	ax.set_title("Batch predictions"); plt.tight_layout()
	return fig

	def heatmap_predict(smiles_block):
	smi_list = _parse_smiles_block(smiles_block)
	if not smi_list:
	fig = plt.figure(figsize=(4, 2))
	plt.axis("off")
	plt.text(0.5, 0.5, "No SMILES provided", ha="center", va="center")
	return fig

	# Embed ligands
	ligs = _embed_ligands(smi_list)
	# Joint features (protein + ligands)
	pv_rep = np.repeat(prot_vec.reshape(1, -1), len(smi_list), axis=0)
	fx = np.hstack([pv_rep, ligs]).astype(np.float32)

	# Predict pAff (single protein row)
	p_affs = reg.predict(fx) # shape (L,)
	M = p_affs.reshape(1, -1) # 1 x L

	fig, ax = plt.subplots(figsize=(max(6, len(smi_list)*0.8), 2.8))
	im = ax.imshow(M, aspect="auto")
	ax.set_xticks(range(len(smi_list)))
	ax.set_xticklabels([s[:14] + ("…" if len(s) > 14 else "") for s in smi_list],
	rotation=45, ha="right")
	ax.set_yticks([0]); ax.set_yticklabels(["TEM-1 (WT)"])
	cbar = fig.colorbar(im, ax=ax); cbar.set_label("Predicted pAff")

	# Mark predicted binders (>= threshold)
	for j in range(M.shape[1]):
	if M[0, j] >= PAFF_BINDER_THRESHOLD:
	ax.text(j, 0, "★", ha="center", va="center", color="white", fontsize=12)

	ax.set_title("Heatmap — predicted pAff (higher is better)")
	plt.tight_layout()
	return fig


	# -----------------------------
	# Gradio UI
	# -----------------------------
	with gr.Blocks(title="Antibiotic Resistance Target Finder — TEM-1") as demo:
	gr.Markdown("""\
	# Antibiotic Resistance Target Finder — TEM-1
	Goal: Predict how tightly a small molecule binds TEM-1 β-lactamase variants.

	How to use (2 steps):
	1) Paste a SMILES string and click Submit to get a prediction.
	2) (Optional) Paste multiple SMILES for batch plots and a heatmap.

	Protein embeddings: ESM-2 (35M) · Ligand embeddings: ChemBERTa · Models: XGBoost + LogisticRegression
	""")

	with gr.Row():
	smi_in = gr.Textbox(label="SMILES", placeholder="e.g., CC1=CC(=O)C=CC1=O", lines=1)
	btn = gr.Button("Submit", variant="primary")

	out_md = gr.Markdown()
	out_plot = gr.Plot()

	btn.click(fn=predict_smiles, inputs=smi_in, outputs=[out_md, out_plot])

	gr.Markdown("""---
	### Batch mode (paste 1–100 SMILES separated by newlines, commas, or semicolons)
	""")

	smi_batch = gr.Textbox(label="Batch SMILES", lines=6, placeholder="SMILES per line ...")

	with gr.Row():
	btn_bars = gr.Button("Bar chart (pAff)")
	btn_scatter = gr.Button("Scatter (pAff vs P(binder))")
	btn_heat = gr.Button("Heatmap")

	plot1 = gr.Plot()
	plot2 = gr.Plot()
	plot3 = gr.Plot()

	def _bars(smiblock):
	names, paff, pbind = batch_predict(smiblock)
	return plot_paff_bars(names, paff)

	def _scatter(smiblock):
	names, paff, pbind = batch_predict(smiblock)
	return plot_paff_vs_pbind(names, paff, pbind)

	def _heat(smiblock):
	return heatmap_predict(smiblock)

	btn_bars.click(_bars, inputs=smi_batch, outputs=plot1)
	btn_scatter.click(_scatter, inputs=smi_batch, outputs=plot2)
	btn_heat.click(_heat, inputs=smi_batch, outputs=plot3)


	with gr.Accordion("Model card: assumptions, metrics & limits", open=False):
	gr.Markdown("""\
	Compute footprint: small (≤50M embeddings + lightweight heads). Runs on CPU in Spaces.
	%s

	Assumptions / caveats
	- Trained on TEM-1 datasets; predictions for very dissimilar chemotypes are less certain.
	- Reported “confidence” is calibrated on a held-out set; not a substitute for wet-lab validation.
	- Use as a ranking/triage tool, not as a definitive activity claim.

	pAff is −log10(Kd in molar). Bigger is better. Example: 1 µM → pAff=6; 100 nM → 7; 10 nM → 8.
	""" % metrics_md)

	demo.launch()