Billy / main.py

Update main.py

ffb81a0 verified 3 months ago

14.6 kB


	import hashlib
	import time
	from typing import List, Dict, Any, Tuple, Optional

	import torch
	import gradio as gr

	# Optional deps (web search + vector store)
	ddg = None
	DDGS = None
	try:
	from duckduckgo_search import ddg as _ddg
	ddg = _ddg
	except Exception:
	try:
	from duckduckgo_search import DDGS as _DDGS
	DDGS = _DDGS
	except Exception:
	ddg = None
	DDGS = None

	try:
	import chromadb
	except Exception:
	chromadb = None

	from sentence_transformers import SentenceTransformer

	from transformers import (
	AutoTokenizer,
	AutoModelForCausalLM,
	)

	# Optional quantization (4-bit on GPU)
	BITSANDBYTES_AVAILABLE = False
	try:
	from transformers import BitsAndBytesConfig
	BITSANDBYTES_AVAILABLE = True
	except Exception:
	BITSANDBYTES_AVAILABLE = False

	# ===============================
	# 1) Model Setup (Llama-3.1-8B-Instruct)
	# ===============================
	MODEL_ID = os.getenv("MODEL_ID", "meta-llama/Meta-Llama-3.1-8B-Instruct")
	HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_HUB_TOKEN")

	print("🚀 Loading Billy AI model...")

	# Tokenizer
	try:
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
	except TypeError:
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_auth_token=HF_TOKEN)

	if tokenizer.pad_token_id is None:
	# Fallback to eos as pad if not set
	tokenizer.pad_token_id = tokenizer.eos_token_id

	def _gpu_bf16_supported() -> bool:
	try:
	return torch.cuda.is_available() and torch.cuda.is_bf16_supported()
	except Exception:
	return False

	def _model_device(m) -> torch.device:
	try:
	return next(m.parameters()).device
	except Exception:
	return torch.device("cpu")

	load_kwargs: Dict[str, Any] = {}
	if torch.cuda.is_available():
	if BITSANDBYTES_AVAILABLE:
	print("⚙️ Using 4-bit quantization (bitsandbytes).")
	compute_dtype = torch.bfloat16 if _gpu_bf16_supported() else torch.float16
	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=compute_dtype,
	)
	load_kwargs.update(dict(device_map="auto", quantization_config=bnb_config, token=HF_TOKEN))
	else:
	print("⚙️ No bitsandbytes: loading in half precision on GPU.")
	load_kwargs.update(dict(device_map="auto",
	torch_dtype=torch.bfloat16 if _gpu_bf16_supported() else torch.float16,
	token=HF_TOKEN))
	else:
	print("⚠️ No GPU detected: CPU load (slow). Consider a smaller model or enable GPU runtime.")
	load_kwargs.update(dict(torch_dtype=torch.float32, token=HF_TOKEN))

	# Load model with fallbacks for auth kwarg differences
	try:
	model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **load_kwargs)
	except TypeError:
	load_kwargs.pop("token", None)
	try:
	model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **load_kwargs)
	except TypeError:
	model = AutoModelForCausalLM.from_pretrained(MODEL_ID, use_auth_token=HF_TOKEN, **load_kwargs)

	MODEL_DEVICE = _model_device(model)
	print(f"✅ Model loaded on: {MODEL_DEVICE}")

	# ===============================
	# 2) Lightweight RAG (Embeddings + Optional Chroma + In-Memory Fallback)
	# ===============================
	try:
	embedder = SentenceTransformer("all-MiniLM-L6-v2")
	print("✅ Embedding model loaded.")
	except Exception as e:
	raise RuntimeError(f"Embedding model load failed: {e}")

	# Optional Chroma persistent store; fallback to in-memory store if unavailable.
	chroma_client = None
	collection = None
	if chromadb is not None:
	try:
	chroma_client = chromadb.PersistentClient(path="./billy_rag_db")
	try:
	collection = chroma_client.get_collection("billy_rag")
	except Exception:
	collection = chroma_client.create_collection("billy_rag")
	print("✅ ChromaDB ready.")
	except Exception as e:
	print(f"⚠️ ChromaDB init failed: {e}; falling back to in-memory store.")

	# In-memory store: list of dicts {text, embedding}
	memory_store: List[Dict[str, Any]] = []

	def _stable_id(text: str) -> str:
	return hashlib.sha1(text.encode("utf-8")).hexdigest()

	def search_web(query: str, max_results: int = 3) -> List[str]:
	# Try legacy ddg function
	try:
	if ddg is not None:
	try:
	results = ddg(query, max_results=max_results)
	except TypeError:
	results = ddg(keywords=query, max_results=max_results)
	snippets = []
	for r in results or []:
	if not r:
	continue
	snippets.append(r.get("body") or r.get("snippet") or r.get("title") or "")
	return [s for s in snippets if s and s.strip()]
	except Exception:
	pass

	# Try modern DDGS client
	try:
	if DDGS is not None:
	with DDGS() as d:
	results = list(d.text(query, max_results=max_results))
	snippets = []
	for r in results or []:
	if not r:
	continue
	# r keys differ slightly in DDGS()
	snippets.append(r.get("body") or r.get("snippet") or r.get("title") or r.get("href") or "")
	return [s for s in snippets if s and s.strip()]
	except Exception:
	pass

	return []

	def store_knowledge(text: str):
	if not text or not text.strip():
	return
	try:
	vec = embedder.encode(text).tolist()
	except Exception:
	return
	if collection is not None:
	try:
	collection.add(
	documents=[text],
	embeddings=[vec],
	ids=[_stable_id(text)],
	metadatas=[{"source": "web_or_local"}],
	)
	return
	except Exception:
	pass
	# Fallback: in-memory
	memory_store.append({"text": text, "embedding": vec})

	def _cosine(a: List[float], b: List[float]) -> float:
	s = 0.0
	na = 0.0
	nb = 0.0
	for x, y in zip(a, b):
	s += x * y
	na += x * x
	nb += y * y
	na = na ** 0.5 or 1.0
	nb = nb ** 0.5 or 1.0
	return s / (na * nb)

	def retrieve_knowledge(query: str, k: int = 5) -> str:
	try:
	qvec = embedder.encode(query).tolist()
	except Exception:
	return ""
	# Prefer Chroma if available
	if collection is not None:
	try:
	res = collection.query(query_embeddings=[qvec], n_results=k)
	docs = res.get("documents", [])
	if docs and docs[0]:
	return " ".join(docs[0])
	except Exception:
	pass
	# In-memory cosine top-k
	if not memory_store:
	return ""
	scored: List[Tuple[str, float]] = []
	for item in memory_store:
	scored.append((item["text"], _cosine(qvec, item["embedding"])))
	scored.sort(key=lambda x: x[1], reverse=True)
	return " ".join([t for t, _ in scored[:k]])

	# ===============================
	# 3) Generation Utilities
	# ===============================
	def build_messages(system_prompt: str, chat_history: List[Tuple[str, str]], user_prompt: str) -> List[Dict[str, str]]:
	messages: List[Dict[str, str]] = [{"role": "system", "content": system_prompt}]
	# chat_history is a list of (user, assistant) tuples
	for u, a in chat_history or []:
	if u:
	messages.append({"role": "user", "content": u})
	if a:
	messages.append({"role": "assistant", "content": a})
	messages.append({"role": "user", "content": user_prompt})
	return messages

	def apply_chat_template_from_messages(messages: List[Dict[str, str]]) -> str:
	try:
	return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	except Exception:
	# Fallback to simple instruct style if no template provided
	sys = ""
	user = ""
	# Extract the last system and user message for a minimal fallback
	for m in messages:
	if m["role"] == "system":
	sys = m["content"]
	elif m["role"] == "user":
	user = m["content"]
	sys = (sys or "").strip()
	user = (user or "").strip()
	prefix = f"{sys}\n\n" if sys else ""
	return f"{prefix}User: {user}\nAssistant:"

	def _get_eos_token_id():
	eos_id = getattr(tokenizer, "eos_token_id", None)
	if isinstance(eos_id, list) and eos_id:
	return eos_id[0]
	return eos_id

	def generate_text(prompt_text: str,
	max_tokens: int = 600,
	temperature: float = 0.6,
	top_p: float = 0.9) -> str:
	inputs = tokenizer(prompt_text, return_tensors="pt")
	inputs = {k: v.to(MODEL_DEVICE) for k, v in inputs.items()}
	output_ids = model.generate(
	**inputs,
	max_new_tokens=min(max_tokens, 2048),
	do_sample=True,
	temperature=temperature,
	top_p=top_p,
	pad_token_id=tokenizer.pad_token_id,
	eos_token_id=_get_eos_token_id(),
	)
	text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
	# Best-effort: strip the prompt echo if present
	if text.startswith(prompt_text):
	return text[len(prompt_text):].strip()
	return text.strip()

	def summarize_text(text: str) -> str:
	system = "You are Billy AI — a precise, helpful summarizer."
	user = f"Summarize the following text in simple, clear bullet points (max 6 bullets):\n\n{text}"
	messages = build_messages(system, [], user)
	return generate_text(apply_chat_template_from_messages(messages), max_tokens=220, temperature=0.3, top_p=0.9)

	def translate_text(text: str, lang: str) -> str:
	system = "You are Billy AI — an expert translator."
	user = f"Translate the following text to {lang} while preserving meaning and tone:\n\n{text}"
	messages = build_messages(system, [], user)
	return generate_text(apply_chat_template_from_messages(messages), max_tokens=220, temperature=0.3, top_p=0.9)

	def explain_code(code: str) -> str:
	system = "You are Billy AI — an expert software engineer and teacher."
	user = ("Explain the following code step by step for a mid-level developer. "
	"Include what it does, complexity, pitfalls, and an improved version if relevant.\n\n"
	f"{code}")
	messages = build_messages(system, [], user)
	return generate_text(apply_chat_template_from_messages(messages), max_tokens=400, temperature=0.5, top_p=0.9)

	# ===============================
	# 4) Chat Orchestration
	# ===============================
	def make_system_prompt(local_knowledge: str) -> str:
	base = ("You are Billy AI — a helpful, witty, and precise assistant. "
	"You tend to outperform GPT-3.5 on reasoning, explanation, and coding tasks. "
	"Be concise but thorough; use bullet points for clarity; cite assumptions; avoid hallucinations.")
	if local_knowledge:
	base += f"\nUseful context: {local_knowledge[:3000]}"
	return base

	def _ingest_search(query: str, max_results: int = 3) -> int:
	snips = search_web(query, max_results=max_results)
	for s in snips:
	store_knowledge(s)
	return len(snips)

	def _parse_translate_command(cmd: str) -> Tuple[Optional[str], Optional[str]]:
	# Supports patterns:
	# /translate <lang>: <text>
	# /translate <lang> \| <text>
	# /translate <lang> <text>
	rest = cmd[len("/translate"):].strip()
	if not rest:
	return None, None
	# Try separators
	for sep in [":", "\|"]:
	if sep in rest:
	lang, text = rest.split(sep, 1)
	return lang.strip(), text.strip()
	parts = rest.split(None, 1)
	if len(parts) == 2:
	return parts[0].strip(), parts[1].strip()
	return None, None

	def handle_message(message: str, chat_history: List[Tuple[str, str]]) -> str:
	msg = (message or "").strip()
	if not msg:
	return "Please send a non-empty message."

	# Slash commands
	low = msg.lower()
	if low.startswith("/summarize "):
	return summarize_text(msg[len("/summarize "):].strip() or "Nothing to summarize.")
	if low.startswith("/explain "):
	return explain_code(message[len("/explain "):].strip())
	if low.startswith("/translate"):
	lang, txt = _parse_translate_command(message)
	if not lang or not txt:
	return "Usage: /translate <lang>: <text>"
	return translate_text(txt, lang)
	if low.startswith("/search "):
	q = message[len("/search "):].strip()
	if not q:
	return "Usage: /search <query>"
	n = _ingest_search(q, max_results=5)
	ctx = retrieve_knowledge(q, k=5)
	if n == 0 and not ctx:
	return "No results found or web search unavailable."
	return f"Ingested {n} snippet(s). Context now includes:\n\n{ctx[:1000]}"

	if low.startswith("/remember "):
	t = message[len("/remember "):].strip()
	if not t:
	return "Usage: /remember <text>"
	store_knowledge(t)
	return "Saved to knowledge base."

	# RAG: retrieve related knowledge
	local_knowledge = retrieve_knowledge(msg, k=5)
	system_prompt = make_system_prompt(local_knowledge)

	messages = build_messages(system_prompt, chat_history, msg)
	prompt = apply_chat_template_from_messages(messages)
	return generate_text(prompt, max_tokens=600, temperature=0.6, top_p=0.9)

	# ===============================
	# 5) Gradio UI
	# ===============================
	def respond(message, history):
	# history is a list of [user, assistant] pairs
	# Convert history to list of tuples[str, str]
	tuples: List[Tuple[str, str]] = []
	for turn in history or []:
	if isinstance(turn, (list, tuple)) and len(turn) == 2:
	u = turn[0] if turn[0] is not None else ""
	a = turn[1] if turn[1] is not None else ""
	tuples.append((str(u), str(a)))
	try:
	return handle_message(message, tuples)
	except Exception as e:
	return f"Error: {e}"

	with gr.Blocks(title="Billy AI") as demo:
	gr.Markdown("## Billy AI")
	gr.Markdown(
	"Commands: /summarize <text>, /explain <code>, /translate <lang>: <text>, /search <query>, /remember <text>"
	)
	chat = gr.ChatInterface(
	fn=respond,
	title="Billy AI",
	theme="soft",
	cache_examples=False,
	)

	if __name__ == "__main__":
	# Share=False by default; set to True if you want a public link
	demo.launch()