Spaces:

The-Ultimate-RAG-HF
/

test

Sleeping

App Files Files Community

test / app /models.py

Andrchest

final try 68

90510a1 5 months ago

raw

history blame contribute delete

3.69 kB

	from sentence_transformers import SentenceTransformer, \
	CrossEncoder # SentenceTransformer -> model for embeddings, CrossEncoder -> re-ranker
	from ctransformers import AutoModelForCausalLM
	from torch import Tensor
	from google import genai
	from google.genai import types
	from app.chunks import Chunk
	import os
	import numpy as np # used only for type hints
	from app.settings import device, local_llm_config, local_generation_config, gemini_generation_config


	class Embedder:
	def __init__(self, model: str = "BAAI/bge-m3"):
	self.device: str = device
	self.model_name: str = model
	self.model: SentenceTransformer = SentenceTransformer(model, device=self.device)

	'''
	Encodes string to dense vector
	'''

	def encode(self, text: str \| list[str]) -> Tensor \| list[Tensor]:
	return self.model.encode(sentences=text, show_progress_bar=False, batch_size=32)

	'''
	Returns the dimensionality of dense vector
	'''

	def get_vector_dimensionality(self) -> (int \| None):
	return self.model.get_sentence_embedding_dimension()


	class Reranker:
	def __init__(self, model: str = "cross-encoder/ms-marco-MiniLM-L6-v2"):
	self.device: str = device
	self.model_name: str = model
	self.model: CrossEncoder = CrossEncoder(model, device=self.device)

	'''
	Returns re-sorted (by relevance) vector with dicts, from which we need only the 'corpus_id'
	since it is a position of chunk in original list
	'''

	def rank(self, query: str, chunks: list[Chunk]) -> list[dict[str, int]]:
	return self.model.rank(query, [chunk.get_raw_text() for chunk in chunks])


	# TODO: add models parameters to global config file
	# TODO: add exception handling when response have more tokens than was set
	# TODO: find a way to restrict the model for providing too long answers

	class LocalLLM:
	def __init__(self):
	self.model = AutoModelForCausalLM.from_pretrained(**local_llm_config)

	'''
	Produces the response to user's prompt

	stream -> flag, determines weather we need to wait until the response is ready or can show it token by token

	TODO: invent a way to really stream the answer (as return value)
	'''

	def get_response(self, prompt: str, stream: bool = True, logging: bool = True,
	use_default_config: bool = True) -> str:

	with open("prompt.txt", "w") as f:
	f.write(prompt)

	generated_text = ""
	tokenized_text: list[int] = self.model.tokenize(text=prompt)
	response: list[int] = self.model.generate(tokens=tokenized_text, **local_generation_config)

	if logging:
	print(response)

	if not stream:
	return self.model.detokenize(response)

	for token in response:
	chunk = self.model.detokenize([token])
	generated_text += chunk
	if logging:
	print(chunk, end="", flush=True) # flush -> clear the buffer

	return generated_text


	class Gemini:
	def __init__(self, model="gemini-2.0-flash"):
	self.client = genai.Client(api_key=os.environ['GEMINI_API_KEY'])
	self.model = model

	def get_response(self, prompt: str, stream: bool = True, logging: bool = True,
	use_default_config: bool = False) -> str:
	with open("prompt.txt", "w", encoding="utf-8", errors="replace") as f:
	f.write(prompt)

	response = self.client.models.generate_content(
	model=self.model,
	contents=prompt,
	config=types.GenerateContentConfig(**gemini_generation_config) if use_default_config else None
	)

	return response.text