Spaces:
Sleeping
Sleeping
File size: 2,028 Bytes
cca58a9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
from datasets import load_dataset
import pandas as pd
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
def load_raw_dataset():
dataset = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k")
df = pd.DataFrame(dataset["train"])
df["combined"] = df["input"] + " " + df["output"]
docs = [
Document(
page_content=row["combined"],
metadata={"question": row["input"], "answer": row["output"]},
)
for _, row in df.iterrows()
]
return docs
def create_vector_database(docs, model_name):
embedding_model = HuggingFaceEmbeddings(model_name=model_name)
vectorstore = FAISS.from_documents(docs, embedding_model)
return vectorstore
def get_llm(model_name):
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name, torch_dtype="auto", device_map="auto"
)
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_new_tokens=512,
temperature=0.7,
do_sample=True,
)
llm = HuggingFacePipeline(pipeline=pipe)
return llm
def get_prompt_template():
prompt_template = PromptTemplate(
input_variables=["context", "question"],
template="""Based on the following references and your medical knowledge, provide a detailed response:
References:
{context}
Question: {question}
By considering:
1. The key medical concepts in the question.
2. How the reference cases relate to this question.
3. What medical principles should be applied.
4. Any potential complications or considerations.
Give the final response:
""",
)
return prompt_template
|