from datasets import load_dataset import pandas as pd from langchain.schema import Document from langchain.embeddings import HuggingFaceEmbeddings from langchain.vectorstores import FAISS from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline from langchain.llms import HuggingFacePipeline from langchain.prompts import PromptTemplate def load_raw_dataset(): dataset = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k") df = pd.DataFrame(dataset["train"]) df["combined"] = df["input"] + " " + df["output"] docs = [ Document( page_content=row["combined"], metadata={"question": row["input"], "answer": row["output"]}, ) for _, row in df.iterrows() ] return docs def create_vector_database(docs, model_name): embedding_model = HuggingFaceEmbeddings(model_name=model_name) vectorstore = FAISS.from_documents(docs, embedding_model) return vectorstore def get_llm(model_name): tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="auto", device_map="auto" ) pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512, temperature=0.7, do_sample=True, ) llm = HuggingFacePipeline(pipeline=pipe) return llm def get_prompt_template(): prompt_template = PromptTemplate( input_variables=["context", "question"], template="""Based on the following references and your medical knowledge, provide a detailed response: References: {context} Question: {question} By considering: 1. The key medical concepts in the question. 2. How the reference cases relate to this question. 3. What medical principles should be applied. 4. Any potential complications or considerations. Give the final response: """, ) return prompt_template