Aman
/

selfrag-zh_baichuan2_7b_chat

@@ -11,27 +11,54 @@ This model is a 7B Chinese version of [Self-RAG](https://huggingface.co/selfrag/
 It is trained on Baichuan2-7B-Chat with a sample of [belle](https://github.com/LianjiaTech/BELLE) sft data, acompanying with interleaving passages from zhwiki. The reflection tokens are aligned with the original verison (in English), so the usage is the same. Hope you enjoy.
 ### Usage
 ```
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from vllm import LLM, SamplingParams
-model = LLM(YOUR_MODEL_PATH, dtype="half")
-sampling_params = SamplingParams(temperature=0.0, top_p=1.0, max_tokens=100, skip_special_tokens=False)
 def format_prompt(input, paragraph=None):
-  prompt = "### Instruction:\n{0}\n\n### Response:\n".format(input)
-  if paragraph is not None:
-    prompt += "[Retrieval]<paragraph>{0}</paragraph>".format(paragraph)
-return prompt
-query_1 = "你好呀"
-query_2 = "故宫三大殿是哪些？"
-queries = [query_1, query_2]
-preds = model.generate([format_prompt(query) for query in queries], sampling_params)
-for pred in preds:
-  print("Model prediction: {0}".format(pred.outputs[0].text))
 # Model prediction: [No Retrieval] 你好！有什么我可以帮你解答的问题吗？ [Utility:5] </s>
 # Model prediction: [Retrieval] <paragraph> ... (this query requires factual grounding, call a retriever) </paragraph> [Relevant] 太和殿、中和殿、保和殿 [Utility:5] </s>
 ```

 It is trained on Baichuan2-7B-Chat with a sample of [belle](https://github.com/LianjiaTech/BELLE) sft data, acompanying with interleaving passages from zhwiki. The reflection tokens are aligned with the original verison (in English), so the usage is the same. Hope you enjoy.
 ### Usage
+I found some output errors while adopting vllm to accelerate the generation process and not sure whether it is due to some precision issues.
+This may be owing to the implementation of vllm. Thus, I use the original generate method of transformers.
 ```
+import os, torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained(YOUR_TOKENIZER_PATH)
+model = AutoModelForCausalLM.from_pretrained(
+        YOUR_MODEL_PATH,
+        torch_dtype=torch.bfloat16,
+        device_map="cuda",
+    )
+### set your retriever if necessary
+retriever = setup_retriever(YOUR_RETRIEVER_PATH)
 def format_prompt(input, paragraph=None):
+    prompt = "### Instruction:\n{0}\n\n### Response:".format(input)
+    if paragraph is not None:
+        prompt += "[Retrieval]<paragraph>{0}</paragraph>".format(paragraph)
+    return prompt
+while True:
+    query = input("[Human]: ")
+    prompt = format_prompt(query)
+    sequences = model.generate(
+        **tokenizer(prompt, return_tensors='pt').to(model.device),
+        do_sample=False,
+        num_beams=5,
+        # top_k=10,
+        # top_p=0.8,
+        temperature=0.9,
+        num_return_sequences=1,
+        eos_token_id=tokenizer.eos_token_id,
+        max_new_tokens=1024,
+        min_new_tokens=1,
+        repetition_penalty=1.5,
+    )
+    for seq in sequences:
+        print(f"[Model]: {tokenizer.decode(seq, skip_special_tokens=False)}")
+        print("-"*50)
+    print("="*50)
+# query_1 = "你好呀"
 # Model prediction: [No Retrieval] 你好！有什么我可以帮你解答的问题吗？ [Utility:5] </s>
+# query_2 = "故宫三大殿是哪些？"
 # Model prediction: [Retrieval] <paragraph> ... (this query requires factual grounding, call a retriever) </paragraph> [Relevant] 太和殿、中和殿、保和殿 [Utility:5] </s>
 ```