Update README.md
Browse files
README.md
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
---
|
| 2 |
-
pipeline_tag:
|
| 3 |
datasets:
|
| 4 |
- openbmb/RLAIF-V-Dataset
|
| 5 |
library_name: transformers
|
|
@@ -13,6 +13,10 @@ tags:
|
|
| 13 |
- multi-image
|
| 14 |
- video
|
| 15 |
- custom_code
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
---
|
| 17 |
|
| 18 |
<h1>A GPT-4o Level MLLM for Vision, Speech and Multimodal Live Streaming on Your Phone</h1>
|
|
@@ -1217,7 +1221,7 @@ msgs = [{'role': 'user', 'content': [task_prompt]}] # you can also try to ask th
|
|
| 1217 |
# sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='voice_cloning', language='en')
|
| 1218 |
# text_prompt = f"Please read the text below."
|
| 1219 |
# user_question = {'role': 'user', 'content': [text_prompt, "content that you want to read"]} # using same voice in sys_prompt to read the text. (Voice Cloning)
|
| 1220 |
-
# user_question = {'role': 'user', 'content': [text_prompt, librosa.load('xxx.wav', sr=16000, mono=True)[0]]} # using same voice in sys_prompt to read 'xxx.wav'. (Voice
|
| 1221 |
# msgs = [sys_prompt, user_question]
|
| 1222 |
|
| 1223 |
res = model.chat(
|
|
@@ -1386,4 +1390,4 @@ If you find our work helpful, please consider citing our papers 📝 and liking
|
|
| 1386 |
journal={arXiv preprint arXiv:2408.01800},
|
| 1387 |
year={2024}
|
| 1388 |
}
|
| 1389 |
-
```
|
|
|
|
| 1 |
---
|
| 2 |
+
pipeline_tag: any-to-any
|
| 3 |
datasets:
|
| 4 |
- openbmb/RLAIF-V-Dataset
|
| 5 |
library_name: transformers
|
|
|
|
| 13 |
- multi-image
|
| 14 |
- video
|
| 15 |
- custom_code
|
| 16 |
+
- audio
|
| 17 |
+
- speech
|
| 18 |
+
- asr
|
| 19 |
+
- tts
|
| 20 |
---
|
| 21 |
|
| 22 |
<h1>A GPT-4o Level MLLM for Vision, Speech and Multimodal Live Streaming on Your Phone</h1>
|
|
|
|
| 1221 |
# sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='voice_cloning', language='en')
|
| 1222 |
# text_prompt = f"Please read the text below."
|
| 1223 |
# user_question = {'role': 'user', 'content': [text_prompt, "content that you want to read"]} # using same voice in sys_prompt to read the text. (Voice Cloning)
|
| 1224 |
+
# user_question = {'role': 'user', 'content': [text_prompt, librosa.load('xxx.wav', sr=16000, mono=True)[0]]} # using same voice in sys_prompt to read 'xxx.wav'. (Voice Conversion)
|
| 1225 |
# msgs = [sys_prompt, user_question]
|
| 1226 |
|
| 1227 |
res = model.chat(
|
|
|
|
| 1390 |
journal={arXiv preprint arXiv:2408.01800},
|
| 1391 |
year={2024}
|
| 1392 |
}
|
| 1393 |
+
```
|