Spaces:
Running
on
Zero
Running
on
Zero
Update cosyvoice/cli/model.py
Browse files- cosyvoice/cli/model.py +5 -4
cosyvoice/cli/model.py
CHANGED
|
@@ -323,10 +323,11 @@ class CosyVoice2Model(CosyVoiceModel):
|
|
| 323 |
self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False
|
| 324 |
self.hift_cache_dict[this_uuid] = None
|
| 325 |
if source_speech_token.shape[1] == 0:
|
| 326 |
-
p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
|
|
|
|
| 327 |
else:
|
| 328 |
p = threading.Thread(target=self.vc_job, args=(source_speech_token, this_uuid))
|
| 329 |
-
p.start()
|
| 330 |
if stream is True:
|
| 331 |
token_offset = 0
|
| 332 |
prompt_token_pad = int(np.ceil(flow_prompt_speech_token.shape[1] / self.token_hop_len) * self.token_hop_len - flow_prompt_speech_token.shape[1])
|
|
@@ -347,7 +348,7 @@ class CosyVoice2Model(CosyVoiceModel):
|
|
| 347 |
yield {'tts_speech': this_tts_speech.cpu()}
|
| 348 |
if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) - token_offset < this_token_hop_len + self.flow.pre_lookahead_len:
|
| 349 |
break
|
| 350 |
-
p.join()
|
| 351 |
# deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
|
| 352 |
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
|
| 353 |
this_tts_speech = self.token2wav(token=this_tts_speech_token,
|
|
@@ -360,7 +361,7 @@ class CosyVoice2Model(CosyVoiceModel):
|
|
| 360 |
yield {'tts_speech': this_tts_speech.cpu()}
|
| 361 |
else:
|
| 362 |
# deal with all tokens
|
| 363 |
-
p.join()
|
| 364 |
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
|
| 365 |
this_tts_speech = self.token2wav(token=this_tts_speech_token,
|
| 366 |
prompt_token=flow_prompt_speech_token,
|
|
|
|
| 323 |
self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False
|
| 324 |
self.hift_cache_dict[this_uuid] = None
|
| 325 |
if source_speech_token.shape[1] == 0:
|
| 326 |
+
# p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
|
| 327 |
+
self.llm_job(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid)
|
| 328 |
else:
|
| 329 |
p = threading.Thread(target=self.vc_job, args=(source_speech_token, this_uuid))
|
| 330 |
+
# p.start()
|
| 331 |
if stream is True:
|
| 332 |
token_offset = 0
|
| 333 |
prompt_token_pad = int(np.ceil(flow_prompt_speech_token.shape[1] / self.token_hop_len) * self.token_hop_len - flow_prompt_speech_token.shape[1])
|
|
|
|
| 348 |
yield {'tts_speech': this_tts_speech.cpu()}
|
| 349 |
if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) - token_offset < this_token_hop_len + self.flow.pre_lookahead_len:
|
| 350 |
break
|
| 351 |
+
# p.join()
|
| 352 |
# deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
|
| 353 |
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
|
| 354 |
this_tts_speech = self.token2wav(token=this_tts_speech_token,
|
|
|
|
| 361 |
yield {'tts_speech': this_tts_speech.cpu()}
|
| 362 |
else:
|
| 363 |
# deal with all tokens
|
| 364 |
+
# p.join()
|
| 365 |
this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
|
| 366 |
this_tts_speech = self.token2wav(token=this_tts_speech_token,
|
| 367 |
prompt_token=flow_prompt_speech_token,
|