aluminumbox commited on
Commit
f8133f8
·
verified ·
1 Parent(s): 9e50aa5

Update cosyvoice/cli/model.py

Browse files
Files changed (1) hide show
  1. cosyvoice/cli/model.py +5 -4
cosyvoice/cli/model.py CHANGED
@@ -323,10 +323,11 @@ class CosyVoice2Model(CosyVoiceModel):
323
  self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False
324
  self.hift_cache_dict[this_uuid] = None
325
  if source_speech_token.shape[1] == 0:
326
- p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
 
327
  else:
328
  p = threading.Thread(target=self.vc_job, args=(source_speech_token, this_uuid))
329
- p.start()
330
  if stream is True:
331
  token_offset = 0
332
  prompt_token_pad = int(np.ceil(flow_prompt_speech_token.shape[1] / self.token_hop_len) * self.token_hop_len - flow_prompt_speech_token.shape[1])
@@ -347,7 +348,7 @@ class CosyVoice2Model(CosyVoiceModel):
347
  yield {'tts_speech': this_tts_speech.cpu()}
348
  if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) - token_offset < this_token_hop_len + self.flow.pre_lookahead_len:
349
  break
350
- p.join()
351
  # deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
352
  this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
353
  this_tts_speech = self.token2wav(token=this_tts_speech_token,
@@ -360,7 +361,7 @@ class CosyVoice2Model(CosyVoiceModel):
360
  yield {'tts_speech': this_tts_speech.cpu()}
361
  else:
362
  # deal with all tokens
363
- p.join()
364
  this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
365
  this_tts_speech = self.token2wav(token=this_tts_speech_token,
366
  prompt_token=flow_prompt_speech_token,
 
323
  self.tts_speech_token_dict[this_uuid], self.llm_end_dict[this_uuid] = [], False
324
  self.hift_cache_dict[this_uuid] = None
325
  if source_speech_token.shape[1] == 0:
326
+ # p = threading.Thread(target=self.llm_job, args=(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid))
327
+ self.llm_job(text, prompt_text, llm_prompt_speech_token, llm_embedding, this_uuid)
328
  else:
329
  p = threading.Thread(target=self.vc_job, args=(source_speech_token, this_uuid))
330
+ # p.start()
331
  if stream is True:
332
  token_offset = 0
333
  prompt_token_pad = int(np.ceil(flow_prompt_speech_token.shape[1] / self.token_hop_len) * self.token_hop_len - flow_prompt_speech_token.shape[1])
 
348
  yield {'tts_speech': this_tts_speech.cpu()}
349
  if self.llm_end_dict[this_uuid] is True and len(self.tts_speech_token_dict[this_uuid]) - token_offset < this_token_hop_len + self.flow.pre_lookahead_len:
350
  break
351
+ # p.join()
352
  # deal with remain tokens, make sure inference remain token len equals token_hop_len when cache_speech is not None
353
  this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
354
  this_tts_speech = self.token2wav(token=this_tts_speech_token,
 
361
  yield {'tts_speech': this_tts_speech.cpu()}
362
  else:
363
  # deal with all tokens
364
+ # p.join()
365
  this_tts_speech_token = torch.tensor(self.tts_speech_token_dict[this_uuid]).unsqueeze(dim=0)
366
  this_tts_speech = self.token2wav(token=this_tts_speech_token,
367
  prompt_token=flow_prompt_speech_token,