diff --git a/doc/source/models/model_abilities/audio.rst b/doc/source/models/model_abilities/audio.rst index d6731913d8..368b45e2a1 100644 --- a/doc/source/models/model_abilities/audio.rst +++ b/doc/source/models/model_abilities/audio.rst @@ -19,7 +19,7 @@ The Audio API provides three methods for interacting with audio: * The speech endpoint generates audio from the input text. -.. list-table:: +.. list-table:: :widths: 25 50 :header-rows: 1 @@ -91,7 +91,7 @@ We can try Transcription API out either via cURL, OpenAI Client, or Xinference's import openai client = openai.Client( - api_key="cannot be empty", + api_key="cannot be empty", base_url="http://:/v1" ) with open("speech.mp3", "rb") as audio_file: @@ -270,6 +270,7 @@ CosyVoice Usage ~~~~~~~~~~~~~~~ Basic usage, launch model ``CosyVoice-300M-SFT``. +PS. If you have other persistent ``.pt`` files for cloned voices, you can set the path of the folder containing the ``.pt`` files in the environment variable ``SOCYVOICE_PT_PATH``. .. tabs:: @@ -282,7 +283,7 @@ Basic usage, launch model ``CosyVoice-300M-SFT``. -d '{ "model": "", "input": "", - # ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女'] + # ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女', other voice you put in ``SOCYVOICE_PT_PATH``] "voice": "中文女" }' @@ -297,7 +298,7 @@ Basic usage, launch model ``CosyVoice-300M-SFT``. response = client.audio.speech.create( model=, input=, - # ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女'] + # ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女', other voice you put in ``SOCYVOICE_PT_PATH``] voice="中文女", ) response.stream_to_file('1.mp3') @@ -311,7 +312,7 @@ Basic usage, launch model ``CosyVoice-300M-SFT``. model = client.get_model("") speech_bytes = model.speech( input=, - # ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女'] + # ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女', other voice you put in ``SOCYVOICE_PT_PATH``] voice="中文女" ) with open('1.mp3', 'wb') as f: diff --git a/xinference/thirdparty/cosyvoice/cli/cosyvoice.py b/xinference/thirdparty/cosyvoice/cli/cosyvoice.py index 49fe15f6c7..bcd42d53b4 100644 --- a/xinference/thirdparty/cosyvoice/cli/cosyvoice.py +++ b/xinference/thirdparty/cosyvoice/cli/cosyvoice.py @@ -18,10 +18,14 @@ from cosyvoice.cli.frontend import CosyVoiceFrontEnd from cosyvoice.cli.model import CosyVoiceModel from cosyvoice.utils.file_utils import logging +import torch class CosyVoice: def __init__(self, model_dir, load_jit=True): + self.default_voices = ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女'] + self.pt_cache = {} + instruct = True if '-Instruct' in model_dir else False self.model_dir = model_dir if not os.path.exists(model_dir): @@ -49,8 +53,29 @@ def list_avaliable_spks(self): return spks def inference_sft(self, tts_text, spk_id, stream=False): + if spk_id not in self.default_voices and os.environ["COSYVOICE_PT_PATH"] is not None: + if spk_id not in self.pt_cache: + self.pt_cache[spk_id] = torch.load(f'{os.environ["COSYVOICE_PT_PATH"]}/{spk_id}.pt') + newspk = self.pt_cache[spk_id] for i in self.frontend.text_normalize(tts_text, split=True): - model_input = self.frontend.frontend_sft(i, spk_id) + if newspk is not None: + model_input = self.frontend.frontend_sft(i, "中文女") + + model_input["flow_embedding"] = newspk["flow_embedding"] + model_input["llm_embedding"] = newspk["llm_embedding"] + + model_input["llm_prompt_speech_token"] = newspk["llm_prompt_speech_token"] + model_input["llm_prompt_speech_token_len"] = newspk["llm_prompt_speech_token_len"] + + model_input["flow_prompt_speech_token"] = newspk["flow_prompt_speech_token"] + model_input["flow_prompt_speech_token_len"] = newspk["flow_prompt_speech_token_len"] + + model_input["prompt_speech_feat_len"] = newspk["prompt_speech_feat_len"] + model_input["prompt_speech_feat"] = newspk["prompt_speech_feat"] + model_input["prompt_text"] = newspk["prompt_text"] + model_input["prompt_text_len"] = newspk["prompt_text_len"] + else: + model_input = self.frontend.frontend_sft(i, spk_id) start_time = time.time() logging.info('synthesis text {}'.format(i)) for model_output in self.model.inference(**model_input, stream=stream): @@ -87,9 +112,24 @@ def inference_cross_lingual(self, tts_text, prompt_speech_16k, stream=False): def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False): if self.frontend.instruct is False: raise ValueError('{} do not support instruct inference'.format(self.model_dir)) + if spk_id not in self.pt_cache: + self.pt_cache[spk_id] = torch.load(f'{os.environ["COSYVOICE_PT_PATH"]}/{spk_id}.pt') + newspk = self.pt_cache[spk_id] instruct_text = self.frontend.text_normalize(instruct_text, split=False) for i in self.frontend.text_normalize(tts_text, split=True): - model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text) + if newspk is not None: + model_input = self.frontend.frontend_instruct(i, "中文女", instruct_text) + + model_input["flow_embedding"] = newspk["flow_embedding"] + model_input["llm_embedding"] = newspk["llm_embedding"] + + model_input["llm_prompt_speech_token"] = newspk["llm_prompt_speech_token"] + model_input["llm_prompt_speech_token_len"] = newspk["llm_prompt_speech_token_len"] + + model_input["flow_prompt_speech_token"] = newspk["flow_prompt_speech_token"] + model_input["flow_prompt_speech_token_len"] = newspk["flow_prompt_speech_token_len"] + else: + model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text) start_time = time.time() logging.info('synthesis text {}'.format(i)) for model_output in self.model.inference(**model_input, stream=stream):