xorbitsai · Minamiyama · Oct 12, 2024 · Oct 12, 2024 · Oct 12, 2024 · Oct 12, 2024
diff --git a/doc/source/models/model_abilities/audio.rst b/doc/source/models/model_abilities/audio.rst
@@ -19,7 +19,7 @@ The Audio API provides three methods for interacting with audio:
 * The speech endpoint generates audio from the input text.
 
 
-.. list-table:: 
+.. list-table::
    :widths: 25  50
    :header-rows: 1
 
@@ -91,7 +91,7 @@ We can try Transcription API out either via cURL, OpenAI Client, or Xinference's
     import openai
 
     client = openai.Client(
-        api_key="cannot be empty", 
+        api_key="cannot be empty",
         base_url="http://<XINFERENCE_HOST>:<XINFERENCE_PORT>/v1"
     )
     with open("speech.mp3", "rb") as audio_file:
@@ -270,6 +270,7 @@ CosyVoice Usage
 ~~~~~~~~~~~~~~~
 
 Basic usage, launch model ``CosyVoice-300M-SFT``.
+PS. If you have other persistent ``.pt`` files for cloned voices, you can set the path of the folder containing the ``.pt`` files in the environment variable ``SOCYVOICE_PT_PATH``.
 
 .. tabs::
 
@@ -282,7 +283,7 @@ Basic usage, launch model ``CosyVoice-300M-SFT``.
       -d '{
         "model": "<MODEL_UID>",
         "input": "<The text to generate audio for>",
-        # ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女']
+        # ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女', other voice you put in ``SOCYVOICE_PT_PATH``]
         "voice": "中文女"
       }'
 
@@ -297,7 +298,7 @@ Basic usage, launch model ``CosyVoice-300M-SFT``.
     response = client.audio.speech.create(
         model=<MODEL_UID>,
         input=<The text to generate audio for>,
-        # ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女']
+        # ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女', other voice you put in ``SOCYVOICE_PT_PATH``]
         voice="中文女",
     )
     response.stream_to_file('1.mp3')
@@ -311,7 +312,7 @@ Basic usage, launch model ``CosyVoice-300M-SFT``.
     model = client.get_model("<MODEL_UID>")
     speech_bytes = model.speech(
         input=<The text to generate audio for>,
-        # ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女']
+        # ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女', other voice you put in ``SOCYVOICE_PT_PATH``]
         voice="中文女"
     )
     with open('1.mp3', 'wb') as f:

diff --git a/xinference/thirdparty/cosyvoice/cli/cosyvoice.py b/xinference/thirdparty/cosyvoice/cli/cosyvoice.py
@@ -18,10 +18,14 @@
 from cosyvoice.cli.frontend import CosyVoiceFrontEnd
 from cosyvoice.cli.model import CosyVoiceModel
 from cosyvoice.utils.file_utils import logging
+import torch
 
 class CosyVoice:
 
     def __init__(self, model_dir, load_jit=True):
+        self.default_voices = ['中文女', '中文男', '日语男', '粤语女', '英文女', '英文男', '韩语女']
+        self.pt_cache = {}
+
         instruct = True if '-Instruct' in model_dir else False
         self.model_dir = model_dir
         if not os.path.exists(model_dir):
@@ -49,8 +53,29 @@ def list_avaliable_spks(self):
         return spks
 
     def inference_sft(self, tts_text, spk_id, stream=False):
+        if spk_id not in self.default_voices and os.environ["COSYVOICE_PT_PATH"] is not None:
+            if spk_id not in self.pt_cache:
+                self.pt_cache[spk_id] = torch.load(f'{os.environ["COSYVOICE_PT_PATH"]}/{spk_id}.pt')
+            newspk = self.pt_cache[spk_id]
         for i in self.frontend.text_normalize(tts_text, split=True):
-            model_input = self.frontend.frontend_sft(i, spk_id)
+            if newspk is not None:
+                model_input = self.frontend.frontend_sft(i, "中文女")
+
+                model_input["flow_embedding"] = newspk["flow_embedding"]
+                model_input["llm_embedding"] = newspk["llm_embedding"]
+
+                model_input["llm_prompt_speech_token"] = newspk["llm_prompt_speech_token"]
+                model_input["llm_prompt_speech_token_len"] = newspk["llm_prompt_speech_token_len"]
+
+                model_input["flow_prompt_speech_token"] = newspk["flow_prompt_speech_token"]
+                model_input["flow_prompt_speech_token_len"] = newspk["flow_prompt_speech_token_len"]
+
+                model_input["prompt_speech_feat_len"] = newspk["prompt_speech_feat_len"]
+                model_input["prompt_speech_feat"] = newspk["prompt_speech_feat"]
+                model_input["prompt_text"] = newspk["prompt_text"]
+                model_input["prompt_text_len"] = newspk["prompt_text_len"]
+            else:
+                model_input = self.frontend.frontend_sft(i, spk_id)
             start_time = time.time()
             logging.info('synthesis text {}'.format(i))
             for model_output in self.model.inference(**model_input, stream=stream):
@@ -87,9 +112,24 @@ def inference_cross_lingual(self, tts_text, prompt_speech_16k, stream=False):
     def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False):
         if self.frontend.instruct is False:
             raise ValueError('{} do not support instruct inference'.format(self.model_dir))
+        if spk_id not in self.pt_cache:
+            self.pt_cache[spk_id] = torch.load(f'{os.environ["COSYVOICE_PT_PATH"]}/{spk_id}.pt')
+        newspk = self.pt_cache[spk_id]
         instruct_text = self.frontend.text_normalize(instruct_text, split=False)
         for i in self.frontend.text_normalize(tts_text, split=True):
-            model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text)
+            if newspk is not None:
+                model_input = self.frontend.frontend_instruct(i, "中文女", instruct_text)
+
+                model_input["flow_embedding"] = newspk["flow_embedding"]
+                model_input["llm_embedding"] = newspk["llm_embedding"]
+
+                model_input["llm_prompt_speech_token"] = newspk["llm_prompt_speech_token"]
+                model_input["llm_prompt_speech_token_len"] = newspk["llm_prompt_speech_token_len"]
+
+                model_input["flow_prompt_speech_token"] = newspk["flow_prompt_speech_token"]
+                model_input["flow_prompt_speech_token_len"] = newspk["flow_prompt_speech_token_len"]
+            else:
+                model_input = self.frontend.frontend_instruct(i, spk_id, instruct_text)
             start_time = time.time()
             logging.info('synthesis text {}'.format(i))
             for model_output in self.model.inference(**model_input, stream=stream):