TTS re-write (#72)

* [WIP] Add bark and parler multi support * Add config files for other models to easily test across models * Use model loading wrapper function for download_models.py * Make sure transformers>4.31.0 (required for bark model) * Add parler dependency * Use TTSModelWrapper for demo code * Use TTSModelWrapper for cli * Add outetts_language attribute * Add TTSModelWrapper * Update text_to_speech.py * Pass model-specific variables as **kwargs * Rename TTSModelWrapper to TTSInterface * Update language argument to kwargs * Remove parler from dependencies Co-authored-by: David de la Iglesia Castro <[email protected]> * Separate inference from TTSModel * Make sure config model is properly registered * Decouple loading & inference of TTS model * Decouple loading & inference of TTS model * Enable user to exit podcast generation gracefully * Add Q2 Oute version to TTS_LOADERS * Add comment for support in TTS_INFERENCE * Update test_model_loaders.py * Update test_text_to_speech.py * Remove extra "use case" examples * Add bark to readme & note about multilingual support * Reference a repo that showcases multilingual use cases * Change default model to 500M * Remove support for bark and parler models * Update docs * Remove unused code * Remove parler dep from tests * Update docs * Lint * Lint * Lint * Remove transformers dependency * Remove parler reference from docs --------- Co-authored-by: David de la Iglesia Castro <[email protected]>
mozilla-ai · Jan 10, 2025 · 528bb24 · 528bb24
1 parent 8809feb
commit 528bb24
Show file tree

Hide file tree

Showing 15 changed files with 152 additions and 184 deletions.
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -30,9 +30,6 @@ jobs:
       - name: Install test dependencies
         run: pip install -e '.[tests]'
 
-      - name: Install parler dependency
-        run: pip install git+https://github.com/huggingface/parler-tts.git
-
       - name: Run Unit Tests
         run: pytest -v tests/unit
 

diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ It is designed to work on most local setups or with [GitHub Codespaces](https://
 ### Built with
 - Python 3.10+ (use Python 3.12 for Apple M1/2/3 chips)
 - [Llama-cpp](https://github.com/abetlen/llama-cpp-python) (text-to-text, i.e script generation)
-- [OuteAI](https://github.com/edwko/OuteTTS) / [Parler_tts](https://github.com/huggingface/parler-tts) (text-to-speech, i.e audio generation)
+- [OuteAI](https://github.com/edwko/OuteTTS) (text-to-speech, i.e audio generation)
 - [Streamlit](https://streamlit.io/) (UI demo)
 
 
@@ -106,10 +106,10 @@ For the complete list of models supported out-of-the-box, visit this [link](http
 
 ### text-to-speech
 
-We support models from the [OuteAI](https://github.com/edwko/OuteTTS) and [Parler_tts](https://github.com/huggingface/parler-tts) packages. The default text-to-speech model in this repo is [OuteTTS-0.2-500M](https://huggingface.co/OuteAI/OuteTTS-0.2-500M). Note that the `0.1-350M` version has a `CC-By-4.0` (permissive) license, whereas the newer / better `0.2-500M` version has a `CC-By-NC-4.0` (non-commercial) license.
-For a complete list of models visit [Oute HF](https://huggingface.co/collections/OuteAI) (only the GGUF versions) and [Parler HF](https://huggingface.co/collections/parler-tts).
+We support models from the [OuteAI](https://github.com/edwko/OuteTTS) package. The default text-to-speech model in this repo is [OuteTTS-0.2-500M](https://huggingface.co/OuteAI/OuteTTS-0.2-500M). Note that the `0.1-350M` version has a `CC-By-4.0` (permissive) license, whereas the newer / better `0.2-500M` version has a `CC-By-NC-4.0` (non-commercial) license.
+For a complete list of models visit [Oute HF](https://huggingface.co/collections/OuteAI) (only the GGUF versions).
 
-**Important note:** In order to keep the package dependencies as lightweight as possible, only the Oute interface is installed by default. If you want to use the parler models, please also follow the instructions at https://github.com/huggingface/parler-tts.
+In this [repo](https://github.com/Kostis-S-Z/document-to-podcast) you can see examples of using different TTS models with minimal code changes.
 
 ## Pre-requisites
 

diff --git a/demo/app.py b/demo/app.py
@@ -6,13 +6,13 @@
 import soundfile as sf
 import streamlit as st
 
+from document_to_podcast.inference.text_to_speech import text_to_speech
 from document_to_podcast.preprocessing import DATA_LOADERS, DATA_CLEANERS
 from document_to_podcast.inference.model_loaders import (
     load_llama_cpp_model,
-    load_outetts_model,
+    load_tts_model,
 )
 from document_to_podcast.config import DEFAULT_PROMPT, DEFAULT_SPEAKERS, Speaker
-from document_to_podcast.inference.text_to_speech import text_to_speech
 from document_to_podcast.inference.text_to_text import text_to_text_stream
 from document_to_podcast.utils import stack_audio_segments
 
@@ -26,7 +26,7 @@ def load_text_to_text_model():
 
 @st.cache_resource
 def load_text_to_speech_model():
-    return load_outetts_model("OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf")
+    return load_tts_model("OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf")
 
 
 script = "script"
@@ -167,7 +167,8 @@ def gen_button_clicked():
                             speech_model,
                             voice_profile,
                         )
-                    st.audio(speech, sample_rate=speech_model.audio_codec.sr)
+                    st.audio(speech, sample_rate=speech_model.sample_rate)
+
                     st.session_state.audio.append(speech)
                     text = ""
 
@@ -179,7 +180,7 @@ def gen_button_clicked():
             sf.write(
                 "podcast.wav",
                 st.session_state.audio,
-                samplerate=speech_model.audio_codec.sr,
+                samplerate=speech_model.sample_rate,
             )
             st.markdown("Podcast saved to disk!")
 

diff --git a/demo/download_models.py b/demo/download_models.py
@@ -4,10 +4,10 @@
 
 from document_to_podcast.inference.model_loaders import (
     load_llama_cpp_model,
-    load_outetts_model,
+    load_tts_model,
 )
 
 load_llama_cpp_model(
     "allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf"
 )
-load_outetts_model("OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf")
+load_tts_model("OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf")
diff --git a/docs/customization.md b/docs/customization.md
@@ -74,6 +74,7 @@ Looking for inspiration? Check out these examples of how others have customized
 
 - **[Radio Drama Generator](https://github.com/stefanfrench/radio-drama-generator)**: A creative adaptation that generates radio dramas by customizing ng the Blueprint parameters.
 - **[Readme-to-Podcast](https://github.com/alexmeckes/readme-to-podcast)**: This project transforms GitHub README files into podcast-style audio, showcasing the Blueprint’s ability to handle diverse text inputs.
+- **[Multilingual Podcast](https://github.com/Kostis-S-Z/document-to-podcast/)**: A repo that showcases how to use this package in other languages, like Hindi, Polish, Korean and many more.
 
 ## 🤝 **Contributing to the Blueprint**
 

diff --git a/docs/future-features-contributions.md b/docs/future-features-contributions.md
@@ -15,7 +15,6 @@ The Document-to-Podcast Blueprint is an evolving project designed to grow with t
 This Blueprint is designed to be a foundation you can build upon. By extending its capabilities, you can open the door to new applications, improve user experience, and adapt the Blueprint to address other use cases. Here are a few ideas for how you can expand its potential:
 
 
-- **Multi-language podcast generation:** Add support for multi-language podcast generation to expand the reach of this Blueprint.
 - **New modalities input:** Add support to the Blueprint to be able to handle different input modalities, like audio or images, enabling more flexibility in podcast generation.
 - **Improved audio quality:** Explore and integrate more advanced open-source TTS frameworks to enhance the quality of generated audio, making podcasts sound more natural.
 

diff --git a/docs/getting-started.md b/docs/getting-started.md
@@ -36,11 +36,3 @@ pip install -e .
 ```bash
 python -m streamlit run demo/app.py
 ```
-
-
-### [Optional]: Use Parler models for text-to-speech
-
-If you want to use the [parler tts](https://github.com/huggingface/parler-tts) models, you will need to **additionally** install an optional dependency by running:
-```bash
-pip install -e '.[parler]'
-```
diff --git a/docs/index.md b/docs/index.md
@@ -11,7 +11,7 @@ These docs are your companion to mastering the **Document-to-Podcast Blueprint**
 ### Built with
 - Python 3.10+
 - [Llama-cpp](https://github.com/abetlen/llama-cpp-python) (text-to-text, i.e script generation)
-- [OuteAI](https://github.com/edwko/OuteTTS) / [Parler_tts](https://github.com/huggingface/parler-tts) (text-to-speech, i.e audio generation)
+- [OuteAI](https://github.com/edwko/OuteTTS) (text-to-speech, i.e audio generation)
 - [Streamlit](https://streamlit.io/) (UI demo)
 
 

diff --git a/docs/step-by-step-guide.md b/docs/step-by-step-guide.md
@@ -160,17 +160,16 @@ In this final step, the generated podcast transcript is brought to life as an au
 
  **1 - Model Loading**
 
-   - The [`model_loader.py`](api.md/#document_to_podcast.inference.model_loaders) module is responsible for loading the `text-to-speech` models using the `outetts` and `parler_tts` libraries.
+   - The [`model_loader.py`](api.md/#document_to_podcast.inference.model_loaders) module is responsible for loading the `text-to-text` and `text-to-speech` models.
 
    - The function `load_outetts_model` takes a model ID in the format `{org}/{repo}/{filename}` and loads the specified model, either on CPU or GPU, based on the `device` parameter. The parameter `language` also enables to swap between the languages the Oute package supports (as of Dec 2024: `en, zh, ja, ko`)
 
-   - The function `load_parler_tts_model_and_tokenizer` takes a model ID in the format `{repo}/{filename}` and loads the specified model and tokenizer, either on CPU or GPU, based on the `device` parameter.
 
 **2 - Text-to-Speech Audio Generation**
 
    - The [`text_to_speech.py`](api.md/#document_to_podcast.inference.text_to_speech) script converts text into audio using a specified TTS model.
 
-   - A **speaker profile** defines the voice characteristics (e.g., tone, speed, clarity) for each speaker. This is specific to each TTS package. Oute models require one of the IDs specified [here](https://github.com/edwko/OuteTTS/tree/main/outetts/version/v1/default_speakers). Parler requires natural language description of the speaker's voice and you have to use a pre-defined name (see [here](https://github.com/huggingface/parler-tts/blob/main/INFERENCE.md#speaker-consistency))
+   - A **speaker profile** defines the voice characteristics (e.g., tone, speed, clarity) for each speaker. This is specific to each TTS package. Oute models require one of the IDs specified [here](https://github.com/edwko/OuteTTS/tree/main/outetts/version/v1/default_speakers).
 
    - The function `text_to_speech` takes the input text (e.g. podcast script) and speaker profile, generating a waveform (audio data in a numpy array) that represents the spoken version of the text.
 

diff --git a/src/document_to_podcast/cli.py b/src/document_to_podcast/cli.py
@@ -11,15 +11,14 @@
     Speaker,
     DEFAULT_PROMPT,
     DEFAULT_SPEAKERS,
-    SUPPORTED_TTS_MODELS,
+    TTS_LOADERS,
 )
 from document_to_podcast.inference.model_loaders import (
     load_llama_cpp_model,
-    load_outetts_model,
-    load_parler_tts_model_and_tokenizer,
+    load_tts_model,
 )
-from document_to_podcast.inference.text_to_text import text_to_text_stream
 from document_to_podcast.inference.text_to_speech import text_to_speech
+from document_to_podcast.inference.text_to_text import text_to_text_stream
 from document_to_podcast.preprocessing import DATA_CLEANERS, DATA_LOADERS
 from document_to_podcast.utils import stack_audio_segments
 
@@ -30,8 +29,9 @@ def document_to_podcast(
     output_folder: str | None = None,
     text_to_text_model: str = "allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf",
     text_to_text_prompt: str = DEFAULT_PROMPT,
-    text_to_speech_model: SUPPORTED_TTS_MODELS = "OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf",
+    text_to_speech_model: TTS_LOADERS = "OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf",
     speakers: list[Speaker] | None = None,
+    outetts_language: str = "en",  # Only applicable to OuteTTS models
     from_config: str | None = None,
 ):
     """
@@ -70,8 +70,10 @@ def document_to_podcast(
         speakers (list[Speaker] | None, optional): The speakers for the podcast.
             Defaults to DEFAULT_SPEAKERS.
 
-        from_config (str, optional): The path to the config file. Defaults to None.
+        outetts_language (str): For OuteTTS models we need to specify which language to use.
+            Supported languages in 0.2-500M: en, zh, ja, ko. More info: https://github.com/edwko/OuteTTS
 
+        from_config (str, optional): The path to the config file. Defaults to None.
 
             If provided, all other arguments will be ignored.
     """
@@ -86,6 +88,7 @@ def document_to_podcast(
             text_to_text_prompt=text_to_text_prompt,
             text_to_speech_model=text_to_speech_model,
             speakers=[Speaker.model_validate(speaker) for speaker in speakers],
+            outetts_language=outetts_language,
         )
 
     output_folder = Path(config.output_folder)
@@ -106,15 +109,9 @@ def document_to_podcast(
     text_model = load_llama_cpp_model(model_id=config.text_to_text_model)
 
     logger.info(f"Loading {config.text_to_speech_model}")
-    if "oute" in config.text_to_speech_model.lower():
-        speech_model = load_outetts_model(model_id=config.text_to_speech_model)
-        speech_tokenizer = None
-        sample_rate = speech_model.audio_codec.sr
-    else:
-        speech_model, speech_tokenizer = load_parler_tts_model_and_tokenizer(
-            model_id=config.text_to_speech_model
-        )
-        sample_rate = speech_model.config.sampling_rate
+    speech_model = load_tts_model(
+        model_id=config.text_to_speech_model, outetts_language=outetts_language
+    )
 
     # ~4 characters per token is considered a reasonable default.
     max_characters = text_model.n_ctx() * 4
@@ -151,22 +148,21 @@ def document_to_podcast(
                     text.split(f'"Speaker {speaker_id}":')[-1],
                     speech_model,
                     voice_profile,
-                    tokenizer=speech_tokenizer,  # Applicable only for parler models
                 )
                 podcast_audio.append(speech)
                 text = ""
+
     except KeyboardInterrupt:
         logger.warning("Podcast generation stopped by user.")
-
     logger.info("Saving Podcast...")
     complete_audio = stack_audio_segments(
-        podcast_audio, sample_rate=sample_rate, silence_pad=1.0
+        podcast_audio, sample_rate=speech_model.sample_rate, silence_pad=1.0
     )
 
     sf.write(
         str(output_folder / "podcast.wav"),
         complete_audio,
-        samplerate=sample_rate,
+        samplerate=speech_model.sample_rate,
     )
     (output_folder / "podcast.txt").write_text(podcast_script)
     logger.success("Done!")

diff --git a/src/document_to_podcast/config.py b/src/document_to_podcast/config.py
@@ -1,10 +1,11 @@
 from pathlib import Path
-from typing import Literal
 from typing_extensions import Annotated
 
 from pydantic import BaseModel, FilePath
 from pydantic.functional_validators import AfterValidator
 
+from document_to_podcast.inference.model_loaders import TTS_LOADERS
+from document_to_podcast.inference.text_to_speech import TTS_INFERENCE
 from document_to_podcast.preprocessing import DATA_LOADERS
 
 
@@ -41,14 +42,6 @@
     },
 ]
 
-SUPPORTED_TTS_MODELS = Literal[
-    "OuteAI/OuteTTS-0.1-350M-GGUF/OuteTTS-0.1-350M-FP16.gguf",
-    "OuteAI/OuteTTS-0.2-500M-GGUF/OuteTTS-0.2-500M-FP16.gguf",
-    "parler-tts/parler-tts-large-v1",
-    "parler-tts/parler-tts-mini-v1",
-    "parler-tts/parler-tts-mini-v1.1",
-]
-
 
 def validate_input_file(value):
     if Path(value).suffix not in DATA_LOADERS:
@@ -73,6 +66,18 @@ def validate_text_to_text_prompt(value):
     return value
 
 
+def validate_text_to_speech_model(value):
+    if value not in TTS_LOADERS:
+        raise ValueError(
+            f"Model {value} is missing a loading function. Please define it under model_loaders.py"
+        )
+    if value not in TTS_INFERENCE:
+        raise ValueError(
+            f"Model {value} is missing an inference function. Please define it under text_to_speech.py"
+        )
+    return value
+
+
 class Speaker(BaseModel):
     id: int
     name: str
@@ -88,5 +93,6 @@ class Config(BaseModel):
     output_folder: str
     text_to_text_model: Annotated[str, AfterValidator(validate_text_to_text_model)]
     text_to_text_prompt: Annotated[str, AfterValidator(validate_text_to_text_prompt)]
-    text_to_speech_model: SUPPORTED_TTS_MODELS
+    text_to_speech_model: Annotated[str, AfterValidator(validate_text_to_speech_model)]
     speakers: list[Speaker]
+    outetts_language: str = "en"