diff --git a/.github/workflows/style_check.yml b/.github/workflows/style_check.yml index c913c233d8..a146213f7c 100644 --- a/.github/workflows/style_check.yml +++ b/.github/workflows/style_check.yml @@ -13,17 +13,15 @@ jobs: fail-fast: false matrix: python-version: [3.9] - experimental: [false] steps: - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + - name: Install uv + uses: astral-sh/setup-uv@v3 with: - python-version: ${{ matrix.python-version }} - architecture: x64 - cache: 'pip' - cache-dependency-path: 'requirements*' - - name: Install/upgrade dev dependencies - run: python3 -m pip install -r requirements.dev.txt + version: "0.4.27" + enable-cache: true + cache-dependency-glob: "**/pyproject.toml" + - name: Set up Python ${{ matrix.python-version }} + run: uv python install ${{ matrix.python-version }} - name: Lint check run: make lint diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index cdb30ea0e0..be3f1b740b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -16,17 +16,14 @@ jobs: subset: ["data_tests", "inference_tests", "test_aux", "test_text", "test_tts", "test_tts2", "test_vocoder", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"] steps: - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + - name: Install uv + uses: astral-sh/setup-uv@v3 with: - python-version: ${{ matrix.python-version }} - architecture: x64 - cache: 'pip' - cache-dependency-path: 'requirements*' - - name: check OS - run: cat /etc/os-release - - name: set ENV - run: export TRAINER_TELEMETRY=0 + version: "0.4.27" + enable-cache: true + cache-dependency-glob: "**/pyproject.toml" + - name: Set up Python ${{ matrix.python-version }} + run: uv python install ${{ matrix.python-version }} - name: Install Espeak if: contains(fromJSON('["inference_tests", "test_text", "test_tts", "test_tts2", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"]'), matrix.subset) run: | @@ -37,21 +34,17 @@ jobs: sudo apt-get update sudo apt-get install -y --no-install-recommends git make gcc make system-deps - - name: Install/upgrade Python setup deps - run: python3 -m pip install --upgrade pip setuptools wheel uv - name: Replace scarf urls if: contains(fromJSON('["data_tests", "inference_tests", "test_aux", "test_tts", "test_tts2", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"]'), matrix.subset) run: | sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json - - name: Install TTS + - name: Unit tests run: | resolution=highest if [ "${{ matrix.python-version }}" == "3.9" ]; then resolution=lowest-direct fi - python3 -m uv pip install --resolution=$resolution --system "coqui-tts[dev,server,languages] @ ." - - name: Unit tests - run: make ${{ matrix.subset }} + uv run --resolution=$resolution --extra server --extra languages make ${{ matrix.subset }} - name: Upload coverage data uses: actions/upload-artifact@v4 with: @@ -65,18 +58,17 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 + - name: Install uv + uses: astral-sh/setup-uv@v3 with: - python-version: "3.12" + version: "0.4.27" - uses: actions/download-artifact@v4 with: pattern: coverage-data-* merge-multiple: true - name: Combine coverage run: | - python -Im pip install --upgrade coverage[toml] - - python -Im coverage combine - python -Im coverage html --skip-covered --skip-empty - - python -Im coverage report --format=markdown >> $GITHUB_STEP_SUMMARY + uv python install + uvx coverage combine + uvx coverage html --skip-covered --skip-empty + uvx coverage report --format=markdown >> $GITHUB_STEP_SUMMARY diff --git a/.gitignore b/.gitignore index f9708961e2..d9f992275c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +uv.lock + WadaSNR/ .idea/ *.pyc diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f96f6f38ac..92f6f3ab3c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: "https://github.com/pre-commit/pre-commit-hooks" - rev: v4.5.0 + rev: v5.0.0 hooks: - id: check-yaml - id: end-of-file-fixer @@ -11,14 +11,7 @@ repos: - id: black language_version: python3 - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.3.0 + rev: v0.7.0 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] - - repo: local - hooks: - - id: generate_requirements.py - name: generate_requirements.py - language: system - entry: python scripts/generate_requirements.py - files: "pyproject.toml|requirements.*\\.txt|tools/generate_requirements.py" diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e93858f27d..d4a8cf0090 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -44,29 +44,37 @@ If you have a new feature, a model to implement, or a bug to squash, go ahead an Please use the following steps to send a ✨**PR**✨. Let us know if you encounter a problem along the way. -The following steps are tested on an Ubuntu system. +The following steps are tested on an Ubuntu system and require +[uv](https://docs.astral.sh/uv/) for virtual environment management. Choose your +preferred [installation +method](https://docs.astral.sh/uv/getting-started/installation/), e.g. the +standalone installer: + +```bash +curl -LsSf https://astral.sh/uv/install.sh | sh +``` 1. Fork 🐸TTS[https://github.com/idiap/coqui-ai-TTS] by clicking the fork button at the top right corner of the project page. 2. Clone 🐸TTS and add the main repo as a new remote named ```upstream```. ```bash - $ git clone git@github.com:/coqui-ai-TTS.git - $ cd coqui-ai-TTS - $ git remote add upstream https://github.com/idiap/coqui-ai-TTS.git + git clone git@github.com:/coqui-ai-TTS.git + cd coqui-ai-TTS + git remote add upstream https://github.com/idiap/coqui-ai-TTS.git ``` 3. Install 🐸TTS for development. ```bash - $ make system-deps # intended to be used on Ubuntu (Debian). Let us know if you have a different OS. - $ make install_dev + make system-deps # intended to be used on Ubuntu (Debian). Let us know if you have a different OS. + make install_dev ``` 4. Create a new branch with an informative name for your goal. ```bash - $ git checkout -b an_informative_name_for_my_branch + git checkout -b an_informative_name_for_my_branch ``` 5. Implement your changes on your new branch. @@ -75,39 +83,42 @@ The following steps are tested on an Ubuntu system. 7. Add your tests to our test suite under ```tests``` folder. It is important to show that your code works, edge cases are considered, and inform others about the intended use. -8. Run the tests to see how your updates work with the rest of the project. You can repeat this step multiple times as you implement your changes to make sure you are on the right direction. +8. Run the tests to see how your updates work with the rest of the project. You + can repeat this step multiple times as you implement your changes to make + sure you are on the right direction. **NB: running all tests takes a long time, + it is better to leave this to the CI.** ```bash - $ make test # stop at the first error - $ make test_all # run all the tests, report all the errors + uv run make test # stop at the first error + uv run make test_all # run all the tests, report all the errors ``` 9. Format your code. We use ```black``` for code formatting. ```bash - $ make style + make style ``` 10. Run the linter and correct the issues raised. We use ```ruff``` for linting. It helps to enforce a coding standard, offers simple refactoring suggestions. ```bash - $ make lint + make lint ``` 11. When things are good, add new files and commit your changes. ```bash - $ git add my_file1.py my_file2.py ... - $ git commit + git add my_file1.py my_file2.py ... + git commit ``` It's a good practice to regularly sync your local copy of the project with the upstream code to keep up with the recent updates. ```bash - $ git fetch upstream - $ git rebase upstream/main + git fetch upstream + git rebase upstream/main # or for the development version - $ git rebase upstream/dev + git rebase upstream/dev ``` 12. Send a PR to ```dev``` branch. @@ -115,7 +126,7 @@ The following steps are tested on an Ubuntu system. Push your branch to your fork. ```bash - $ git push -u origin an_informative_name_for_my_branch + git push -u origin an_informative_name_for_my_branch ``` Then go to your fork's Github page and click on 'Pull request' to send your ✨**PR**✨. @@ -137,9 +148,9 @@ If you prefer working within a Docker container as your development environment, 2. Clone 🐸TTS and add the main repo as a new remote named ```upsteam```. ```bash - $ git clone git@github.com:/coqui-ai-TTS.git - $ cd coqui-ai-TTS - $ git remote add upstream https://github.com/idiap/coqui-ai-TTS.git + git clone git@github.com:/coqui-ai-TTS.git + cd coqui-ai-TTS + git remote add upstream https://github.com/idiap/coqui-ai-TTS.git ``` 3. Build the Docker Image as your development environment (it installs all of the dependencies for you): diff --git a/Dockerfile b/Dockerfile index 05c37d78fa..e9d331bc41 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,7 +14,7 @@ RUN rm -rf /root/.cache/pip WORKDIR /root COPY . /root -RUN make install +RUN pip3 install -e .[all] ENTRYPOINT ["tts"] CMD ["--help"] diff --git a/Makefile b/Makefile index 077b4b23e5..1d6867f5e8 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ .DEFAULT_GOAL := help -.PHONY: test system-deps dev-deps style lint install install_dev help docs +.PHONY: test system-deps style lint install install_dev help docs help: @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' @@ -50,27 +50,24 @@ test_failed: ## only run tests failed the last time. coverage run -m nose2 -F -v -B tests style: ## update code style. - black ${target_dirs} + uv run --only-dev black ${target_dirs} lint: ## run linters. - ruff check ${target_dirs} - black ${target_dirs} --check + uv run --only-dev ruff check ${target_dirs} + uv run --only-dev black ${target_dirs} --check system-deps: ## install linux system deps sudo apt-get install -y libsndfile1-dev -dev-deps: ## install development deps - pip install -r requirements.dev.txt - build-docs: ## build the docs cd docs && make clean && make build install: ## install 🐸 TTS - pip install -e .[all] + uv sync --all-extras install_dev: ## install 🐸 TTS for development. - pip install -e .[all,dev] - pre-commit install + uv sync --all-extras + uv run pre-commit install docs: ## build the docs $(MAKE) -C docs clean && $(MAKE) -C docs html diff --git a/README.md b/README.md index c6a1db4fff..507cce9298 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,13 @@ ## 🐸Coqui TTS News - 📣 Fork of the [original, unmaintained repository](https://github.com/coqui-ai/TTS). New PyPI package: [coqui-tts](https://pypi.org/project/coqui-tts) +- 📣 Prebuilt wheels are now also published for Mac and Windows (in addition to Linux as before) for easier installation across platforms. - 📣 ⓍTTSv2 is here with 16 languages and better performance across the board. - 📣 ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/idiap/coqui-ai-TTS/tree/dev/recipes/ljspeech). - 📣 ⓍTTS can now stream with <200ms latency. - 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://coqui-tts.readthedocs.io/en/latest/models/xtts.html) - 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://coqui-tts.readthedocs.io/en/latest/models/bark.html) - 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS. -- 📣 🐸TTS now supports 🐢Tortoise with faster inference. [Docs](https://coqui-tts.readthedocs.io/en/latest/models/tortoise.html) - -
- ## @@ -27,7 +24,6 @@ ______________________________________________________________________ [![Discord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv) [![License]()](https://opensource.org/licenses/MPL-2.0) [![PyPI version](https://badge.fury.io/py/coqui-tts.svg)](https://badge.fury.io/py/coqui-tts) -[![Covenant](https://camo.githubusercontent.com/7d620efaa3eac1c5b060ece5d6aacfcc8b81a74a04d05cd0398689c01c4463bb/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f436f6e7472696275746f72253230436f76656e616e742d76322e3025323061646f707465642d6666363962342e737667)](https://github.com/idiap/coqui-ai-TTS/blob/main/CODE_OF_CONDUCT.md) [![Downloads](https://pepy.tech/badge/coqui-tts)](https://pepy.tech/project/coqui-tts) [![DOI](https://zenodo.org/badge/265612440.svg)](https://zenodo.org/badge/latestdoi/265612440) @@ -43,12 +39,11 @@ ______________________________________________________________________ ## 💬 Where to ask questions Please use our dedicated channels for questions and discussion. Help is much more valuable if it's shared publicly so that more people can benefit from it. -| Type | Platforms | -| ------------------------------- | --------------------------------------- | -| 🚨 **Bug Reports** | [GitHub Issue Tracker] | -| 🎁 **Feature Requests & Ideas** | [GitHub Issue Tracker] | -| 👩‍💻 **Usage Questions** | [GitHub Discussions] | -| 🗯 **General Discussion** | [GitHub Discussions] or [Discord] | +| Type | Platforms | +| -------------------------------------------- | ----------------------------------- | +| 🚨 **Bug Reports, Feature Requests & Ideas** | [GitHub Issue Tracker] | +| 👩‍💻 **Usage Questions** | [GitHub Discussions] | +| 🗯 **General Discussion** | [GitHub Discussions] or [Discord] | [github issue tracker]: https://github.com/idiap/coqui-ai-TTS/issues [github discussions]: https://github.com/idiap/coqui-ai-TTS/discussions @@ -66,15 +61,10 @@ repository are also still a useful source of information. | 💼 **Documentation** | [ReadTheDocs](https://coqui-tts.readthedocs.io/en/latest/) | 💾 **Installation** | [TTS/README.md](https://github.com/idiap/coqui-ai-TTS/tree/dev#installation)| | 👩‍💻 **Contributing** | [CONTRIBUTING.md](https://github.com/idiap/coqui-ai-TTS/blob/main/CONTRIBUTING.md)| -| 📌 **Road Map** | [Main Development Plans](https://github.com/coqui-ai/TTS/issues/378) | 🚀 **Released Models** | [Standard models](https://github.com/idiap/coqui-ai-TTS/blob/dev/TTS/.models.json) and [Fairseq models in ~1100 languages](https://github.com/idiap/coqui-ai-TTS#example-text-to-speech-using-fairseq-models-in-1100-languages-)| -| 📰 **Papers** | [TTS Papers](https://github.com/erogol/TTS-papers)| ## Features -- High-performance Deep Learning models for Text2Speech tasks. - - Text2Spec models (Tacotron, Tacotron2, Glow-TTS, SpeedySpeech). - - Speaker Encoder to compute speaker embeddings efficiently. - - Vocoder models (MelGAN, Multiband-MelGAN, GAN-TTS, ParallelWaveGAN, WaveGrad, WaveRNN) +- High-performance Deep Learning models for Text2Speech tasks. See lists of models below. - Fast and efficient model training. - Detailed training logs on the terminal and Tensorboard. - Support for Multi-speaker TTS. @@ -180,8 +170,8 @@ pip install -e .[server,ja] If you are on Ubuntu (Debian), you can also run following commands for installation. ```bash -$ make system-deps # intended to be used on Ubuntu (Debian). Let us know if you have a different OS. -$ make install +make system-deps # intended to be used on Ubuntu (Debian). Let us know if you have a different OS. +make install ``` If you are on Windows, 👑@GuyPaddock wrote installation instructions diff --git a/TTS/__init__.py b/TTS/__init__.py index 9e87bca4be..8e93c9b5db 100644 --- a/TTS/__init__.py +++ b/TTS/__init__.py @@ -1,3 +1,33 @@ import importlib.metadata +from TTS.utils.generic_utils import is_pytorch_at_least_2_4 + __version__ = importlib.metadata.version("coqui-tts") + + +if is_pytorch_at_least_2_4(): + import _codecs + from collections import defaultdict + + import numpy as np + import torch + + from TTS.config.shared_configs import BaseDatasetConfig + from TTS.tts.configs.xtts_config import XttsConfig + from TTS.tts.models.xtts import XttsArgs, XttsAudioConfig + from TTS.utils.radam import RAdam + + torch.serialization.add_safe_globals([dict, defaultdict, RAdam]) + + # Bark + torch.serialization.add_safe_globals( + [ + np.core.multiarray.scalar, + np.dtype, + np.dtypes.Float64DType, + _codecs.encode, # TODO: safe by default from Pytorch 2.5 + ] + ) + + # XTTS + torch.serialization.add_safe_globals([BaseDatasetConfig, XttsConfig, XttsAudioConfig, XttsArgs]) diff --git a/TTS/tts/layers/bark/load_model.py b/TTS/tts/layers/bark/load_model.py index ce6b757f05..72eca30ac6 100644 --- a/TTS/tts/layers/bark/load_model.py +++ b/TTS/tts/layers/bark/load_model.py @@ -10,6 +10,7 @@ from TTS.tts.layers.bark.model import GPT, GPTConfig from TTS.tts.layers.bark.model_fine import FineGPT, FineGPTConfig +from TTS.utils.generic_utils import is_pytorch_at_least_2_4 if ( torch.cuda.is_available() @@ -118,7 +119,7 @@ def load_model(ckpt_path, device, config, model_type="text"): logger.info(f"{model_type} model not found, downloading...") _download(config.REMOTE_MODEL_PATHS[model_type]["path"], ckpt_path, config.CACHE_DIR) - checkpoint = torch.load(ckpt_path, map_location=device) + checkpoint = torch.load(ckpt_path, map_location=device, weights_only=is_pytorch_at_least_2_4()) # this is a hack model_args = checkpoint["model_args"] if "input_vocab_size" not in model_args: diff --git a/TTS/tts/layers/tortoise/arch_utils.py b/TTS/tts/layers/tortoise/arch_utils.py index c79ef31b0c..52c2526695 100644 --- a/TTS/tts/layers/tortoise/arch_utils.py +++ b/TTS/tts/layers/tortoise/arch_utils.py @@ -9,6 +9,7 @@ from transformers import LogitsWarper from TTS.tts.layers.tortoise.xtransformers import ContinuousTransformerWrapper, RelativePositionBias +from TTS.utils.generic_utils import is_pytorch_at_least_2_4 def zero_module(module): @@ -332,7 +333,7 @@ def __init__( self.mel_norm_file = mel_norm_file if self.mel_norm_file is not None: with fsspec.open(self.mel_norm_file) as f: - self.mel_norms = torch.load(f) + self.mel_norms = torch.load(f, weights_only=is_pytorch_at_least_2_4()) else: self.mel_norms = None diff --git a/TTS/tts/layers/tortoise/audio_utils.py b/TTS/tts/layers/tortoise/audio_utils.py index 0b8701227b..4f299a8fd9 100644 --- a/TTS/tts/layers/tortoise/audio_utils.py +++ b/TTS/tts/layers/tortoise/audio_utils.py @@ -10,6 +10,7 @@ from scipy.io.wavfile import read from TTS.utils.audio.torch_transforms import TorchSTFT +from TTS.utils.generic_utils import is_pytorch_at_least_2_4 logger = logging.getLogger(__name__) @@ -124,7 +125,7 @@ def load_voice(voice: str, extra_voice_dirs: List[str] = []): voices = get_voices(extra_voice_dirs) paths = voices[voice] if len(paths) == 1 and paths[0].endswith(".pth"): - return None, torch.load(paths[0]) + return None, torch.load(paths[0], weights_only=is_pytorch_at_least_2_4()) else: conds = [] for cond_path in paths: diff --git a/TTS/tts/layers/tortoise/autoregressive.py b/TTS/tts/layers/tortoise/autoregressive.py index 14d881bc10..aaae695516 100644 --- a/TTS/tts/layers/tortoise/autoregressive.py +++ b/TTS/tts/layers/tortoise/autoregressive.py @@ -1,14 +1,22 @@ # AGPL: a notification must be added stating that changes have been made to that file. import functools +from typing import Optional import torch import torch.nn as nn import torch.nn.functional as F +import transformers +from packaging.version import Version from transformers import GPT2Config, GPT2PreTrainedModel, LogitsProcessorList from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions from TTS.tts.layers.tortoise.arch_utils import AttentionBlock, TypicalLogitsWarper +if Version(transformers.__version__) >= Version("4.45"): + isin = transformers.pytorch_utils.isin_mps_friendly +else: + isin = torch.isin + def null_position_embeddings(range, dim): return torch.zeros((range.shape[0], range.shape[1], dim), device=range.device) @@ -596,6 +604,8 @@ def inference_speech( max_length = ( trunc_index + self.max_mel_tokens - 1 if max_generate_length is None else trunc_index + max_generate_length ) + stop_token_tensor = torch.tensor(self.stop_mel_token, device=inputs.device, dtype=torch.long) + attention_mask = _prepare_attention_mask_for_generation(inputs, stop_token_tensor, stop_token_tensor) gen = self.inference_model.generate( inputs, bos_token_id=self.start_mel_token, @@ -604,11 +614,39 @@ def inference_speech( max_length=max_length, logits_processor=logits_processor, num_return_sequences=num_return_sequences, + attention_mask=attention_mask, **hf_generate_kwargs, ) return gen[:, trunc_index:] +def _prepare_attention_mask_for_generation( + inputs: torch.Tensor, + pad_token_id: Optional[torch.Tensor], + eos_token_id: Optional[torch.Tensor], +) -> torch.LongTensor: + # No information for attention mask inference -> return default attention mask + default_attention_mask = torch.ones(inputs.shape[:2], dtype=torch.long, device=inputs.device) + if pad_token_id is None: + return default_attention_mask + + is_input_ids = len(inputs.shape) == 2 and inputs.dtype in [torch.int, torch.long] + if not is_input_ids: + return default_attention_mask + + is_pad_token_in_inputs = (pad_token_id is not None) and (isin(elements=inputs, test_elements=pad_token_id).any()) + is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or ~( + isin(elements=eos_token_id, test_elements=pad_token_id).any() + ) + can_infer_attention_mask = is_pad_token_in_inputs * is_pad_token_not_equal_to_eos_token_id + attention_mask_from_padding = inputs.ne(pad_token_id).long() + + attention_mask = ( + attention_mask_from_padding * can_infer_attention_mask + default_attention_mask * ~can_infer_attention_mask + ) + return attention_mask + + if __name__ == "__main__": gpt = UnifiedVoice( model_dim=256, diff --git a/TTS/tts/layers/xtts/dvae.py b/TTS/tts/layers/xtts/dvae.py index 4a37307e74..73970fb0bf 100644 --- a/TTS/tts/layers/xtts/dvae.py +++ b/TTS/tts/layers/xtts/dvae.py @@ -9,6 +9,8 @@ import torchaudio from einops import rearrange +from TTS.utils.generic_utils import is_pytorch_at_least_2_4 + logger = logging.getLogger(__name__) @@ -46,7 +48,7 @@ def dvae_wav_to_mel( mel = mel_stft(wav) mel = torch.log(torch.clamp(mel, min=1e-5)) if mel_norms is None: - mel_norms = torch.load(mel_norms_file, map_location=device) + mel_norms = torch.load(mel_norms_file, map_location=device, weights_only=is_pytorch_at_least_2_4()) mel = mel / mel_norms.unsqueeze(0).unsqueeze(-1) return mel diff --git a/TTS/tts/layers/xtts/gpt.py b/TTS/tts/layers/xtts/gpt.py index b55b84d90e..b3c3b31b47 100644 --- a/TTS/tts/layers/xtts/gpt.py +++ b/TTS/tts/layers/xtts/gpt.py @@ -8,6 +8,7 @@ import torch.nn.functional as F from transformers import GPT2Config +from TTS.tts.layers.tortoise.autoregressive import _prepare_attention_mask_for_generation from TTS.tts.layers.xtts.gpt_inference import GPT2InferenceModel from TTS.tts.layers.xtts.latent_encoder import ConditioningEncoder from TTS.tts.layers.xtts.perceiver_encoder import PerceiverResampler @@ -586,12 +587,15 @@ def generate( **hf_generate_kwargs, ): gpt_inputs = self.compute_embeddings(cond_latents, text_inputs) + stop_token_tensor = torch.tensor(self.stop_audio_token, device=gpt_inputs.device, dtype=torch.long) + attention_mask = _prepare_attention_mask_for_generation(gpt_inputs, stop_token_tensor, stop_token_tensor) gen = self.gpt_inference.generate( gpt_inputs, bos_token_id=self.start_audio_token, pad_token_id=self.stop_audio_token, eos_token_id=self.stop_audio_token, max_length=self.max_gen_mel_tokens + gpt_inputs.shape[-1], + attention_mask=attention_mask, **hf_generate_kwargs, ) if "return_dict_in_generate" in hf_generate_kwargs: diff --git a/TTS/tts/layers/xtts/gpt_inference.py b/TTS/tts/layers/xtts/gpt_inference.py index 4625ae1ba9..e94683524a 100644 --- a/TTS/tts/layers/xtts/gpt_inference.py +++ b/TTS/tts/layers/xtts/gpt_inference.py @@ -1,10 +1,12 @@ import torch from torch import nn -from transformers import GPT2PreTrainedModel +from transformers import GenerationMixin, GPT2PreTrainedModel from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions +from TTS.tts.layers.xtts.stream_generator import StreamGenerationConfig -class GPT2InferenceModel(GPT2PreTrainedModel): + +class GPT2InferenceModel(GPT2PreTrainedModel, GenerationMixin): """Override GPT2LMHeadModel to allow for prefix conditioning.""" def __init__(self, config, gpt, pos_emb, embeddings, norm, linear, kv_cache): @@ -15,6 +17,7 @@ def __init__(self, config, gpt, pos_emb, embeddings, norm, linear, kv_cache): self.final_norm = norm self.lm_head = nn.Sequential(norm, linear) self.kv_cache = kv_cache + self.generation_config = StreamGenerationConfig.from_model_config(config) if self.can_generate() else None def store_prefix_emb(self, prefix_emb): self.cached_prefix_emb = prefix_emb diff --git a/TTS/tts/layers/xtts/hifigan_decoder.py b/TTS/tts/layers/xtts/hifigan_decoder.py index b6032e5584..5ef0030b8b 100644 --- a/TTS/tts/layers/xtts/hifigan_decoder.py +++ b/TTS/tts/layers/xtts/hifigan_decoder.py @@ -9,6 +9,7 @@ from torch.nn.utils.parametrize import remove_parametrizations from trainer.io import load_fsspec +from TTS.utils.generic_utils import is_pytorch_at_least_2_4 from TTS.vocoder.models.hifigan_generator import get_padding logger = logging.getLogger(__name__) @@ -328,7 +329,7 @@ def remove_weight_norm(self): def load_checkpoint( self, config, checkpoint_path, eval=False, cache=False ): # pylint: disable=unused-argument, redefined-builtin - state = torch.load(checkpoint_path, map_location=torch.device("cpu")) + state = torch.load(checkpoint_path, map_location=torch.device("cpu"), weights_only=is_pytorch_at_least_2_4()) self.load_state_dict(state["model"]) if eval: self.eval() diff --git a/TTS/tts/layers/xtts/stream_generator.py b/TTS/tts/layers/xtts/stream_generator.py index efc92a04ef..44cf940c69 100644 --- a/TTS/tts/layers/xtts/stream_generator.py +++ b/TTS/tts/layers/xtts/stream_generator.py @@ -20,8 +20,10 @@ PhrasalConstraint, PreTrainedModel, StoppingCriteriaList, + TemperatureLogitsWarper, + TopKLogitsWarper, + TopPLogitsWarper, ) -from transformers.generation.stopping_criteria import validate_stopping_criteria from transformers.generation.utils import GenerateOutput, SampleOutput, logger @@ -152,7 +154,18 @@ def generate( # noqa: PLR0911 # 2. Set generation parameters if not already defined logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() - kwargs_has_attention_mask = model_kwargs.get("attention_mask", None) is not None + + if generation_config.pad_token_id is None and generation_config.eos_token_id is not None: + if model_kwargs.get("attention_mask", None) is None: + logger.warning( + "The attention mask and the pad token id were not set. As a consequence, you may observe " + "unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results." + ) + eos_token_id = generation_config.eos_token_id + if isinstance(eos_token_id, list): + eos_token_id = eos_token_id[0] + logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation.") + generation_config.pad_token_id = eos_token_id # 3. Define model inputs # inputs_tensor has to be defined @@ -164,22 +177,38 @@ def generate( # noqa: PLR0911 ) batch_size = inputs_tensor.shape[0] - device = inputs_tensor.device - self._prepare_special_tokens(generation_config, kwargs_has_attention_mask, device=device) - # 4. Define other model kwargs model_kwargs["output_attentions"] = generation_config.output_attentions model_kwargs["output_hidden_states"] = generation_config.output_hidden_states model_kwargs["use_cache"] = generation_config.use_cache + model_kwargs["cache_position"] = torch.Tensor([0]).to(inputs_tensor.device) accepts_attention_mask = "attention_mask" in set(inspect.signature(self.forward).parameters.keys()) requires_attention_mask = "encoder_outputs" not in model_kwargs - if not kwargs_has_attention_mask and requires_attention_mask and accepts_attention_mask: + if model_kwargs.get("attention_mask", None) is None and requires_attention_mask and accepts_attention_mask: + setattr( + generation_config, + "_pad_token_tensor", + torch.full( + (inputs_tensor.shape[0], inputs_tensor.shape[1]), + generation_config.pad_token_id, + device=inputs_tensor.device, + ), + ) + setattr( + generation_config, + "_eos_token_tensor", + torch.full( + (inputs_tensor.shape[0], inputs_tensor.shape[1]), + generation_config.eos_token_id, + device=inputs_tensor.device, + ), + ) model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation( inputs_tensor, - generation_config.pad_token_id, - generation_config.eos_token_id, + generation_config._pad_token_tensor, + generation_config._eos_token_tensor, ) # decoder-only models should use left-padding for generation @@ -202,15 +231,16 @@ def generate( # noqa: PLR0911 # 5. Prepare `input_ids` which will be used for auto-regressive generation if self.config.is_encoder_decoder: - input_ids, model_kwargs = self._prepare_decoder_input_ids_for_generation( - batch_size=batch_size, - model_input_name=model_input_name, - model_kwargs=model_kwargs, + input_ids = self._prepare_decoder_input_ids_for_generation( + batch_size, decoder_start_token_id=generation_config.decoder_start_token_id, + bos_token_id=generation_config.bos_token_id, + model_kwargs=model_kwargs, device=inputs_tensor.device, ) else: - input_ids = inputs_tensor if model_input_name == "input_ids" else model_kwargs.pop("input_ids") + # if decoder-only then inputs_tensor has to be `input_ids` + input_ids = inputs_tensor # 6. Prepare `max_length` depending on other stopping criteria. input_ids_seq_length = input_ids.shape[-1] @@ -376,7 +406,7 @@ def generate( # noqa: PLR0911 elif is_sample_gen_mode: # 11. prepare logits warper - logits_warper = self._get_logits_warper(generation_config, inputs_tensor.device) + logits_warper = _get_logits_warper(generation_config) # 12. expand input_ids with `num_return_sequences` additional sequences per batch input_ids, model_kwargs = self._expand_inputs_for_generation( @@ -401,7 +431,7 @@ def generate( # noqa: PLR0911 ) elif is_sample_gen_stream_mode: # 11. prepare logits warper - logits_warper = self._get_logits_warper(generation_config, inputs_tensor.device) + logits_warper = _get_logits_warper(generation_config) # 12. expand input_ids with `num_return_sequences` additional sequences per batch input_ids, model_kwargs = self._expand_inputs_for_generation( @@ -463,7 +493,7 @@ def generate( # noqa: PLR0911 elif is_beam_sample_gen_mode: # 11. prepare logits warper - logits_warper = self._get_logits_warper(generation_config, inputs_tensor.device) + logits_warper = _get_logits_warper(generation_config) if stopping_criteria.max_length is None: raise ValueError("`max_length` needs to be a stopping_criteria for now.") @@ -877,10 +907,10 @@ def init_stream_support(): if __name__ == "__main__": - from transformers import AutoModelForCausalLM, AutoTokenizer - - init_stream_support() + from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedModel + PreTrainedModel.generate = NewGenerationMixin.generate + PreTrainedModel.sample_stream = NewGenerationMixin.sample_stream model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m", torch_dtype=torch.float16) tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m") @@ -920,3 +950,17 @@ def init_stream_support(): chunk = tokenizer.decode(x, skip_special_tokens=True) stream_result += chunk print(stream_result) + + +def _get_logits_warper(generation_config: GenerationConfig) -> LogitsProcessorList: + + warpers = LogitsProcessorList() + + if generation_config.temperature is not None and generation_config.temperature != 1.0: + warpers.append(TemperatureLogitsWarper(generation_config.temperature)) + if generation_config.top_k is not None and generation_config.top_k != 0: + warpers.append(TopKLogitsWarper(top_k=generation_config.top_k, min_tokens_to_keep=1)) + if generation_config.top_p is not None and generation_config.top_p < 1.0: + warpers.append(TopPLogitsWarper(top_p=generation_config.top_p, min_tokens_to_keep=1)) + + return warpers diff --git a/TTS/tts/layers/xtts/trainer/gpt_trainer.py b/TTS/tts/layers/xtts/trainer/gpt_trainer.py index 04d123778b..9d9edd5758 100644 --- a/TTS/tts/layers/xtts/trainer/gpt_trainer.py +++ b/TTS/tts/layers/xtts/trainer/gpt_trainer.py @@ -19,6 +19,7 @@ from TTS.tts.layers.xtts.trainer.dataset import XTTSDataset from TTS.tts.models.base_tts import BaseTTS from TTS.tts.models.xtts import Xtts, XttsArgs, XttsAudioConfig +from TTS.utils.generic_utils import is_pytorch_at_least_2_4 logger = logging.getLogger(__name__) @@ -91,7 +92,9 @@ def __init__(self, config: Coqpit): # load GPT if available if self.args.gpt_checkpoint: - gpt_checkpoint = torch.load(self.args.gpt_checkpoint, map_location=torch.device("cpu")) + gpt_checkpoint = torch.load( + self.args.gpt_checkpoint, map_location=torch.device("cpu"), weights_only=is_pytorch_at_least_2_4() + ) # deal with coqui Trainer exported model if "model" in gpt_checkpoint.keys() and "config" in gpt_checkpoint.keys(): logger.info("Coqui Trainer checkpoint detected! Converting it!") @@ -184,7 +187,9 @@ def __init__(self, config: Coqpit): self.dvae.eval() if self.args.dvae_checkpoint: - dvae_checkpoint = torch.load(self.args.dvae_checkpoint, map_location=torch.device("cpu")) + dvae_checkpoint = torch.load( + self.args.dvae_checkpoint, map_location=torch.device("cpu"), weights_only=is_pytorch_at_least_2_4() + ) self.dvae.load_state_dict(dvae_checkpoint, strict=False) logger.info("DVAE weights restored from: %s", self.args.dvae_checkpoint) else: diff --git a/TTS/tts/layers/xtts/xtts_manager.py b/TTS/tts/layers/xtts/xtts_manager.py index 5560e87687..8156b35f0d 100644 --- a/TTS/tts/layers/xtts/xtts_manager.py +++ b/TTS/tts/layers/xtts/xtts_manager.py @@ -1,9 +1,11 @@ import torch +from TTS.utils.generic_utils import is_pytorch_at_least_2_4 + class SpeakerManager: def __init__(self, speaker_file_path=None): - self.speakers = torch.load(speaker_file_path) + self.speakers = torch.load(speaker_file_path, weights_only=is_pytorch_at_least_2_4()) @property def name_to_id(self): diff --git a/TTS/tts/models/neuralhmm_tts.py b/TTS/tts/models/neuralhmm_tts.py index 277369e644..de5401aac7 100644 --- a/TTS/tts/models/neuralhmm_tts.py +++ b/TTS/tts/models/neuralhmm_tts.py @@ -18,7 +18,7 @@ from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.visual import plot_alignment, plot_spectrogram -from TTS.utils.generic_utils import format_aux_input +from TTS.utils.generic_utils import format_aux_input, is_pytorch_at_least_2_4 logger = logging.getLogger(__name__) @@ -107,7 +107,7 @@ def update_mean_std(self, statistics_dict: Dict): def preprocess_batch(self, text, text_len, mels, mel_len): if self.mean.item() == 0 or self.std.item() == 1: - statistics_dict = torch.load(self.mel_statistics_parameter_path) + statistics_dict = torch.load(self.mel_statistics_parameter_path, weights_only=is_pytorch_at_least_2_4()) self.update_mean_std(statistics_dict) mels = self.normalize(mels) @@ -292,7 +292,9 @@ def on_init_start(self, trainer): "Data parameters found for: %s. Loading mel normalization parameters...", trainer.config.mel_statistics_parameter_path, ) - statistics = torch.load(trainer.config.mel_statistics_parameter_path) + statistics = torch.load( + trainer.config.mel_statistics_parameter_path, weights_only=is_pytorch_at_least_2_4() + ) data_mean, data_std, init_transition_prob = ( statistics["mean"], statistics["std"], diff --git a/TTS/tts/models/overflow.py b/TTS/tts/models/overflow.py index b05b75009b..b72f4877cf 100644 --- a/TTS/tts/models/overflow.py +++ b/TTS/tts/models/overflow.py @@ -19,7 +19,7 @@ from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.visual import plot_alignment, plot_spectrogram -from TTS.utils.generic_utils import format_aux_input +from TTS.utils.generic_utils import format_aux_input, is_pytorch_at_least_2_4 logger = logging.getLogger(__name__) @@ -120,7 +120,7 @@ def update_mean_std(self, statistics_dict: Dict): def preprocess_batch(self, text, text_len, mels, mel_len): if self.mean.item() == 0 or self.std.item() == 1: - statistics_dict = torch.load(self.mel_statistics_parameter_path) + statistics_dict = torch.load(self.mel_statistics_parameter_path, weights_only=is_pytorch_at_least_2_4()) self.update_mean_std(statistics_dict) mels = self.normalize(mels) @@ -308,7 +308,9 @@ def on_init_start(self, trainer): "Data parameters found for: %s. Loading mel normalization parameters...", trainer.config.mel_statistics_parameter_path, ) - statistics = torch.load(trainer.config.mel_statistics_parameter_path) + statistics = torch.load( + trainer.config.mel_statistics_parameter_path, weights_only=is_pytorch_at_least_2_4() + ) data_mean, data_std, init_transition_prob = ( statistics["mean"], statistics["std"], diff --git a/TTS/tts/models/tortoise.py b/TTS/tts/models/tortoise.py index 17303c69f7..01629b5d2a 100644 --- a/TTS/tts/models/tortoise.py +++ b/TTS/tts/models/tortoise.py @@ -23,6 +23,7 @@ from TTS.tts.layers.tortoise.vocoder import VocConf, VocType from TTS.tts.layers.tortoise.wav2vec_alignment import Wav2VecAlignment from TTS.tts.models.base_tts import BaseTTS +from TTS.utils.generic_utils import is_pytorch_at_least_2_4 logger = logging.getLogger(__name__) @@ -170,7 +171,13 @@ def classify_audio_clip(clip, model_dir): kernel_size=5, distribute_zero_label=False, ) - classifier.load_state_dict(torch.load(os.path.join(model_dir, "classifier.pth"), map_location=torch.device("cpu"))) + classifier.load_state_dict( + torch.load( + os.path.join(model_dir, "classifier.pth"), + map_location=torch.device("cpu"), + weights_only=is_pytorch_at_least_2_4(), + ) + ) clip = clip.cpu().unsqueeze(0) results = F.softmax(classifier(clip), dim=-1) return results[0][0] @@ -488,6 +495,7 @@ def get_random_conditioning_latents(self): torch.load( os.path.join(self.models_dir, "rlg_auto.pth"), map_location=torch.device("cpu"), + weights_only=is_pytorch_at_least_2_4(), ) ) self.rlg_diffusion = RandomLatentConverter(2048).eval() @@ -495,6 +503,7 @@ def get_random_conditioning_latents(self): torch.load( os.path.join(self.models_dir, "rlg_diffuser.pth"), map_location=torch.device("cpu"), + weights_only=is_pytorch_at_least_2_4(), ) ) with torch.no_grad(): @@ -881,17 +890,17 @@ def load_checkpoint( if os.path.exists(ar_path): # remove keys from the checkpoint that are not in the model - checkpoint = torch.load(ar_path, map_location=torch.device("cpu")) + checkpoint = torch.load(ar_path, map_location=torch.device("cpu"), weights_only=is_pytorch_at_least_2_4()) # strict set False # due to removed `bias` and `masked_bias` changes in Transformers self.autoregressive.load_state_dict(checkpoint, strict=False) if os.path.exists(diff_path): - self.diffusion.load_state_dict(torch.load(diff_path), strict=strict) + self.diffusion.load_state_dict(torch.load(diff_path, weights_only=is_pytorch_at_least_2_4()), strict=strict) if os.path.exists(clvp_path): - self.clvp.load_state_dict(torch.load(clvp_path), strict=strict) + self.clvp.load_state_dict(torch.load(clvp_path, weights_only=is_pytorch_at_least_2_4()), strict=strict) if os.path.exists(vocoder_checkpoint_path): self.vocoder.load_state_dict( @@ -899,6 +908,7 @@ def load_checkpoint( torch.load( vocoder_checkpoint_path, map_location=torch.device("cpu"), + weights_only=is_pytorch_at_least_2_4(), ) ) ) diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index ef09344217..7c4a76ad7d 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -16,6 +16,7 @@ from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer, split_sentence from TTS.tts.layers.xtts.xtts_manager import LanguageManager, SpeakerManager from TTS.tts.models.base_tts import BaseTTS +from TTS.utils.generic_utils import is_pytorch_at_least_2_4 logger = logging.getLogger(__name__) @@ -65,7 +66,7 @@ def wav_to_mel_cloning( mel = mel_stft(wav) mel = torch.log(torch.clamp(mel, min=1e-5)) if mel_norms is None: - mel_norms = torch.load(mel_norms_file, map_location=device) + mel_norms = torch.load(mel_norms_file, map_location=device, weights_only=is_pytorch_at_least_2_4()) mel = mel / mel_norms.unsqueeze(0).unsqueeze(-1) return mel @@ -667,6 +668,7 @@ def inference_stream( repetition_penalty=float(repetition_penalty), output_attentions=False, output_hidden_states=True, + return_dict_in_generate=True, **hf_generate_kwargs, ) diff --git a/TTS/tts/utils/fairseq.py b/TTS/tts/utils/fairseq.py index 3d8eec2b4e..20907a0532 100644 --- a/TTS/tts/utils/fairseq.py +++ b/TTS/tts/utils/fairseq.py @@ -1,8 +1,10 @@ import torch +from TTS.utils.generic_utils import is_pytorch_at_least_2_4 + def rehash_fairseq_vits_checkpoint(checkpoint_file): - chk = torch.load(checkpoint_file, map_location=torch.device("cpu"))["model"] + chk = torch.load(checkpoint_file, map_location=torch.device("cpu"), weights_only=is_pytorch_at_least_2_4())["model"] new_chk = {} for k, v in chk.items(): if "enc_p." in k: diff --git a/TTS/tts/utils/managers.py b/TTS/tts/utils/managers.py index 23aa52a8a2..6a2f7df67b 100644 --- a/TTS/tts/utils/managers.py +++ b/TTS/tts/utils/managers.py @@ -9,6 +9,7 @@ from TTS.config import load_config from TTS.encoder.utils.generic_utils import setup_encoder_model from TTS.utils.audio import AudioProcessor +from TTS.utils.generic_utils import is_pytorch_at_least_2_4 def load_file(path: str): @@ -17,7 +18,7 @@ def load_file(path: str): return json.load(f) elif path.endswith(".pth"): with fsspec.open(path, "rb") as f: - return torch.load(f, map_location="cpu") + return torch.load(f, map_location="cpu", weights_only=is_pytorch_at_least_2_4()) else: raise ValueError("Unsupported file type") diff --git a/TTS/tts/utils/text/characters.py b/TTS/tts/utils/text/characters.py index c622b93c59..4bf9bf6bd5 100644 --- a/TTS/tts/utils/text/characters.py +++ b/TTS/tts/utils/text/characters.py @@ -34,8 +34,8 @@ def parse_symbols(): _pulmonic_consonants = "pbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ" _suprasegmentals = "ˈˌːˑ" _other_symbols = "ʍwɥʜʢʡɕʑɺɧʲ" -_diacrilics = "ɚ˞ɫ" -_phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics +_diacritics = "̃ɚ˞ɫ" +_phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacritics class BaseVocabulary: diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py index 91f8844262..3ee285232f 100644 --- a/TTS/utils/generic_utils.py +++ b/TTS/utils/generic_utils.py @@ -6,6 +6,9 @@ from pathlib import Path from typing import Dict, Optional +import torch +from packaging.version import Version + logger = logging.getLogger(__name__) @@ -131,3 +134,8 @@ def setup_logger( sh = logging.StreamHandler() sh.setFormatter(formatter) lg.addHandler(sh) + + +def is_pytorch_at_least_2_4() -> bool: + """Check if the installed Pytorch version is 2.4 or higher.""" + return Version(torch.__version__) >= Version("2.4") diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 50a7893047..90af4f48f9 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -12,9 +12,6 @@ from TTS.tts.configs.vits_config import VitsConfig from TTS.tts.models import setup_model as setup_tts_model from TTS.tts.models.vits import Vits - -# pylint: disable=unused-wildcard-import -# pylint: disable=wildcard-import from TTS.tts.utils.synthesis import synthesis, transfer_voice, trim_silence from TTS.utils.audio import AudioProcessor from TTS.utils.audio.numpy_transforms import save_wav diff --git a/TTS/vc/modules/freevc/wavlm/__init__.py b/TTS/vc/modules/freevc/wavlm/__init__.py index 03b2f5827b..4046e137f5 100644 --- a/TTS/vc/modules/freevc/wavlm/__init__.py +++ b/TTS/vc/modules/freevc/wavlm/__init__.py @@ -5,6 +5,7 @@ import torch from trainer.io import get_user_data_dir +from TTS.utils.generic_utils import is_pytorch_at_least_2_4 from TTS.vc.modules.freevc.wavlm.wavlm import WavLM, WavLMConfig logger = logging.getLogger(__name__) @@ -26,7 +27,7 @@ def get_wavlm(device="cpu"): logger.info("Downloading WavLM model to %s ...", output_path) urllib.request.urlretrieve(model_uri, output_path) - checkpoint = torch.load(output_path, map_location=torch.device(device)) + checkpoint = torch.load(output_path, map_location=torch.device(device), weights_only=is_pytorch_at_least_2_4()) cfg = WavLMConfig(checkpoint["cfg"]) wavlm = WavLM(cfg).to(device) wavlm.load_state_dict(checkpoint["model"]) diff --git a/dockerfiles/Dockerfile.dev b/dockerfiles/Dockerfile.dev index af0d3fc0cd..b61bc4de94 100644 --- a/dockerfiles/Dockerfile.dev +++ b/dockerfiles/Dockerfile.dev @@ -20,4 +20,4 @@ RUN rm -rf /root/.cache/pip WORKDIR /root COPY . /root -RUN make install +RUN pip3 install -e .[all,dev] diff --git a/notebooks/TestAttention.ipynb b/notebooks/TestAttention.ipynb index d85ca1035a..f52fa028e5 100644 --- a/notebooks/TestAttention.ipynb +++ b/notebooks/TestAttention.ipynb @@ -119,9 +119,9 @@ "\n", "# load model state\n", "if use_cuda:\n", - " cp = torch.load(MODEL_PATH)\n", + " cp = torch.load(MODEL_PATH, weights_only=True)\n", "else:\n", - " cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)\n", + " cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage, weights_only=True)\n", "\n", "# load the model\n", "model.load_state_dict(cp['model'])\n", diff --git a/pyproject.toml b/pyproject.toml index f87751b998..389c0c66b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ include = ["TTS*"] [project] name = "coqui-tts" -version = "0.24.2" +version = "0.24.3" description = "Deep learning for Text to Speech." readme = "README.md" requires-python = ">=3.9, <3.13" @@ -44,11 +44,11 @@ classifiers = [ ] dependencies = [ # Core - "numpy>=1.25.2", + "numpy>=1.25.2,<2.0", "cython>=3.0.0", "scipy>=1.11.2", "torch>=2.1", - "torchaudio", + "torchaudio>=2.1.0", "soundfile>=0.12.0", "librosa>=0.10.1", "inflect>=5.6.0", @@ -62,13 +62,13 @@ dependencies = [ # Training "matplotlib>=3.7.0", # Coqui stack - "coqui-tts-trainer==0.1.4", + "coqui-tts-trainer>=0.1.4,<0.2.0", "coqpit>=0.0.16", # Gruut + supported languages "gruut[de,es,fr]>=2.4.0", # Tortoise "einops>=0.6.0", - "transformers>=4.42.0,<4.43.0", + "transformers>=4.43.0", # Bark "encodec>=0.1.1", # XTTS @@ -77,15 +77,6 @@ dependencies = [ ] [project.optional-dependencies] -# Development dependencies -dev = [ - "black==24.2.0", - "coverage[toml]>=7", - "nose2>=0.15", - "pre-commit>=3", - "ruff==0.4.9", - "tomli>=2; python_version < '3.11'", -] # Dependencies for building the documentation docs = [ "furo>=2023.5.20", @@ -115,6 +106,7 @@ ko = [ "hangul_romanize>=0.1.0", "jamo>=0.4.1", "g2pkk>=0.1.1", + "pip>=22.2", ] # Japanese ja = [ @@ -136,6 +128,15 @@ all = [ "coqui-tts[notebooks,server,bn,ja,ko,zh]", ] +[dependency-groups] +dev = [ + "black==24.2.0", + "coverage[toml]>=7", + "nose2>=0.15", + "pre-commit>=3", + "ruff==0.7.0", +] + [project.urls] Homepage = "https://github.com/idiap/coqui-ai-TTS" Documentation = "https://coqui-tts.readthedocs.io" @@ -147,14 +148,16 @@ Discussions = "https://github.com/idiap/coqui-ai-TTS/discussions" tts = "TTS.bin.synthesize:main" tts-server = "TTS.server.server:main" +[tool.uv] +constraint-dependencies = ["numba>0.58.0"] + [tool.ruff] -target-version = "py39" line-length = 120 +extend-exclude = ["*.ipynb"] lint.extend-select = [ "B033", # duplicate-value "C416", # unnecessary-comprehension "D419", # empty-docstring - "E999", # syntax-error "F401", # unused-import "F704", # yield-outside-function "F706", # return-outside-function diff --git a/requirements.dev.txt b/requirements.dev.txt deleted file mode 100644 index 74ec0cd80c..0000000000 --- a/requirements.dev.txt +++ /dev/null @@ -1,8 +0,0 @@ -# Generated via scripts/generate_requirements.py and pre-commit hook. -# Do not edit this file; modify pyproject.toml instead. -black==24.2.0 -coverage[toml]>=7 -nose2>=0.15 -pre-commit>=3 -ruff==0.4.9 -tomli>=2; python_version < '3.11' diff --git a/scripts/generate_requirements.py b/scripts/generate_requirements.py deleted file mode 100644 index bbd32bafd2..0000000000 --- a/scripts/generate_requirements.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env python -"""Generate requirements/*.txt files from pyproject.toml. - -Adapted from: -https://github.com/numpy/numpydoc/blob/e7c6baf00f5f73a4a8f8318d0cb4e04949c9a5d1/tools/generate_requirements.py -""" - -import sys -from pathlib import Path - -try: # standard module since Python 3.11 - import tomllib as toml -except ImportError: - try: # available for older Python via pip - import tomli as toml - except ImportError: - sys.exit("Please install `tomli` first: `pip install tomli`") - -script_pth = Path(__file__) -repo_dir = script_pth.parent.parent -script_relpth = script_pth.relative_to(repo_dir) -header = [ - f"# Generated via {script_relpth.as_posix()} and pre-commit hook.", - "# Do not edit this file; modify pyproject.toml instead.", -] - - -def generate_requirement_file(name: str, req_list: list[str]) -> None: - req_fname = repo_dir / f"requirements.{name}.txt" - req_fname.write_text("\n".join(header + req_list) + "\n") - - -def main() -> None: - pyproject = toml.loads((repo_dir / "pyproject.toml").read_text()) - generate_requirement_file("dev", pyproject["project"]["optional-dependencies"]["dev"]) - - -if __name__ == "__main__": - main()