diff --git a/.github/actions/setup-uv/action.yml b/.github/actions/setup-uv/action.yml new file mode 100644 index 0000000000..c7dd4f5f99 --- /dev/null +++ b/.github/actions/setup-uv/action.yml @@ -0,0 +1,11 @@ +name: Setup uv + +runs: + using: 'composite' + steps: + - name: Install uv + uses: astral-sh/setup-uv@v4 + with: + version: "0.5.4" + enable-cache: true + cache-dependency-glob: "**/pyproject.toml" diff --git a/.github/workflows/pypi-release.yml b/.github/workflows/pypi-release.yml index 822990e967..1b7f44654c 100644 --- a/.github/workflows/pypi-release.yml +++ b/.github/workflows/pypi-release.yml @@ -7,11 +7,12 @@ defaults: shell: bash jobs: - build-sdist: - name: Build source distribution + build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + - name: Setup uv + uses: ./.github/actions/setup-uv - name: Verify tag matches version run: | set -ex @@ -20,37 +21,23 @@ jobs: if [[ "v$version" != "$tag" ]]; then exit 1 fi - - uses: actions/setup-python@v5 - with: - python-version: 3.9 - - run: | - python -m pip install -U pip setuptools build - - run: | - python -m build - - run: | - pip install dist/*.tar.gz - - uses: actions/upload-artifact@v4 - with: - name: build-sdist - path: dist/*.tar.gz - build-wheels: - name: Build wheels on ${{ matrix.os }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-latest, windows-latest, macos-latest] - steps: - - uses: actions/checkout@v4 - - name: Build wheels - uses: pypa/cibuildwheel@v2.21.1 + - name: Set up Python + run: uv python install 3.12 + - name: Build sdist and wheel + run: uv build + - name: Test installation of sdist and wheel + run: | + uv venv --no-project + uv pip install dist/*.tar.gz + uv pip install dist/*.whl - uses: actions/upload-artifact@v4 with: - name: build-wheels-${{ matrix.os }} - path: ./wheelhouse/*.whl + name: build + path: dist/* publish-artifacts: name: Publish to PyPI runs-on: ubuntu-latest - needs: [build-sdist, build-wheels] + needs: [build] environment: name: release url: https://pypi.org/p/coqui-tts @@ -60,8 +47,7 @@ jobs: - uses: actions/download-artifact@v4 with: path: dist - pattern: build-* - merge-multiple: true + pattern: build - run: | ls -lh dist/ - name: Publish package distributions to PyPI diff --git a/.github/workflows/style_check.yml b/.github/workflows/style_check.yml index a146213f7c..d1060f6be2 100644 --- a/.github/workflows/style_check.yml +++ b/.github/workflows/style_check.yml @@ -7,7 +7,7 @@ on: pull_request: types: [opened, synchronize, reopened] jobs: - test: + lint: runs-on: ubuntu-latest strategy: fail-fast: false @@ -15,12 +15,8 @@ jobs: python-version: [3.9] steps: - uses: actions/checkout@v4 - - name: Install uv - uses: astral-sh/setup-uv@v3 - with: - version: "0.4.27" - enable-cache: true - cache-dependency-glob: "**/pyproject.toml" + - name: Setup uv + uses: ./.github/actions/setup-uv - name: Set up Python ${{ matrix.python-version }} run: uv python install ${{ matrix.python-version }} - name: Lint check diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index be3f1b740b..8d639d5dee 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,4 +1,4 @@ -name: tests +name: test on: push: @@ -6,26 +6,32 @@ on: - main pull_request: types: [opened, synchronize, reopened] + workflow_dispatch: + inputs: + trainer_branch: + description: "Branch of Trainer to test" + required: false + default: "main" + coqpit_branch: + description: "Branch of Coqpit to test" + required: false + default: "main" jobs: - test: + unit: runs-on: ubuntu-latest strategy: fail-fast: false matrix: python-version: [3.9, "3.10", "3.11", "3.12"] - subset: ["data_tests", "inference_tests", "test_aux", "test_text", "test_tts", "test_tts2", "test_vocoder", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"] + subset: ["data_tests", "inference_tests", "test_aux", "test_text"] steps: - uses: actions/checkout@v4 - - name: Install uv - uses: astral-sh/setup-uv@v3 - with: - version: "0.4.27" - enable-cache: true - cache-dependency-glob: "**/pyproject.toml" + - name: Setup uv + uses: ./.github/actions/setup-uv - name: Set up Python ${{ matrix.python-version }} run: uv python install ${{ matrix.python-version }} - name: Install Espeak - if: contains(fromJSON('["inference_tests", "test_text", "test_tts", "test_tts2", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"]'), matrix.subset) + if: contains(fromJSON('["inference_tests", "test_text"]'), matrix.subset) run: | sudo apt-get update sudo apt-get install espeak espeak-ng @@ -34,10 +40,14 @@ jobs: sudo apt-get update sudo apt-get install -y --no-install-recommends git make gcc make system-deps - - name: Replace scarf urls - if: contains(fromJSON('["data_tests", "inference_tests", "test_aux", "test_tts", "test_tts2", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"]'), matrix.subset) + - name: Install custom Trainer and/or Coqpit if requested run: | - sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json + if [[ -n "${{ github.event.inputs.trainer_branch }}" ]]; then + uv add git+https://github.com/idiap/coqui-ai-Trainer --branch ${{ github.event.inputs.trainer_branch }} + fi + if [[ -n "${{ github.event.inputs.coqpit_branch }}" ]]; then + uv add git+https://github.com/idiap/coqui-ai-coqpit --branch ${{ github.event.inputs.coqpit_branch }} + fi - name: Unit tests run: | resolution=highest @@ -52,16 +62,59 @@ jobs: name: coverage-data-${{ matrix.subset }}-${{ matrix.python-version }} path: .coverage.* if-no-files-found: ignore + integration: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.9", "3.12"] + subset: ["test_tts", "test_tts2", "test_vocoder", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"] + steps: + - uses: actions/checkout@v4 + - name: Setup uv + uses: ./.github/actions/setup-uv + - name: Set up Python ${{ matrix.python-version }} + run: uv python install ${{ matrix.python-version }} + - name: Install Espeak + if: contains(fromJSON('["test_tts", "test_tts2", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"]'), matrix.subset) + run: | + sudo apt-get update + sudo apt-get install espeak espeak-ng + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y --no-install-recommends git make gcc + make system-deps + - name: Install custom Trainer and/or Coqpit if requested + run: | + if [[ -n "${{ github.event.inputs.trainer_branch }}" ]]; then + uv add git+https://github.com/idiap/coqui-ai-Trainer --branch ${{ github.event.inputs.trainer_branch }} + fi + if [[ -n "${{ github.event.inputs.coqpit_branch }}" ]]; then + uv add git+https://github.com/idiap/coqui-ai-coqpit --branch ${{ github.event.inputs.coqpit_branch }} + fi + - name: Integration tests + run: | + resolution=highest + if [ "${{ matrix.python-version }}" == "3.9" ]; then + resolution=lowest-direct + fi + uv run --resolution=$resolution --extra server --extra languages make ${{ matrix.subset }} + - name: Upload coverage data + uses: actions/upload-artifact@v4 + with: + include-hidden-files: true + name: coverage-data-${{ matrix.subset }}-${{ matrix.python-version }} + path: .coverage.* + if-no-files-found: ignore coverage: if: always() - needs: test + needs: [unit, integration] runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - name: Install uv - uses: astral-sh/setup-uv@v3 - with: - version: "0.4.27" + - name: Setup uv + uses: ./.github/actions/setup-uv - uses: actions/download-artifact@v4 with: pattern: coverage-data-* diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 92f6f3ab3c..62420e9958 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,6 +2,8 @@ repos: - repo: "https://github.com/pre-commit/pre-commit-hooks" rev: v5.0.0 hooks: + - id: check-json + files: "TTS/.models.json" - id: check-yaml - id: end-of-file-fixer - id: trailing-whitespace diff --git a/.readthedocs.yml b/.readthedocs.yml index e19a4dccb7..355e3485e7 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -9,14 +9,13 @@ version: 2 build: os: ubuntu-22.04 tools: - python: "3.11" - -# Optionally set the version of Python and requirements required to build your docs -python: - install: - - path: . - extra_requirements: - - docs + python: "3.12" + commands: + - asdf plugin add uv + - asdf install uv latest + - asdf global uv latest + - uv sync --group docs + - uv run -m sphinx -T -b html -d docs/_build/doctrees -D language=en docs/source $READTHEDOCS_OUTPUT/html # Build documentation in the docs/ directory with Sphinx sphinx: diff --git a/CODE_OWNERS.rst b/CODE_OWNERS.rst deleted file mode 100644 index 768b573911..0000000000 --- a/CODE_OWNERS.rst +++ /dev/null @@ -1,75 +0,0 @@ -TTS code owners / governance system -========================================== - -TTS is run under a governance system inspired (and partially copied from) by the `Mozilla module ownership system `_. The project is roughly divided into modules, and each module has its owners, which are responsible for reviewing pull requests and deciding on technical direction for their modules. Module ownership authority is given to people who have worked extensively on areas of the project. - -Module owners also have the authority of naming other module owners or appointing module peers, which are people with authority to review pull requests in that module. They can also sub-divide their module into sub-modules with their owners. - -Module owners are not tyrants. They are chartered to make decisions with input from the community and in the best interest of the community. Module owners are not required to make code changes or additions solely because the community wants them to do so. (Like anyone else, the module owners may write code because they want to, because their employers want them to, because the community wants them to, or for some other reason.) Module owners do need to pay attention to patches submitted to that module. However “pay attention” does not mean agreeing to every patch. Some patches may not make sense for the WebThings project; some may be poorly implemented. Module owners have the authority to decline a patch; this is a necessary part of the role. We ask the module owners to describe in the relevant issue their reasons for wanting changes to a patch, for declining it altogether, or for postponing review for some period. We don’t ask or expect them to rewrite patches to make them acceptable. Similarly, module owners may need to delay review of a promising patch due to an upcoming deadline. For example, a patch may be of interest, but not for the next milestone. In such a case it may make sense for the module owner to postpone review of a patch until after matters needed for a milestone have been finalized. Again, we expect this to be described in the relevant issue. And of course, it shouldn’t go on very often or for very long or escalation and review is likely. - -The work of the various module owners and peers is overseen by the global owners, which are responsible for making final decisions in case there's conflict between owners as well as set the direction for the project as a whole. - -This file describes module owners who are active on the project and which parts of the code they have expertise on (and interest in). If you're making changes to the code and are wondering who's an appropriate person to talk to, this list will tell you who to ping. - -There's overlap in the areas of expertise of each owner, and in particular when looking at which files are covered by each area, there is a lot of overlap. Don't worry about getting it exactly right when requesting review, any code owner will be happy to redirect the request to a more appropriate person. - -Global owners ----------------- - -These are people who have worked on the project extensively and are familiar with all or most parts of it. Their expertise and review guidance is trusted by other code owners to cover their own areas of expertise. In case of conflicting opinions from other owners, global owners will make a final decision. - -- Eren Gölge (@erogol) -- Reuben Morais (@reuben) - -Training, feeding ------------------ - -- Eren Gölge (@erogol) - -Model exporting ---------------- - -- Eren Gölge (@erogol) - -Multi-Speaker TTS ------------------ - -- Eren Gölge (@erogol) -- Edresson Casanova (@edresson) - -TTS ---- - -- Eren Gölge (@erogol) - -Vocoders --------- - -- Eren Gölge (@erogol) - -Speaker Encoder ---------------- - -- Eren Gölge (@erogol) - -Testing & CI ------------- - -- Eren Gölge (@erogol) -- Reuben Morais (@reuben) - -Python bindings ---------------- - -- Eren Gölge (@erogol) -- Reuben Morais (@reuben) - -Documentation -------------- - -- Eren Gölge (@erogol) - -Third party bindings --------------------- - -Owned by the author. diff --git a/Dockerfile b/Dockerfile index e9d331bc41..9ce5c63989 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,9 +1,20 @@ ARG BASE=nvidia/cuda:11.8.0-base-ubuntu22.04 FROM ${BASE} -RUN apt-get update && apt-get upgrade -y -RUN apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/* -RUN pip3 install -U pip setuptools +RUN apt-get update && \ + apt-get upgrade -y +RUN apt-get install -y --no-install-recommends \ + gcc g++ make python3 python3-dev python3-pip \ + python3-venv python3-wheel espeak-ng \ + libsndfile1-dev libc-dev curl && \ + rm -rf /var/lib/apt/lists/* + +# Install Rust compiler (to build sudachipy for Mac) +RUN curl --proto '=https' --tlsv1.2 -sSf "https://sh.rustup.rs" | sh -s -- -y +ENV PATH="/root/.cargo/bin:${PATH}" + +RUN pip3 install -U pip setuptools wheel +RUN pip3 install -U "spacy[ja]<3.8" RUN pip3 install llvmlite --ignore-installed # Install Dependencies: diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 8d092ceff2..0000000000 --- a/MANIFEST.in +++ /dev/null @@ -1,10 +0,0 @@ -include README.md -include LICENSE.txt -include *.cff -recursive-include TTS *.json -recursive-include TTS *.html -recursive-include TTS *.png -recursive-include TTS *.md -recursive-include TTS *.py -recursive-include TTS *.pyx -recursive-include images *.png diff --git a/README.md b/README.md index 507cce9298..7dddf3a37b 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,12 @@ ## 🐸Coqui TTS News - 📣 Fork of the [original, unmaintained repository](https://github.com/coqui-ai/TTS). New PyPI package: [coqui-tts](https://pypi.org/project/coqui-tts) +- 📣 [OpenVoice](https://github.com/myshell-ai/OpenVoice) models now available for voice conversion. - 📣 Prebuilt wheels are now also published for Mac and Windows (in addition to Linux as before) for easier installation across platforms. -- 📣 ⓍTTSv2 is here with 16 languages and better performance across the board. +- 📣 ⓍTTSv2 is here with 17 languages and better performance across the board. ⓍTTS can stream with <200ms latency. - 📣 ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/idiap/coqui-ai-TTS/tree/dev/recipes/ljspeech). -- 📣 ⓍTTS can now stream with <200ms latency. -- 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://coqui-tts.readthedocs.io/en/latest/models/xtts.html) - 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://coqui-tts.readthedocs.io/en/latest/models/bark.html) -- 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS. +- 📣 You can use [Fairseq models in ~1100 languages](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS. ## @@ -121,6 +120,7 @@ repository are also still a useful source of information. ### Voice Conversion - FreeVC: [paper](https://arxiv.org/abs/2210.15418) +- OpenVoice: [technical report](https://arxiv.org/abs/2312.01479) You can also help us implement more models. @@ -147,9 +147,7 @@ The following extras allow the installation of optional dependencies: | Name | Description | |------|-------------| -| `all` | All optional dependencies, except `dev` and `docs` | -| `dev` | Development dependencies | -| `docs` | Dependencies for building the documentation | +| `all` | All optional dependencies | | `notebooks` | Dependencies only used in notebooks | | `server` | Dependencies to run the TTS server | | `bn` | Bangla G2P | @@ -246,8 +244,14 @@ tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progr tts.voice_conversion_to_file(source_wav="my/source.wav", target_wav="my/target.wav", file_path="output.wav") ``` -#### Example voice cloning together with the voice conversion model. -This way, you can clone voices by using any model in 🐸TTS. +Other available voice conversion models: +- `voice_conversion_models/multilingual/multi-dataset/openvoice_v1` +- `voice_conversion_models/multilingual/multi-dataset/openvoice_v2` + +#### Example voice cloning together with the default voice conversion model. + +This way, you can clone voices by using any model in 🐸TTS. The FreeVC model is +used for voice conversion after synthesizing speech. ```python @@ -414,4 +418,6 @@ $ tts --out_path output/path/speech.wav --model_name "// https://github.com/mobassir94/comprehensive-bangla-tts", @@ -737,7 +737,7 @@ "license": "Apache 2.0" }, "vits-female": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_female.zip", + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.13.3_models/tts_models--bn--custom--vits_female.zip", "default_vocoder": null, "commit": null, "description": "Single speaker Bangla female model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts", @@ -750,7 +750,7 @@ "common-voice": { "glow-tts":{ "description": "Belarusian GlowTTS model created by @alex73 (Github).", - "github_rls_url":"https://coqui.gateway.scarf.sh/v0.16.6/tts_models--be--common-voice--glow-tts.zip", + "github_rls_url":"https://github.com/coqui-ai/TTS/releases/download/v0.16.6/tts_models--be--common-voice--glow-tts.zip", "default_vocoder": "vocoder_models/be/common-voice/hifigan", "commit": "c0aabb85", "license": "CC-BY-SA 4.0", @@ -763,14 +763,14 @@ "universal": { "libri-tts": { "wavegrad": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--wavegrad.zip", + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--universal--libri-tts--wavegrad.zip", "commit": "ea976b0", "author": "Eren Gölge @erogol", "license": "MPL", "contact": "egolge@coqui.com" }, "fullband-melgan": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--fullband-melgan.zip", + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--universal--libri-tts--fullband-melgan.zip", "commit": "4132240", "author": "Eren Gölge @erogol", "license": "MPL", @@ -782,14 +782,14 @@ "ek1": { "wavegrad": { "description": "EK1 en-rp wavegrad by NMStoker", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ek1--wavegrad.zip", + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--en--ek1--wavegrad.zip", "commit": "c802255", "license": "apache 2.0" } }, "ljspeech": { "multiband-melgan": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip", + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip", "commit": "ea976b0", "author": "Eren Gölge @erogol", "license": "MPL", @@ -797,7 +797,7 @@ }, "hifigan_v2": { "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--hifigan_v2.zip", + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--en--ljspeech--hifigan_v2.zip", "commit": "bae2ad0f", "author": "@erogol", "license": "apache 2.0", @@ -805,7 +805,7 @@ }, "univnet": { "description": "UnivNet model finetuned on TacotronDDC_ph spectrograms for better compatibility.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--univnet_v2.zip", + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--en--ljspeech--univnet_v2.zip", "commit": "4581e3d", "author": "Eren @erogol", "license": "apache 2.0", @@ -815,7 +815,7 @@ "blizzard2013": { "hifigan_v2": { "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/vocoder_models--en--blizzard2013--hifigan_v2.zip", + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.7.0_models/vocoder_models--en--blizzard2013--hifigan_v2.zip", "commit": "d6284e7", "author": "Adam Froghyar @a-froghyar", "license": "apache 2.0", @@ -825,7 +825,7 @@ "vctk": { "hifigan_v2": { "description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--vctk--hifigan_v2.zip", + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--en--vctk--hifigan_v2.zip", "commit": "2f07160", "author": "Edresson Casanova", "license": "apache 2.0", @@ -835,7 +835,7 @@ "sam": { "hifigan_v2": { "description": "Finetuned and intended to be used with tts_models/en/sam/tacotron_DDC", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--sam--hifigan_v2.zip", + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--en--sam--hifigan_v2.zip", "commit": "2f07160", "author": "Eren Gölge @erogol", "license": "apache 2.0", @@ -846,7 +846,7 @@ "nl": { "mai": { "parallel-wavegan": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--nl--mai--parallel-wavegan.zip", + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--nl--mai--parallel-wavegan.zip", "author": "@r-dh", "license": "apache 2.0", "commit": "unknown" @@ -856,19 +856,19 @@ "de": { "thorsten": { "wavegrad": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--wavegrad.zip", + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--de--thorsten--wavegrad.zip", "author": "@thorstenMueller", "license": "apache 2.0", "commit": "unknown" }, "fullband-melgan": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--fullband-melgan.zip", + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--de--thorsten--fullband-melgan.zip", "author": "@thorstenMueller", "license": "apache 2.0", "commit": "unknown" }, "hifigan_v1": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/vocoder_models--de--thorsten--hifigan_v1.zip", + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/vocoder_models--de--thorsten--hifigan_v1.zip", "description": "HifiGAN vocoder model for Thorsten Neutral Dec2021 22k Samplerate Tacotron2 DDC model", "author": "@thorstenMueller", "license": "apache 2.0", @@ -879,7 +879,7 @@ "ja": { "kokoro": { "hifigan_v1": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--ja--kokoro--hifigan_v1.zip", + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--ja--kokoro--hifigan_v1.zip", "description": "HifiGAN model trained for kokoro dataset by @kaiidams", "author": "@kaiidams", "license": "apache 2.0", @@ -890,7 +890,7 @@ "uk": { "mai": { "multiband-melgan": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip", + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip", "author": "@robinhad", "commit": "bdab788d", "license": "MIT", @@ -901,7 +901,7 @@ "tr": { "common-voice": { "hifigan": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip", + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip", "description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.", "author": "Fatih Akademi", "license": "MIT", @@ -912,7 +912,7 @@ "be": { "common-voice": { "hifigan": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.16.6/vocoder_models--be--common-voice--hifigan.zip", + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.16.6/vocoder_models--be--common-voice--hifigan.zip", "description": "Belarusian HiFiGAN model created by @alex73 (Github).", "author": "@alex73", "license": "CC-BY-SA 4.0", @@ -925,12 +925,34 @@ "multilingual": { "vctk": { "freevc24": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip", + "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip", "description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC", "author": "Jing-Yi Li @OlaWod", "license": "MIT", "commit": null } + }, + "multi-dataset": { + "openvoice_v1": { + "hf_url": [ + "https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter/config.json", + "https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter/checkpoint.pth" + ], + "description": "OpenVoice VC model from https://huggingface.co/myshell-ai/OpenVoiceV2", + "author": "MyShell.ai", + "license": "MIT", + "commit": null + }, + "openvoice_v2": { + "hf_url": [ + "https://huggingface.co/myshell-ai/OpenVoiceV2/resolve/main/converter/config.json", + "https://huggingface.co/myshell-ai/OpenVoiceV2/resolve/main/converter/checkpoint.pth" + ], + "description": "OpenVoice VC model from https://huggingface.co/myshell-ai/OpenVoiceV2", + "author": "MyShell.ai", + "license": "MIT", + "commit": null + } } } } diff --git a/TTS/api.py b/TTS/api.py index 250ed1a0d9..ed82825007 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -155,8 +155,10 @@ def load_vc_model_by_name(self, model_name: str, gpu: bool = False): gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False. """ self.model_name = model_name - model_path, config_path, _, _, _ = self.download_model_by_name(model_name) - self.voice_converter = Synthesizer(vc_checkpoint=model_path, vc_config=config_path, use_cuda=gpu) + model_path, config_path, _, _, model_dir = self.download_model_by_name(model_name) + self.voice_converter = Synthesizer( + vc_checkpoint=model_path, vc_config=config_path, model_dir=model_dir, use_cuda=gpu + ) def load_tts_model_by_name(self, model_name: str, gpu: bool = False): """Load one of 🐸TTS models by name. @@ -355,15 +357,17 @@ def voice_conversion( target_wav (str):` Path to the target wav file. """ - wav = self.voice_converter.voice_conversion(source_wav=source_wav, target_wav=target_wav) - return wav + if self.voice_converter is None: + msg = "The selected model does not support voice conversion." + raise RuntimeError(msg) + return self.voice_converter.voice_conversion(source_wav=source_wav, target_wav=target_wav) def voice_conversion_to_file( self, source_wav: str, target_wav: str, file_path: str = "output.wav", - ): + ) -> str: """Voice conversion with FreeVC. Convert source wav to target speaker. Args: diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py index 127199186b..535182d214 100644 --- a/TTS/bin/compute_attention_masks.py +++ b/TTS/bin/compute_attention_masks.py @@ -80,7 +80,7 @@ num_chars = len(phonemes) if C.use_phonemes else len(symbols) # TODO: handle multi-speaker model = setup_model(C) - model, _ = load_checkpoint(model, args.model_path, args.use_cuda, True) + model, _ = load_checkpoint(model, args.model_path, use_cuda=args.use_cuda, eval=True) # data loader preprocessor = importlib.import_module("TTS.tts.datasets.formatters") diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index bc01ffd595..454f528ab4 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -407,18 +407,18 @@ def main(): # load models synthesizer = Synthesizer( - tts_path, - tts_config_path, - speakers_file_path, - language_ids_file_path, - vocoder_path, - vocoder_config_path, - encoder_path, - encoder_config_path, - vc_path, - vc_config_path, - model_dir, - args.voice_dir, + tts_checkpoint=tts_path, + tts_config_path=tts_config_path, + tts_speakers_file=speakers_file_path, + tts_languages_file=language_ids_file_path, + vocoder_checkpoint=vocoder_path, + vocoder_config=vocoder_config_path, + encoder_checkpoint=encoder_path, + encoder_config=encoder_config_path, + vc_checkpoint=vc_path, + vc_config=vc_config_path, + model_dir=model_dir, + voice_dir=args.voice_dir, ).to(device) # query speaker ids of a multi-speaker model. @@ -429,7 +429,7 @@ def main(): logger.info( "Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model." ) - logger.info(synthesizer.tts_model.speaker_manager.name_to_id) + logger.info(list(synthesizer.tts_model.speaker_manager.name_to_id.keys())) return # query langauge ids of a multi-lingual model. diff --git a/TTS/demos/xtts_ft_demo/utils/gpt_train.py b/TTS/demos/xtts_ft_demo/utils/gpt_train.py index 7b41966b8f..f838297af3 100644 --- a/TTS/demos/xtts_ft_demo/utils/gpt_train.py +++ b/TTS/demos/xtts_ft_demo/utils/gpt_train.py @@ -43,8 +43,8 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True) # DVAE files - DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth" - MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth" + DVAE_CHECKPOINT_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/dvae.pth" + MEL_NORM_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/mel_stats.pth" # Set the path to the downloaded files DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK)) @@ -58,9 +58,9 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, ) # Download XTTS v2.0 checkpoint if needed - TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json" - XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth" - XTTS_CONFIG_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json" + TOKENIZER_FILE_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/vocab.json" + XTTS_CHECKPOINT_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/model.pth" + XTTS_CONFIG_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/config.json" # XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning. TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK)) # vocab.json file diff --git a/TTS/encoder/models/base_encoder.py b/TTS/encoder/models/base_encoder.py index f7137c2186..2082019aad 100644 --- a/TTS/encoder/models/base_encoder.py +++ b/TTS/encoder/models/base_encoder.py @@ -5,10 +5,10 @@ import torchaudio from coqpit import Coqpit from torch import nn +from trainer.generic_utils import set_partial_state_dict from trainer.io import load_fsspec from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss -from TTS.utils.generic_utils import set_init_dict logger = logging.getLogger(__name__) @@ -130,7 +130,7 @@ def load_checkpoint( logger.info("Partial model initialization.") model_dict = self.state_dict() - model_dict = set_init_dict(model_dict, state["model"], c) + model_dict = set_partial_state_dict(model_dict, state["model"], config) self.load_state_dict(model_dict) del model_dict diff --git a/TTS/encoder/models/lstm.py b/TTS/encoder/models/lstm.py index 51852b5b82..4e0a7523aa 100644 --- a/TTS/encoder/models/lstm.py +++ b/TTS/encoder/models/lstm.py @@ -86,7 +86,7 @@ def forward(self, x, l2_norm=True): - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})` """ with torch.no_grad(): - with torch.cuda.amp.autocast(enabled=False): + with torch.autocast("cuda", enabled=False): if self.use_torch_spec: x.squeeze_(1) x = self.torch_spec(x) diff --git a/TTS/encoder/utils/training.py b/TTS/encoder/utils/training.py index cc3a78b084..48629c7a57 100644 --- a/TTS/encoder/utils/training.py +++ b/TTS/encoder/utils/training.py @@ -2,9 +2,9 @@ from dataclasses import dataclass, field from coqpit import Coqpit -from trainer import TrainerArgs, get_last_checkpoint +from trainer import TrainerArgs from trainer.generic_utils import get_experiment_folder_path, get_git_branch -from trainer.io import copy_model_files +from trainer.io import copy_model_files, get_last_checkpoint from trainer.logging import logger_factory from trainer.logging.console_logger import ConsoleLogger diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index f9f2cb2e37..d1a37da4c1 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -166,6 +166,11 @@ def load_attention_mask_meta_data(metafile_path): def _get_formatter_by_name(name): """Returns the respective preprocessing function.""" thismodule = sys.modules[__name__] + if not hasattr(thismodule, name.lower()): + msg = ( + f"{name} formatter not found. If it is a custom formatter, pass the function to load_tts_samples() instead." + ) + raise ValueError(msg) return getattr(thismodule, name.lower()) diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 37e3a1779d..5f629f32a9 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -63,6 +63,31 @@ def get_audio_size(audiopath: Union[str, os.PathLike[Any]]) -> int: raise RuntimeError(msg) from e +def get_attribute_balancer_weights(items: list, attr_name: str, multi_dict: Optional[dict] = None): + """Create inverse frequency weights for balancing the dataset. + + Use `multi_dict` to scale relative weights.""" + attr_names_samples = np.array([item[attr_name] for item in items]) + unique_attr_names = np.unique(attr_names_samples).tolist() + attr_idx = [unique_attr_names.index(l) for l in attr_names_samples] + attr_count = np.array([len(np.where(attr_names_samples == l)[0]) for l in unique_attr_names]) + weight_attr = 1.0 / attr_count + dataset_samples_weight = np.array([weight_attr[l] for l in attr_idx]) + dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight) + if multi_dict is not None: + # check if all keys are in the multi_dict + for k in multi_dict: + assert k in unique_attr_names, f"{k} not in {unique_attr_names}" + # scale weights + multiplier_samples = np.array([multi_dict.get(item[attr_name], 1.0) for item in items]) + dataset_samples_weight *= multiplier_samples + return ( + torch.from_numpy(dataset_samples_weight).float(), + unique_attr_names, + np.unique(dataset_samples_weight).tolist(), + ) + + class TTSDataset(Dataset): def __init__( self, diff --git a/TTS/tts/layers/bark/hubert/kmeans_hubert.py b/TTS/tts/layers/bark/hubert/kmeans_hubert.py index 58a614cb87..ade84794eb 100644 --- a/TTS/tts/layers/bark/hubert/kmeans_hubert.py +++ b/TTS/tts/layers/bark/hubert/kmeans_hubert.py @@ -14,6 +14,8 @@ from torchaudio.functional import resample from transformers import HubertModel +from TTS.utils.generic_utils import exists + def round_down_nearest_multiple(num, divisor): return num // divisor * divisor @@ -26,14 +28,6 @@ def curtail_to_multiple(t, mult, from_left=False): return t[..., seq_slice] -def exists(val): - return val is not None - - -def default(val, d): - return val if exists(val) else d - - class CustomHubert(nn.Module): """ checkpoint and kmeans can be downloaded at https://github.com/facebookresearch/fairseq/tree/main/examples/hubert diff --git a/TTS/tts/layers/bark/load_model.py b/TTS/tts/layers/bark/load_model.py index 72eca30ac6..6b7caab916 100644 --- a/TTS/tts/layers/bark/load_model.py +++ b/TTS/tts/layers/bark/load_model.py @@ -12,13 +12,8 @@ from TTS.tts.layers.bark.model_fine import FineGPT, FineGPTConfig from TTS.utils.generic_utils import is_pytorch_at_least_2_4 -if ( - torch.cuda.is_available() - and hasattr(torch.cuda, "amp") - and hasattr(torch.cuda.amp, "autocast") - and torch.cuda.is_bf16_supported() -): - autocast = functools.partial(torch.cuda.amp.autocast, dtype=torch.bfloat16) +if torch.cuda.is_available() and torch.cuda.is_bf16_supported(): + autocast = functools.partial(torch.autocast, device_type="cuda", dtype=torch.bfloat16) else: @contextlib.contextmanager diff --git a/TTS/tts/layers/bark/model.py b/TTS/tts/layers/bark/model.py index 68c50dbdbd..54a9cecec0 100644 --- a/TTS/tts/layers/bark/model.py +++ b/TTS/tts/layers/bark/model.py @@ -12,18 +12,6 @@ from torch.nn import functional as F -class LayerNorm(nn.Module): - """LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False""" - - def __init__(self, ndim, bias): - super().__init__() - self.weight = nn.Parameter(torch.ones(ndim)) - self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None - - def forward(self, x): - return F.layer_norm(x, self.weight.shape, self.weight, self.bias, 1e-5) - - class CausalSelfAttention(nn.Module): def __init__(self, config): super().__init__() @@ -119,9 +107,9 @@ def forward(self, x): class Block(nn.Module): def __init__(self, config, layer_idx): super().__init__() - self.ln_1 = LayerNorm(config.n_embd, bias=config.bias) + self.ln_1 = nn.LayerNorm(config.n_embd, bias=config.bias) self.attn = CausalSelfAttention(config) - self.ln_2 = LayerNorm(config.n_embd, bias=config.bias) + self.ln_2 = nn.LayerNorm(config.n_embd, bias=config.bias) self.mlp = MLP(config) self.layer_idx = layer_idx @@ -158,7 +146,7 @@ def __init__(self, config): wpe=nn.Embedding(config.block_size, config.n_embd), drop=nn.Dropout(config.dropout), h=nn.ModuleList([Block(config, idx) for idx in range(config.n_layer)]), - ln_f=LayerNorm(config.n_embd, bias=config.bias), + ln_f=nn.LayerNorm(config.n_embd, bias=config.bias), ) ) self.lm_head = nn.Linear(config.n_embd, config.output_vocab_size, bias=False) diff --git a/TTS/tts/layers/delightful_tts/acoustic_model.py b/TTS/tts/layers/delightful_tts/acoustic_model.py index 83989f9ba4..981d6cdb1f 100644 --- a/TTS/tts/layers/delightful_tts/acoustic_model.py +++ b/TTS/tts/layers/delightful_tts/acoustic_model.py @@ -5,13 +5,13 @@ import torch import torch.nn.functional as F from coqpit import Coqpit +from monotonic_alignment_search import maximum_path from torch import nn from TTS.tts.layers.delightful_tts.conformer import Conformer from TTS.tts.layers.delightful_tts.encoders import ( PhonemeLevelProsodyEncoder, UtteranceLevelProsodyEncoder, - get_mask_from_lengths, ) from TTS.tts.layers.delightful_tts.energy_adaptor import EnergyAdaptor from TTS.tts.layers.delightful_tts.networks import EmbeddingPadded, positional_encoding @@ -19,7 +19,7 @@ from TTS.tts.layers.delightful_tts.pitch_adaptor import PitchAdaptor from TTS.tts.layers.delightful_tts.variance_predictor import VariancePredictor from TTS.tts.layers.generic.aligner import AlignmentNetwork -from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask +from TTS.tts.utils.helpers import expand_encoder_outputs, generate_attention, sequence_mask logger = logging.getLogger(__name__) @@ -230,42 +230,6 @@ def _init_d_vector(self): raise ValueError("[!] Speaker embedding layer already initialized before d_vector settings.") self.embedded_speaker_dim = self.args.d_vector_dim - @staticmethod - def generate_attn(dr, x_mask, y_mask=None): - """Generate an attention mask from the linear scale durations. - - Args: - dr (Tensor): Linear scale durations. - x_mask (Tensor): Mask for the input (character) sequence. - y_mask (Tensor): Mask for the output (spectrogram) sequence. Compute it from the predicted durations - if None. Defaults to None. - - Shapes - - dr: :math:`(B, T_{en})` - - x_mask: :math:`(B, T_{en})` - - y_mask: :math:`(B, T_{de})` - """ - # compute decode mask from the durations - if y_mask is None: - y_lengths = dr.sum(1).long() - y_lengths[y_lengths < 1] = 1 - y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(dr.dtype) - attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) - attn = generate_path(dr, attn_mask.squeeze(1)).to(dr.dtype) - return attn - - def _expand_encoder_with_durations( - self, - o_en: torch.FloatTensor, - dr: torch.IntTensor, - x_mask: torch.IntTensor, - y_lengths: torch.IntTensor, - ): - y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en.dtype) - attn = self.generate_attn(dr, x_mask, y_mask) - o_en_ex = torch.einsum("kmn, kjm -> kjn", [attn.float(), o_en]) - return y_mask, o_en_ex, attn.transpose(1, 2) - def _forward_aligner( self, x: torch.FloatTensor, @@ -339,8 +303,8 @@ def forward( {"d_vectors": d_vectors, "speaker_ids": speaker_idx} ) # pylint: disable=unused-variable - src_mask = get_mask_from_lengths(src_lens) # [B, T_src] - mel_mask = get_mask_from_lengths(mel_lens) # [B, T_mel] + src_mask = ~sequence_mask(src_lens) # [B, T_src] + mel_mask = ~sequence_mask(mel_lens) # [B, T_mel] # Token embeddings token_embeddings = self.src_word_emb(tokens) # [B, T_src, C_hidden] @@ -419,8 +383,8 @@ def forward( encoder_outputs = encoder_outputs.transpose(1, 2) + pitch_emb + energy_emb log_duration_prediction = self.duration_predictor(x=encoder_outputs_res.detach(), mask=src_mask) - mel_pred_mask, encoder_outputs_ex, alignments = self._expand_encoder_with_durations( - o_en=encoder_outputs, y_lengths=mel_lens, dr=dr, x_mask=~src_mask[:, None] + encoder_outputs_ex, alignments, mel_pred_mask = expand_encoder_outputs( + encoder_outputs, y_lengths=mel_lens, duration=dr, x_mask=~src_mask[:, None] ) x = self.decoder( @@ -434,7 +398,7 @@ def forward( dr = torch.log(dr + 1) dr_pred = torch.exp(log_duration_prediction) - 1 - alignments_dp = self.generate_attn(dr_pred, src_mask.unsqueeze(1), mel_pred_mask) # [B, T_max, T_max2'] + alignments_dp = generate_attention(dr_pred, src_mask.unsqueeze(1), mel_pred_mask) # [B, T_max, T_max2'] return { "model_outputs": x, @@ -447,7 +411,7 @@ def forward( "p_prosody_pred": p_prosody_pred, "p_prosody_ref": p_prosody_ref, "alignments_dp": alignments_dp, - "alignments": alignments, # [B, T_de, T_en] + "alignments": alignments.transpose(1, 2), # [B, T_de, T_en] "aligner_soft": aligner_soft, "aligner_mas": aligner_mas, "aligner_durations": aligner_durations, @@ -468,7 +432,7 @@ def inference( pitch_transform: Callable = None, energy_transform: Callable = None, ) -> torch.Tensor: - src_mask = get_mask_from_lengths(torch.tensor([tokens.shape[1]], dtype=torch.int64, device=tokens.device)) + src_mask = ~sequence_mask(torch.tensor([tokens.shape[1]], dtype=torch.int64, device=tokens.device)) src_lens = torch.tensor(tokens.shape[1:2]).to(tokens.device) # pylint: disable=unused-variable sid, g, lid, _ = self._set_cond_input( # pylint: disable=unused-variable {"d_vectors": d_vectors, "speaker_ids": speaker_idx} @@ -535,11 +499,11 @@ def inference( duration_pred = torch.round(duration_pred) # -> [B, T_src] mel_lens = duration_pred.sum(1) # -> [B,] - _, encoder_outputs_ex, alignments = self._expand_encoder_with_durations( - o_en=encoder_outputs, y_lengths=mel_lens, dr=duration_pred.squeeze(1), x_mask=~src_mask[:, None] + encoder_outputs_ex, alignments, _ = expand_encoder_outputs( + encoder_outputs, y_lengths=mel_lens, duration=duration_pred.squeeze(1), x_mask=~src_mask[:, None] ) - mel_mask = get_mask_from_lengths( + mel_mask = ~sequence_mask( torch.tensor([encoder_outputs_ex.shape[2]], dtype=torch.int64, device=encoder_outputs_ex.device) ) @@ -556,7 +520,7 @@ def inference( x = self.to_mel(x) outputs = { "model_outputs": x, - "alignments": alignments, + "alignments": alignments.transpose(1, 2), # "pitch": pitch_emb_pred, "durations": duration_pred, "pitch": pitch_pred, diff --git a/TTS/tts/layers/delightful_tts/conformer.py b/TTS/tts/layers/delightful_tts/conformer.py index b2175b3b96..227a871c69 100644 --- a/TTS/tts/layers/delightful_tts/conformer.py +++ b/TTS/tts/layers/delightful_tts/conformer.py @@ -1,20 +1,14 @@ ### credit: https://github.com/dunky11/voicesmith import math -from typing import Tuple import torch import torch.nn as nn # pylint: disable=consider-using-from-import import torch.nn.functional as F -from TTS.tts.layers.delightful_tts.conv_layers import Conv1dGLU, DepthWiseConv1d, PointwiseConv1d +from TTS.tts.layers.delightful_tts.conv_layers import Conv1dGLU, DepthWiseConv1d, PointwiseConv1d, calc_same_padding from TTS.tts.layers.delightful_tts.networks import GLUActivation -def calc_same_padding(kernel_size: int) -> Tuple[int, int]: - pad = kernel_size // 2 - return (pad, pad - (kernel_size + 1) % 2) - - class Conformer(nn.Module): def __init__( self, @@ -322,7 +316,7 @@ def forward( value: torch.Tensor, mask: torch.Tensor, encoding: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: batch_size, seq_length, _ = key.size() # pylint: disable=unused-variable encoding = encoding[:, : key.shape[1]] encoding = encoding.repeat(batch_size, 1, 1) @@ -378,7 +372,7 @@ def forward( value: torch.Tensor, pos_embedding: torch.Tensor, mask: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: batch_size = query.shape[0] query = self.query_proj(query).view(batch_size, -1, self.num_heads, self.d_head) key = self.key_proj(key).view(batch_size, -1, self.num_heads, self.d_head).permute(0, 2, 1, 3) @@ -411,40 +405,3 @@ def _relative_shift(self, pos_score: torch.Tensor) -> torch.Tensor: # pylint: d padded_pos_score = padded_pos_score.view(batch_size, num_heads, seq_length2 + 1, seq_length1) pos_score = padded_pos_score[:, :, 1:].view_as(pos_score) return pos_score - - -class MultiHeadAttention(nn.Module): - """ - input: - query --- [N, T_q, query_dim] - key --- [N, T_k, key_dim] - output: - out --- [N, T_q, num_units] - """ - - def __init__(self, query_dim: int, key_dim: int, num_units: int, num_heads: int): - super().__init__() - self.num_units = num_units - self.num_heads = num_heads - self.key_dim = key_dim - - self.W_query = nn.Linear(in_features=query_dim, out_features=num_units, bias=False) - self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False) - self.W_value = nn.Linear(in_features=key_dim, out_features=num_units, bias=False) - - def forward(self, query: torch.Tensor, key: torch.Tensor) -> torch.Tensor: - querys = self.W_query(query) # [N, T_q, num_units] - keys = self.W_key(key) # [N, T_k, num_units] - values = self.W_value(key) - split_size = self.num_units // self.num_heads - querys = torch.stack(torch.split(querys, split_size, dim=2), dim=0) # [h, N, T_q, num_units/h] - keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h] - values = torch.stack(torch.split(values, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h] - # score = softmax(QK^T / (d_k ** 0.5)) - scores = torch.matmul(querys, keys.transpose(2, 3)) # [h, N, T_q, T_k] - scores = scores / (self.key_dim**0.5) - scores = F.softmax(scores, dim=3) - # out = score * V - out = torch.matmul(scores, values) # [h, N, T_q, num_units/h] - out = torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0) # [N, T_q, num_units] - return out diff --git a/TTS/tts/layers/delightful_tts/conv_layers.py b/TTS/tts/layers/delightful_tts/conv_layers.py index fb9aa4495f..1d5139571e 100644 --- a/TTS/tts/layers/delightful_tts/conv_layers.py +++ b/TTS/tts/layers/delightful_tts/conv_layers.py @@ -3,9 +3,6 @@ import torch import torch.nn as nn # pylint: disable=consider-using-from-import import torch.nn.functional as F -from torch.nn.utils import parametrize - -from TTS.tts.layers.delightful_tts.kernel_predictor import KernelPredictor def calc_same_padding(kernel_size: int) -> Tuple[int, int]: @@ -530,142 +527,3 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.addcoords(x) x = self.conv(x) return x - - -class LVCBlock(torch.nn.Module): - """the location-variable convolutions""" - - def __init__( # pylint: disable=dangerous-default-value - self, - in_channels, - cond_channels, - stride, - dilations=[1, 3, 9, 27], - lReLU_slope=0.2, - conv_kernel_size=3, - cond_hop_length=256, - kpnet_hidden_channels=64, - kpnet_conv_size=3, - kpnet_dropout=0.0, - ): - super().__init__() - - self.cond_hop_length = cond_hop_length - self.conv_layers = len(dilations) - self.conv_kernel_size = conv_kernel_size - - self.kernel_predictor = KernelPredictor( - cond_channels=cond_channels, - conv_in_channels=in_channels, - conv_out_channels=2 * in_channels, - conv_layers=len(dilations), - conv_kernel_size=conv_kernel_size, - kpnet_hidden_channels=kpnet_hidden_channels, - kpnet_conv_size=kpnet_conv_size, - kpnet_dropout=kpnet_dropout, - kpnet_nonlinear_activation_params={"negative_slope": lReLU_slope}, - ) - - self.convt_pre = nn.Sequential( - nn.LeakyReLU(lReLU_slope), - nn.utils.parametrizations.weight_norm( - nn.ConvTranspose1d( - in_channels, - in_channels, - 2 * stride, - stride=stride, - padding=stride // 2 + stride % 2, - output_padding=stride % 2, - ) - ), - ) - - self.conv_blocks = nn.ModuleList() - for dilation in dilations: - self.conv_blocks.append( - nn.Sequential( - nn.LeakyReLU(lReLU_slope), - nn.utils.parametrizations.weight_norm( - nn.Conv1d( - in_channels, - in_channels, - conv_kernel_size, - padding=dilation * (conv_kernel_size - 1) // 2, - dilation=dilation, - ) - ), - nn.LeakyReLU(lReLU_slope), - ) - ) - - def forward(self, x, c): - """forward propagation of the location-variable convolutions. - Args: - x (Tensor): the input sequence (batch, in_channels, in_length) - c (Tensor): the conditioning sequence (batch, cond_channels, cond_length) - - Returns: - Tensor: the output sequence (batch, in_channels, in_length) - """ - _, in_channels, _ = x.shape # (B, c_g, L') - - x = self.convt_pre(x) # (B, c_g, stride * L') - kernels, bias = self.kernel_predictor(c) - - for i, conv in enumerate(self.conv_blocks): - output = conv(x) # (B, c_g, stride * L') - - k = kernels[:, i, :, :, :, :] # (B, 2 * c_g, c_g, kernel_size, cond_length) - b = bias[:, i, :, :] # (B, 2 * c_g, cond_length) - - output = self.location_variable_convolution( - output, k, b, hop_size=self.cond_hop_length - ) # (B, 2 * c_g, stride * L'): LVC - x = x + torch.sigmoid(output[:, :in_channels, :]) * torch.tanh( - output[:, in_channels:, :] - ) # (B, c_g, stride * L'): GAU - - return x - - def location_variable_convolution(self, x, kernel, bias, dilation=1, hop_size=256): # pylint: disable=no-self-use - """perform location-variable convolution operation on the input sequence (x) using the local convolution kernl. - Time: 414 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each), test on NVIDIA V100. - Args: - x (Tensor): the input sequence (batch, in_channels, in_length). - kernel (Tensor): the local convolution kernel (batch, in_channel, out_channels, kernel_size, kernel_length) - bias (Tensor): the bias for the local convolution (batch, out_channels, kernel_length) - dilation (int): the dilation of convolution. - hop_size (int): the hop_size of the conditioning sequence. - Returns: - (Tensor): the output sequence after performing local convolution. (batch, out_channels, in_length). - """ - batch, _, in_length = x.shape - batch, _, out_channels, kernel_size, kernel_length = kernel.shape - assert in_length == (kernel_length * hop_size), "length of (x, kernel) is not matched" - - padding = dilation * int((kernel_size - 1) / 2) - x = F.pad(x, (padding, padding), "constant", 0) # (batch, in_channels, in_length + 2*padding) - x = x.unfold(2, hop_size + 2 * padding, hop_size) # (batch, in_channels, kernel_length, hop_size + 2*padding) - - if hop_size < dilation: - x = F.pad(x, (0, dilation), "constant", 0) - x = x.unfold( - 3, dilation, dilation - ) # (batch, in_channels, kernel_length, (hop_size + 2*padding)/dilation, dilation) - x = x[:, :, :, :, :hop_size] - x = x.transpose(3, 4) # (batch, in_channels, kernel_length, dilation, (hop_size + 2*padding)/dilation) - x = x.unfold(4, kernel_size, 1) # (batch, in_channels, kernel_length, dilation, _, kernel_size) - - o = torch.einsum("bildsk,biokl->bolsd", x, kernel) - o = o.to(memory_format=torch.channels_last_3d) - bias = bias.unsqueeze(-1).unsqueeze(-1).to(memory_format=torch.channels_last_3d) - o = o + bias - o = o.contiguous().view(batch, out_channels, -1) - - return o - - def remove_weight_norm(self): - self.kernel_predictor.remove_weight_norm() - parametrize.remove_parametrizations(self.convt_pre[1], "weight") - for block in self.conv_blocks: - parametrize.remove_parametrizations(block[1], "weight") diff --git a/TTS/tts/layers/delightful_tts/encoders.py b/TTS/tts/layers/delightful_tts/encoders.py index 0878f0677a..bd0c319dc1 100644 --- a/TTS/tts/layers/delightful_tts/encoders.py +++ b/TTS/tts/layers/delightful_tts/encoders.py @@ -7,14 +7,7 @@ from TTS.tts.layers.delightful_tts.conformer import ConformerMultiHeadedSelfAttention from TTS.tts.layers.delightful_tts.conv_layers import CoordConv1d from TTS.tts.layers.delightful_tts.networks import STL - - -def get_mask_from_lengths(lengths: torch.Tensor) -> torch.Tensor: - batch_size = lengths.shape[0] - max_len = torch.max(lengths).item() - ids = torch.arange(0, max_len, device=lengths.device).unsqueeze(0).expand(batch_size, -1) - mask = ids >= lengths.unsqueeze(1).expand(-1, max_len) - return mask +from TTS.tts.utils.helpers import sequence_mask def stride_lens(lens: torch.Tensor, stride: int = 2) -> torch.Tensor: @@ -93,7 +86,7 @@ def forward(self, x: torch.Tensor, mel_lens: torch.Tensor) -> Tuple[torch.Tensor outputs --- [N, E//2] """ - mel_masks = get_mask_from_lengths(mel_lens).unsqueeze(1) + mel_masks = ~sequence_mask(mel_lens).unsqueeze(1) x = x.masked_fill(mel_masks, 0) for conv, norm in zip(self.convs, self.norms): x = conv(x) @@ -103,7 +96,7 @@ def forward(self, x: torch.Tensor, mel_lens: torch.Tensor) -> Tuple[torch.Tensor for _ in range(2): mel_lens = stride_lens(mel_lens) - mel_masks = get_mask_from_lengths(mel_lens) + mel_masks = ~sequence_mask(mel_lens) x = x.masked_fill(mel_masks.unsqueeze(1), 0) x = x.permute((0, 2, 1)) diff --git a/TTS/tts/layers/delightful_tts/kernel_predictor.py b/TTS/tts/layers/delightful_tts/kernel_predictor.py deleted file mode 100644 index 96c550b6c2..0000000000 --- a/TTS/tts/layers/delightful_tts/kernel_predictor.py +++ /dev/null @@ -1,128 +0,0 @@ -import torch.nn as nn # pylint: disable=consider-using-from-import -from torch.nn.utils import parametrize - - -class KernelPredictor(nn.Module): - """Kernel predictor for the location-variable convolutions - - Args: - cond_channels (int): number of channel for the conditioning sequence, - conv_in_channels (int): number of channel for the input sequence, - conv_out_channels (int): number of channel for the output sequence, - conv_layers (int): number of layers - - """ - - def __init__( # pylint: disable=dangerous-default-value - self, - cond_channels, - conv_in_channels, - conv_out_channels, - conv_layers, - conv_kernel_size=3, - kpnet_hidden_channels=64, - kpnet_conv_size=3, - kpnet_dropout=0.0, - kpnet_nonlinear_activation="LeakyReLU", - kpnet_nonlinear_activation_params={"negative_slope": 0.1}, - ): - super().__init__() - - self.conv_in_channels = conv_in_channels - self.conv_out_channels = conv_out_channels - self.conv_kernel_size = conv_kernel_size - self.conv_layers = conv_layers - - kpnet_kernel_channels = conv_in_channels * conv_out_channels * conv_kernel_size * conv_layers # l_w - kpnet_bias_channels = conv_out_channels * conv_layers # l_b - - self.input_conv = nn.Sequential( - nn.utils.parametrizations.weight_norm( - nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=2, bias=True) - ), - getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), - ) - - self.residual_convs = nn.ModuleList() - padding = (kpnet_conv_size - 1) // 2 - for _ in range(3): - self.residual_convs.append( - nn.Sequential( - nn.Dropout(kpnet_dropout), - nn.utils.parametrizations.weight_norm( - nn.Conv1d( - kpnet_hidden_channels, - kpnet_hidden_channels, - kpnet_conv_size, - padding=padding, - bias=True, - ) - ), - getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), - nn.utils.parametrizations.weight_norm( - nn.Conv1d( - kpnet_hidden_channels, - kpnet_hidden_channels, - kpnet_conv_size, - padding=padding, - bias=True, - ) - ), - getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params), - ) - ) - self.kernel_conv = nn.utils.parametrizations.weight_norm( - nn.Conv1d( - kpnet_hidden_channels, - kpnet_kernel_channels, - kpnet_conv_size, - padding=padding, - bias=True, - ) - ) - self.bias_conv = nn.utils.parametrizations.weight_norm( - nn.Conv1d( - kpnet_hidden_channels, - kpnet_bias_channels, - kpnet_conv_size, - padding=padding, - bias=True, - ) - ) - - def forward(self, c): - """ - Args: - c (Tensor): the conditioning sequence (batch, cond_channels, cond_length) - """ - batch, _, cond_length = c.shape - c = self.input_conv(c) - for residual_conv in self.residual_convs: - residual_conv.to(c.device) - c = c + residual_conv(c) - k = self.kernel_conv(c) - b = self.bias_conv(c) - kernels = k.contiguous().view( - batch, - self.conv_layers, - self.conv_in_channels, - self.conv_out_channels, - self.conv_kernel_size, - cond_length, - ) - bias = b.contiguous().view( - batch, - self.conv_layers, - self.conv_out_channels, - cond_length, - ) - - return kernels, bias - - def remove_weight_norm(self): - parametrize.remove_parametrizations(self.input_conv[0], "weight") - parametrize.remove_parametrizations(self.kernel_conv, "weight") - parametrize.remove_parametrizations(self.bias_conv, "weight") - for block in self.residual_convs: - parametrize.remove_parametrizations(block[1], "weight") - parametrize.remove_parametrizations(block[3], "weight") diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 5ebed81dda..db62430c9d 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -309,6 +309,24 @@ def forward(self, attn_logprob, in_lens, out_lens): return total_loss +class NLLLoss(nn.Module): + """Negative log likelihood loss.""" + + def forward(self, log_prob: torch.Tensor) -> dict: # pylint: disable=no-self-use + """Compute the loss. + + Args: + logits (Tensor): [B, T, D] + + Returns: + Tensor: [1] + + """ + return_dict = {} + return_dict["loss"] = -log_prob.mean() + return return_dict + + ######################## # MODEL LOSS LAYERS ######################## @@ -619,6 +637,28 @@ def forward( return {"loss": loss, "loss_l1": spec_loss, "loss_ssim": ssim_loss, "loss_dur": dur_loss, "mdn_loss": mdn_loss} +def feature_loss(feats_real, feats_generated): + loss = 0 + for dr, dg in zip(feats_real, feats_generated): + for rl, gl in zip(dr, dg): + rl = rl.float().detach() + gl = gl.float() + loss += torch.mean(torch.abs(rl - gl)) + return loss * 2 + + +def generator_loss(scores_fake): + loss = 0 + gen_losses = [] + for dg in scores_fake: + dg = dg.float() + l = torch.mean((1 - dg) ** 2) + gen_losses.append(l) + loss += l + + return loss, gen_losses + + class VitsGeneratorLoss(nn.Module): def __init__(self, c: Coqpit): super().__init__() @@ -640,28 +680,6 @@ def __init__(self, c: Coqpit): do_amp_to_db=True, ) - @staticmethod - def feature_loss(feats_real, feats_generated): - loss = 0 - for dr, dg in zip(feats_real, feats_generated): - for rl, gl in zip(dr, dg): - rl = rl.float().detach() - gl = gl.float() - loss += torch.mean(torch.abs(rl - gl)) - return loss * 2 - - @staticmethod - def generator_loss(scores_fake): - loss = 0 - gen_losses = [] - for dg in scores_fake: - dg = dg.float() - l = torch.mean((1 - dg) ** 2) - gen_losses.append(l) - loss += l - - return loss, gen_losses - @staticmethod def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): """ @@ -722,10 +740,8 @@ def forward( self.kl_loss(z_p=z_p, logs_q=logs_q, m_p=m_p, logs_p=logs_p, z_mask=z_mask.unsqueeze(1)) * self.kl_loss_alpha ) - loss_feat = ( - self.feature_loss(feats_real=feats_disc_real, feats_generated=feats_disc_fake) * self.feat_loss_alpha - ) - loss_gen = self.generator_loss(scores_fake=scores_disc_fake)[0] * self.gen_loss_alpha + loss_feat = feature_loss(feats_real=feats_disc_real, feats_generated=feats_disc_fake) * self.feat_loss_alpha + loss_gen = generator_loss(scores_fake=scores_disc_fake)[0] * self.gen_loss_alpha loss_mel = torch.nn.functional.l1_loss(mel_slice, mel_slice_hat) * self.mel_loss_alpha loss_duration = torch.sum(loss_duration.float()) * self.dur_loss_alpha loss = loss_kl + loss_feat + loss_mel + loss_gen + loss_duration @@ -779,6 +795,15 @@ def forward(self, scores_disc_real, scores_disc_fake): return return_dict +def _binary_alignment_loss(alignment_hard, alignment_soft): + """Binary loss that forces soft alignments to match the hard alignments. + + Explained in `https://arxiv.org/pdf/2108.10447.pdf`. + """ + log_sum = torch.log(torch.clamp(alignment_soft[alignment_hard == 1], min=1e-12)).sum() + return -log_sum / alignment_hard.sum() + + class ForwardTTSLoss(nn.Module): """Generic configurable ForwardTTS loss.""" @@ -820,14 +845,6 @@ def __init__(self, c): self.dur_loss_alpha = c.dur_loss_alpha self.binary_alignment_loss_alpha = c.binary_align_loss_alpha - @staticmethod - def _binary_alignment_loss(alignment_hard, alignment_soft): - """Binary loss that forces soft alignments to match the hard alignments as - explained in `https://arxiv.org/pdf/2108.10447.pdf`. - """ - log_sum = torch.log(torch.clamp(alignment_soft[alignment_hard == 1], min=1e-12)).sum() - return -log_sum / alignment_hard.sum() - def forward( self, decoder_output, @@ -879,7 +896,7 @@ def forward( return_dict["loss_aligner"] = self.aligner_loss_alpha * aligner_loss if self.binary_alignment_loss_alpha > 0 and alignment_hard is not None: - binary_alignment_loss = self._binary_alignment_loss(alignment_hard, alignment_soft) + binary_alignment_loss = _binary_alignment_loss(alignment_hard, alignment_soft) loss = loss + self.binary_alignment_loss_alpha * binary_alignment_loss if binary_loss_weight: return_dict["loss_binary_alignment"] = ( diff --git a/TTS/tts/layers/tacotron/capacitron_layers.py b/TTS/tts/layers/tacotron/capacitron_layers.py index 2181ffa7ec..817f42771b 100644 --- a/TTS/tts/layers/tacotron/capacitron_layers.py +++ b/TTS/tts/layers/tacotron/capacitron_layers.py @@ -3,6 +3,8 @@ from torch.distributions.multivariate_normal import MultivariateNormal as MVN from torch.nn import functional as F +from TTS.tts.layers.tacotron.common_layers import calculate_post_conv_height + class CapacitronVAE(nn.Module): """Effective Use of Variational Embedding Capacity for prosody transfer. @@ -97,7 +99,7 @@ def __init__(self, num_mel, out_dim): self.training = False self.bns = nn.ModuleList([nn.BatchNorm2d(num_features=filter_size) for filter_size in filters[1:]]) - post_conv_height = self.calculate_post_conv_height(num_mel, 3, 2, 2, num_layers) + post_conv_height = calculate_post_conv_height(num_mel, 3, 2, 2, num_layers) self.recurrence = nn.LSTM( input_size=filters[-1] * post_conv_height, hidden_size=out_dim, batch_first=True, bidirectional=False ) @@ -155,13 +157,6 @@ def forward(self, inputs, input_lengths): return last_output.to(inputs.device) # [B, 128] - @staticmethod - def calculate_post_conv_height(height, kernel_size, stride, pad, n_convs): - """Height of spec after n convolutions with fixed kernel/stride/pad.""" - for _ in range(n_convs): - height = (height - kernel_size + 2 * pad) // stride + 1 - return height - class TextSummary(nn.Module): def __init__(self, embedding_dim, encoder_output_dim): diff --git a/TTS/tts/layers/tacotron/common_layers.py b/TTS/tts/layers/tacotron/common_layers.py index f78ff1e75f..16e517fdca 100644 --- a/TTS/tts/layers/tacotron/common_layers.py +++ b/TTS/tts/layers/tacotron/common_layers.py @@ -3,6 +3,13 @@ from torch.nn import functional as F +def calculate_post_conv_height(height: int, kernel_size: int, stride: int, pad: int, n_convs: int) -> int: + """Height of spec after n convolutions with fixed kernel/stride/pad.""" + for _ in range(n_convs): + height = (height - kernel_size + 2 * pad) // stride + 1 + return height + + class Linear(nn.Module): """Linear layer with a specific initialization. diff --git a/TTS/tts/layers/tacotron/gst_layers.py b/TTS/tts/layers/tacotron/gst_layers.py index 05dba7084f..4a83fb1c83 100644 --- a/TTS/tts/layers/tacotron/gst_layers.py +++ b/TTS/tts/layers/tacotron/gst_layers.py @@ -2,6 +2,8 @@ import torch.nn.functional as F from torch import nn +from TTS.tts.layers.tacotron.common_layers import calculate_post_conv_height + class GST(nn.Module): """Global Style Token Module for factorizing prosody in speech. @@ -44,7 +46,7 @@ def __init__(self, num_mel, embedding_dim): self.convs = nn.ModuleList(convs) self.bns = nn.ModuleList([nn.BatchNorm2d(num_features=filter_size) for filter_size in filters[1:]]) - post_conv_height = self.calculate_post_conv_height(num_mel, 3, 2, 1, num_layers) + post_conv_height = calculate_post_conv_height(num_mel, 3, 2, 1, num_layers) self.recurrence = nn.GRU( input_size=filters[-1] * post_conv_height, hidden_size=embedding_dim // 2, batch_first=True ) @@ -71,13 +73,6 @@ def forward(self, inputs): return out.squeeze(0) - @staticmethod - def calculate_post_conv_height(height, kernel_size, stride, pad, n_convs): - """Height of spec after n convolutions with fixed kernel/stride/pad.""" - for _ in range(n_convs): - height = (height - kernel_size + 2 * pad) // stride + 1 - return height - class StyleTokenLayer(nn.Module): """NN Module attending to style tokens based on prosody encodings.""" @@ -117,7 +112,7 @@ class MultiHeadAttention(nn.Module): out --- [N, T_q, num_units] """ - def __init__(self, query_dim, key_dim, num_units, num_heads): + def __init__(self, query_dim: int, key_dim: int, num_units: int, num_heads: int): super().__init__() self.num_units = num_units self.num_heads = num_heads @@ -127,7 +122,7 @@ def __init__(self, query_dim, key_dim, num_units, num_heads): self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False) self.W_value = nn.Linear(in_features=key_dim, out_features=num_units, bias=False) - def forward(self, query, key): + def forward(self, query: torch.Tensor, key: torch.Tensor) -> torch.Tensor: queries = self.W_query(query) # [N, T_q, num_units] keys = self.W_key(key) # [N, T_k, num_units] values = self.W_value(key) @@ -137,13 +132,11 @@ def forward(self, query, key): keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h] values = torch.stack(torch.split(values, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h] - # score = softmax(QK^T / (d_k**0.5)) + # score = softmax(QK^T / (d_k ** 0.5)) scores = torch.matmul(queries, keys.transpose(2, 3)) # [h, N, T_q, T_k] scores = scores / (self.key_dim**0.5) scores = F.softmax(scores, dim=3) # out = score * V out = torch.matmul(scores, values) # [h, N, T_q, num_units/h] - out = torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0) # [N, T_q, num_units] - - return out + return torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0) # [N, T_q, num_units] diff --git a/TTS/tts/layers/tortoise/arch_utils.py b/TTS/tts/layers/tortoise/arch_utils.py index 52c2526695..4c3733e691 100644 --- a/TTS/tts/layers/tortoise/arch_utils.py +++ b/TTS/tts/layers/tortoise/arch_utils.py @@ -93,12 +93,10 @@ def __init__( channels, num_heads=1, num_head_channels=-1, - do_checkpoint=True, relative_pos_embeddings=False, ): super().__init__() self.channels = channels - self.do_checkpoint = do_checkpoint if num_head_channels == -1: self.num_heads = num_heads else: @@ -185,115 +183,7 @@ def forward(self, x): return self.op(x) -class ResBlock(nn.Module): - def __init__( - self, - channels, - dropout, - out_channels=None, - use_conv=False, - use_scale_shift_norm=False, - up=False, - down=False, - kernel_size=3, - ): - super().__init__() - self.channels = channels - self.dropout = dropout - self.out_channels = out_channels or channels - self.use_conv = use_conv - self.use_scale_shift_norm = use_scale_shift_norm - padding = 1 if kernel_size == 3 else 2 - - self.in_layers = nn.Sequential( - normalization(channels), - nn.SiLU(), - nn.Conv1d(channels, self.out_channels, kernel_size, padding=padding), - ) - - self.updown = up or down - - if up: - self.h_upd = Upsample(channels, False) - self.x_upd = Upsample(channels, False) - elif down: - self.h_upd = Downsample(channels, False) - self.x_upd = Downsample(channels, False) - else: - self.h_upd = self.x_upd = nn.Identity() - - self.out_layers = nn.Sequential( - normalization(self.out_channels), - nn.SiLU(), - nn.Dropout(p=dropout), - zero_module(nn.Conv1d(self.out_channels, self.out_channels, kernel_size, padding=padding)), - ) - - if self.out_channels == channels: - self.skip_connection = nn.Identity() - elif use_conv: - self.skip_connection = nn.Conv1d(channels, self.out_channels, kernel_size, padding=padding) - else: - self.skip_connection = nn.Conv1d(channels, self.out_channels, 1) - - def forward(self, x): - if self.updown: - in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1] - h = in_rest(x) - h = self.h_upd(h) - x = self.x_upd(x) - h = in_conv(h) - else: - h = self.in_layers(x) - h = self.out_layers(h) - return self.skip_connection(x) + h - - -class AudioMiniEncoder(nn.Module): - def __init__( - self, - spec_dim, - embedding_dim, - base_channels=128, - depth=2, - resnet_blocks=2, - attn_blocks=4, - num_attn_heads=4, - dropout=0, - downsample_factor=2, - kernel_size=3, - ): - super().__init__() - self.init = nn.Sequential(nn.Conv1d(spec_dim, base_channels, 3, padding=1)) - ch = base_channels - res = [] - for l in range(depth): - for r in range(resnet_blocks): - res.append(ResBlock(ch, dropout, kernel_size=kernel_size)) - res.append(Downsample(ch, use_conv=True, out_channels=ch * 2, factor=downsample_factor)) - ch *= 2 - self.res = nn.Sequential(*res) - self.final = nn.Sequential(normalization(ch), nn.SiLU(), nn.Conv1d(ch, embedding_dim, 1)) - attn = [] - for a in range(attn_blocks): - attn.append( - AttentionBlock( - embedding_dim, - num_attn_heads, - ) - ) - self.attn = nn.Sequential(*attn) - self.dim = embedding_dim - - def forward(self, x): - h = self.init(x) - h = self.res(h) - h = self.final(h) - h = self.attn(h) - return h[:, :, 0] - - -DEFAULT_MEL_NORM_FILE = "https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth" +DEFAULT_MEL_NORM_FILE = "https://github.com/coqui-ai/TTS/releases/download/v0.14.1_models/mel_norms.pth" class TorchMelSpectrogram(nn.Module): diff --git a/TTS/tts/layers/tortoise/audio_utils.py b/TTS/tts/layers/tortoise/audio_utils.py index 4f299a8fd9..c67ee6c44b 100644 --- a/TTS/tts/layers/tortoise/audio_utils.py +++ b/TTS/tts/layers/tortoise/audio_utils.py @@ -9,7 +9,7 @@ import torchaudio from scipy.io.wavfile import read -from TTS.utils.audio.torch_transforms import TorchSTFT +from TTS.utils.audio.torch_transforms import TorchSTFT, amp_to_db from TTS.utils.generic_utils import is_pytorch_at_least_2_4 logger = logging.getLogger(__name__) @@ -88,24 +88,6 @@ def normalize_tacotron_mel(mel): return 2 * ((mel - TACOTRON_MEL_MIN) / (TACOTRON_MEL_MAX - TACOTRON_MEL_MIN)) - 1 -def dynamic_range_compression(x, C=1, clip_val=1e-5): - """ - PARAMS - ------ - C: compression factor - """ - return torch.log(torch.clamp(x, min=clip_val) * C) - - -def dynamic_range_decompression(x, C=1): - """ - PARAMS - ------ - C: compression factor used to compress - """ - return torch.exp(x) / C - - def get_voices(extra_voice_dirs: List[str] = []): dirs = extra_voice_dirs voices: Dict[str, List[str]] = {} @@ -175,7 +157,7 @@ def wav_to_univnet_mel(wav, do_normalization=False, device="cuda"): ) stft = stft.to(device) mel = stft(wav) - mel = dynamic_range_compression(mel) + mel = amp_to_db(mel) if do_normalization: mel = normalize_tacotron_mel(mel) return mel diff --git a/TTS/tts/layers/tortoise/autoregressive.py b/TTS/tts/layers/tortoise/autoregressive.py index aaae695516..07cf3d542b 100644 --- a/TTS/tts/layers/tortoise/autoregressive.py +++ b/TTS/tts/layers/tortoise/autoregressive.py @@ -1,5 +1,6 @@ # AGPL: a notification must be added stating that changes have been made to that file. import functools +import random from typing import Optional import torch @@ -123,7 +124,7 @@ def forward( else: emb = self.embeddings(input_ids) emb = emb + self.text_pos_embedding.get_fixed_embedding( - attention_mask.shape[1] - mel_len, attention_mask.device + attention_mask.shape[1] - (mel_len + 1), attention_mask.device ) transformer_outputs = self.transformer( @@ -175,8 +176,6 @@ def __init__( embedding_dim, attn_blocks=6, num_attn_heads=4, - do_checkpointing=False, - mean=False, ): super().__init__() attn = [] @@ -185,34 +184,46 @@ def __init__( attn.append(AttentionBlock(embedding_dim, num_attn_heads)) self.attn = nn.Sequential(*attn) self.dim = embedding_dim - self.do_checkpointing = do_checkpointing - self.mean = mean def forward(self, x): + """ + x: (b, 80, s) + """ h = self.init(x) h = self.attn(h) - if self.mean: - return h.mean(dim=2) - else: - return h[:, :, 0] + return h class LearnedPositionEmbeddings(nn.Module): - def __init__(self, seq_len, model_dim, init=0.02): + def __init__(self, seq_len, model_dim, init=0.02, relative=False): super().__init__() self.emb = nn.Embedding(seq_len, model_dim) # Initializing this way is standard for GPT-2 self.emb.weight.data.normal_(mean=0.0, std=init) + self.relative = relative + self.seq_len = seq_len def forward(self, x): sl = x.shape[1] - return self.emb(torch.arange(0, sl, device=x.device)) + if self.relative: + start = random.randint(sl, self.seq_len) - sl + return self.emb(torch.arange(start, start + sl, device=x.device)) + else: + return self.emb(torch.arange(0, sl, device=x.device)) def get_fixed_embedding(self, ind, dev): - return self.emb(torch.arange(0, ind, device=dev))[ind - 1 : ind] - - -def build_hf_gpt_transformer(layers, model_dim, heads, max_mel_seq_len, max_text_seq_len, checkpointing): + return self.emb(torch.tensor([ind], device=dev)).unsqueeze(0) + + +def build_hf_gpt_transformer( + layers: int, + model_dim: int, + heads: int, + max_mel_seq_len: int, + max_text_seq_len: int, + checkpointing: bool, + max_prompt_len: int = 0, +): """ GPT-2 implemented by the HuggingFace library. """ @@ -220,8 +231,8 @@ def build_hf_gpt_transformer(layers, model_dim, heads, max_mel_seq_len, max_text gpt_config = GPT2Config( vocab_size=256, # Unused. - n_positions=max_mel_seq_len + max_text_seq_len, - n_ctx=max_mel_seq_len + max_text_seq_len, + n_positions=max_mel_seq_len + max_text_seq_len + max_prompt_len, + n_ctx=max_mel_seq_len + max_text_seq_len + max_prompt_len, n_embd=model_dim, n_layer=layers, n_head=heads, @@ -234,13 +245,18 @@ def build_hf_gpt_transformer(layers, model_dim, heads, max_mel_seq_len, max_text gpt.wpe = functools.partial(null_position_embeddings, dim=model_dim) # Built-in token embeddings are unused. del gpt.wte - return ( - gpt, - LearnedPositionEmbeddings(max_mel_seq_len, model_dim), - LearnedPositionEmbeddings(max_text_seq_len, model_dim), - None, - None, + + mel_pos_emb = ( + LearnedPositionEmbeddings(max_mel_seq_len, model_dim) + if max_mel_seq_len != -1 + else functools.partial(null_position_embeddings, dim=model_dim) + ) + text_pos_emb = ( + LearnedPositionEmbeddings(max_text_seq_len, model_dim) + if max_mel_seq_len != -1 + else functools.partial(null_position_embeddings, dim=model_dim) ) + return gpt, mel_pos_emb, text_pos_emb, None, None class MelEncoder(nn.Module): @@ -334,12 +350,12 @@ def __init__( self.mel_layer_pos_embedding, self.text_layer_pos_embedding, ) = build_hf_gpt_transformer( - layers, - model_dim, - heads, - self.max_mel_tokens + 2 + self.max_conditioning_inputs, - self.max_text_tokens + 2, - checkpointing, + layers=layers, + model_dim=model_dim, + heads=heads, + max_mel_seq_len=self.max_mel_tokens + 2 + self.max_conditioning_inputs, + max_text_seq_len=self.max_text_tokens + 2, + checkpointing=checkpointing, ) if train_solo_embeddings: self.mel_solo_embedding = nn.Parameter(torch.randn(1, 1, model_dim) * 0.02, requires_grad=True) @@ -455,7 +471,7 @@ def get_conditioning(self, speech_conditioning_input): ) conds = [] for j in range(speech_conditioning_input.shape[1]): - conds.append(self.conditioning_encoder(speech_conditioning_input[:, j])) + conds.append(self.conditioning_encoder(speech_conditioning_input[:, j])[:, :, 0]) conds = torch.stack(conds, dim=1) conds = conds.mean(dim=1) return conds diff --git a/TTS/tts/layers/tortoise/classifier.py b/TTS/tts/layers/tortoise/classifier.py index 8764bb070b..c72834e9a8 100644 --- a/TTS/tts/layers/tortoise/classifier.py +++ b/TTS/tts/layers/tortoise/classifier.py @@ -16,7 +16,6 @@ def __init__( up=False, down=False, kernel_size=3, - do_checkpoint=True, ): super().__init__() self.channels = channels @@ -24,7 +23,6 @@ def __init__( self.out_channels = out_channels or channels self.use_conv = use_conv self.use_scale_shift_norm = use_scale_shift_norm - self.do_checkpoint = do_checkpoint padding = 1 if kernel_size == 3 else 2 self.in_layers = nn.Sequential( @@ -92,14 +90,14 @@ def __init__( self.layers = depth for l in range(depth): for r in range(resnet_blocks): - res.append(ResBlock(ch, dropout, do_checkpoint=False, kernel_size=kernel_size)) + res.append(ResBlock(ch, dropout, kernel_size=kernel_size)) res.append(Downsample(ch, use_conv=True, out_channels=ch * 2, factor=downsample_factor)) ch *= 2 self.res = nn.Sequential(*res) self.final = nn.Sequential(normalization(ch), nn.SiLU(), nn.Conv1d(ch, embedding_dim, 1)) attn = [] for a in range(attn_blocks): - attn.append(AttentionBlock(embedding_dim, num_attn_heads, do_checkpoint=False)) + attn.append(AttentionBlock(embedding_dim, num_attn_heads)) self.attn = nn.Sequential(*attn) self.dim = embedding_dim diff --git a/TTS/tts/layers/tortoise/clvp.py b/TTS/tts/layers/tortoise/clvp.py index 241dfdd4f4..44da1324e7 100644 --- a/TTS/tts/layers/tortoise/clvp.py +++ b/TTS/tts/layers/tortoise/clvp.py @@ -8,10 +8,6 @@ from TTS.tts.layers.tortoise.xtransformers import Encoder -def exists(val): - return val is not None - - def masked_mean(t, mask, dim=1): t = t.masked_fill(~mask[:, :, None], 0.0) return t.sum(dim=1) / mask.sum(dim=1)[..., None] diff --git a/TTS/tts/layers/tortoise/diffusion_decoder.py b/TTS/tts/layers/tortoise/diffusion_decoder.py index 0d3cf7698a..15bbfb7121 100644 --- a/TTS/tts/layers/tortoise/diffusion_decoder.py +++ b/TTS/tts/layers/tortoise/diffusion_decoder.py @@ -5,7 +5,6 @@ import torch import torch.nn as nn import torch.nn.functional as F -from torch import autocast from TTS.tts.layers.tortoise.arch_utils import AttentionBlock, normalization @@ -197,31 +196,26 @@ def __init__( model_channels * 2, num_heads, relative_pos_embeddings=True, - do_checkpoint=False, ), AttentionBlock( model_channels * 2, num_heads, relative_pos_embeddings=True, - do_checkpoint=False, ), AttentionBlock( model_channels * 2, num_heads, relative_pos_embeddings=True, - do_checkpoint=False, ), AttentionBlock( model_channels * 2, num_heads, relative_pos_embeddings=True, - do_checkpoint=False, ), AttentionBlock( model_channels * 2, num_heads, relative_pos_embeddings=True, - do_checkpoint=False, ), ) self.unconditioned_embedding = nn.Parameter(torch.randn(1, model_channels, 1)) @@ -385,7 +379,7 @@ def forward( unused_params.extend(list(lyr.parameters())) else: # First and last blocks will have autocast disabled for improved precision. - with autocast(x.device.type, enabled=self.enable_fp16 and i != 0): + with torch.autocast(x.device.type, enabled=self.enable_fp16 and i != 0): x = lyr(x, time_emb) x = x.float() diff --git a/TTS/tts/layers/tortoise/transformer.py b/TTS/tts/layers/tortoise/transformer.py index 6cb1bab96a..ed4d79d4ab 100644 --- a/TTS/tts/layers/tortoise/transformer.py +++ b/TTS/tts/layers/tortoise/transformer.py @@ -1,22 +1,19 @@ +from typing import TypeVar, Union + import torch import torch.nn.functional as F from einops import rearrange from torch import nn -# helpers - +from TTS.utils.generic_utils import exists -def exists(val): - return val is not None - - -def default(val, d): - return val if exists(val) else d +# helpers +_T = TypeVar("_T") -def cast_tuple(val, depth=1): +def cast_tuple(val: Union[tuple[_T], list[_T], _T], depth: int = 1) -> tuple[_T]: if isinstance(val, list): - val = tuple(val) + return tuple(val) return val if isinstance(val, tuple) else (val,) * depth diff --git a/TTS/tts/layers/tortoise/xtransformers.py b/TTS/tts/layers/tortoise/xtransformers.py index 9325b8c720..0892fee19d 100644 --- a/TTS/tts/layers/tortoise/xtransformers.py +++ b/TTS/tts/layers/tortoise/xtransformers.py @@ -1,13 +1,15 @@ import math from collections import namedtuple from functools import partial -from inspect import isfunction import torch import torch.nn.functional as F from einops import rearrange, repeat from torch import einsum, nn +from TTS.tts.layers.tortoise.transformer import cast_tuple, max_neg_value +from TTS.utils.generic_utils import default, exists + DEFAULT_DIM_HEAD = 64 Intermediates = namedtuple("Intermediates", ["pre_softmax_attn", "post_softmax_attn"]) @@ -25,20 +27,6 @@ # helpers -def exists(val): - return val is not None - - -def default(val, d): - if exists(val): - return val - return d() if isfunction(d) else d - - -def cast_tuple(val, depth): - return val if isinstance(val, tuple) else (val,) * depth - - class always: def __init__(self, val): self.val = val @@ -63,10 +51,6 @@ def __call__(self, x, *args, **kwargs): return x == self.val -def max_neg_value(tensor): - return -torch.finfo(tensor.dtype).max - - def l2norm(t): return F.normalize(t, p=2, dim=-1) diff --git a/TTS/tts/layers/vits/discriminator.py b/TTS/tts/layers/vits/discriminator.py index 3449739fdc..49f7a0d074 100644 --- a/TTS/tts/layers/vits/discriminator.py +++ b/TTS/tts/layers/vits/discriminator.py @@ -2,7 +2,7 @@ from torch import nn from torch.nn.modules.conv import Conv1d -from TTS.vocoder.models.hifigan_discriminator import DiscriminatorP +from TTS.vocoder.models.hifigan_discriminator import LRELU_SLOPE, DiscriminatorP class DiscriminatorS(torch.nn.Module): @@ -39,7 +39,7 @@ def forward(self, x): feat = [] for l in self.convs: x = l(x) - x = torch.nn.functional.leaky_relu(x, 0.1) + x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE) feat.append(x) x = self.conv_post(x) feat.append(x) diff --git a/TTS/tts/layers/vits/networks.py b/TTS/tts/layers/vits/networks.py index 50ed1024de..ab2ca5667a 100644 --- a/TTS/tts/layers/vits/networks.py +++ b/TTS/tts/layers/vits/networks.py @@ -256,7 +256,7 @@ def __init__( ) self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - def forward(self, x, x_lengths, g=None): + def forward(self, x, x_lengths, g=None, tau=1.0): """ Shapes: - x: :math:`[B, C, T]` @@ -268,5 +268,5 @@ def forward(self, x, x_lengths, g=None): x = self.enc(x, x_mask, g=g) stats = self.proj(x) * x_mask mean, log_scale = torch.split(stats, self.out_channels, dim=1) - z = (mean + torch.randn_like(mean) * torch.exp(log_scale)) * x_mask + z = (mean + torch.randn_like(mean) * tau * torch.exp(log_scale)) * x_mask return z, mean, log_scale, x_mask diff --git a/TTS/tts/layers/xtts/dvae.py b/TTS/tts/layers/xtts/dvae.py index 73970fb0bf..4f806f82cb 100644 --- a/TTS/tts/layers/xtts/dvae.py +++ b/TTS/tts/layers/xtts/dvae.py @@ -14,10 +14,6 @@ logger = logging.getLogger(__name__) -def default(val, d): - return val if val is not None else d - - def eval_decorator(fn): def inner(model, *args, **kwargs): was_training = model.training diff --git a/TTS/tts/layers/xtts/gpt.py b/TTS/tts/layers/xtts/gpt.py index b3c3b31b47..20eff26ecc 100644 --- a/TTS/tts/layers/xtts/gpt.py +++ b/TTS/tts/layers/xtts/gpt.py @@ -1,6 +1,5 @@ # ported from: https://github.com/neonbjb/tortoise-tts -import functools import random import torch @@ -8,83 +7,16 @@ import torch.nn.functional as F from transformers import GPT2Config -from TTS.tts.layers.tortoise.autoregressive import _prepare_attention_mask_for_generation +from TTS.tts.layers.tortoise.autoregressive import ( + ConditioningEncoder, + LearnedPositionEmbeddings, + _prepare_attention_mask_for_generation, + build_hf_gpt_transformer, +) from TTS.tts.layers.xtts.gpt_inference import GPT2InferenceModel -from TTS.tts.layers.xtts.latent_encoder import ConditioningEncoder from TTS.tts.layers.xtts.perceiver_encoder import PerceiverResampler -def null_position_embeddings(range, dim): - return torch.zeros((range.shape[0], range.shape[1], dim), device=range.device) - - -class LearnedPositionEmbeddings(nn.Module): - def __init__(self, seq_len, model_dim, init=0.02, relative=False): - super().__init__() - # nn.Embedding - self.emb = torch.nn.Embedding(seq_len, model_dim) - # Initializing this way is standard for GPT-2 - self.emb.weight.data.normal_(mean=0.0, std=init) - self.relative = relative - self.seq_len = seq_len - - def forward(self, x): - sl = x.shape[1] - if self.relative: - start = random.randint(sl, self.seq_len) - sl - return self.emb(torch.arange(start, start + sl, device=x.device)) - else: - return self.emb(torch.arange(0, sl, device=x.device)) - - def get_fixed_embedding(self, ind, dev): - return self.emb(torch.tensor([ind], device=dev)).unsqueeze(0) - - -def build_hf_gpt_transformer( - layers, - model_dim, - heads, - max_mel_seq_len, - max_text_seq_len, - max_prompt_len, - checkpointing, -): - """ - GPT-2 implemented by the HuggingFace library. - """ - from transformers import GPT2Config, GPT2Model - - gpt_config = GPT2Config( - vocab_size=256, # Unused. - n_positions=max_mel_seq_len + max_text_seq_len + max_prompt_len, - n_ctx=max_mel_seq_len + max_text_seq_len + max_prompt_len, - n_embd=model_dim, - n_layer=layers, - n_head=heads, - gradient_checkpointing=checkpointing, - use_cache=not checkpointing, - ) - gpt = GPT2Model(gpt_config) - # Override the built in positional embeddings - del gpt.wpe - gpt.wpe = functools.partial(null_position_embeddings, dim=model_dim) - # Built-in token embeddings are unused. - del gpt.wte - - mel_pos_emb = ( - LearnedPositionEmbeddings(max_mel_seq_len, model_dim) - if max_mel_seq_len != -1 - else functools.partial(null_position_embeddings, dim=model_dim) - ) - text_pos_emb = ( - LearnedPositionEmbeddings(max_text_seq_len, model_dim) - if max_mel_seq_len != -1 - else functools.partial(null_position_embeddings, dim=model_dim) - ) - # gpt = torch.compile(gpt, mode="reduce-overhead", fullgraph=True) - return gpt, mel_pos_emb, text_pos_emb, None, None - - class GPT(nn.Module): def __init__( self, @@ -149,13 +81,13 @@ def __init__( self.mel_layer_pos_embedding, self.text_layer_pos_embedding, ) = build_hf_gpt_transformer( - layers, - model_dim, - heads, - self.max_mel_tokens, - self.max_text_tokens, - self.max_prompt_tokens, - checkpointing, + layers=layers, + model_dim=model_dim, + heads=heads, + max_mel_seq_len=self.max_mel_tokens, + max_text_seq_len=self.max_text_tokens, + max_prompt_len=self.max_prompt_tokens, + checkpointing=checkpointing, ) if train_solo_embeddings: self.mel_solo_embedding = nn.Parameter(torch.randn(1, 1, model_dim) * 0.02, requires_grad=True) @@ -303,19 +235,6 @@ def get_logits( else: return first_logits - def get_conditioning(self, speech_conditioning_input): - speech_conditioning_input = ( - speech_conditioning_input.unsqueeze(1) - if len(speech_conditioning_input.shape) == 3 - else speech_conditioning_input - ) - conds = [] - for j in range(speech_conditioning_input.shape[1]): - conds.append(self.conditioning_encoder(speech_conditioning_input[:, j])) - conds = torch.stack(conds, dim=1) - conds = conds.mean(dim=1) - return conds - def get_prompts(self, prompt_codes): """ Create a prompt from the mel codes. This is used to condition the model on the mel codes. @@ -354,6 +273,7 @@ def get_style_emb(self, cond_input, return_latent=False): """ cond_input: (b, 80, s) or (b, 1, 80, s) conds: (b, 1024, s) + output: (b, 1024, 32) """ conds = None if not return_latent: diff --git a/TTS/tts/layers/xtts/hifigan_decoder.py b/TTS/tts/layers/xtts/hifigan_decoder.py index 5ef0030b8b..2e6ac01a87 100644 --- a/TTS/tts/layers/xtts/hifigan_decoder.py +++ b/TTS/tts/layers/xtts/hifigan_decoder.py @@ -1,618 +1,13 @@ import logging import torch -import torchaudio -from torch import nn -from torch.nn import Conv1d, ConvTranspose1d -from torch.nn import functional as F -from torch.nn.utils.parametrizations import weight_norm -from torch.nn.utils.parametrize import remove_parametrizations from trainer.io import load_fsspec -from TTS.utils.generic_utils import is_pytorch_at_least_2_4 -from TTS.vocoder.models.hifigan_generator import get_padding +from TTS.encoder.models.resnet import ResNetSpeakerEncoder +from TTS.vocoder.models.hifigan_generator import HifiganGenerator logger = logging.getLogger(__name__) -LRELU_SLOPE = 0.1 - - -class ResBlock1(torch.nn.Module): - """Residual Block Type 1. It has 3 convolutional layers in each convolutional block. - - Network:: - - x -> lrelu -> conv1_1 -> conv1_2 -> conv1_3 -> z -> lrelu -> conv2_1 -> conv2_2 -> conv2_3 -> o -> + -> o - |--------------------------------------------------------------------------------------------------| - - - Args: - channels (int): number of hidden channels for the convolutional layers. - kernel_size (int): size of the convolution filter in each layer. - dilations (list): list of dilation value for each conv layer in a block. - """ - - def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): - super().__init__() - self.convs1 = nn.ModuleList( - [ - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=dilation[0], - padding=get_padding(kernel_size, dilation[0]), - ) - ), - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=dilation[1], - padding=get_padding(kernel_size, dilation[1]), - ) - ), - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=dilation[2], - padding=get_padding(kernel_size, dilation[2]), - ) - ), - ] - ) - - self.convs2 = nn.ModuleList( - [ - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=1, - padding=get_padding(kernel_size, 1), - ) - ), - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=1, - padding=get_padding(kernel_size, 1), - ) - ), - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=1, - padding=get_padding(kernel_size, 1), - ) - ), - ] - ) - - def forward(self, x): - """ - Args: - x (Tensor): input tensor. - Returns: - Tensor: output tensor. - Shapes: - x: [B, C, T] - """ - for c1, c2 in zip(self.convs1, self.convs2): - xt = F.leaky_relu(x, LRELU_SLOPE) - xt = c1(xt) - xt = F.leaky_relu(xt, LRELU_SLOPE) - xt = c2(xt) - x = xt + x - return x - - def remove_weight_norm(self): - for l in self.convs1: - remove_parametrizations(l, "weight") - for l in self.convs2: - remove_parametrizations(l, "weight") - - -class ResBlock2(torch.nn.Module): - """Residual Block Type 2. It has 1 convolutional layers in each convolutional block. - - Network:: - - x -> lrelu -> conv1-> -> z -> lrelu -> conv2-> o -> + -> o - |---------------------------------------------------| - - - Args: - channels (int): number of hidden channels for the convolutional layers. - kernel_size (int): size of the convolution filter in each layer. - dilations (list): list of dilation value for each conv layer in a block. - """ - - def __init__(self, channels, kernel_size=3, dilation=(1, 3)): - super().__init__() - self.convs = nn.ModuleList( - [ - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=dilation[0], - padding=get_padding(kernel_size, dilation[0]), - ) - ), - weight_norm( - Conv1d( - channels, - channels, - kernel_size, - 1, - dilation=dilation[1], - padding=get_padding(kernel_size, dilation[1]), - ) - ), - ] - ) - - def forward(self, x): - for c in self.convs: - xt = F.leaky_relu(x, LRELU_SLOPE) - xt = c(xt) - x = xt + x - return x - - def remove_weight_norm(self): - for l in self.convs: - remove_parametrizations(l, "weight") - - -class HifiganGenerator(torch.nn.Module): - def __init__( - self, - in_channels, - out_channels, - resblock_type, - resblock_dilation_sizes, - resblock_kernel_sizes, - upsample_kernel_sizes, - upsample_initial_channel, - upsample_factors, - inference_padding=5, - cond_channels=0, - conv_pre_weight_norm=True, - conv_post_weight_norm=True, - conv_post_bias=True, - cond_in_each_up_layer=False, - ): - r"""HiFiGAN Generator with Multi-Receptive Field Fusion (MRF) - - Network: - x -> lrelu -> upsampling_layer -> resblock1_k1x1 -> z1 -> + -> z_sum / #resblocks -> lrelu -> conv_post_7x1 -> tanh -> o - .. -> zI ---| - resblockN_kNx1 -> zN ---' - - Args: - in_channels (int): number of input tensor channels. - out_channels (int): number of output tensor channels. - resblock_type (str): type of the `ResBlock`. '1' or '2'. - resblock_dilation_sizes (List[List[int]]): list of dilation values in each layer of a `ResBlock`. - resblock_kernel_sizes (List[int]): list of kernel sizes for each `ResBlock`. - upsample_kernel_sizes (List[int]): list of kernel sizes for each transposed convolution. - upsample_initial_channel (int): number of channels for the first upsampling layer. This is divided by 2 - for each consecutive upsampling layer. - upsample_factors (List[int]): upsampling factors (stride) for each upsampling layer. - inference_padding (int): constant padding applied to the input at inference time. Defaults to 5. - """ - super().__init__() - self.inference_padding = inference_padding - self.num_kernels = len(resblock_kernel_sizes) - self.num_upsamples = len(upsample_factors) - self.cond_in_each_up_layer = cond_in_each_up_layer - - # initial upsampling layers - self.conv_pre = weight_norm(Conv1d(in_channels, upsample_initial_channel, 7, 1, padding=3)) - resblock = ResBlock1 if resblock_type == "1" else ResBlock2 - # upsampling layers - self.ups = nn.ModuleList() - for i, (u, k) in enumerate(zip(upsample_factors, upsample_kernel_sizes)): - self.ups.append( - weight_norm( - ConvTranspose1d( - upsample_initial_channel // (2**i), - upsample_initial_channel // (2 ** (i + 1)), - k, - u, - padding=(k - u) // 2, - ) - ) - ) - # MRF blocks - self.resblocks = nn.ModuleList() - for i in range(len(self.ups)): - ch = upsample_initial_channel // (2 ** (i + 1)) - for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): - self.resblocks.append(resblock(ch, k, d)) - # post convolution layer - self.conv_post = weight_norm(Conv1d(ch, out_channels, 7, 1, padding=3, bias=conv_post_bias)) - if cond_channels > 0: - self.cond_layer = nn.Conv1d(cond_channels, upsample_initial_channel, 1) - - if not conv_pre_weight_norm: - remove_parametrizations(self.conv_pre, "weight") - - if not conv_post_weight_norm: - remove_parametrizations(self.conv_post, "weight") - - if self.cond_in_each_up_layer: - self.conds = nn.ModuleList() - for i in range(len(self.ups)): - ch = upsample_initial_channel // (2 ** (i + 1)) - self.conds.append(nn.Conv1d(cond_channels, ch, 1)) - - def forward(self, x, g=None): - """ - Args: - x (Tensor): feature input tensor. - g (Tensor): global conditioning input tensor. - - Returns: - Tensor: output waveform. - - Shapes: - x: [B, C, T] - Tensor: [B, 1, T] - """ - o = self.conv_pre(x) - if hasattr(self, "cond_layer"): - o = o + self.cond_layer(g) - for i in range(self.num_upsamples): - o = F.leaky_relu(o, LRELU_SLOPE) - o = self.ups[i](o) - - if self.cond_in_each_up_layer: - o = o + self.conds[i](g) - - z_sum = None - for j in range(self.num_kernels): - if z_sum is None: - z_sum = self.resblocks[i * self.num_kernels + j](o) - else: - z_sum += self.resblocks[i * self.num_kernels + j](o) - o = z_sum / self.num_kernels - o = F.leaky_relu(o) - o = self.conv_post(o) - o = torch.tanh(o) - return o - - @torch.no_grad() - def inference(self, c): - """ - Args: - x (Tensor): conditioning input tensor. - - Returns: - Tensor: output waveform. - - Shapes: - x: [B, C, T] - Tensor: [B, 1, T] - """ - c = c.to(self.conv_pre.weight.device) - c = torch.nn.functional.pad(c, (self.inference_padding, self.inference_padding), "replicate") - return self.forward(c) - - def remove_weight_norm(self): - logger.info("Removing weight norm...") - for l in self.ups: - remove_parametrizations(l, "weight") - for l in self.resblocks: - l.remove_weight_norm() - remove_parametrizations(self.conv_pre, "weight") - remove_parametrizations(self.conv_post, "weight") - - def load_checkpoint( - self, config, checkpoint_path, eval=False, cache=False - ): # pylint: disable=unused-argument, redefined-builtin - state = torch.load(checkpoint_path, map_location=torch.device("cpu"), weights_only=is_pytorch_at_least_2_4()) - self.load_state_dict(state["model"]) - if eval: - self.eval() - assert not self.training - self.remove_weight_norm() - - -class SELayer(nn.Module): - def __init__(self, channel, reduction=8): - super(SELayer, self).__init__() - self.avg_pool = nn.AdaptiveAvgPool2d(1) - self.fc = nn.Sequential( - nn.Linear(channel, channel // reduction), - nn.ReLU(inplace=True), - nn.Linear(channel // reduction, channel), - nn.Sigmoid(), - ) - - def forward(self, x): - b, c, _, _ = x.size() - y = self.avg_pool(x).view(b, c) - y = self.fc(y).view(b, c, 1, 1) - return x * y - - -class SEBasicBlock(nn.Module): - expansion = 1 - - def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8): - super(SEBasicBlock, self).__init__() - self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False) - self.bn1 = nn.BatchNorm2d(planes) - self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False) - self.bn2 = nn.BatchNorm2d(planes) - self.relu = nn.ReLU(inplace=True) - self.se = SELayer(planes, reduction) - self.downsample = downsample - self.stride = stride - - def forward(self, x): - residual = x - - out = self.conv1(x) - out = self.relu(out) - out = self.bn1(out) - - out = self.conv2(out) - out = self.bn2(out) - out = self.se(out) - - if self.downsample is not None: - residual = self.downsample(x) - - out += residual - out = self.relu(out) - return out - - -def set_init_dict(model_dict, checkpoint_state, c): - # Partial initialization: if there is a mismatch with new and old layer, it is skipped. - for k, v in checkpoint_state.items(): - if k not in model_dict: - logger.warning("Layer missing in the model definition: %s", k) - # 1. filter out unnecessary keys - pretrained_dict = {k: v for k, v in checkpoint_state.items() if k in model_dict} - # 2. filter out different size layers - pretrained_dict = {k: v for k, v in pretrained_dict.items() if v.numel() == model_dict[k].numel()} - # 3. skip reinit layers - if c.has("reinit_layers") and c.reinit_layers is not None: - for reinit_layer_name in c.reinit_layers: - pretrained_dict = {k: v for k, v in pretrained_dict.items() if reinit_layer_name not in k} - # 4. overwrite entries in the existing state dict - model_dict.update(pretrained_dict) - logger.info("%d / %d layers are restored.", len(pretrained_dict), len(model_dict)) - return model_dict - - -class PreEmphasis(nn.Module): - def __init__(self, coefficient=0.97): - super().__init__() - self.coefficient = coefficient - self.register_buffer("filter", torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0)) - - def forward(self, x): - assert len(x.size()) == 2 - - x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect") - return torch.nn.functional.conv1d(x, self.filter).squeeze(1) - - -class ResNetSpeakerEncoder(nn.Module): - """This is copied from 🐸TTS to remove it from the dependencies.""" - - # pylint: disable=W0102 - def __init__( - self, - input_dim=64, - proj_dim=512, - layers=[3, 4, 6, 3], - num_filters=[32, 64, 128, 256], - encoder_type="ASP", - log_input=False, - use_torch_spec=False, - audio_config=None, - ): - super(ResNetSpeakerEncoder, self).__init__() - - self.encoder_type = encoder_type - self.input_dim = input_dim - self.log_input = log_input - self.use_torch_spec = use_torch_spec - self.audio_config = audio_config - self.proj_dim = proj_dim - - self.conv1 = nn.Conv2d(1, num_filters[0], kernel_size=3, stride=1, padding=1) - self.relu = nn.ReLU(inplace=True) - self.bn1 = nn.BatchNorm2d(num_filters[0]) - - self.inplanes = num_filters[0] - self.layer1 = self.create_layer(SEBasicBlock, num_filters[0], layers[0]) - self.layer2 = self.create_layer(SEBasicBlock, num_filters[1], layers[1], stride=(2, 2)) - self.layer3 = self.create_layer(SEBasicBlock, num_filters[2], layers[2], stride=(2, 2)) - self.layer4 = self.create_layer(SEBasicBlock, num_filters[3], layers[3], stride=(2, 2)) - - self.instancenorm = nn.InstanceNorm1d(input_dim) - - if self.use_torch_spec: - self.torch_spec = torch.nn.Sequential( - PreEmphasis(audio_config["preemphasis"]), - torchaudio.transforms.MelSpectrogram( - sample_rate=audio_config["sample_rate"], - n_fft=audio_config["fft_size"], - win_length=audio_config["win_length"], - hop_length=audio_config["hop_length"], - window_fn=torch.hamming_window, - n_mels=audio_config["num_mels"], - ), - ) - - else: - self.torch_spec = None - - outmap_size = int(self.input_dim / 8) - - self.attention = nn.Sequential( - nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1), - nn.ReLU(), - nn.BatchNorm1d(128), - nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1), - nn.Softmax(dim=2), - ) - - if self.encoder_type == "SAP": - out_dim = num_filters[3] * outmap_size - elif self.encoder_type == "ASP": - out_dim = num_filters[3] * outmap_size * 2 - else: - raise ValueError("Undefined encoder") - - self.fc = nn.Linear(out_dim, proj_dim) - - self._init_layers() - - def _init_layers(self): - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") - elif isinstance(m, nn.BatchNorm2d): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - - def create_layer(self, block, planes, blocks, stride=1): - downsample = None - if stride != 1 or self.inplanes != planes * block.expansion: - downsample = nn.Sequential( - nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False), - nn.BatchNorm2d(planes * block.expansion), - ) - - layers = [] - layers.append(block(self.inplanes, planes, stride, downsample)) - self.inplanes = planes * block.expansion - for _ in range(1, blocks): - layers.append(block(self.inplanes, planes)) - - return nn.Sequential(*layers) - - # pylint: disable=R0201 - def new_parameter(self, *size): - out = nn.Parameter(torch.FloatTensor(*size)) - nn.init.xavier_normal_(out) - return out - - def forward(self, x, l2_norm=False): - """Forward pass of the model. - - Args: - x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True` - to compute the spectrogram on-the-fly. - l2_norm (bool): Whether to L2-normalize the outputs. - - Shapes: - - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})` - """ - x.squeeze_(1) - # if you torch spec compute it otherwise use the mel spec computed by the AP - if self.use_torch_spec: - x = self.torch_spec(x) - - if self.log_input: - x = (x + 1e-6).log() - x = self.instancenorm(x).unsqueeze(1) - - x = self.conv1(x) - x = self.relu(x) - x = self.bn1(x) - - x = self.layer1(x) - x = self.layer2(x) - x = self.layer3(x) - x = self.layer4(x) - - x = x.reshape(x.size()[0], -1, x.size()[-1]) - - w = self.attention(x) - - if self.encoder_type == "SAP": - x = torch.sum(x * w, dim=2) - elif self.encoder_type == "ASP": - mu = torch.sum(x * w, dim=2) - sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-5)) - x = torch.cat((mu, sg), 1) - - x = x.view(x.size()[0], -1) - x = self.fc(x) - - if l2_norm: - x = torch.nn.functional.normalize(x, p=2, dim=1) - return x - - def load_checkpoint( - self, - checkpoint_path: str, - eval: bool = False, - use_cuda: bool = False, - criterion=None, - cache=False, - ): - state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache) - try: - self.load_state_dict(state["model"]) - logger.info("Model fully restored.") - except (KeyError, RuntimeError) as error: - # If eval raise the error - if eval: - raise error - - logger.info("Partial model initialization.") - model_dict = self.state_dict() - model_dict = set_init_dict(model_dict, state["model"]) - self.load_state_dict(model_dict) - del model_dict - - # load the criterion for restore_path - if criterion is not None and "criterion" in state: - try: - criterion.load_state_dict(state["criterion"]) - except (KeyError, RuntimeError) as error: - logger.exception("Criterion load ignored because of: %s", error) - - if use_cuda: - self.cuda() - if criterion is not None: - criterion = criterion.cuda() - - if eval: - self.eval() - assert not self.training - - if not eval: - return criterion, state["step"] - return criterion - class HifiDecoder(torch.nn.Module): def __init__( diff --git a/TTS/tts/layers/xtts/latent_encoder.py b/TTS/tts/layers/xtts/latent_encoder.py index f9d62a36f1..6becffb8b7 100644 --- a/TTS/tts/layers/xtts/latent_encoder.py +++ b/TTS/tts/layers/xtts/latent_encoder.py @@ -6,10 +6,7 @@ from torch import nn from torch.nn import functional as F - -class GroupNorm32(nn.GroupNorm): - def forward(self, x): - return super().forward(x.float()).type(x.dtype) +from TTS.tts.layers.tortoise.arch_utils import normalization, zero_module def conv_nd(dims, *args, **kwargs): @@ -22,24 +19,6 @@ def conv_nd(dims, *args, **kwargs): raise ValueError(f"unsupported dimensions: {dims}") -def normalization(channels): - groups = 32 - if channels <= 16: - groups = 8 - elif channels <= 64: - groups = 16 - while channels % groups != 0: - groups = int(groups / 2) - assert groups > 2 - return GroupNorm32(groups, channels) - - -def zero_module(module): - for p in module.parameters(): - p.detach().zero_() - return module - - class QKVAttention(nn.Module): def __init__(self, n_heads): super().__init__() @@ -114,28 +93,3 @@ def forward(self, x, mask=None, qk_bias=0): h = self.proj_out(h) xp = self.x_proj(x) return (xp + h).reshape(b, xp.shape[1], *spatial) - - -class ConditioningEncoder(nn.Module): - def __init__( - self, - spec_dim, - embedding_dim, - attn_blocks=6, - num_attn_heads=4, - ): - super().__init__() - attn = [] - self.init = nn.Conv1d(spec_dim, embedding_dim, kernel_size=1) - for a in range(attn_blocks): - attn.append(AttentionBlock(embedding_dim, num_attn_heads)) - self.attn = nn.Sequential(*attn) - self.dim = embedding_dim - - def forward(self, x): - """ - x: (b, 80, s) - """ - h = self.init(x) - h = self.attn(h) - return h diff --git a/TTS/tts/layers/xtts/perceiver_encoder.py b/TTS/tts/layers/xtts/perceiver_encoder.py index f4b6e84123..7477087283 100644 --- a/TTS/tts/layers/xtts/perceiver_encoder.py +++ b/TTS/tts/layers/xtts/perceiver_encoder.py @@ -9,9 +9,8 @@ from einops.layers.torch import Rearrange from torch import einsum, nn - -def exists(val): - return val is not None +from TTS.tts.layers.tortoise.transformer import GEGLU +from TTS.utils.generic_utils import default, exists def once(fn): @@ -151,12 +150,6 @@ def Sequential(*mods): return nn.Sequential(*filter(exists, mods)) -def default(val, d): - if exists(val): - return val - return d() if callable(d) else d - - class RMSNorm(nn.Module): def __init__(self, dim, scale=True, dim_cond=None): super().__init__() @@ -194,12 +187,6 @@ def forward(self, x): return super().forward(causal_padded_x) -class GEGLU(nn.Module): - def forward(self, x): - x, gate = x.chunk(2, dim=-1) - return F.gelu(gate) * x - - def FeedForward(dim, mult=4, causal_conv=False): dim_inner = int(dim * mult * 2 / 3) diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py index e87eb0766b..076727239c 100644 --- a/TTS/tts/layers/xtts/tokenizer.py +++ b/TTS/tts/layers/xtts/tokenizer.py @@ -15,6 +15,7 @@ from tokenizers import Tokenizer from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words +from TTS.tts.utils.text.cleaners import collapse_whitespace, lowercase logger = logging.getLogger(__name__) @@ -72,8 +73,6 @@ def split_sentence(text, lang, text_split_length=250): return text_splits -_whitespace_re = re.compile(r"\s+") - # List of (regular expression, replacement) pairs for abbreviations: _abbreviations = { "en": [ @@ -564,14 +563,6 @@ def expand_numbers_multilingual(text, lang="en"): return text -def lowercase(text): - return text.lower() - - -def collapse_whitespace(text): - return re.sub(_whitespace_re, " ", text) - - def multilingual_cleaners(text, lang): text = text.replace('"', "") if lang == "tr": @@ -586,13 +577,6 @@ def multilingual_cleaners(text, lang): return text -def basic_cleaners(text): - """Basic pipeline that lowercases and collapses whitespace without transliteration.""" - text = lowercase(text) - text = collapse_whitespace(text) - return text - - def chinese_transliterate(text): try: import pypinyin diff --git a/TTS/tts/layers/xtts/trainer/gpt_trainer.py b/TTS/tts/layers/xtts/trainer/gpt_trainer.py index 9d9edd5758..0253d65ddd 100644 --- a/TTS/tts/layers/xtts/trainer/gpt_trainer.py +++ b/TTS/tts/layers/xtts/trainer/gpt_trainer.py @@ -50,7 +50,7 @@ class GPTArgs(XttsArgs): max_wav_length: int = 255995 # ~11.6 seconds max_text_length: int = 200 tokenizer_file: str = "" - mel_norm_file: str = "https://coqui.gateway.scarf.sh/v0.14.0_models/mel_norms.pth" + mel_norm_file: str = "https://github.com/coqui-ai/TTS/releases/download/v0.14.0_models/mel_norms.pth" dvae_checkpoint: str = "" xtts_checkpoint: str = "" gpt_checkpoint: str = "" # if defined it will replace the gpt weights on xtts model diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index 2d27a57850..28a52bc558 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -3,6 +3,7 @@ import torch from coqpit import Coqpit +from monotonic_alignment_search import maximum_path from torch import nn from trainer.io import load_fsspec @@ -12,7 +13,7 @@ from TTS.tts.layers.feed_forward.encoder import Encoder from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.models.base_tts import BaseTTS -from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask +from TTS.tts.utils.helpers import expand_encoder_outputs, generate_attention, sequence_mask from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.visual import plot_alignment, plot_spectrogram @@ -168,35 +169,6 @@ def compute_align_path(self, mu, log_sigma, y, x_mask, y_mask): dr_mas = torch.sum(attn, -1) return dr_mas.squeeze(1), log_p - @staticmethod - def generate_attn(dr, x_mask, y_mask=None): - # compute decode mask from the durations - if y_mask is None: - y_lengths = dr.sum(1).long() - y_lengths[y_lengths < 1] = 1 - y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(dr.dtype) - attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) - attn = generate_path(dr, attn_mask.squeeze(1)).to(dr.dtype) - return attn - - def expand_encoder_outputs(self, en, dr, x_mask, y_mask): - """Generate attention alignment map from durations and - expand encoder outputs - - Examples:: - - encoder output: [a,b,c,d] - - durations: [1, 3, 2, 1] - - - expanded: [a, b, b, b, c, c, d] - - attention map: [[0, 0, 0, 0, 0, 0, 1], - [0, 0, 0, 0, 1, 1, 0], - [0, 1, 1, 1, 0, 0, 0], - [1, 0, 0, 0, 0, 0, 0]] - """ - attn = self.generate_attn(dr, x_mask, y_mask) - o_en_ex = torch.matmul(attn.squeeze(1).transpose(1, 2), en.transpose(1, 2)).transpose(1, 2) - return o_en_ex, attn - def format_durations(self, o_dr_log, x_mask): o_dr = (torch.exp(o_dr_log) - 1) * x_mask * self.length_scale o_dr[o_dr < 1] = 1.0 @@ -242,9 +214,8 @@ def _forward_encoder(self, x, x_lengths, g=None): return o_en, o_en_dp, x_mask, g def _forward_decoder(self, o_en, o_en_dp, dr, x_mask, y_lengths, g): - y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en_dp.dtype) # expand o_en with durations - o_en_ex, attn = self.expand_encoder_outputs(o_en, dr, x_mask, y_mask) + o_en_ex, attn, y_mask = expand_encoder_outputs(o_en, dr, x_mask, y_lengths) # positional encoding if hasattr(self, "pos_encoder"): o_en_ex = self.pos_encoder(o_en_ex, y_mask) @@ -281,7 +252,7 @@ def forward( o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) dr_mas, mu, log_sigma, logp = self._forward_mdn(o_en, y, y_lengths, x_mask) y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en_dp.dtype) - attn = self.generate_attn(dr_mas, x_mask, y_mask) + attn = generate_attention(dr_mas, x_mask, y_mask) elif phase == 1: # train decoder o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) diff --git a/TTS/tts/models/delightful_tts.py b/TTS/tts/models/delightful_tts.py index a938a3a4ab..e6db116081 100644 --- a/TTS/tts/models/delightful_tts.py +++ b/TTS/tts/models/delightful_tts.py @@ -8,31 +8,36 @@ import numpy as np import torch import torch.distributed as dist -import torchaudio from coqpit import Coqpit -from librosa.filters import mel as librosa_mel_fn from torch import nn -from torch.cuda.amp.autocast_mode import autocast -from torch.nn import functional as F from torch.utils.data import DataLoader from torch.utils.data.sampler import WeightedRandomSampler from trainer.io import load_fsspec from trainer.torch import DistributedSampler, DistributedSamplerWrapper from trainer.trainer_utils import get_optimizer, get_scheduler -from TTS.tts.datasets.dataset import F0Dataset, TTSDataset, _parse_sample +from TTS.tts.datasets.dataset import F0Dataset, TTSDataset, _parse_sample, get_attribute_balancer_weights from TTS.tts.layers.delightful_tts.acoustic_model import AcousticModel -from TTS.tts.layers.losses import ForwardSumLoss, VitsDiscriminatorLoss +from TTS.tts.layers.losses import ( + ForwardSumLoss, + VitsDiscriminatorLoss, + _binary_alignment_loss, + feature_loss, + generator_loss, +) from TTS.tts.layers.vits.discriminator import VitsDiscriminator from TTS.tts.models.base_tts import BaseTTSE2E +from TTS.tts.models.vits import load_audio from TTS.tts.utils.helpers import average_over_durations, compute_attn_prior, rand_segments, segment, sequence_mask from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.synthesis import embedding_to_torch, id_to_torch, numpy_to_torch from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.visual import plot_alignment, plot_avg_pitch, plot_pitch, plot_spectrogram from TTS.utils.audio.numpy_transforms import build_mel_basis, compute_f0 from TTS.utils.audio.numpy_transforms import db_to_amp as db_to_amp_numpy from TTS.utils.audio.numpy_transforms import mel_to_wav as mel_to_wav_numpy from TTS.utils.audio.processor import AudioProcessor +from TTS.utils.audio.torch_transforms import wav_to_mel, wav_to_spec from TTS.vocoder.layers.losses import MultiScaleSTFTLoss from TTS.vocoder.models.hifigan_generator import HifiganGenerator from TTS.vocoder.utils.generic_utils import plot_results @@ -40,284 +45,20 @@ logger = logging.getLogger(__name__) -def id_to_torch(aux_id, cuda=False): - if aux_id is not None: - aux_id = np.asarray(aux_id) - aux_id = torch.from_numpy(aux_id) - if cuda: - return aux_id.cuda() - return aux_id - - -def embedding_to_torch(d_vector, cuda=False): - if d_vector is not None: - d_vector = np.asarray(d_vector) - d_vector = torch.from_numpy(d_vector).float() - d_vector = d_vector.squeeze().unsqueeze(0) - if cuda: - return d_vector.cuda() - return d_vector - - -def numpy_to_torch(np_array, dtype, cuda=False): - if np_array is None: - return None - tensor = torch.as_tensor(np_array, dtype=dtype) - if cuda: - return tensor.cuda() - return tensor - - -def get_mask_from_lengths(lengths: torch.Tensor) -> torch.Tensor: - batch_size = lengths.shape[0] - max_len = torch.max(lengths).item() - ids = torch.arange(0, max_len, device=lengths.device).unsqueeze(0).expand(batch_size, -1) - mask = ids >= lengths.unsqueeze(1).expand(-1, max_len) - return mask - - -def pad(input_ele: List[torch.Tensor], max_len: int) -> torch.Tensor: - out_list = torch.jit.annotate(List[torch.Tensor], []) - for batch in input_ele: - if len(batch.shape) == 1: - one_batch_padded = F.pad(batch, (0, max_len - batch.size(0)), "constant", 0.0) - else: - one_batch_padded = F.pad(batch, (0, 0, 0, max_len - batch.size(0)), "constant", 0.0) - out_list.append(one_batch_padded) - out_padded = torch.stack(out_list) - return out_padded - - -def stride_lens(lens: torch.Tensor, stride: int = 2) -> torch.Tensor: - return torch.ceil(lens / stride).int() - - -def initialize_embeddings(shape: Tuple[int]) -> torch.Tensor: - assert len(shape) == 2, "Can only initialize 2-D embedding matrices ..." - return torch.randn(shape) * np.sqrt(2 / shape[1]) - - -# pylint: disable=redefined-outer-name -def calc_same_padding(kernel_size: int) -> Tuple[int, int]: - pad = kernel_size // 2 - return (pad, pad - (kernel_size + 1) % 2) - - hann_window = {} mel_basis = {} -@torch.no_grad() -def weights_reset(m: nn.Module): - # check if the current module has reset_parameters and if it is reset the weight - reset_parameters = getattr(m, "reset_parameters", None) - if callable(reset_parameters): - m.reset_parameters() - - -def get_module_weights_sum(mdl: nn.Module): - dict_sums = {} - for name, w in mdl.named_parameters(): - if "weight" in name: - value = w.data.sum().item() - dict_sums[name] = value - return dict_sums - - -def load_audio(file_path: str): - """Load the audio file normalized in [-1, 1] - - Return Shapes: - - x: :math:`[1, T]` - """ - x, sr = torchaudio.load( - file_path, - ) - assert (x > 1).sum() + (x < -1).sum() == 0 - return x, sr - - -def _amp_to_db(x, C=1, clip_val=1e-5): - return torch.log(torch.clamp(x, min=clip_val) * C) - - -def _db_to_amp(x, C=1): - return torch.exp(x) / C - - -def amp_to_db(magnitudes): - output = _amp_to_db(magnitudes) - return output - - -def db_to_amp(magnitudes): - output = _db_to_amp(magnitudes) - return output - - -def _wav_to_spec(y, n_fft, hop_length, win_length, center=False): - y = y.squeeze(1) - - if torch.min(y) < -1.0: - logger.info("min value is %.3f", torch.min(y)) - if torch.max(y) > 1.0: - logger.info("max value is %.3f", torch.max(y)) - - global hann_window # pylint: disable=global-statement - dtype_device = str(y.dtype) + "_" + str(y.device) - wnsize_dtype_device = str(win_length) + "_" + dtype_device - if wnsize_dtype_device not in hann_window: - hann_window[wnsize_dtype_device] = torch.hann_window(win_length).to(dtype=y.dtype, device=y.device) - - y = torch.nn.functional.pad( - y.unsqueeze(1), - (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)), - mode="reflect", - ) - y = y.squeeze(1) - - spec = torch.view_as_real( - torch.stft( - y, - n_fft, - hop_length=hop_length, - win_length=win_length, - window=hann_window[wnsize_dtype_device], - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=True, - ) - ) - - return spec - - -def wav_to_spec(y, n_fft, hop_length, win_length, center=False): - """ - Args Shapes: - - y : :math:`[B, 1, T]` - - Return Shapes: - - spec : :math:`[B,C,T]` - """ - spec = _wav_to_spec(y, n_fft, hop_length, win_length, center=center) - spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) - return spec - - def wav_to_energy(y, n_fft, hop_length, win_length, center=False): - spec = _wav_to_spec(y, n_fft, hop_length, win_length, center=center) - - spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) + spec = wav_to_spec(y, n_fft, hop_length, win_length, center=center) return torch.norm(spec, dim=1, keepdim=True) -def name_mel_basis(spec, n_fft, fmax): - n_fft_len = f"{n_fft}_{fmax}_{spec.dtype}_{spec.device}" - return n_fft_len - - -def spec_to_mel(spec, n_fft, num_mels, sample_rate, fmin, fmax): - """ - Args Shapes: - - spec : :math:`[B,C,T]` - - Return Shapes: - - mel : :math:`[B,C,T]` - """ - global mel_basis # pylint: disable=global-statement - mel_basis_key = name_mel_basis(spec, n_fft, fmax) - # pylint: disable=too-many-function-args - if mel_basis_key not in mel_basis: - # pylint: disable=missing-kwoa - mel = librosa_mel_fn(sample_rate, n_fft, num_mels, fmin, fmax) - mel_basis[mel_basis_key] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device) - mel = torch.matmul(mel_basis[mel_basis_key], spec) - mel = amp_to_db(mel) - return mel - - -def wav_to_mel(y, n_fft, num_mels, sample_rate, hop_length, win_length, fmin, fmax, center=False): - """ - Args Shapes: - - y : :math:`[B, 1, T_y]` - - Return Shapes: - - spec : :math:`[B,C,T_spec]` - """ - y = y.squeeze(1) - - if torch.min(y) < -1.0: - logger.info("min value is %.3f", torch.min(y)) - if torch.max(y) > 1.0: - logger.info("max value is %.3f", torch.max(y)) - - global mel_basis, hann_window # pylint: disable=global-statement - mel_basis_key = name_mel_basis(y, n_fft, fmax) - wnsize_dtype_device = str(win_length) + "_" + str(y.dtype) + "_" + str(y.device) - if mel_basis_key not in mel_basis: - # pylint: disable=missing-kwoa - mel = librosa_mel_fn( - sr=sample_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax - ) # pylint: disable=too-many-function-args - mel_basis[mel_basis_key] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device) - if wnsize_dtype_device not in hann_window: - hann_window[wnsize_dtype_device] = torch.hann_window(win_length).to(dtype=y.dtype, device=y.device) - - y = torch.nn.functional.pad( - y.unsqueeze(1), - (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)), - mode="reflect", - ) - y = y.squeeze(1) - - spec = torch.view_as_real( - torch.stft( - y, - n_fft, - hop_length=hop_length, - win_length=win_length, - window=hann_window[wnsize_dtype_device], - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=True, - ) - ) - - spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) - spec = torch.matmul(mel_basis[mel_basis_key], spec) - spec = amp_to_db(spec) - return spec - - ############################## # DATASET ############################## -def get_attribute_balancer_weights(items: list, attr_name: str, multi_dict: dict = None): - """Create balancer weight for torch WeightedSampler""" - attr_names_samples = np.array([item[attr_name] for item in items]) - unique_attr_names = np.unique(attr_names_samples).tolist() - attr_idx = [unique_attr_names.index(l) for l in attr_names_samples] - attr_count = np.array([len(np.where(attr_names_samples == l)[0]) for l in unique_attr_names]) - weight_attr = 1.0 / attr_count - dataset_samples_weight = np.array([weight_attr[l] for l in attr_idx]) - dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight) - if multi_dict is not None: - multiplier_samples = np.array([multi_dict.get(item[attr_name], 1.0) for item in items]) - dataset_samples_weight *= multiplier_samples - return ( - torch.from_numpy(dataset_samples_weight).float(), - unique_attr_names, - np.unique(dataset_samples_weight).tolist(), - ) - - class ForwardTTSE2eF0Dataset(F0Dataset): """Override F0Dataset to avoid slow computing of pitches""" @@ -952,7 +693,7 @@ def train_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int): ) # compute loss - with autocast(enabled=False): # use float32 for the criterion + with torch.autocast("cuda", enabled=False): # use float32 for the criterion loss_dict = criterion[optimizer_idx]( scores_disc_fake=scores_d_fake, scores_disc_real=scores_d_real, @@ -963,7 +704,7 @@ def train_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int): if optimizer_idx == 1: mel = batch["mel_input"] # compute melspec segment - with autocast(enabled=False): + with torch.autocast("cuda", enabled=False): mel_slice = segment( mel.float(), self.model_outputs_cache["slice_ids"], self.args.spec_segment_size, pad_short=True ) @@ -991,7 +732,7 @@ def train_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int): ) # compute losses - with autocast(enabled=True): # use float32 for the criterion + with torch.autocast("cuda", enabled=True): # use float32 for the criterion loss_dict = criterion[optimizer_idx]( mel_output=self.model_outputs_cache["acoustic_model_outputs"].transpose(1, 2), mel_target=batch["mel_input"], @@ -1197,7 +938,7 @@ def synthesize( **kwargs, ): # pylint: disable=unused-argument # TODO: add cloning support with ref_waveform - is_cuda = next(self.parameters()).is_cuda + device = next(self.parameters()).device # convert text to sequence of token IDs text_inputs = np.asarray( @@ -1211,14 +952,14 @@ def synthesize( if isinstance(speaker_id, str) and self.args.use_speaker_embedding: # get the speaker id for the speaker embedding layer _speaker_id = self.speaker_manager.name_to_id[speaker_id] - _speaker_id = id_to_torch(_speaker_id, cuda=is_cuda) + _speaker_id = id_to_torch(_speaker_id, device=device) if speaker_id is not None and self.args.use_d_vector_file: # get the average d_vector for the speaker d_vector = self.speaker_manager.get_mean_embedding(speaker_id, num_samples=None, randomize=False) - d_vector = embedding_to_torch(d_vector, cuda=is_cuda) + d_vector = embedding_to_torch(d_vector, device=device) - text_inputs = numpy_to_torch(text_inputs, torch.long, cuda=is_cuda) + text_inputs = numpy_to_torch(text_inputs, torch.long, device=device) text_inputs = text_inputs.unsqueeze(0) # synthesize voice @@ -1241,7 +982,7 @@ def synthesize( return return_dict def synthesize_with_gl(self, text: str, speaker_id, d_vector): - is_cuda = next(self.parameters()).is_cuda + device = next(self.parameters()).device # convert text to sequence of token IDs text_inputs = np.asarray( @@ -1250,12 +991,12 @@ def synthesize_with_gl(self, text: str, speaker_id, d_vector): ) # pass tensors to backend if speaker_id is not None: - speaker_id = id_to_torch(speaker_id, cuda=is_cuda) + speaker_id = id_to_torch(speaker_id, device=device) if d_vector is not None: - d_vector = embedding_to_torch(d_vector, cuda=is_cuda) + d_vector = embedding_to_torch(d_vector, device=device) - text_inputs = numpy_to_torch(text_inputs, torch.long, cuda=is_cuda) + text_inputs = numpy_to_torch(text_inputs, torch.long, device=device) text_inputs = text_inputs.unsqueeze(0) # synthesize voice @@ -1602,36 +1343,6 @@ def __init__(self, config): self.gen_loss_alpha = config.gen_loss_alpha self.multi_scale_stft_loss_alpha = config.multi_scale_stft_loss_alpha - @staticmethod - def _binary_alignment_loss(alignment_hard, alignment_soft): - """Binary loss that forces soft alignments to match the hard alignments as - explained in `https://arxiv.org/pdf/2108.10447.pdf`. - """ - log_sum = torch.log(torch.clamp(alignment_soft[alignment_hard == 1], min=1e-12)).sum() - return -log_sum / alignment_hard.sum() - - @staticmethod - def feature_loss(feats_real, feats_generated): - loss = 0 - for dr, dg in zip(feats_real, feats_generated): - for rl, gl in zip(dr, dg): - rl = rl.float().detach() - gl = gl.float() - loss += torch.mean(torch.abs(rl - gl)) - return loss * 2 - - @staticmethod - def generator_loss(scores_fake): - loss = 0 - gen_losses = [] - for dg in scores_fake: - dg = dg.float() - l = torch.mean((1 - dg) ** 2) - gen_losses.append(l) - loss += l - - return loss, gen_losses - def forward( self, mel_output, @@ -1729,7 +1440,7 @@ def forward( ) if self.binary_alignment_loss_alpha > 0 and aligner_hard is not None: - binary_alignment_loss = self._binary_alignment_loss(aligner_hard, aligner_soft) + binary_alignment_loss = _binary_alignment_loss(aligner_hard, aligner_soft) total_loss = total_loss + self.binary_alignment_loss_alpha * binary_alignment_loss * binary_loss_weight if binary_loss_weight: loss_dict["loss_binary_alignment"] = ( @@ -1749,8 +1460,8 @@ def forward( # vocoder losses if not skip_disc: - loss_feat = self.feature_loss(feats_real=feats_real, feats_generated=feats_fake) * self.feat_loss_alpha - loss_gen = self.generator_loss(scores_fake=scores_fake)[0] * self.gen_loss_alpha + loss_feat = feature_loss(feats_real=feats_real, feats_generated=feats_fake) * self.feat_loss_alpha + loss_gen = generator_loss(scores_fake=scores_fake)[0] * self.gen_loss_alpha loss_dict["vocoder_loss_feat"] = loss_feat loss_dict["vocoder_loss_gen"] = loss_gen loss_dict["loss"] = loss_dict["loss"] + loss_feat + loss_gen diff --git a/TTS/tts/models/forward_tts.py b/TTS/tts/models/forward_tts.py index 4b74462dd5..d09e3ea91b 100644 --- a/TTS/tts/models/forward_tts.py +++ b/TTS/tts/models/forward_tts.py @@ -4,8 +4,8 @@ import torch from coqpit import Coqpit +from monotonic_alignment_search import maximum_path from torch import nn -from torch.cuda.amp.autocast_mode import autocast from trainer.io import load_fsspec from TTS.tts.layers.feed_forward.decoder import Decoder @@ -14,7 +14,7 @@ from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor from TTS.tts.models.base_tts import BaseTTS -from TTS.tts.utils.helpers import average_over_durations, generate_path, maximum_path, sequence_mask +from TTS.tts.utils.helpers import average_over_durations, expand_encoder_outputs, generate_attention, sequence_mask from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.visual import plot_alignment, plot_avg_energy, plot_avg_pitch, plot_spectrogram @@ -310,49 +310,6 @@ def init_multispeaker(self, config: Coqpit): self.emb_g = nn.Embedding(self.num_speakers, self.args.hidden_channels) nn.init.uniform_(self.emb_g.weight, -0.1, 0.1) - @staticmethod - def generate_attn(dr, x_mask, y_mask=None): - """Generate an attention mask from the durations. - - Shapes - - dr: :math:`(B, T_{en})` - - x_mask: :math:`(B, T_{en})` - - y_mask: :math:`(B, T_{de})` - """ - # compute decode mask from the durations - if y_mask is None: - y_lengths = dr.sum(1).long() - y_lengths[y_lengths < 1] = 1 - y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(dr.dtype) - attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2) - attn = generate_path(dr, attn_mask.squeeze(1)).to(dr.dtype) - return attn - - def expand_encoder_outputs(self, en, dr, x_mask, y_mask): - """Generate attention alignment map from durations and - expand encoder outputs - - Shapes: - - en: :math:`(B, D_{en}, T_{en})` - - dr: :math:`(B, T_{en})` - - x_mask: :math:`(B, T_{en})` - - y_mask: :math:`(B, T_{de})` - - Examples:: - - encoder output: [a,b,c,d] - durations: [1, 3, 2, 1] - - expanded: [a, b, b, b, c, c, d] - attention map: [[0, 0, 0, 0, 0, 0, 1], - [0, 0, 0, 0, 1, 1, 0], - [0, 1, 1, 1, 0, 0, 0], - [1, 0, 0, 0, 0, 0, 0]] - """ - attn = self.generate_attn(dr, x_mask, y_mask) - o_en_ex = torch.matmul(attn.squeeze(1).transpose(1, 2).to(en.dtype), en.transpose(1, 2)).transpose(1, 2) - return o_en_ex, attn - def format_durations(self, o_dr_log, x_mask): """Format predicted durations. 1. Convert to linear scale from log scale @@ -443,9 +400,8 @@ def _forward_decoder( Returns: Tuple[torch.FloatTensor, torch.FloatTensor]: Decoder output, attention map from durations. """ - y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en.dtype) # expand o_en with durations - o_en_ex, attn = self.expand_encoder_outputs(o_en, dr, x_mask, y_mask) + o_en_ex, attn, y_mask = expand_encoder_outputs(o_en, dr, x_mask, y_lengths) # positional encoding if hasattr(self, "pos_encoder"): o_en_ex = self.pos_encoder(o_en_ex, y_mask) @@ -624,7 +580,7 @@ def forward( o_dr_log = self.duration_predictor(o_en, x_mask) o_dr = torch.clamp(torch.exp(o_dr_log) - 1, 0, self.max_duration) # generate attn mask from predicted durations - o_attn = self.generate_attn(o_dr.squeeze(1), x_mask) + o_attn = generate_attention(o_dr.squeeze(1), x_mask) # aligner o_alignment_dur = None alignment_soft = None @@ -743,7 +699,7 @@ def train_step(self, batch: dict, criterion: nn.Module): if self.use_aligner: durations = outputs["o_alignment_dur"] # use float32 in AMP - with autocast(enabled=False): + with torch.autocast("cuda", enabled=False): # compute loss loss_dict = criterion( decoder_output=outputs["model_outputs"], diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index 64954d283c..5bf4713140 100644 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -4,8 +4,8 @@ import torch from coqpit import Coqpit +from monotonic_alignment_search import maximum_path from torch import nn -from torch.cuda.amp.autocast_mode import autocast from torch.nn import functional as F from trainer.io import load_fsspec @@ -13,7 +13,7 @@ from TTS.tts.layers.glow_tts.decoder import Decoder from TTS.tts.layers.glow_tts.encoder import Encoder from TTS.tts.models.base_tts import BaseTTS -from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask +from TTS.tts.utils.helpers import generate_path, sequence_mask from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.text.tokenizer import TTSTokenizer @@ -415,7 +415,7 @@ def train_step(self, batch: dict, criterion: nn.Module): aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids}, ) - with autocast(enabled=False): # avoid mixed_precision in criterion + with torch.autocast("cuda", enabled=False): # avoid mixed_precision in criterion loss_dict = criterion( outputs["z"].float(), outputs["y_mean"].float(), diff --git a/TTS/tts/models/neuralhmm_tts.py b/TTS/tts/models/neuralhmm_tts.py index de5401aac7..0b3fadafbf 100644 --- a/TTS/tts/models/neuralhmm_tts.py +++ b/TTS/tts/models/neuralhmm_tts.py @@ -8,6 +8,7 @@ from trainer.io import load_fsspec from trainer.logging.tensorboard_logger import TensorboardLogger +from TTS.tts.layers.losses import NLLLoss from TTS.tts.layers.overflow.common_layers import Encoder, OverflowUtils from TTS.tts.layers.overflow.neural_hmm import NeuralHMM from TTS.tts.layers.overflow.plotting_utils import ( @@ -373,21 +374,3 @@ def test_log( ) -> None: logger.test_audios(steps, outputs[1], self.ap.sample_rate) logger.test_figures(steps, outputs[0]) - - -class NLLLoss(nn.Module): - """Negative log likelihood loss.""" - - def forward(self, log_prob: torch.Tensor) -> dict: # pylint: disable=no-self-use - """Compute the loss. - - Args: - logits (Tensor): [B, T, D] - - Returns: - Tensor: [1] - - """ - return_dict = {} - return_dict["loss"] = -log_prob.mean() - return return_dict diff --git a/TTS/tts/models/overflow.py b/TTS/tts/models/overflow.py index b72f4877cf..ac09e406ad 100644 --- a/TTS/tts/models/overflow.py +++ b/TTS/tts/models/overflow.py @@ -8,6 +8,7 @@ from trainer.io import load_fsspec from trainer.logging.tensorboard_logger import TensorboardLogger +from TTS.tts.layers.losses import NLLLoss from TTS.tts.layers.overflow.common_layers import Encoder, OverflowUtils from TTS.tts.layers.overflow.decoder import Decoder from TTS.tts.layers.overflow.neural_hmm import NeuralHMM @@ -389,21 +390,3 @@ def test_log( ) -> None: logger.test_audios(steps, outputs[1], self.ap.sample_rate) logger.test_figures(steps, outputs[0]) - - -class NLLLoss(nn.Module): - """Negative log likelihood loss.""" - - def forward(self, log_prob: torch.Tensor) -> dict: # pylint: disable=no-self-use - """Compute the loss. - - Args: - logits (Tensor): [B, T, D] - - Returns: - Tensor: [1] - - """ - return_dict = {} - return_dict["loss"] = -log_prob.mean() - return return_dict diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 400a86d042..5d3efd2021 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -4,7 +4,6 @@ import torch from torch import nn -from torch.cuda.amp.autocast_mode import autocast from trainer.trainer_utils import get_optimizer, get_scheduler from TTS.tts.layers.tacotron.capacitron_layers import CapacitronVAE @@ -310,7 +309,7 @@ def train_step(self, batch: Dict, criterion: torch.nn.Module) -> Tuple[Dict, Dic alignment_lengths = mel_lengths // self.decoder.r # compute loss - with autocast(enabled=False): # use float32 for the criterion + with torch.autocast("cuda", enabled=False): # use float32 for the criterion loss_dict = criterion( outputs["model_outputs"].float(), outputs["decoder_outputs"].float(), diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index 4b1317f440..2716a39786 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -4,7 +4,6 @@ import torch from torch import nn -from torch.cuda.amp.autocast_mode import autocast from trainer.trainer_utils import get_optimizer, get_scheduler from TTS.tts.layers.tacotron.capacitron_layers import CapacitronVAE @@ -338,7 +337,7 @@ def train_step(self, batch: Dict, criterion: torch.nn.Module): alignment_lengths = mel_lengths // self.decoder.r # compute loss - with autocast(enabled=False): # use float32 for the criterion + with torch.autocast("cuda", enabled=False): # use float32 for the criterion loss_dict = criterion( outputs["model_outputs"].float(), outputs["decoder_outputs"].float(), diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index b014e4fdde..7ec2519236 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -10,9 +10,8 @@ import torch.distributed as dist import torchaudio from coqpit import Coqpit -from librosa.filters import mel as librosa_mel_fn +from monotonic_alignment_search import maximum_path from torch import nn -from torch.cuda.amp.autocast_mode import autocast from torch.nn import functional as F from torch.utils.data import DataLoader from torch.utils.data.sampler import WeightedRandomSampler @@ -21,20 +20,21 @@ from trainer.trainer_utils import get_optimizer, get_scheduler from TTS.tts.configs.shared_configs import CharactersConfig -from TTS.tts.datasets.dataset import TTSDataset, _parse_sample +from TTS.tts.datasets.dataset import TTSDataset, _parse_sample, get_attribute_balancer_weights from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor from TTS.tts.layers.vits.discriminator import VitsDiscriminator from TTS.tts.layers.vits.networks import PosteriorEncoder, ResidualCouplingBlocks, TextEncoder from TTS.tts.layers.vits.stochastic_duration_predictor import StochasticDurationPredictor from TTS.tts.models.base_tts import BaseTTS from TTS.tts.utils.fairseq import rehash_fairseq_vits_checkpoint -from TTS.tts.utils.helpers import generate_path, maximum_path, rand_segments, segment, sequence_mask +from TTS.tts.utils.helpers import generate_path, rand_segments, segment, sequence_mask from TTS.tts.utils.languages import LanguageManager from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.text.characters import BaseCharacters, BaseVocabulary, _characters, _pad, _phonemes, _punctuations from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.visual import plot_alignment +from TTS.utils.audio.torch_transforms import spec_to_mel, wav_to_mel, wav_to_spec from TTS.utils.samplers import BucketBatchSampler from TTS.vocoder.models.hifigan_generator import HifiganGenerator from TTS.vocoder.utils.generic_utils import plot_results @@ -45,10 +45,6 @@ # IO / Feature extraction ############################## -# pylint: disable=global-statement -hann_window = {} -mel_basis = {} - @torch.no_grad() def weights_reset(m: nn.Module): @@ -78,143 +74,6 @@ def load_audio(file_path): return x, sr -def _amp_to_db(x, C=1, clip_val=1e-5): - return torch.log(torch.clamp(x, min=clip_val) * C) - - -def _db_to_amp(x, C=1): - return torch.exp(x) / C - - -def amp_to_db(magnitudes): - output = _amp_to_db(magnitudes) - return output - - -def db_to_amp(magnitudes): - output = _db_to_amp(magnitudes) - return output - - -def wav_to_spec(y, n_fft, hop_length, win_length, center=False): - """ - Args Shapes: - - y : :math:`[B, 1, T]` - - Return Shapes: - - spec : :math:`[B,C,T]` - """ - y = y.squeeze(1) - - if torch.min(y) < -1.0: - logger.info("min value is %.3f", torch.min(y)) - if torch.max(y) > 1.0: - logger.info("max value is %.3f", torch.max(y)) - - global hann_window - dtype_device = str(y.dtype) + "_" + str(y.device) - wnsize_dtype_device = str(win_length) + "_" + dtype_device - if wnsize_dtype_device not in hann_window: - hann_window[wnsize_dtype_device] = torch.hann_window(win_length).to(dtype=y.dtype, device=y.device) - - y = torch.nn.functional.pad( - y.unsqueeze(1), - (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)), - mode="reflect", - ) - y = y.squeeze(1) - - spec = torch.view_as_real( - torch.stft( - y, - n_fft, - hop_length=hop_length, - win_length=win_length, - window=hann_window[wnsize_dtype_device], - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=True, - ) - ) - - spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) - return spec - - -def spec_to_mel(spec, n_fft, num_mels, sample_rate, fmin, fmax): - """ - Args Shapes: - - spec : :math:`[B,C,T]` - - Return Shapes: - - mel : :math:`[B,C,T]` - """ - global mel_basis - dtype_device = str(spec.dtype) + "_" + str(spec.device) - fmax_dtype_device = str(fmax) + "_" + dtype_device - if fmax_dtype_device not in mel_basis: - mel = librosa_mel_fn(sr=sample_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) - mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device) - mel = torch.matmul(mel_basis[fmax_dtype_device], spec) - mel = amp_to_db(mel) - return mel - - -def wav_to_mel(y, n_fft, num_mels, sample_rate, hop_length, win_length, fmin, fmax, center=False): - """ - Args Shapes: - - y : :math:`[B, 1, T]` - - Return Shapes: - - spec : :math:`[B,C,T]` - """ - y = y.squeeze(1) - - if torch.min(y) < -1.0: - logger.info("min value is %.3f", torch.min(y)) - if torch.max(y) > 1.0: - logger.info("max value is %.3f", torch.max(y)) - - global mel_basis, hann_window - dtype_device = str(y.dtype) + "_" + str(y.device) - fmax_dtype_device = str(fmax) + "_" + dtype_device - wnsize_dtype_device = str(win_length) + "_" + dtype_device - if fmax_dtype_device not in mel_basis: - mel = librosa_mel_fn(sr=sample_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) - mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device) - if wnsize_dtype_device not in hann_window: - hann_window[wnsize_dtype_device] = torch.hann_window(win_length).to(dtype=y.dtype, device=y.device) - - y = torch.nn.functional.pad( - y.unsqueeze(1), - (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)), - mode="reflect", - ) - y = y.squeeze(1) - - spec = torch.view_as_real( - torch.stft( - y, - n_fft, - hop_length=hop_length, - win_length=win_length, - window=hann_window[wnsize_dtype_device], - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=True, - ) - ) - - spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) - spec = torch.matmul(mel_basis[fmax_dtype_device], spec) - spec = amp_to_db(spec) - return spec - - ############################# # CONFIGS ############################# @@ -236,30 +95,6 @@ class VitsAudioConfig(Coqpit): ############################## -def get_attribute_balancer_weights(items: list, attr_name: str, multi_dict: dict = None): - """Create inverse frequency weights for balancing the dataset. - Use `multi_dict` to scale relative weights.""" - attr_names_samples = np.array([item[attr_name] for item in items]) - unique_attr_names = np.unique(attr_names_samples).tolist() - attr_idx = [unique_attr_names.index(l) for l in attr_names_samples] - attr_count = np.array([len(np.where(attr_names_samples == l)[0]) for l in unique_attr_names]) - weight_attr = 1.0 / attr_count - dataset_samples_weight = np.array([weight_attr[l] for l in attr_idx]) - dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight) - if multi_dict is not None: - # check if all keys are in the multi_dict - for k in multi_dict: - assert k in unique_attr_names, f"{k} not in {unique_attr_names}" - # scale weights - multiplier_samples = np.array([multi_dict.get(item[attr_name], 1.0) for item in items]) - dataset_samples_weight *= multiplier_samples - return ( - torch.from_numpy(dataset_samples_weight).float(), - unique_attr_names, - np.unique(dataset_samples_weight).tolist(), - ) - - class VitsDataset(TTSDataset): def __init__(self, model_args, *args, **kwargs): super().__init__(*args, **kwargs) @@ -1277,7 +1112,7 @@ def train_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int) -> T ) # compute loss - with autocast(enabled=False): # use float32 for the criterion + with torch.autocast("cuda", enabled=False): # use float32 for the criterion loss_dict = criterion[optimizer_idx]( scores_disc_real, scores_disc_fake, @@ -1288,7 +1123,7 @@ def train_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int) -> T mel = batch["mel"] # compute melspec segment - with autocast(enabled=False): + with torch.autocast("cuda", enabled=False): if self.args.encoder_sample_rate: spec_segment_size = self.spec_segment_size * int(self.interpolate_factor) else: @@ -1315,7 +1150,7 @@ def train_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int) -> T ) # compute losses - with autocast(enabled=False): # use float32 for the criterion + with torch.autocast("cuda", enabled=False): # use float32 for the criterion loss_dict = criterion[optimizer_idx]( mel_slice_hat=mel_slice.float(), mel_slice=mel_slice_hat.float(), diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index 7c4a76ad7d..35de91e359 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -93,25 +93,6 @@ def load_audio(audiopath, sampling_rate): return audio -def pad_or_truncate(t, length): - """ - Ensure a given tensor t has a specified sequence length by either padding it with zeros or clipping it. - - Args: - t (torch.Tensor): The input tensor to be padded or truncated. - length (int): The desired length of the tensor. - - Returns: - torch.Tensor: The padded or truncated tensor. - """ - tp = t[..., :length] - if t.shape[-1] == length: - tp = t - elif t.shape[-1] < length: - tp = F.pad(t, (0, length - t.shape[-1])) - return tp - - @dataclass class XttsAudioConfig(Coqpit): """ @@ -779,6 +760,12 @@ def load_checkpoint( if os.path.exists(vocab_path): self.tokenizer = VoiceBpeTokenizer(vocab_file=vocab_path) + else: + msg = ( + f"`vocab.json` file not found in `{checkpoint_dir}`. Move the file there or " + "specify alternative path in `model_args.tokenizer_file` in `config.json`" + ) + raise FileNotFoundError(msg) self.init_models() diff --git a/TTS/tts/utils/helpers.py b/TTS/tts/utils/helpers.py index 7429d0fcc8..ff10f751f2 100644 --- a/TTS/tts/utils/helpers.py +++ b/TTS/tts/utils/helpers.py @@ -1,15 +1,10 @@ +from typing import Optional + import numpy as np import torch from scipy.stats import betabinom from torch.nn import functional as F -try: - from TTS.tts.utils.monotonic_align.core import maximum_path_c - - CYTHON = True -except ModuleNotFoundError: - CYTHON = False - class StandardScaler: """StandardScaler for mean-scale normalization with the given mean and scale values.""" @@ -40,7 +35,7 @@ def inverse_transform(self, X): # from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1 -def sequence_mask(sequence_length, max_len=None): +def sequence_mask(sequence_length: torch.Tensor, max_len: Optional[int] = None) -> torch.Tensor: """Create a sequence mask for filtering padding in a sequence tensor. Args: @@ -51,7 +46,7 @@ def sequence_mask(sequence_length, max_len=None): - mask: :math:`[B, T_max]` """ if max_len is None: - max_len = sequence_length.max() + max_len = int(sequence_length.max()) seq_range = torch.arange(max_len, dtype=sequence_length.dtype, device=sequence_length.device) # B x T_max return seq_range.unsqueeze(0) < sequence_length.unsqueeze(1) @@ -150,89 +145,75 @@ def convert_pad_shape(pad_shape: list[list]) -> list: return [item for sublist in l for item in sublist] -def generate_path(duration, mask): - """ +def generate_path(duration: torch.Tensor, mask: torch.Tensor) -> torch.Tensor: + """Generate alignment path based on the given segment durations. + Shapes: - duration: :math:`[B, T_en]` - mask: :math:'[B, T_en, T_de]` - path: :math:`[B, T_en, T_de]` """ b, t_x, t_y = mask.shape - cum_duration = torch.cumsum(duration, 1) + cum_duration = torch.cumsum(duration, dim=1) cum_duration_flat = cum_duration.view(b * t_x) path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) path = path.view(b, t_x, t_y) path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] - path = path * mask - return path + return path * mask -def maximum_path(value, mask): - if CYTHON: - return maximum_path_cython(value, mask) - return maximum_path_numpy(value, mask) +def generate_attention( + duration: torch.Tensor, x_mask: torch.Tensor, y_mask: Optional[torch.Tensor] = None +) -> torch.Tensor: + """Generate an attention map from the linear scale durations. + Args: + duration (Tensor): Linear scale durations. + x_mask (Tensor): Mask for the input (character) sequence. + y_mask (Tensor): Mask for the output (spectrogram) sequence. Compute it from the predicted durations + if None. Defaults to None. + + Shapes + - duration: :math:`(B, T_{en})` + - x_mask: :math:`(B, T_{en})` + - y_mask: :math:`(B, T_{de})` + """ + # compute decode mask from the durations + if y_mask is None: + y_lengths = duration.sum(dim=1).long() + y_lengths[y_lengths < 1] = 1 + y_mask = sequence_mask(y_lengths).unsqueeze(1).to(duration.dtype) + attn_mask = x_mask.unsqueeze(-1) * y_mask.unsqueeze(2) + return generate_path(duration, attn_mask.squeeze(1)).to(duration.dtype) + + +def expand_encoder_outputs( + x: torch.Tensor, duration: torch.Tensor, x_mask: torch.Tensor, y_lengths: torch.Tensor +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Generate attention alignment map from durations and expand encoder outputs. -def maximum_path_cython(value, mask): - """Cython optimised version. Shapes: - - value: :math:`[B, T_en, T_de]` - - mask: :math:`[B, T_en, T_de]` - """ - value = value * mask - device = value.device - dtype = value.dtype - value = value.data.cpu().numpy().astype(np.float32) - path = np.zeros_like(value).astype(np.int32) - mask = mask.data.cpu().numpy() + - x: Encoder output :math:`(B, D_{en}, T_{en})` + - duration: :math:`(B, T_{en})` + - x_mask: :math:`(B, T_{en})` + - y_lengths: :math:`(B)` - t_x_max = mask.sum(1)[:, 0].astype(np.int32) - t_y_max = mask.sum(2)[:, 0].astype(np.int32) - maximum_path_c(path, value, t_x_max, t_y_max) - return torch.from_numpy(path).to(device=device, dtype=dtype) + Examples:: + encoder output: [a,b,c,d] + durations: [1, 3, 2, 1] -def maximum_path_numpy(value, mask, max_neg_val=None): - """ - Monotonic alignment search algorithm - Numpy-friendly version. It's about 4 times faster than torch version. - value: [b, t_x, t_y] - mask: [b, t_x, t_y] + expanded: [a, b, b, b, c, c, d] + attention map: [[0, 0, 0, 0, 0, 0, 1], + [0, 0, 0, 0, 1, 1, 0], + [0, 1, 1, 1, 0, 0, 0], + [1, 0, 0, 0, 0, 0, 0]] """ - if max_neg_val is None: - max_neg_val = -np.inf # Patch for Sphinx complaint - value = value * mask - - device = value.device - dtype = value.dtype - value = value.cpu().detach().numpy() - mask = mask.cpu().detach().numpy().astype(bool) - - b, t_x, t_y = value.shape - direction = np.zeros(value.shape, dtype=np.int64) - v = np.zeros((b, t_x), dtype=np.float32) - x_range = np.arange(t_x, dtype=np.float32).reshape(1, -1) - for j in range(t_y): - v0 = np.pad(v, [[0, 0], [1, 0]], mode="constant", constant_values=max_neg_val)[:, :-1] - v1 = v - max_mask = v1 >= v0 - v_max = np.where(max_mask, v1, v0) - direction[:, :, j] = max_mask - - index_mask = x_range <= j - v = np.where(index_mask, v_max + value[:, :, j], max_neg_val) - direction = np.where(mask, direction, 1) - - path = np.zeros(value.shape, dtype=np.float32) - index = mask[:, :, 0].sum(1).astype(np.int64) - 1 - index_range = np.arange(b) - for j in reversed(range(t_y)): - path[index_range, index, j] = 1 - index = index + direction[index_range, index, j] - 1 - path = path * mask.astype(np.float32) - path = torch.from_numpy(path).to(device=device, dtype=dtype) - return path + y_mask = sequence_mask(y_lengths).unsqueeze(1).to(x.dtype) + attn = generate_attention(duration, x_mask, y_mask) + x_expanded = torch.einsum("kmn, kjm -> kjn", [attn.float(), x]) + return x_expanded, attn, y_mask def beta_binomial_prior_distribution(phoneme_count, mel_count, scaling_factor=1.0): diff --git a/TTS/tts/utils/monotonic_align/core.pyx b/TTS/tts/utils/monotonic_align/core.pyx deleted file mode 100644 index 091fcc3a50..0000000000 --- a/TTS/tts/utils/monotonic_align/core.pyx +++ /dev/null @@ -1,47 +0,0 @@ -import numpy as np - -cimport cython -cimport numpy as np - -from cython.parallel import prange - - -@cython.boundscheck(False) -@cython.wraparound(False) -cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_x, int t_y, float max_neg_val) nogil: - cdef int x - cdef int y - cdef float v_prev - cdef float v_cur - cdef float tmp - cdef int index = t_x - 1 - - for y in range(t_y): - for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)): - if x == y: - v_cur = max_neg_val - else: - v_cur = value[x, y-1] - if x == 0: - if y == 0: - v_prev = 0. - else: - v_prev = max_neg_val - else: - v_prev = value[x-1, y-1] - value[x, y] = max(v_cur, v_prev) + value[x, y] - - for y in range(t_y - 1, -1, -1): - path[index, y] = 1 - if index != 0 and (index == y or value[index, y-1] < value[index-1, y-1]): - index = index - 1 - - -@cython.boundscheck(False) -@cython.wraparound(False) -cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_xs, int[::1] t_ys, float max_neg_val=-1e9) nogil: - cdef int b = values.shape[0] - - cdef int i - for i in prange(b, nogil=True): - maximum_path_each(paths[i], values[i], t_xs[i], t_ys[i], max_neg_val) diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 797151c254..5dc4cc569f 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -1,17 +1,16 @@ -from typing import Dict +from typing import Dict, Optional, Union import numpy as np import torch from torch import nn -def numpy_to_torch(np_array, dtype, cuda=False, device="cpu"): - if cuda: - device = "cuda" +def numpy_to_torch( + np_array: np.ndarray, dtype: torch.dtype, device: Union[str, torch.device] = "cpu" +) -> Optional[torch.Tensor]: if np_array is None: return None - tensor = torch.as_tensor(np_array, dtype=dtype, device=device) - return tensor + return torch.as_tensor(np_array, dtype=dtype, device=device) def compute_style_mel(style_wav, ap, cuda=False, device="cpu"): @@ -76,18 +75,14 @@ def inv_spectrogram(postnet_output, ap, CONFIG): return wav -def id_to_torch(aux_id, cuda=False, device="cpu"): - if cuda: - device = "cuda" +def id_to_torch(aux_id, device: Union[str, torch.device] = "cpu") -> Optional[torch.Tensor]: if aux_id is not None: aux_id = np.asarray(aux_id) aux_id = torch.from_numpy(aux_id).to(device) return aux_id -def embedding_to_torch(d_vector, cuda=False, device="cpu"): - if cuda: - device = "cuda" +def embedding_to_torch(d_vector, device: Union[str, torch.device] = "cpu") -> Optional[torch.Tensor]: if d_vector is not None: d_vector = np.asarray(d_vector) d_vector = torch.from_numpy(d_vector).type(torch.FloatTensor) diff --git a/TTS/utils/audio/numpy_transforms.py b/TTS/utils/audio/numpy_transforms.py index 4a8972480c..9c83009b0f 100644 --- a/TTS/utils/audio/numpy_transforms.py +++ b/TTS/utils/audio/numpy_transforms.py @@ -1,6 +1,6 @@ import logging from io import BytesIO -from typing import Tuple +from typing import Optional import librosa import numpy as np @@ -16,11 +16,11 @@ def build_mel_basis( *, - sample_rate: int = None, - fft_size: int = None, - num_mels: int = None, - mel_fmax: int = None, - mel_fmin: int = None, + sample_rate: int, + fft_size: int, + num_mels: int, + mel_fmin: int, + mel_fmax: Optional[int] = None, **kwargs, ) -> np.ndarray: """Build melspectrogram basis. @@ -34,9 +34,7 @@ def build_mel_basis( return librosa.filters.mel(sr=sample_rate, n_fft=fft_size, n_mels=num_mels, fmin=mel_fmin, fmax=mel_fmax) -def millisec_to_length( - *, frame_length_ms: int = None, frame_shift_ms: int = None, sample_rate: int = None, **kwargs -) -> Tuple[int, int]: +def millisec_to_length(*, frame_length_ms: float, frame_shift_ms: float, sample_rate: int, **kwargs) -> tuple[int, int]: """Compute hop and window length from milliseconds. Returns: @@ -61,7 +59,7 @@ def _exp(x, base): return np.exp(x) -def amp_to_db(*, x: np.ndarray = None, gain: float = 1, base: int = 10, **kwargs) -> np.ndarray: +def amp_to_db(*, x: np.ndarray, gain: float = 1, base: float = 10, **kwargs) -> np.ndarray: """Convert amplitude values to decibels. Args: @@ -77,7 +75,7 @@ def amp_to_db(*, x: np.ndarray = None, gain: float = 1, base: int = 10, **kwargs # pylint: disable=no-self-use -def db_to_amp(*, x: np.ndarray = None, gain: float = 1, base: int = 10, **kwargs) -> np.ndarray: +def db_to_amp(*, x: np.ndarray, gain: float = 1, base: float = 10, **kwargs) -> np.ndarray: """Convert decibels spectrogram to amplitude spectrogram. Args: @@ -104,18 +102,20 @@ def preemphasis(*, x: np.ndarray, coef: float = 0.97, **kwargs) -> np.ndarray: np.ndarray: Decorrelated audio signal. """ if coef == 0: - raise RuntimeError(" [!] Preemphasis is set 0.0.") + msg = " [!] Preemphasis is set 0.0." + raise RuntimeError(msg) return scipy.signal.lfilter([1, -coef], [1], x) -def deemphasis(*, x: np.ndarray = None, coef: float = 0.97, **kwargs) -> np.ndarray: +def deemphasis(*, x: np.ndarray, coef: float = 0.97, **kwargs) -> np.ndarray: """Reverse pre-emphasis.""" if coef == 0: - raise RuntimeError(" [!] Preemphasis is set 0.0.") + msg = " [!] Preemphasis is set 0.0." + raise ValueError(msg) return scipy.signal.lfilter([1], [1, -coef], x) -def spec_to_mel(*, spec: np.ndarray, mel_basis: np.ndarray = None, **kwargs) -> np.ndarray: +def spec_to_mel(*, spec: np.ndarray, mel_basis: np.ndarray, **kwargs) -> np.ndarray: """Convert a full scale linear spectrogram output of a network to a melspectrogram. Args: @@ -130,14 +130,14 @@ def spec_to_mel(*, spec: np.ndarray, mel_basis: np.ndarray = None, **kwargs) -> return np.dot(mel_basis, spec) -def mel_to_spec(*, mel: np.ndarray = None, mel_basis: np.ndarray = None, **kwargs) -> np.ndarray: +def mel_to_spec(*, mel: np.ndarray, mel_basis: np.ndarray, **kwargs) -> np.ndarray: """Convert a melspectrogram to full scale spectrogram.""" assert (mel < 0).sum() == 0, " [!] Input values must be non-negative." inv_mel_basis = np.linalg.pinv(mel_basis) return np.maximum(1e-10, np.dot(inv_mel_basis, mel)) -def wav_to_spec(*, wav: np.ndarray = None, **kwargs) -> np.ndarray: +def wav_to_spec(*, wav: np.ndarray, **kwargs) -> np.ndarray: """Compute a spectrogram from a waveform. Args: @@ -151,7 +151,7 @@ def wav_to_spec(*, wav: np.ndarray = None, **kwargs) -> np.ndarray: return S.astype(np.float32) -def wav_to_mel(*, wav: np.ndarray = None, mel_basis=None, **kwargs) -> np.ndarray: +def wav_to_mel(*, wav: np.ndarray, mel_basis: np.ndarray, **kwargs) -> np.ndarray: """Compute a melspectrogram from a waveform.""" D = stft(y=wav, **kwargs) S = spec_to_mel(spec=np.abs(D), mel_basis=mel_basis, **kwargs) @@ -164,20 +164,20 @@ def spec_to_wav(*, spec: np.ndarray, power: float = 1.5, **kwargs) -> np.ndarray return griffin_lim(spec=S**power, **kwargs) -def mel_to_wav(*, mel: np.ndarray = None, power: float = 1.5, **kwargs) -> np.ndarray: +def mel_to_wav(*, mel: np.ndarray, mel_basis: np.ndarray, power: float = 1.5, **kwargs) -> np.ndarray: """Convert a melspectrogram to a waveform using Griffi-Lim vocoder.""" S = mel.copy() - S = mel_to_spec(mel=S, mel_basis=kwargs["mel_basis"]) # Convert back to linear + S = mel_to_spec(mel=S, mel_basis=mel_basis) # Convert back to linear return griffin_lim(spec=S**power, **kwargs) ### STFT and ISTFT ### def stft( *, - y: np.ndarray = None, - fft_size: int = None, - hop_length: int = None, - win_length: int = None, + y: np.ndarray, + fft_size: int, + hop_length: Optional[int] = None, + win_length: Optional[int] = None, pad_mode: str = "reflect", window: str = "hann", center: bool = True, @@ -203,9 +203,9 @@ def stft( def istft( *, - y: np.ndarray = None, - hop_length: int = None, - win_length: int = None, + y: np.ndarray, + hop_length: Optional[int] = None, + win_length: Optional[int] = None, window: str = "hann", center: bool = True, **kwargs, @@ -220,7 +220,7 @@ def istft( return librosa.istft(y, hop_length=hop_length, win_length=win_length, center=center, window=window) -def griffin_lim(*, spec: np.ndarray = None, num_iter=60, **kwargs) -> np.ndarray: +def griffin_lim(*, spec: np.ndarray, num_iter=60, **kwargs) -> np.ndarray: angles = np.exp(2j * np.pi * np.random.rand(*spec.shape)) S_complex = np.abs(spec).astype(complex) y = istft(y=S_complex * angles, **kwargs) @@ -233,11 +233,11 @@ def griffin_lim(*, spec: np.ndarray = None, num_iter=60, **kwargs) -> np.ndarray return y -def compute_stft_paddings( - *, x: np.ndarray = None, hop_length: int = None, pad_two_sides: bool = False, **kwargs -) -> Tuple[int, int]: - """Compute paddings used by Librosa's STFT. Compute right padding (final frame) or both sides padding - (first and final frames)""" +def compute_stft_paddings(*, x: np.ndarray, hop_length: int, pad_two_sides: bool = False, **kwargs) -> tuple[int, int]: + """Compute paddings used by Librosa's STFT. + + Compute right padding (final frame) or both sides padding (first and final frames). + """ pad = (x.shape[0] // hop_length + 1) * hop_length - x.shape[0] if not pad_two_sides: return 0, pad @@ -246,12 +246,12 @@ def compute_stft_paddings( def compute_f0( *, - x: np.ndarray = None, - pitch_fmax: float = None, - pitch_fmin: float = None, - hop_length: int = None, - win_length: int = None, - sample_rate: int = None, + x: np.ndarray, + pitch_fmax: Optional[float] = None, + pitch_fmin: Optional[float] = None, + hop_length: int, + win_length: int, + sample_rate: int, stft_pad_mode: str = "reflect", center: bool = True, **kwargs, @@ -323,19 +323,18 @@ def compute_energy(y: np.ndarray, **kwargs) -> np.ndarray: """ x = stft(y=y, **kwargs) mag, _ = magphase(x) - energy = np.sqrt(np.sum(mag**2, axis=0)) - return energy + return np.sqrt(np.sum(mag**2, axis=0)) ### Audio Processing ### def find_endpoint( *, - wav: np.ndarray = None, + wav: np.ndarray, trim_db: float = -40, - sample_rate: int = None, - min_silence_sec=0.8, - gain: float = None, - base: int = None, + sample_rate: int, + min_silence_sec: float = 0.8, + gain: float = 1, + base: float = 10, **kwargs, ) -> int: """Find the last point without silence at the end of a audio signal. @@ -344,8 +343,8 @@ def find_endpoint( wav (np.ndarray): Audio signal. threshold_db (int, optional): Silence threshold in decibels. Defaults to -40. min_silence_sec (float, optional): Ignore silences that are shorter then this in secs. Defaults to 0.8. - gian (float, optional): Gain to be used to convert trim_db to trim_amp. Defaults to None. - base (int, optional): Base of the logarithm used to convert trim_db to trim_amp. Defaults to 10. + gain (float, optional): Gain factor to be used to convert trim_db to trim_amp. Defaults to 1. + base (float, optional): Base of the logarithm used to convert trim_db to trim_amp. Defaults to 10. Returns: int: Last point without silence. @@ -361,20 +360,20 @@ def find_endpoint( def trim_silence( *, - wav: np.ndarray = None, - sample_rate: int = None, - trim_db: float = None, - win_length: int = None, - hop_length: int = None, + wav: np.ndarray, + sample_rate: int, + trim_db: float = 60, + win_length: int, + hop_length: int, **kwargs, ) -> np.ndarray: - """Trim silent parts with a threshold and 0.01 sec margin""" + """Trim silent parts with a threshold and 0.01 sec margin.""" margin = int(sample_rate * 0.01) wav = wav[margin:-margin] return librosa.effects.trim(wav, top_db=trim_db, frame_length=win_length, hop_length=hop_length)[0] -def volume_norm(*, x: np.ndarray = None, coef: float = 0.95, **kwargs) -> np.ndarray: +def volume_norm(*, x: np.ndarray, coef: float = 0.95, **kwargs) -> np.ndarray: """Normalize the volume of an audio signal. Args: @@ -387,7 +386,7 @@ def volume_norm(*, x: np.ndarray = None, coef: float = 0.95, **kwargs) -> np.nda return x / abs(x).max() * coef -def rms_norm(*, wav: np.ndarray = None, db_level: float = -27.0, **kwargs) -> np.ndarray: +def rms_norm(*, wav: np.ndarray, db_level: float = -27.0, **kwargs) -> np.ndarray: r = 10 ** (db_level / 20) a = np.sqrt((len(wav) * (r**2)) / np.sum(wav**2)) return wav * a @@ -404,11 +403,10 @@ def rms_volume_norm(*, x: np.ndarray, db_level: float = -27.0, **kwargs) -> np.n np.ndarray: RMS normalized waveform. """ assert -99 <= db_level <= 0, " [!] db_level should be between -99 and 0" - wav = rms_norm(wav=x, db_level=db_level) - return wav + return rms_norm(wav=x, db_level=db_level) -def load_wav(*, filename: str, sample_rate: int = None, resample: bool = False, **kwargs) -> np.ndarray: +def load_wav(*, filename: str, sample_rate: Optional[int] = None, resample: bool = False, **kwargs) -> np.ndarray: """Read a wav file using Librosa and optionally resample, silence trim, volume normalize. Resampling slows down loading the file significantly. Therefore it is recommended to resample the file before. @@ -427,19 +425,39 @@ def load_wav(*, filename: str, sample_rate: int = None, resample: bool = False, else: # SF is faster than librosa for loading files x, _ = sf.read(filename) + if x.ndim != 1: + logger.warning("Found multi-channel audio. Converting to mono: %s", filename) + x = librosa.to_mono(x) return x -def save_wav(*, wav: np.ndarray, path: str, sample_rate: int = None, pipe_out=None, **kwargs) -> None: +def save_wav( + *, + wav: np.ndarray, + path: str, + sample_rate: int, + pipe_out=None, + do_rms_norm: bool = False, + db_level: float = -27.0, + **kwargs, +) -> None: """Save float waveform to a file using Scipy. Args: wav (np.ndarray): Waveform with float values in range [-1, 1] to save. path (str): Path to a output file. - sr (int, optional): Sampling rate used for saving to the file. Defaults to None. + sr (int): Sampling rate used for saving to the file. Defaults to None. pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe. + do_rms_norm (bool): Whether to apply RMS normalization + db_level (float): Target dB level in RMS. """ - wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) + if do_rms_norm: + if db_level is None: + msg = "`db_level` cannot be None with `do_rms_norm=True`" + raise ValueError(msg) + wav_norm = rms_volume_norm(x=wav, db_level=db_level) + else: + wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) wav_norm = wav_norm.astype(np.int16) if pipe_out: @@ -462,8 +480,7 @@ def mulaw_encode(*, wav: np.ndarray, mulaw_qc: int, **kwargs) -> np.ndarray: def mulaw_decode(*, wav, mulaw_qc: int, **kwargs) -> np.ndarray: """Recovers waveform from quantized values.""" mu = 2**mulaw_qc - 1 - x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1) - return x + return np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1) def encode_16bits(*, x: np.ndarray, **kwargs) -> np.ndarray: diff --git a/TTS/utils/audio/processor.py b/TTS/utils/audio/processor.py index 680e29debc..1d8fed8e39 100644 --- a/TTS/utils/audio/processor.py +++ b/TTS/utils/audio/processor.py @@ -1,11 +1,8 @@ import logging -from io import BytesIO -from typing import Dict, Tuple +from typing import Optional import librosa import numpy as np -import scipy.io.wavfile -import scipy.signal from TTS.tts.utils.helpers import StandardScaler from TTS.utils.audio.numpy_transforms import ( @@ -21,6 +18,7 @@ millisec_to_length, preemphasis, rms_volume_norm, + save_wav, spec_to_mel, stft, trim_silence, @@ -32,7 +30,7 @@ # pylint: disable=too-many-public-methods -class AudioProcessor(object): +class AudioProcessor: """Audio Processor for TTS. Note: @@ -172,7 +170,7 @@ def __init__( db_level=None, stats_path=None, **_, - ): + ) -> None: # setup class attributed self.sample_rate = sample_rate self.resample = resample @@ -210,7 +208,8 @@ def __init__( elif log_func == "np.log10": self.base = 10 else: - raise ValueError(" [!] unknown `log_func` value.") + msg = " [!] unknown `log_func` value." + raise ValueError(msg) # setup stft parameters if hop_length is None: # compute stft parameters from given time values @@ -254,7 +253,7 @@ def init_from_config(config: "Coqpit"): ### normalization ### def normalize(self, S: np.ndarray) -> np.ndarray: - """Normalize values into `[0, self.max_norm]` or `[-self.max_norm, self.max_norm]` + """Normalize values into `[0, self.max_norm]` or `[-self.max_norm, self.max_norm]`. Args: S (np.ndarray): Spectrogram to normalize. @@ -272,10 +271,10 @@ def normalize(self, S: np.ndarray) -> np.ndarray: if hasattr(self, "mel_scaler"): if S.shape[0] == self.num_mels: return self.mel_scaler.transform(S.T).T - elif S.shape[0] == self.fft_size / 2: + if S.shape[0] == self.fft_size / 2: return self.linear_scaler.transform(S.T).T - else: - raise RuntimeError(" [!] Mean-Var stats does not match the given feature dimensions.") + msg = " [!] Mean-Var stats does not match the given feature dimensions." + raise RuntimeError(msg) # range normalization S -= self.ref_level_db # discard certain range of DB assuming it is air noise S_norm = (S - self.min_level_db) / (-self.min_level_db) @@ -286,13 +285,11 @@ def normalize(self, S: np.ndarray) -> np.ndarray: S_norm, -self.max_norm, self.max_norm # pylint: disable=invalid-unary-operand-type ) return S_norm - else: - S_norm = self.max_norm * S_norm - if self.clip_norm: - S_norm = np.clip(S_norm, 0, self.max_norm) - return S_norm - else: - return S + S_norm = self.max_norm * S_norm + if self.clip_norm: + S_norm = np.clip(S_norm, 0, self.max_norm) + return S_norm + return S def denormalize(self, S: np.ndarray) -> np.ndarray: """Denormalize spectrogram values. @@ -313,10 +310,10 @@ def denormalize(self, S: np.ndarray) -> np.ndarray: if hasattr(self, "mel_scaler"): if S_denorm.shape[0] == self.num_mels: return self.mel_scaler.inverse_transform(S_denorm.T).T - elif S_denorm.shape[0] == self.fft_size / 2: + if S_denorm.shape[0] == self.fft_size / 2: return self.linear_scaler.inverse_transform(S_denorm.T).T - else: - raise RuntimeError(" [!] Mean-Var stats does not match the given feature dimensions.") + msg = " [!] Mean-Var stats does not match the given feature dimensions." + raise RuntimeError(msg) if self.symmetric_norm: if self.clip_norm: S_denorm = np.clip( @@ -324,16 +321,14 @@ def denormalize(self, S: np.ndarray) -> np.ndarray: ) S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db return S_denorm + self.ref_level_db - else: - if self.clip_norm: - S_denorm = np.clip(S_denorm, 0, self.max_norm) - S_denorm = (S_denorm * -self.min_level_db / self.max_norm) + self.min_level_db - return S_denorm + self.ref_level_db - else: - return S_denorm + if self.clip_norm: + S_denorm = np.clip(S_denorm, 0, self.max_norm) + S_denorm = (S_denorm * -self.min_level_db / self.max_norm) + self.min_level_db + return S_denorm + self.ref_level_db + return S_denorm ### Mean-STD scaling ### - def load_stats(self, stats_path: str) -> Tuple[np.array, np.array, np.array, np.array, Dict]: + def load_stats(self, stats_path: str) -> tuple[np.array, np.array, np.array, np.array, dict]: """Loading mean and variance statistics from a `npy` file. Args: @@ -351,7 +346,7 @@ def load_stats(self, stats_path: str) -> Tuple[np.array, np.array, np.array, np. stats_config = stats["audio_config"] # check all audio parameters used for computing stats skip_parameters = ["griffin_lim_iters", "stats_path", "do_trim_silence", "ref_level_db", "power"] - for key in stats_config.keys(): + for key in stats_config: if key in skip_parameters: continue if key not in ["sample_rate", "trim_db"]: @@ -415,10 +410,7 @@ def spectrogram(self, y: np.ndarray) -> np.ndarray: win_length=self.win_length, pad_mode=self.stft_pad_mode, ) - if self.do_amp_to_db_linear: - S = amp_to_db(x=np.abs(D), gain=self.spec_gain, base=self.base) - else: - S = np.abs(D) + S = amp_to_db(x=np.abs(D), gain=self.spec_gain, base=self.base) if self.do_amp_to_db_linear else np.abs(D) return self.normalize(S).astype(np.float32) def melspectrogram(self, y: np.ndarray) -> np.ndarray: @@ -467,8 +459,7 @@ def out_linear_to_mel(self, linear_spec: np.ndarray) -> np.ndarray: S = db_to_amp(x=S, gain=self.spec_gain, base=self.base) S = spec_to_mel(spec=np.abs(S), mel_basis=self.mel_basis) S = amp_to_db(x=S, gain=self.spec_gain, base=self.base) - mel = self.normalize(S) - return mel + return self.normalize(S) def _griffin_lim(self, S): return griffin_lim( @@ -502,7 +493,7 @@ def compute_f0(self, x: np.ndarray) -> np.ndarray: if len(x) % self.hop_length == 0: x = np.pad(x, (0, self.hop_length // 2), mode=self.stft_pad_mode) - f0 = compute_f0( + return compute_f0( x=x, pitch_fmax=self.pitch_fmax, pitch_fmin=self.pitch_fmin, @@ -513,8 +504,6 @@ def compute_f0(self, x: np.ndarray) -> np.ndarray: center=True, ) - return f0 - ### Audio Processing ### def find_endpoint(self, wav: np.ndarray, min_silence_sec=0.8) -> int: """Find the last point without silence at the end of a audio signal. @@ -537,7 +526,7 @@ def find_endpoint(self, wav: np.ndarray, min_silence_sec=0.8) -> int: ) def trim_silence(self, wav): - """Trim silent parts with a threshold and 0.01 sec margin""" + """Trim silent parts with a threshold and 0.01 sec margin.""" return trim_silence( wav=wav, sample_rate=self.sample_rate, @@ -558,21 +547,8 @@ def sound_norm(x: np.ndarray) -> np.ndarray: """ return volume_norm(x=x) - def rms_volume_norm(self, x: np.ndarray, db_level: float = None) -> np.ndarray: - """Normalize the volume based on RMS of the signal. - - Args: - x (np.ndarray): Raw waveform. - - Returns: - np.ndarray: RMS normalized waveform. - """ - if db_level is None: - db_level = self.db_level - return rms_volume_norm(x=x, db_level=db_level) - ### save and load ### - def load_wav(self, filename: str, sr: int = None) -> np.ndarray: + def load_wav(self, filename: str, sr: Optional[int] = None) -> np.ndarray: """Read a wav file using Librosa and optionally resample, silence trim, volume normalize. Resampling slows down loading the file significantly. Therefore it is recommended to resample the file before. @@ -596,10 +572,10 @@ def load_wav(self, filename: str, sr: int = None) -> np.ndarray: if self.do_sound_norm: x = self.sound_norm(x) if self.do_rms_norm: - x = self.rms_volume_norm(x, self.db_level) + x = rms_volume_norm(x=x, db_level=self.db_level) return x - def save_wav(self, wav: np.ndarray, path: str, sr: int = None, pipe_out=None) -> None: + def save_wav(self, wav: np.ndarray, path: str, sr: Optional[int] = None, pipe_out=None) -> None: """Save a waveform to a file using Scipy. Args: @@ -608,18 +584,14 @@ def save_wav(self, wav: np.ndarray, path: str, sr: int = None, pipe_out=None) -> sr (int, optional): Sampling rate used for saving to the file. Defaults to None. pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe. """ - if self.do_rms_norm: - wav_norm = self.rms_volume_norm(wav, self.db_level) * 32767 - else: - wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) - - wav_norm = wav_norm.astype(np.int16) - if pipe_out: - wav_buffer = BytesIO() - scipy.io.wavfile.write(wav_buffer, sr if sr else self.sample_rate, wav_norm) - wav_buffer.seek(0) - pipe_out.buffer.write(wav_buffer.read()) - scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm) + save_wav( + wav=wav, + path=path, + sample_rate=sr if sr else self.sample_rate, + pipe_out=pipe_out, + do_rms_norm=self.do_rms_norm, + db_level=self.db_level, + ) def get_duration(self, filename: str) -> float: """Get the duration of a wav file using Librosa. diff --git a/TTS/utils/audio/torch_transforms.py b/TTS/utils/audio/torch_transforms.py index 632969c51a..59bb23cc4f 100644 --- a/TTS/utils/audio/torch_transforms.py +++ b/TTS/utils/audio/torch_transforms.py @@ -1,7 +1,113 @@ +import logging + import librosa import torch from torch import nn +logger = logging.getLogger(__name__) + + +hann_window = {} +mel_basis = {} + + +def amp_to_db(x: torch.Tensor, *, spec_gain: float = 1.0, clip_val: float = 1e-5) -> torch.Tensor: + """Spectral normalization / dynamic range compression.""" + return torch.log(torch.clamp(x, min=clip_val) * spec_gain) + + +def db_to_amp(x: torch.Tensor, *, spec_gain: float = 1.0) -> torch.Tensor: + """Spectral denormalization / dynamic range decompression.""" + return torch.exp(x) / spec_gain + + +def wav_to_spec(y: torch.Tensor, n_fft: int, hop_length: int, win_length: int, *, center: bool = False) -> torch.Tensor: + """ + Args Shapes: + - y : :math:`[B, 1, T]` + + Return Shapes: + - spec : :math:`[B,C,T]` + """ + y = y.squeeze(1) + + if torch.min(y) < -1.0: + logger.info("min value is %.3f", torch.min(y)) + if torch.max(y) > 1.0: + logger.info("max value is %.3f", torch.max(y)) + + global hann_window + wnsize_dtype_device = f"{win_length}_{y.dtype}_{y.device}" + if wnsize_dtype_device not in hann_window: + hann_window[wnsize_dtype_device] = torch.hann_window(win_length).to(dtype=y.dtype, device=y.device) + + y = torch.nn.functional.pad( + y.unsqueeze(1), + (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)), + mode="reflect", + ) + y = y.squeeze(1) + + spec = torch.view_as_real( + torch.stft( + y, + n_fft, + hop_length=hop_length, + win_length=win_length, + window=hann_window[wnsize_dtype_device], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True, + ) + ) + + return torch.sqrt(spec.pow(2).sum(-1) + 1e-6) + + +def spec_to_mel( + spec: torch.Tensor, n_fft: int, num_mels: int, sample_rate: int, fmin: float, fmax: float +) -> torch.Tensor: + """ + Args Shapes: + - spec : :math:`[B,C,T]` + + Return Shapes: + - mel : :math:`[B,C,T]` + """ + global mel_basis + fmax_dtype_device = f"{n_fft}_{fmax}_{spec.dtype}_{spec.device}" + if fmax_dtype_device not in mel_basis: + # TODO: switch librosa to torchaudio + mel = librosa.filters.mel(sr=sample_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) + mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device) + mel = torch.matmul(mel_basis[fmax_dtype_device], spec) + return amp_to_db(mel) + + +def wav_to_mel( + y: torch.Tensor, + n_fft: int, + num_mels: int, + sample_rate: int, + hop_length: int, + win_length: int, + fmin: float, + fmax: float, + *, + center: bool = False, +) -> torch.Tensor: + """ + Args Shapes: + - y : :math:`[B, 1, T]` + + Return Shapes: + - spec : :math:`[B,C,T]` + """ + spec = wav_to_spec(y, n_fft, hop_length, win_length, center=center) + return spec_to_mel(spec, n_fft, num_mels, sample_rate, fmin, fmax) + class TorchSTFT(nn.Module): # pylint: disable=abstract-method """Some of the audio processing funtions using Torch for faster batch processing. @@ -157,11 +263,3 @@ def _build_mel_basis(self): norm=self.mel_norm, ) self.mel_basis = torch.from_numpy(mel_basis).float() - - @staticmethod - def _amp_to_db(x, spec_gain=1.0): - return torch.log(torch.clamp(x, min=1e-5) * spec_gain) - - @staticmethod - def _db_to_amp(x, spec_gain=1.0): - return torch.exp(x) / spec_gain diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py index 3ee285232f..087ae7d0e1 100644 --- a/TTS/utils/generic_utils.py +++ b/TTS/utils/generic_utils.py @@ -4,13 +4,26 @@ import logging import re from pathlib import Path -from typing import Dict, Optional +from typing import Callable, Dict, Optional, TypeVar, Union import torch from packaging.version import Version +from typing_extensions import TypeIs logger = logging.getLogger(__name__) +_T = TypeVar("_T") + + +def exists(val: Union[_T, None]) -> TypeIs[_T]: + return val is not None + + +def default(val: Union[_T, None], d: Union[_T, Callable[[], _T]]) -> _T: + if exists(val): + return val + return d() if callable(d) else d + def to_camel(text): text = text.capitalize() @@ -54,25 +67,6 @@ def get_import_path(obj: object) -> str: return ".".join([type(obj).__module__, type(obj).__name__]) -def set_init_dict(model_dict, checkpoint_state, c): - # Partial initialization: if there is a mismatch with new and old layer, it is skipped. - for k, v in checkpoint_state.items(): - if k not in model_dict: - logger.warning("Layer missing in the model finition %s", k) - # 1. filter out unnecessary keys - pretrained_dict = {k: v for k, v in checkpoint_state.items() if k in model_dict} - # 2. filter out different size layers - pretrained_dict = {k: v for k, v in pretrained_dict.items() if v.numel() == model_dict[k].numel()} - # 3. skip reinit layers - if c.has("reinit_layers") and c.reinit_layers is not None: - for reinit_layer_name in c.reinit_layers: - pretrained_dict = {k: v for k, v in pretrained_dict.items() if reinit_layer_name not in k} - # 4. overwrite entries in the existing state dict - model_dict.update(pretrained_dict) - logger.info("%d / %d layers are restored.", len(pretrained_dict), len(model_dict)) - return model_dict - - def format_aux_input(def_args: Dict, kwargs: Dict) -> Dict: """Format kwargs to hande auxilary inputs to models. diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index fb5071d9b0..38fcfd60e9 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -230,7 +230,7 @@ def _download_hf_model(self, model_item: Dict, output_path: str): self._download_zip_file(model_item["hf_url"], output_path, self.progress_bar) def download_fairseq_model(self, model_name, output_path): - URI_PREFIX = "https://coqui.gateway.scarf.sh/fairseq/" + URI_PREFIX = "https://dl.fbaipublicfiles.com/mms/tts/" _, lang, _, _ = model_name.split("/") model_download_uri = os.path.join(URI_PREFIX, f"{lang}.tar.gz") self._download_tar_file(model_download_uri, output_path, self.progress_bar) @@ -243,9 +243,9 @@ def set_model_url(model_item: Dict): elif "hf_url" in model_item: model_item["model_url"] = model_item["hf_url"] elif "fairseq" in model_item["model_name"]: - model_item["model_url"] = "https://coqui.gateway.scarf.sh/fairseq/" + model_item["model_url"] = "https://dl.fbaipublicfiles.com/mms/tts/" elif "xtts" in model_item["model_name"]: - model_item["model_url"] = "https://coqui.gateway.scarf.sh/xtts/" + model_item["model_url"] = "https://huggingface.co/coqui/" return model_item def _set_model_item(self, model_name): @@ -278,11 +278,11 @@ def _set_model_item(self, model_name): "contact": "info@coqui.ai", "tos_required": True, "hf_url": [ - f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/model.pth", - f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/config.json", - f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/vocab.json", - f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/hash.md5", - f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/speakers_xtts.pth", + f"https://huggingface.co/coqui/XTTS-v2/resolve/{model_version}/model.pth", + f"https://huggingface.co/coqui/XTTS-v2/resolve/{model_version}/config.json", + f"https://huggingface.co/coqui/XTTS-v2/resolve/{model_version}/vocab.json", + f"https://huggingface.co/coqui/XTTS-v2/resolve/{model_version}/hash.md5", + f"https://huggingface.co/coqui/XTTS-v2/resolve/{model_version}/speakers_xtts.pth", ], } else: @@ -424,7 +424,7 @@ def _find_files(output_path: str) -> Tuple[str, str]: model_file = None config_file = None for file_name in os.listdir(output_path): - if file_name in ["model_file.pth", "model_file.pth.tar", "model.pth"]: + if file_name in ["model_file.pth", "model_file.pth.tar", "model.pth", "checkpoint.pth"]: model_file = os.path.join(output_path, file_name) elif file_name == "config.json": config_file = os.path.join(output_path, file_name) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 90af4f48f9..a9b9feffc1 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -1,6 +1,7 @@ import logging import os import time +from pathlib import Path from typing import List import numpy as np @@ -15,7 +16,9 @@ from TTS.tts.utils.synthesis import synthesis, transfer_voice, trim_silence from TTS.utils.audio import AudioProcessor from TTS.utils.audio.numpy_transforms import save_wav +from TTS.vc.configs.openvoice_config import OpenVoiceConfig from TTS.vc.models import setup_model as setup_vc_model +from TTS.vc.models.openvoice import OpenVoice from TTS.vocoder.models import setup_model as setup_vocoder_model from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input @@ -25,6 +28,7 @@ class Synthesizer(nn.Module): def __init__( self, + *, tts_checkpoint: str = "", tts_config_path: str = "", tts_speakers_file: str = "", @@ -91,23 +95,20 @@ def __init__( if tts_checkpoint: self._load_tts(tts_checkpoint, tts_config_path, use_cuda) - self.output_sample_rate = self.tts_config.audio["sample_rate"] if vocoder_checkpoint: self._load_vocoder(vocoder_checkpoint, vocoder_config, use_cuda) - self.output_sample_rate = self.vocoder_config.audio["sample_rate"] - if vc_checkpoint: + if vc_checkpoint and model_dir is None: self._load_vc(vc_checkpoint, vc_config, use_cuda) - self.output_sample_rate = self.vc_config.audio["output_sample_rate"] if model_dir: if "fairseq" in model_dir: self._load_fairseq_from_dir(model_dir, use_cuda) - self.output_sample_rate = self.tts_config.audio["sample_rate"] + elif "openvoice" in model_dir: + self._load_openvoice_from_dir(Path(model_dir), use_cuda) else: self._load_tts_from_dir(model_dir, use_cuda) - self.output_sample_rate = self.tts_config.audio["output_sample_rate"] @staticmethod def _get_segmenter(lang: str): @@ -136,6 +137,7 @@ def _load_vc(self, vc_checkpoint: str, vc_config_path: str, use_cuda: bool) -> N """ # pylint: disable=global-statement self.vc_config = load_config(vc_config_path) + self.output_sample_rate = self.vc_config.audio["output_sample_rate"] self.vc_model = setup_vc_model(config=self.vc_config) self.vc_model.load_checkpoint(self.vc_config, vc_checkpoint) if use_cuda: @@ -150,9 +152,24 @@ def _load_fairseq_from_dir(self, model_dir: str, use_cuda: bool) -> None: self.tts_model = Vits.init_from_config(self.tts_config) self.tts_model.load_fairseq_checkpoint(self.tts_config, checkpoint_dir=model_dir, eval=True) self.tts_config = self.tts_model.config + self.output_sample_rate = self.tts_config.audio["sample_rate"] if use_cuda: self.tts_model.cuda() + def _load_openvoice_from_dir(self, checkpoint: Path, use_cuda: bool) -> None: + """Load the OpenVoice model from a directory. + + We assume the model knows how to load itself from the directory and + there is a config.json file in the directory. + """ + self.vc_config = OpenVoiceConfig() + self.vc_model = OpenVoice.init_from_config(self.vc_config) + self.vc_model.load_checkpoint(self.vc_config, checkpoint, eval=True) + self.vc_config = self.vc_model.config + self.output_sample_rate = self.vc_config.audio["output_sample_rate"] + if use_cuda: + self.vc_model.cuda() + def _load_tts_from_dir(self, model_dir: str, use_cuda: bool) -> None: """Load the TTS model from a directory. @@ -160,6 +177,7 @@ def _load_tts_from_dir(self, model_dir: str, use_cuda: bool) -> None: """ config = load_config(os.path.join(model_dir, "config.json")) self.tts_config = config + self.output_sample_rate = self.tts_config.audio["output_sample_rate"] self.tts_model = setup_tts_model(config) self.tts_model.load_checkpoint(config, checkpoint_dir=model_dir, eval=True) if use_cuda: @@ -181,6 +199,7 @@ def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) - """ # pylint: disable=global-statement self.tts_config = load_config(tts_config_path) + self.output_sample_rate = self.tts_config.audio["sample_rate"] if self.tts_config["use_phonemes"] and self.tts_config["phonemizer"] is None: raise ValueError("Phonemizer is not defined in the TTS config.") @@ -218,6 +237,7 @@ def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> N use_cuda (bool): enable/disable CUDA use. """ self.vocoder_config = load_config(model_config) + self.output_sample_rate = self.vocoder_config.audio["sample_rate"] self.vocoder_ap = AudioProcessor(**self.vocoder_config.audio) self.vocoder_model = setup_vocoder_model(self.vocoder_config) self.vocoder_model.load_checkpoint(self.vocoder_config, model_file, eval=True) diff --git a/TTS/vc/configs/freevc_config.py b/TTS/vc/configs/freevc_config.py index 207181b303..d600bfb1f4 100644 --- a/TTS/vc/configs/freevc_config.py +++ b/TTS/vc/configs/freevc_config.py @@ -229,7 +229,7 @@ class FreeVCConfig(BaseVCConfig): If true, language embedding is used. Defaults to `False`. Note: - Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters. + Check :class:`TTS.tts.configs.shared_configs.BaseVCConfig` for the inherited parameters. Example: diff --git a/TTS/vc/configs/openvoice_config.py b/TTS/vc/configs/openvoice_config.py new file mode 100644 index 0000000000..261cdd6f47 --- /dev/null +++ b/TTS/vc/configs/openvoice_config.py @@ -0,0 +1,201 @@ +from dataclasses import dataclass, field +from typing import Optional + +from coqpit import Coqpit + +from TTS.vc.configs.shared_configs import BaseVCConfig + + +@dataclass +class OpenVoiceAudioConfig(Coqpit): + """Audio configuration + + Args: + input_sample_rate (int): + The sampling rate of the input waveform. + + output_sample_rate (int): + The sampling rate of the output waveform. + + fft_size (int): + The length of the filter. + + hop_length (int): + The hop length. + + win_length (int): + The window length. + """ + + input_sample_rate: int = field(default=22050) + output_sample_rate: int = field(default=22050) + fft_size: int = field(default=1024) + hop_length: int = field(default=256) + win_length: int = field(default=1024) + + +@dataclass +class OpenVoiceArgs(Coqpit): + """OpenVoice model arguments. + + zero_g (bool): + Whether to zero the gradients. + + inter_channels (int): + The number of channels in the intermediate layers. + + hidden_channels (int): + The number of channels in the hidden layers. + + filter_channels (int): + The number of channels in the filter layers. + + n_heads (int): + The number of attention heads. + + n_layers (int): + The number of layers. + + kernel_size (int): + The size of the kernel. + + p_dropout (float): + The dropout probability. + + resblock (str): + The type of residual block. + + resblock_kernel_sizes (List[int]): + The kernel sizes for the residual blocks. + + resblock_dilation_sizes (List[List[int]]): + The dilation sizes for the residual blocks. + + upsample_rates (List[int]): + The upsample rates. + + upsample_initial_channel (int): + The number of channels in the initial upsample layer. + + upsample_kernel_sizes (List[int]): + The kernel sizes for the upsample layers. + + n_layers_q (int): + The number of layers in the quantization network. + + use_spectral_norm (bool): + Whether to use spectral normalization. + + gin_channels (int): + The number of channels in the global conditioning vector. + + tau (float): + Tau parameter for the posterior encoder + """ + + zero_g: bool = field(default=True) + inter_channels: int = field(default=192) + hidden_channels: int = field(default=192) + filter_channels: int = field(default=768) + n_heads: int = field(default=2) + n_layers: int = field(default=6) + kernel_size: int = field(default=3) + p_dropout: float = field(default=0.1) + resblock: str = field(default="1") + resblock_kernel_sizes: list[int] = field(default_factory=lambda: [3, 7, 11]) + resblock_dilation_sizes: list[list[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]) + upsample_rates: list[int] = field(default_factory=lambda: [8, 8, 2, 2]) + upsample_initial_channel: int = field(default=512) + upsample_kernel_sizes: list[int] = field(default_factory=lambda: [16, 16, 4, 4]) + n_layers_q: int = field(default=3) + use_spectral_norm: bool = field(default=False) + gin_channels: int = field(default=256) + tau: float = field(default=0.3) + + +@dataclass +class OpenVoiceConfig(BaseVCConfig): + """Defines parameters for OpenVoice VC model. + + Args: + model (str): + Model name. Do not change unless you know what you are doing. + + model_args (OpenVoiceArgs): + Model architecture arguments. Defaults to `OpenVoiceArgs()`. + + audio (OpenVoiceAudioConfig): + Audio processing configuration. Defaults to `OpenVoiceAudioConfig()`. + + return_wav (bool): + If true, data loader returns the waveform as well as the other outputs. Do not change. Defaults to `True`. + + compute_linear_spec (bool): + If true, the linear spectrogram is computed and returned alongside the mel output. Do not change. Defaults to `True`. + + use_weighted_sampler (bool): + If true, use weighted sampler with bucketing for balancing samples between datasets used in training. Defaults to `False`. + + weighted_sampler_attrs (dict): + Key retuned by the formatter to be used for weighted sampler. For example `{"root_path": 2.0, "speaker_name": 1.0}` sets sample probabilities + by overweighting `root_path` by 2.0. Defaults to `{}`. + + weighted_sampler_multipliers (dict): + Weight each unique value of a key returned by the formatter for weighted sampling. + For example `{"root_path":{"/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-100/":1.0, "/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-360/": 0.5}`. + It will sample instances from `train-clean-100` 2 times more than `train-clean-360`. Defaults to `{}`. + + r (int): + Number of spectrogram frames to be generated at a time. Do not change. Defaults to `1`. + + add_blank (bool): + If true, a blank token is added in between every character. Defaults to `True`. + + Note: + Check :class:`TTS.tts.configs.shared_configs.BaseVCConfig` for the inherited parameters. + + Example: + + >>> from TTS.vc.configs.openvoice_config import OpenVoiceConfig + >>> config = OpenVoiceConfig() + """ + + model: str = "openvoice" + # model specific params + model_args: OpenVoiceArgs = field(default_factory=OpenVoiceArgs) + audio: OpenVoiceAudioConfig = field(default_factory=OpenVoiceAudioConfig) + + # optimizer + # TODO with training support + + # loss params + # TODO with training support + + # data loader params + return_wav: bool = True + compute_linear_spec: bool = True + + # sampler params + use_weighted_sampler: bool = False # TODO: move it to the base config + weighted_sampler_attrs: dict = field(default_factory=lambda: {}) + weighted_sampler_multipliers: dict = field(default_factory=lambda: {}) + + # overrides + r: int = 1 # DO NOT CHANGE + add_blank: bool = True + + # multi-speaker settings + # use speaker embedding layer + num_speakers: int = 0 + speakers_file: Optional[str] = None + speaker_embedding_channels: int = 256 + + # use d-vectors + use_d_vector_file: bool = False + d_vector_file: Optional[list[str]] = None + d_vector_dim: Optional[int] = None + + def __post_init__(self) -> None: + for key, val in self.model_args.items(): + if hasattr(self, key): + self[key] = val diff --git a/TTS/tts/utils/monotonic_align/__init__.py b/TTS/vc/layers/__init__.py similarity index 100% rename from TTS/tts/utils/monotonic_align/__init__.py rename to TTS/vc/layers/__init__.py diff --git a/TTS/vc/modules/__init__.py b/TTS/vc/layers/freevc/__init__.py similarity index 100% rename from TTS/vc/modules/__init__.py rename to TTS/vc/layers/freevc/__init__.py diff --git a/TTS/vc/modules/freevc/commons.py b/TTS/vc/layers/freevc/commons.py similarity index 81% rename from TTS/vc/modules/freevc/commons.py rename to TTS/vc/layers/freevc/commons.py index feea7f34dc..49889e4816 100644 --- a/TTS/vc/modules/freevc/commons.py +++ b/TTS/vc/layers/freevc/commons.py @@ -3,7 +3,7 @@ import torch from torch.nn import functional as F -from TTS.tts.utils.helpers import convert_pad_shape, sequence_mask +from TTS.tts.utils.helpers import convert_pad_shape def init_weights(m: torch.nn.Module, mean: float = 0.0, std: float = 0.01) -> None: @@ -96,37 +96,11 @@ def subsequent_mask(length): return mask -@torch.jit.script -def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): - n_channels_int = n_channels[0] - in_act = input_a + input_b - t_act = torch.tanh(in_act[:, :n_channels_int, :]) - s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) - acts = t_act * s_act - return acts - - def shift_1d(x): x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] return x -def generate_path(duration, mask): - """ - duration: [b, 1, t_x] - mask: [b, 1, t_y, t_x] - """ - b, _, t_y, t_x = mask.shape - cum_duration = torch.cumsum(duration, -1) - - cum_duration_flat = cum_duration.view(b * t_x) - path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) - path = path.view(b, t_x, t_y) - path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] - path = path.unsqueeze(1).transpose(2, 3) * mask - return path - - def clip_grad_value_(parameters, clip_value, norm_type=2): if isinstance(parameters, torch.Tensor): parameters = [parameters] diff --git a/TTS/vc/layers/freevc/mel_processing.py b/TTS/vc/layers/freevc/mel_processing.py new file mode 100644 index 0000000000..017d900284 --- /dev/null +++ b/TTS/vc/layers/freevc/mel_processing.py @@ -0,0 +1,58 @@ +import logging + +import torch +import torch.utils.data +from librosa.filters import mel as librosa_mel_fn + +from TTS.utils.audio.torch_transforms import amp_to_db + +logger = logging.getLogger(__name__) + +MAX_WAV_VALUE = 32768.0 + +mel_basis = {} +hann_window = {} + + +def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): + if torch.min(y) < -1.0: + logger.info("Min value is: %.3f", torch.min(y)) + if torch.max(y) > 1.0: + logger.info("Max value is: %.3f", torch.max(y)) + + global mel_basis, hann_window + dtype_device = str(y.dtype) + "_" + str(y.device) + fmax_dtype_device = str(fmax) + "_" + dtype_device + wnsize_dtype_device = str(win_size) + "_" + dtype_device + if fmax_dtype_device not in mel_basis: + mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) + mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device) + if wnsize_dtype_device not in hann_window: + hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) + + y = torch.nn.functional.pad( + y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect" + ) + y = y.squeeze(1) + + spec = torch.view_as_real( + torch.stft( + y, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window[wnsize_dtype_device], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True, + ) + ) + + spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) + + spec = torch.matmul(mel_basis[fmax_dtype_device], spec) + spec = amp_to_db(spec) + + return spec diff --git a/TTS/vc/modules/freevc/modules.py b/TTS/vc/layers/freevc/modules.py similarity index 98% rename from TTS/vc/modules/freevc/modules.py rename to TTS/vc/layers/freevc/modules.py index 722444a303..c34f22d701 100644 --- a/TTS/vc/modules/freevc/modules.py +++ b/TTS/vc/layers/freevc/modules.py @@ -5,9 +5,9 @@ from torch.nn.utils.parametrizations import weight_norm from torch.nn.utils.parametrize import remove_parametrizations -import TTS.vc.modules.freevc.commons as commons from TTS.tts.layers.generic.normalization import LayerNorm2 -from TTS.vc.modules.freevc.commons import init_weights +from TTS.tts.layers.generic.wavenet import fused_add_tanh_sigmoid_multiply +from TTS.vc.layers.freevc.commons import init_weights from TTS.vocoder.models.hifigan_generator import get_padding LRELU_SLOPE = 0.1 @@ -99,7 +99,7 @@ def forward(self, x, x_mask, g=None, **kwargs): else: g_l = torch.zeros_like(x_in) - acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) + acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) acts = self.drop(acts) res_skip_acts = self.res_skip_layers[i](acts) diff --git a/TTS/vc/modules/freevc/__init__.py b/TTS/vc/layers/freevc/speaker_encoder/__init__.py similarity index 100% rename from TTS/vc/modules/freevc/__init__.py rename to TTS/vc/layers/freevc/speaker_encoder/__init__.py diff --git a/TTS/vc/modules/freevc/speaker_encoder/audio.py b/TTS/vc/layers/freevc/speaker_encoder/audio.py similarity index 97% rename from TTS/vc/modules/freevc/speaker_encoder/audio.py rename to TTS/vc/layers/freevc/speaker_encoder/audio.py index 5b23a4dbb6..5fa317ce45 100644 --- a/TTS/vc/modules/freevc/speaker_encoder/audio.py +++ b/TTS/vc/layers/freevc/speaker_encoder/audio.py @@ -5,7 +5,7 @@ import librosa import numpy as np -from TTS.vc.modules.freevc.speaker_encoder.hparams import ( +from TTS.vc.layers.freevc.speaker_encoder.hparams import ( audio_norm_target_dBFS, mel_n_channels, mel_window_length, diff --git a/TTS/vc/modules/freevc/speaker_encoder/hparams.py b/TTS/vc/layers/freevc/speaker_encoder/hparams.py similarity index 100% rename from TTS/vc/modules/freevc/speaker_encoder/hparams.py rename to TTS/vc/layers/freevc/speaker_encoder/hparams.py diff --git a/TTS/vc/modules/freevc/speaker_encoder/speaker_encoder.py b/TTS/vc/layers/freevc/speaker_encoder/speaker_encoder.py similarity index 98% rename from TTS/vc/modules/freevc/speaker_encoder/speaker_encoder.py rename to TTS/vc/layers/freevc/speaker_encoder/speaker_encoder.py index 294bf322cb..a6d5bcf942 100644 --- a/TTS/vc/modules/freevc/speaker_encoder/speaker_encoder.py +++ b/TTS/vc/layers/freevc/speaker_encoder/speaker_encoder.py @@ -7,8 +7,8 @@ from torch import nn from trainer.io import load_fsspec -from TTS.vc.modules.freevc.speaker_encoder import audio -from TTS.vc.modules.freevc.speaker_encoder.hparams import ( +from TTS.vc.layers.freevc.speaker_encoder import audio +from TTS.vc.layers.freevc.speaker_encoder.hparams import ( mel_n_channels, mel_window_step, model_embedding_size, diff --git a/TTS/vc/modules/freevc/wavlm/__init__.py b/TTS/vc/layers/freevc/wavlm/__init__.py similarity index 94% rename from TTS/vc/modules/freevc/wavlm/__init__.py rename to TTS/vc/layers/freevc/wavlm/__init__.py index 4046e137f5..62f7e74aaf 100644 --- a/TTS/vc/modules/freevc/wavlm/__init__.py +++ b/TTS/vc/layers/freevc/wavlm/__init__.py @@ -6,7 +6,7 @@ from trainer.io import get_user_data_dir from TTS.utils.generic_utils import is_pytorch_at_least_2_4 -from TTS.vc.modules.freevc.wavlm.wavlm import WavLM, WavLMConfig +from TTS.vc.layers.freevc.wavlm.wavlm import WavLM, WavLMConfig logger = logging.getLogger(__name__) diff --git a/TTS/vc/modules/freevc/wavlm/config.json b/TTS/vc/layers/freevc/wavlm/config.json similarity index 100% rename from TTS/vc/modules/freevc/wavlm/config.json rename to TTS/vc/layers/freevc/wavlm/config.json diff --git a/TTS/vc/modules/freevc/wavlm/modules.py b/TTS/vc/layers/freevc/wavlm/modules.py similarity index 100% rename from TTS/vc/modules/freevc/wavlm/modules.py rename to TTS/vc/layers/freevc/wavlm/modules.py diff --git a/TTS/vc/modules/freevc/wavlm/wavlm.py b/TTS/vc/layers/freevc/wavlm/wavlm.py similarity index 99% rename from TTS/vc/modules/freevc/wavlm/wavlm.py rename to TTS/vc/layers/freevc/wavlm/wavlm.py index 10dd09ed0c..775f3e5979 100644 --- a/TTS/vc/modules/freevc/wavlm/wavlm.py +++ b/TTS/vc/layers/freevc/wavlm/wavlm.py @@ -17,7 +17,7 @@ import torch.nn.functional as F from torch.nn import LayerNorm -from TTS.vc.modules.freevc.wavlm.modules import ( +from TTS.vc.layers.freevc.wavlm.modules import ( Fp32GroupNorm, Fp32LayerNorm, GLU_Linear, diff --git a/TTS/vc/models/__init__.py b/TTS/vc/models/__init__.py index a498b292b7..a9807d7006 100644 --- a/TTS/vc/models/__init__.py +++ b/TTS/vc/models/__init__.py @@ -6,11 +6,6 @@ logger = logging.getLogger(__name__) -def to_camel(text): - text = text.capitalize() - return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) - - def setup_model(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "BaseVC": logger.info("Using model: %s", config.model) # fetch the right model implementation. diff --git a/TTS/vc/models/freevc.py b/TTS/vc/models/freevc.py index e5cfdc1e61..c654219c39 100644 --- a/TTS/vc/models/freevc.py +++ b/TTS/vc/models/freevc.py @@ -6,24 +6,23 @@ import torch from coqpit import Coqpit from torch import nn -from torch.nn import Conv1d, Conv2d, ConvTranspose1d +from torch.nn import Conv1d, ConvTranspose1d from torch.nn import functional as F -from torch.nn.utils import spectral_norm from torch.nn.utils.parametrizations import weight_norm from torch.nn.utils.parametrize import remove_parametrizations from trainer.io import load_fsspec -import TTS.vc.modules.freevc.commons as commons -import TTS.vc.modules.freevc.modules as modules +import TTS.vc.layers.freevc.modules as modules +from TTS.tts.layers.vits.discriminator import DiscriminatorS from TTS.tts.utils.helpers import sequence_mask from TTS.tts.utils.speakers import SpeakerManager from TTS.vc.configs.freevc_config import FreeVCConfig +from TTS.vc.layers.freevc.commons import init_weights, rand_slice_segments +from TTS.vc.layers.freevc.mel_processing import mel_spectrogram_torch +from TTS.vc.layers.freevc.speaker_encoder.speaker_encoder import SpeakerEncoder as SpeakerEncoderEx +from TTS.vc.layers.freevc.wavlm import get_wavlm from TTS.vc.models.base_vc import BaseVC -from TTS.vc.modules.freevc.commons import init_weights -from TTS.vc.modules.freevc.mel_processing import mel_spectrogram_torch -from TTS.vc.modules.freevc.speaker_encoder.speaker_encoder import SpeakerEncoder as SpeakerEncoderEx -from TTS.vc.modules.freevc.wavlm import get_wavlm -from TTS.vocoder.models.hifigan_generator import get_padding +from TTS.vocoder.models.hifigan_discriminator import DiscriminatorP logger = logging.getLogger(__name__) @@ -164,75 +163,6 @@ def remove_weight_norm(self): remove_parametrizations(l, "weight") -class DiscriminatorP(torch.nn.Module): - def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): - super(DiscriminatorP, self).__init__() - self.period = period - self.use_spectral_norm = use_spectral_norm - norm_f = weight_norm if use_spectral_norm is False else spectral_norm - self.convs = nn.ModuleList( - [ - norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), - norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), - norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), - norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), - norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))), - ] - ) - self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) - - def forward(self, x): - fmap = [] - - # 1d to 2d - b, c, t = x.shape - if t % self.period != 0: # pad first - n_pad = self.period - (t % self.period) - x = F.pad(x, (0, n_pad), "reflect") - t = t + n_pad - x = x.view(b, c, t // self.period, self.period) - - for l in self.convs: - x = l(x) - x = F.leaky_relu(x, modules.LRELU_SLOPE) - fmap.append(x) - x = self.conv_post(x) - fmap.append(x) - x = torch.flatten(x, 1, -1) - - return x, fmap - - -class DiscriminatorS(torch.nn.Module): - def __init__(self, use_spectral_norm=False): - super(DiscriminatorS, self).__init__() - norm_f = weight_norm if use_spectral_norm is False else spectral_norm - self.convs = nn.ModuleList( - [ - norm_f(Conv1d(1, 16, 15, 1, padding=7)), - norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), - norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), - norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), - norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), - norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), - ] - ) - self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) - - def forward(self, x): - fmap = [] - - for l in self.convs: - x = l(x) - x = F.leaky_relu(x, modules.LRELU_SLOPE) - fmap.append(x) - x = self.conv_post(x) - fmap.append(x) - x = torch.flatten(x, 1, -1) - - return x, fmap - - class MultiPeriodDiscriminator(torch.nn.Module): def __init__(self, use_spectral_norm=False): super(MultiPeriodDiscriminator, self).__init__() @@ -454,7 +384,7 @@ def forward( z_p = self.flow(z, spec_mask, g=g) # Randomly slice z and compute o using dec - z_slice, ids_slice = commons.rand_slice_segments(z, spec_lengths, self.segment_size) + z_slice, ids_slice = rand_slice_segments(z, spec_lengths, self.segment_size) o = self.dec(z_slice, g=g) return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q) diff --git a/TTS/vc/models/openvoice.py b/TTS/vc/models/openvoice.py new file mode 100644 index 0000000000..135b0861b9 --- /dev/null +++ b/TTS/vc/models/openvoice.py @@ -0,0 +1,320 @@ +import json +import logging +import os +from pathlib import Path +from typing import Any, Mapping, Optional, Union + +import librosa +import numpy as np +import numpy.typing as npt +import torch +from coqpit import Coqpit +from torch import nn +from torch.nn import functional as F +from trainer.io import load_fsspec + +from TTS.tts.layers.vits.networks import PosteriorEncoder +from TTS.tts.utils.speakers import SpeakerManager +from TTS.utils.audio.torch_transforms import wav_to_spec +from TTS.vc.configs.openvoice_config import OpenVoiceConfig +from TTS.vc.models.base_vc import BaseVC +from TTS.vc.models.freevc import Generator, ResidualCouplingBlock + +logger = logging.getLogger(__name__) + + +class ReferenceEncoder(nn.Module): + """NN module creating a fixed size prosody embedding from a spectrogram. + + inputs: mel spectrograms [batch_size, num_spec_frames, num_mel] + outputs: [batch_size, embedding_dim] + """ + + def __init__(self, spec_channels: int, embedding_dim: int = 0, layernorm: bool = True) -> None: + super().__init__() + self.spec_channels = spec_channels + ref_enc_filters = [32, 32, 64, 64, 128, 128] + K = len(ref_enc_filters) + filters = [1] + ref_enc_filters + convs = [ + torch.nn.utils.parametrizations.weight_norm( + nn.Conv2d( + in_channels=filters[i], + out_channels=filters[i + 1], + kernel_size=(3, 3), + stride=(2, 2), + padding=(1, 1), + ) + ) + for i in range(K) + ] + self.convs = nn.ModuleList(convs) + + out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K) + self.gru = nn.GRU( + input_size=ref_enc_filters[-1] * out_channels, + hidden_size=256 // 2, + batch_first=True, + ) + self.proj = nn.Linear(128, embedding_dim) + self.layernorm = nn.LayerNorm(self.spec_channels) if layernorm else None + + def forward(self, inputs: torch.Tensor) -> torch.Tensor: + N = inputs.size(0) + + out = inputs.view(N, 1, -1, self.spec_channels) # [N, 1, Ty, n_freqs] + if self.layernorm is not None: + out = self.layernorm(out) + + for conv in self.convs: + out = conv(out) + out = F.relu(out) # [N, 128, Ty//2^K, n_mels//2^K] + + out = out.transpose(1, 2) # [N, Ty//2^K, 128, n_mels//2^K] + T = out.size(1) + N = out.size(0) + out = out.contiguous().view(N, T, -1) # [N, Ty//2^K, 128*n_mels//2^K] + + self.gru.flatten_parameters() + _memory, out = self.gru(out) # out --- [1, N, 128] + + return self.proj(out.squeeze(0)) + + def calculate_channels(self, L: int, kernel_size: int, stride: int, pad: int, n_convs: int) -> int: + for _ in range(n_convs): + L = (L - kernel_size + 2 * pad) // stride + 1 + return L + + +class OpenVoice(BaseVC): + """ + OpenVoice voice conversion model (inference only). + + Source: https://github.com/myshell-ai/OpenVoice + Paper: https://arxiv.org/abs/2312.01479 + + Paper abstract: + We introduce OpenVoice, a versatile voice cloning approach that requires + only a short audio clip from the reference speaker to replicate their voice and + generate speech in multiple languages. OpenVoice represents a significant + advancement in addressing the following open challenges in the field: 1) + Flexible Voice Style Control. OpenVoice enables granular control over voice + styles, including emotion, accent, rhythm, pauses, and intonation, in addition + to replicating the tone color of the reference speaker. The voice styles are not + directly copied from and constrained by the style of the reference speaker. + Previous approaches lacked the ability to flexibly manipulate voice styles after + cloning. 2) Zero-Shot Cross-Lingual Voice Cloning. OpenVoice achieves zero-shot + cross-lingual voice cloning for languages not included in the massive-speaker + training set. Unlike previous approaches, which typically require extensive + massive-speaker multi-lingual (MSML) dataset for all languages, OpenVoice can + clone voices into a new language without any massive-speaker training data for + that language. OpenVoice is also computationally efficient, costing tens of + times less than commercially available APIs that offer even inferior + performance. To foster further research in the field, we have made the source + code and trained model publicly accessible. We also provide qualitative results + in our demo website. Prior to its public release, our internal version of + OpenVoice was used tens of millions of times by users worldwide between May and + October 2023, serving as the backend of MyShell. + """ + + def __init__(self, config: Coqpit, speaker_manager: Optional[SpeakerManager] = None) -> None: + super().__init__(config, None, speaker_manager, None) + + self.init_multispeaker(config) + + self.zero_g = self.args.zero_g + self.inter_channels = self.args.inter_channels + self.hidden_channels = self.args.hidden_channels + self.filter_channels = self.args.filter_channels + self.n_heads = self.args.n_heads + self.n_layers = self.args.n_layers + self.kernel_size = self.args.kernel_size + self.p_dropout = self.args.p_dropout + self.resblock = self.args.resblock + self.resblock_kernel_sizes = self.args.resblock_kernel_sizes + self.resblock_dilation_sizes = self.args.resblock_dilation_sizes + self.upsample_rates = self.args.upsample_rates + self.upsample_initial_channel = self.args.upsample_initial_channel + self.upsample_kernel_sizes = self.args.upsample_kernel_sizes + self.n_layers_q = self.args.n_layers_q + self.use_spectral_norm = self.args.use_spectral_norm + self.gin_channels = self.args.gin_channels + self.tau = self.args.tau + + self.spec_channels = config.audio.fft_size // 2 + 1 + + self.dec = Generator( + self.inter_channels, + self.resblock, + self.resblock_kernel_sizes, + self.resblock_dilation_sizes, + self.upsample_rates, + self.upsample_initial_channel, + self.upsample_kernel_sizes, + gin_channels=self.gin_channels, + ) + self.enc_q = PosteriorEncoder( + self.spec_channels, + self.inter_channels, + self.hidden_channels, + kernel_size=5, + dilation_rate=1, + num_layers=16, + cond_channels=self.gin_channels, + ) + + self.flow = ResidualCouplingBlock( + self.inter_channels, + self.hidden_channels, + kernel_size=5, + dilation_rate=1, + n_layers=4, + gin_channels=self.gin_channels, + ) + + self.ref_enc = ReferenceEncoder(self.spec_channels, self.gin_channels) + + @property + def device(self) -> torch.device: + return next(self.parameters()).device + + @staticmethod + def init_from_config(config: OpenVoiceConfig) -> "OpenVoice": + return OpenVoice(config) + + def init_multispeaker(self, config: Coqpit, data: Optional[list[Any]] = None) -> None: + """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer + or with external `d_vectors` computed from a speaker encoder model. + + You must provide a `speaker_manager` at initialization to set up the multi-speaker modules. + + Args: + config (Coqpit): Model configuration. + data (list, optional): Dataset items to infer number of speakers. Defaults to None. + """ + self.num_spks = config.num_speakers + if self.speaker_manager: + self.num_spks = self.speaker_manager.num_speakers + + def load_checkpoint( + self, + config: OpenVoiceConfig, + checkpoint_path: Union[str, os.PathLike[Any]], + eval: bool = False, + strict: bool = True, + cache: bool = False, + ) -> None: + """Map from OpenVoice's config structure.""" + config_path = Path(checkpoint_path).parent / "config.json" + with open(config_path, encoding="utf-8") as f: + config_org = json.load(f) + self.config.audio.input_sample_rate = config_org["data"]["sampling_rate"] + self.config.audio.output_sample_rate = config_org["data"]["sampling_rate"] + self.config.audio.fft_size = config_org["data"]["filter_length"] + self.config.audio.hop_length = config_org["data"]["hop_length"] + self.config.audio.win_length = config_org["data"]["win_length"] + state = load_fsspec(str(checkpoint_path), map_location=torch.device("cpu"), cache=cache) + self.load_state_dict(state["model"], strict=strict) + if eval: + self.eval() + + def forward(self) -> None: ... + def train_step(self) -> None: ... + def eval_step(self) -> None: ... + + @staticmethod + def _set_x_lengths(x: torch.Tensor, aux_input: Mapping[str, Optional[torch.Tensor]]) -> torch.Tensor: + if "x_lengths" in aux_input and aux_input["x_lengths"] is not None: + return aux_input["x_lengths"] + return torch.tensor(x.shape[1:2]).to(x.device) + + @torch.no_grad() + def inference( + self, + x: torch.Tensor, + aux_input: Mapping[str, Optional[torch.Tensor]] = {"x_lengths": None, "g_src": None, "g_tgt": None}, + ) -> dict[str, torch.Tensor]: + """ + Inference pass of the model + + Args: + x (torch.Tensor): Input tensor. Shape: (batch_size, c_seq_len). + x_lengths (torch.Tensor): Lengths of the input tensor. Shape: (batch_size,). + g_src (torch.Tensor): Source speaker embedding tensor. Shape: (batch_size, spk_emb_dim). + g_tgt (torch.Tensor): Target speaker embedding tensor. Shape: (batch_size, spk_emb_dim). + + Returns: + o_hat: Output spectrogram tensor. Shape: (batch_size, spec_seq_len, spec_dim). + x_mask: Spectrogram mask. Shape: (batch_size, spec_seq_len). + (z, z_p, z_hat): A tuple of latent variables. + """ + x_lengths = self._set_x_lengths(x, aux_input) + if "g_src" in aux_input and aux_input["g_src"] is not None: + g_src = aux_input["g_src"] + else: + raise ValueError("aux_input must define g_src") + if "g_tgt" in aux_input and aux_input["g_tgt"] is not None: + g_tgt = aux_input["g_tgt"] + else: + raise ValueError("aux_input must define g_tgt") + z, _m_q, _logs_q, y_mask = self.enc_q( + x, x_lengths, g=g_src if not self.zero_g else torch.zeros_like(g_src), tau=self.tau + ) + z_p = self.flow(z, y_mask, g=g_src) + z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True) + o_hat = self.dec(z_hat * y_mask, g=g_tgt if not self.zero_g else torch.zeros_like(g_tgt)) + return { + "model_outputs": o_hat, + "y_mask": y_mask, + "z": z, + "z_p": z_p, + "z_hat": z_hat, + } + + def load_audio(self, wav: Union[str, npt.NDArray[np.float32], torch.Tensor, list[float]]) -> torch.Tensor: + """Read and format the input audio.""" + if isinstance(wav, str): + out = torch.from_numpy(librosa.load(wav, sr=self.config.audio.input_sample_rate)[0]) + elif isinstance(wav, np.ndarray): + out = torch.from_numpy(wav) + elif isinstance(wav, list): + out = torch.from_numpy(np.array(wav)) + else: + out = wav + return out.to(self.device).float() + + def extract_se(self, audio: Union[str, torch.Tensor]) -> tuple[torch.Tensor, torch.Tensor]: + audio_ref = self.load_audio(audio) + y = torch.FloatTensor(audio_ref) + y = y.to(self.device) + y = y.unsqueeze(0) + spec = wav_to_spec( + y, + n_fft=self.config.audio.fft_size, + hop_length=self.config.audio.hop_length, + win_length=self.config.audio.win_length, + center=False, + ).to(self.device) + with torch.no_grad(): + g = self.ref_enc(spec.transpose(1, 2)).unsqueeze(-1) + + return g, spec + + @torch.inference_mode() + def voice_conversion(self, src: Union[str, torch.Tensor], tgt: Union[str, torch.Tensor]) -> npt.NDArray[np.float32]: + """ + Voice conversion pass of the model. + + Args: + src (str or torch.Tensor): Source utterance. + tgt (str or torch.Tensor): Target utterance. + + Returns: + Output numpy array. + """ + src_se, src_spec = self.extract_se(src) + tgt_se, _ = self.extract_se(tgt) + + aux_input = {"g_src": src_se, "g_tgt": tgt_se} + audio = self.inference(src_spec, aux_input) + return audio["model_outputs"][0, 0].data.cpu().float().numpy() diff --git a/TTS/vc/modules/freevc/mel_processing.py b/TTS/vc/modules/freevc/mel_processing.py deleted file mode 100644 index a3e251891a..0000000000 --- a/TTS/vc/modules/freevc/mel_processing.py +++ /dev/null @@ -1,133 +0,0 @@ -import logging - -import torch -import torch.utils.data -from librosa.filters import mel as librosa_mel_fn - -logger = logging.getLogger(__name__) - -MAX_WAV_VALUE = 32768.0 - - -def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): - """ - PARAMS - ------ - C: compression factor - """ - return torch.log(torch.clamp(x, min=clip_val) * C) - - -def dynamic_range_decompression_torch(x, C=1): - """ - PARAMS - ------ - C: compression factor used to compress - """ - return torch.exp(x) / C - - -def spectral_normalize_torch(magnitudes): - output = dynamic_range_compression_torch(magnitudes) - return output - - -def spectral_de_normalize_torch(magnitudes): - output = dynamic_range_decompression_torch(magnitudes) - return output - - -mel_basis = {} -hann_window = {} - - -def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): - if torch.min(y) < -1.0: - logger.info("Min value is: %.3f", torch.min(y)) - if torch.max(y) > 1.0: - logger.info("Max value is: %.3f", torch.max(y)) - - global hann_window - dtype_device = str(y.dtype) + "_" + str(y.device) - wnsize_dtype_device = str(win_size) + "_" + dtype_device - if wnsize_dtype_device not in hann_window: - hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) - - y = torch.nn.functional.pad( - y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect" - ) - y = y.squeeze(1) - - spec = torch.view_as_real( - torch.stft( - y, - n_fft, - hop_length=hop_size, - win_length=win_size, - window=hann_window[wnsize_dtype_device], - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=True, - ) - ) - - spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) - return spec - - -def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): - global mel_basis - dtype_device = str(spec.dtype) + "_" + str(spec.device) - fmax_dtype_device = str(fmax) + "_" + dtype_device - if fmax_dtype_device not in mel_basis: - mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) - mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device) - spec = torch.matmul(mel_basis[fmax_dtype_device], spec) - spec = spectral_normalize_torch(spec) - return spec - - -def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): - if torch.min(y) < -1.0: - logger.info("Min value is: %.3f", torch.min(y)) - if torch.max(y) > 1.0: - logger.info("Max value is: %.3f", torch.max(y)) - - global mel_basis, hann_window - dtype_device = str(y.dtype) + "_" + str(y.device) - fmax_dtype_device = str(fmax) + "_" + dtype_device - wnsize_dtype_device = str(win_size) + "_" + dtype_device - if fmax_dtype_device not in mel_basis: - mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) - mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device) - if wnsize_dtype_device not in hann_window: - hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) - - y = torch.nn.functional.pad( - y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect" - ) - y = y.squeeze(1) - - spec = torch.view_as_real( - torch.stft( - y, - n_fft, - hop_length=hop_size, - win_length=win_size, - window=hann_window[wnsize_dtype_device], - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=True, - ) - ) - - spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) - - spec = torch.matmul(mel_basis[fmax_dtype_device], spec) - spec = spectral_normalize_torch(spec) - - return spec diff --git a/TTS/vc/modules/freevc/speaker_encoder/__init__.py b/TTS/vc/modules/freevc/speaker_encoder/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/TTS/vocoder/models/__init__.py b/TTS/vocoder/models/__init__.py index 7a1716f16d..b6a1850484 100644 --- a/TTS/vocoder/models/__init__.py +++ b/TTS/vocoder/models/__init__.py @@ -4,12 +4,9 @@ from coqpit import Coqpit -logger = logging.getLogger(__name__) - +from TTS.utils.generic_utils import to_camel -def to_camel(text): - text = text.capitalize() - return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) +logger = logging.getLogger(__name__) def setup_model(config: Coqpit): diff --git a/TTS/vocoder/models/hifigan_generator.py b/TTS/vocoder/models/hifigan_generator.py index afdd59a859..8273d02037 100644 --- a/TTS/vocoder/models/hifigan_generator.py +++ b/TTS/vocoder/models/hifigan_generator.py @@ -178,6 +178,7 @@ def __init__( conv_pre_weight_norm=True, conv_post_weight_norm=True, conv_post_bias=True, + cond_in_each_up_layer=False, ): r"""HiFiGAN Generator with Multi-Receptive Field Fusion (MRF) @@ -202,6 +203,8 @@ def __init__( self.inference_padding = inference_padding self.num_kernels = len(resblock_kernel_sizes) self.num_upsamples = len(upsample_factors) + self.cond_in_each_up_layer = cond_in_each_up_layer + # initial upsampling layers self.conv_pre = weight_norm(Conv1d(in_channels, upsample_initial_channel, 7, 1, padding=3)) resblock = ResBlock1 if resblock_type == "1" else ResBlock2 @@ -236,6 +239,12 @@ def __init__( if not conv_post_weight_norm: remove_parametrizations(self.conv_post, "weight") + if self.cond_in_each_up_layer: + self.conds = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + self.conds.append(nn.Conv1d(cond_channels, ch, 1)) + def forward(self, x, g=None): """ Args: @@ -255,6 +264,10 @@ def forward(self, x, g=None): for i in range(self.num_upsamples): o = F.leaky_relu(o, LRELU_SLOPE) o = self.ups[i](o) + + if self.cond_in_each_up_layer: + o = o + self.conds[i](g) + z_sum = None for j in range(self.num_kernels): if z_sum is None: diff --git a/TTS/vocoder/models/parallel_wavegan_generator.py b/TTS/vocoder/models/parallel_wavegan_generator.py index 6a4d4ca6e7..e60c8781f0 100644 --- a/TTS/vocoder/models/parallel_wavegan_generator.py +++ b/TTS/vocoder/models/parallel_wavegan_generator.py @@ -12,6 +12,13 @@ logger = logging.getLogger(__name__) +def _get_receptive_field_size(layers, stacks, kernel_size, dilation=lambda x: 2**x): + assert layers % stacks == 0 + layers_per_cycle = layers // stacks + dilations = [dilation(i % layers_per_cycle) for i in range(layers)] + return (kernel_size - 1) * sum(dilations) + 1 + + class ParallelWaveganGenerator(torch.nn.Module): """PWGAN generator as in https://arxiv.org/pdf/1910.11480.pdf. It is similar to WaveNet with no causal convolution. @@ -144,16 +151,9 @@ def _apply_weight_norm(m): self.apply(_apply_weight_norm) - @staticmethod - def _get_receptive_field_size(layers, stacks, kernel_size, dilation=lambda x: 2**x): - assert layers % stacks == 0 - layers_per_cycle = layers // stacks - dilations = [dilation(i % layers_per_cycle) for i in range(layers)] - return (kernel_size - 1) * sum(dilations) + 1 - @property def receptive_field_size(self): - return self._get_receptive_field_size(self.layers, self.stacks, self.kernel_size) + return _get_receptive_field_size(self.layers, self.stacks, self.kernel_size) def load_checkpoint( self, config, checkpoint_path, eval=False, cache=False diff --git a/TTS/vocoder/models/univnet_generator.py b/TTS/vocoder/models/univnet_generator.py index 72e57a9c39..5d1f817927 100644 --- a/TTS/vocoder/models/univnet_generator.py +++ b/TTS/vocoder/models/univnet_generator.py @@ -7,6 +7,7 @@ from torch.nn.utils import parametrize from TTS.vocoder.layers.lvc_block import LVCBlock +from TTS.vocoder.models.parallel_wavegan_generator import _get_receptive_field_size logger = logging.getLogger(__name__) @@ -133,17 +134,10 @@ def _apply_weight_norm(m): self.apply(_apply_weight_norm) - @staticmethod - def _get_receptive_field_size(layers, stacks, kernel_size, dilation=lambda x: 2**x): - assert layers % stacks == 0 - layers_per_cycle = layers // stacks - dilations = [dilation(i % layers_per_cycle) for i in range(layers)] - return (kernel_size - 1) * sum(dilations) + 1 - @property def receptive_field_size(self): """Return receptive field size.""" - return self._get_receptive_field_size(self.layers, self.stacks, self.kernel_size) + return _get_receptive_field_size(self.layers, self.stacks, self.kernel_size) @torch.no_grad() def inference(self, c): diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 723f18dde2..1847679890 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -17,6 +17,7 @@ from TTS.utils.audio.numpy_transforms import mulaw_decode from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset from TTS.vocoder.layers.losses import WaveRNNLoss +from TTS.vocoder.layers.upsample import Stretch2d from TTS.vocoder.models.base_vocoder import BaseVocoder from TTS.vocoder.utils.distribution import sample_from_discretized_mix_logistic, sample_from_gaussian @@ -66,19 +67,6 @@ def forward(self, x): return x -class Stretch2d(nn.Module): - def __init__(self, x_scale, y_scale): - super().__init__() - self.x_scale = x_scale - self.y_scale = y_scale - - def forward(self, x): - b, c, h, w = x.size() - x = x.unsqueeze(-1).unsqueeze(3) - x = x.repeat(1, 1, 1, self.y_scale, 1, self.x_scale) - return x.view(b, c, h * self.y_scale, w * self.x_scale) - - class UpsampleNetwork(nn.Module): def __init__( self, diff --git a/docs/README.md b/docs/README.md deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index 86ccae9cca..0000000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -furo -myst-parser == 2.0.0 -sphinx == 7.2.5 -sphinx_inline_tabs -sphinx_copybutton -linkify-it-py diff --git a/pyproject.toml b/pyproject.toml index 389c0c66b8..5386d274ac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,18 +1,31 @@ -[build-system] -requires = [ - "setuptools", - "setuptools-scm", - "cython>=3.0.0", - "numpy>=2.0.0", -] -build-backend = "setuptools.build_meta" +# ,*++++++*, ,*++++++*, +# *++. .+++ *++. .++* +# *+* ,++++* *+* *+* ,++++, *+* +# ,+, .++++++++++* ,++,,,,*+, ,++++++++++. *+, +# *+. .++++++++++++..++ *+.,++++++++++++. .+* +# .+* ++++++++++++.*+, .+*.++++++++++++ *+, +# .++ *++++++++* ++, .++.*++++++++* ++, +# ,+++*. . .*++, ,++*. .*+++* +# *+, .,*++**. .**++**. ,+* +# .+* *+, +# *+. Coqui .+* +# *+* +++ TTS +++ *+* +# .+++*. . . *+++. +# ,+* *+++*... ...*+++* *+, +# .++. .""""+++++++****+++++++"""". ++. +# ,++. .++, +# .++* *++. +# *+++, ,+++* +# .,*++++::::::++++*,. +# `````` -[tool.setuptools.packages.find] -include = ["TTS*"] +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" [project] name = "coqui-tts" -version = "0.24.3" +version = "0.25.0" description = "Deep learning for Text to Speech." readme = "README.md" requires-python = ">=3.9, <3.13" @@ -62,13 +75,14 @@ dependencies = [ # Training "matplotlib>=3.7.0", # Coqui stack - "coqui-tts-trainer>=0.1.4,<0.2.0", - "coqpit>=0.0.16", + "coqui-tts-trainer>=0.2.0,<0.3.0", + "coqpit-config>=0.1.1,<0.2.0", + "monotonic-alignment-search>=0.1.0", # Gruut + supported languages "gruut[de,es,fr]>=2.4.0", # Tortoise "einops>=0.6.0", - "transformers>=4.43.0", + "transformers>=4.43.0,<=4.46.2", # Bark "encodec>=0.1.1", # XTTS @@ -77,15 +91,6 @@ dependencies = [ ] [project.optional-dependencies] -# Dependencies for building the documentation -docs = [ - "furo>=2023.5.20", - "myst-parser==2.0.0", - "sphinx==7.2.5", - "sphinx_inline_tabs>=2023.4.21", - "sphinx_copybutton>=0.1", - "linkify-it-py>=2.0.0", -] # Only used in notebooks notebooks = [ "bokeh==1.4.0", @@ -136,6 +141,15 @@ dev = [ "pre-commit>=3", "ruff==0.7.0", ] +# Dependencies for building the documentation +docs = [ + "furo>=2023.5.20", + "myst-parser==2.0.0", + "sphinx==7.2.5", + "sphinx_inline_tabs>=2023.4.21", + "sphinx_copybutton>=0.1", + "linkify-it-py>=2.0.0", +] [project.urls] Homepage = "https://github.com/idiap/coqui-ai-TTS" @@ -151,6 +165,22 @@ tts-server = "TTS.server.server:main" [tool.uv] constraint-dependencies = ["numba>0.58.0"] +[tool.hatch.build] +exclude = [ + "/.github", + "/.gitignore", + "/.pre-commit-config.yaml", + "/.readthedocs.yml", + "/Makefile", + "/dockerfiles", + "/run_bash_tests.sh", + "/scripts", + "/tests", +] + +[tool.hatch.build.targets.wheel] +packages = ["TTS"] + [tool.ruff] line-length = 120 extend-exclude = ["*.ipynb"] diff --git a/recipes/ljspeech/xtts_v1/train_gpt_xtts.py b/recipes/ljspeech/xtts_v1/train_gpt_xtts.py index 7d8f4064c5..d31ec8f1ed 100644 --- a/recipes/ljspeech/xtts_v1/train_gpt_xtts.py +++ b/recipes/ljspeech/xtts_v1/train_gpt_xtts.py @@ -41,8 +41,8 @@ # DVAE files -DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/dvae.pth" -MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/mel_stats.pth" +DVAE_CHECKPOINT_LINK = "https://huggingface.co/coqui/XTTS-v1/resolve/v1.1.2/dvae.pth" +MEL_NORM_LINK = "https://huggingface.co/coqui/XTTS-v1/resolve/v1.1.2/mel_stats.pth" # Set the path to the downloaded files DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, DVAE_CHECKPOINT_LINK.split("/")[-1]) @@ -55,8 +55,8 @@ # Download XTTS v1.1 checkpoint if needed -TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/vocab.json" -XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/model.pth" +TOKENIZER_FILE_LINK = "https://huggingface.co/coqui/XTTS-v1/resolve/v1.1.2/vocab.json" +XTTS_CHECKPOINT_LINK = "https://huggingface.co/coqui/XTTS-v1/resolve/v1.1.2/model.pth" # XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning. TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, TOKENIZER_FILE_LINK.split("/")[-1]) # vocab.json file diff --git a/recipes/ljspeech/xtts_v2/train_gpt_xtts.py b/recipes/ljspeech/xtts_v2/train_gpt_xtts.py index 626917381a..ccaa97f1e4 100644 --- a/recipes/ljspeech/xtts_v2/train_gpt_xtts.py +++ b/recipes/ljspeech/xtts_v2/train_gpt_xtts.py @@ -41,8 +41,8 @@ # DVAE files -DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth" -MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth" +DVAE_CHECKPOINT_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/dvae.pth" +MEL_NORM_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/mel_stats.pth" # Set the path to the downloaded files DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK)) @@ -55,8 +55,8 @@ # Download XTTS v2.0 checkpoint if needed -TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json" -XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth" +TOKENIZER_FILE_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/vocab.json" +XTTS_CHECKPOINT_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/model.pth" # XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning. TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK)) # vocab.json file diff --git a/setup.py b/setup.py deleted file mode 100644 index 1cf2def1d3..0000000000 --- a/setup.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python -# ,*++++++*, ,*++++++*, -# *++. .+++ *++. .++* -# *+* ,++++* *+* *+* ,++++, *+* -# ,+, .++++++++++* ,++,,,,*+, ,++++++++++. *+, -# *+. .++++++++++++..++ *+.,++++++++++++. .+* -# .+* ++++++++++++.*+, .+*.++++++++++++ *+, -# .++ *++++++++* ++, .++.*++++++++* ++, -# ,+++*. . .*++, ,++*. .*+++* -# *+, .,*++**. .**++**. ,+* -# .+* *+, -# *+. Coqui .+* -# *+* +++ TTS +++ *+* -# .+++*. . . *+++. -# ,+* *+++*... ...*+++* *+, -# .++. .""""+++++++****+++++++"""". ++. -# ,++. .++, -# .++* *++. -# *+++, ,+++* -# .,*++++::::::++++*,. -# `````` - -import numpy -from Cython.Build import cythonize -from setuptools import Extension, setup - -exts = [ - Extension( - name="TTS.tts.utils.monotonic_align.core", - sources=["TTS/tts/utils/monotonic_align/core.pyx"], - ) -] -setup( - include_dirs=numpy.get_include(), - ext_modules=cythonize(exts, language_level=3), - zip_safe=False, -) diff --git a/tests/tts_tests/test_helpers.py b/tests/aux_tests/test_helpers.py similarity index 76% rename from tests/tts_tests/test_helpers.py rename to tests/aux_tests/test_helpers.py index d07efa3620..6781cbc5d4 100644 --- a/tests/tts_tests/test_helpers.py +++ b/tests/aux_tests/test_helpers.py @@ -1,6 +1,14 @@ import torch as T -from TTS.tts.utils.helpers import average_over_durations, generate_path, rand_segments, segment, sequence_mask +from TTS.tts.utils.helpers import ( + average_over_durations, + expand_encoder_outputs, + generate_attention, + generate_path, + rand_segments, + segment, + sequence_mask, +) def test_average_over_durations(): # pylint: disable=no-self-use @@ -86,3 +94,24 @@ def test_generate_path(): assert all(path[b, t, :current_idx] == 0.0) assert all(path[b, t, current_idx + durations[b, t].item() :] == 0.0) current_idx += durations[b, t].item() + + assert T.all(path == generate_attention(durations, x_mask, y_mask)) + assert T.all(path == generate_attention(durations, x_mask)) + + +def test_expand_encoder_outputs(): + inputs = T.rand(2, 5, 57) + durations = T.randint(1, 4, (2, 57)) + + x_mask = T.ones(2, 1, 57) + y_lengths = T.ones(2) * durations.sum(1).max() + + expanded, _, _ = expand_encoder_outputs(inputs, durations, x_mask, y_lengths) + + for b in range(durations.shape[0]): + index = 0 + for idx, dur in enumerate(durations[b]): + idx_expanded = expanded[b, :, index : index + dur.item()] + diff = (idx_expanded - inputs[b, :, idx].repeat(int(dur)).view(idx_expanded.shape)).sum() + assert abs(diff) < 1e-6, diff + index += dur diff --git a/tests/aux_tests/test_stft_torch.py b/tests/aux_tests/test_stft_torch.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/tests/aux_tests/test_torch_transforms.py b/tests/aux_tests/test_torch_transforms.py new file mode 100644 index 0000000000..2da5a359c1 --- /dev/null +++ b/tests/aux_tests/test_torch_transforms.py @@ -0,0 +1,16 @@ +import numpy as np +import torch + +from TTS.utils.audio import numpy_transforms as np_transforms +from TTS.utils.audio.torch_transforms import amp_to_db, db_to_amp + + +def test_amplitude_db_conversion(): + x = torch.rand(11) + o1 = amp_to_db(x=x, spec_gain=1.0) + o2 = db_to_amp(x=o1, spec_gain=1.0) + np_o1 = np_transforms.amp_to_db(x=x, base=np.e) + np_o2 = np_transforms.db_to_amp(x=np_o1, base=np.e) + assert torch.allclose(x, o2) + assert torch.allclose(o1, np_o1) + assert torch.allclose(o2, np_o2) diff --git a/tests/inference_tests/test_synthesizer.py b/tests/inference_tests/test_synthesizer.py index ce4fc751c2..21cc194131 100644 --- a/tests/inference_tests/test_synthesizer.py +++ b/tests/inference_tests/test_synthesizer.py @@ -23,7 +23,7 @@ def test_in_out(self): tts_root_path = get_tests_input_path() tts_checkpoint = os.path.join(tts_root_path, "checkpoint_10.pth") tts_config = os.path.join(tts_root_path, "dummy_model_config.json") - synthesizer = Synthesizer(tts_checkpoint, tts_config, None, None) + synthesizer = Synthesizer(tts_checkpoint=tts_checkpoint, tts_config_path=tts_config) synthesizer.tts("Better this test works!!") def test_split_into_sentences(self): diff --git a/tests/tts_tests/test_neuralhmm_tts_train.py b/tests/tts_tests/test_neuralhmm_tts_train.py index 25d9aa8148..4789d53d9e 100644 --- a/tests/tts_tests/test_neuralhmm_tts_train.py +++ b/tests/tts_tests/test_neuralhmm_tts_train.py @@ -4,7 +4,7 @@ import shutil import torch -from trainer import get_last_checkpoint +from trainer.io import get_last_checkpoint from tests import get_device_id, get_tests_output_path, run_cli from TTS.tts.configs.neuralhmm_tts_config import NeuralhmmTTSConfig diff --git a/tests/tts_tests/test_overflow_train.py b/tests/tts_tests/test_overflow_train.py index 86fa60af72..d86bde6854 100644 --- a/tests/tts_tests/test_overflow_train.py +++ b/tests/tts_tests/test_overflow_train.py @@ -4,7 +4,7 @@ import shutil import torch -from trainer import get_last_checkpoint +from trainer.io import get_last_checkpoint from tests import get_device_id, get_tests_output_path, run_cli from TTS.tts.configs.overflow_config import OverflowConfig diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py index 530781ef88..2aac7f101d 100644 --- a/tests/tts_tests/test_speedy_speech_train.py +++ b/tests/tts_tests/test_speedy_speech_train.py @@ -3,7 +3,7 @@ import os import shutil -from trainer import get_last_checkpoint +from trainer.io import get_last_checkpoint from tests import get_device_id, get_tests_output_path, run_cli from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig diff --git a/tests/tts_tests/test_tacotron2_d-vectors_train.py b/tests/tts_tests/test_tacotron2_d-vectors_train.py index 99ba4349c4..d2d1d5c35f 100644 --- a/tests/tts_tests/test_tacotron2_d-vectors_train.py +++ b/tests/tts_tests/test_tacotron2_d-vectors_train.py @@ -3,7 +3,7 @@ import os import shutil -from trainer import get_last_checkpoint +from trainer.io import get_last_checkpoint from tests import get_device_id, get_tests_output_path, run_cli from TTS.tts.configs.tacotron2_config import Tacotron2Config diff --git a/tests/tts_tests/test_tacotron2_speaker_emb_train.py b/tests/tts_tests/test_tacotron2_speaker_emb_train.py index 5f1bc3fd50..83a07d1a6c 100644 --- a/tests/tts_tests/test_tacotron2_speaker_emb_train.py +++ b/tests/tts_tests/test_tacotron2_speaker_emb_train.py @@ -3,7 +3,7 @@ import os import shutil -from trainer import get_last_checkpoint +from trainer.io import get_last_checkpoint from tests import get_device_id, get_tests_output_path, run_cli from TTS.tts.configs.tacotron2_config import Tacotron2Config diff --git a/tests/tts_tests/test_tacotron2_train.py b/tests/tts_tests/test_tacotron2_train.py index 40107070e1..df0e934d8e 100644 --- a/tests/tts_tests/test_tacotron2_train.py +++ b/tests/tts_tests/test_tacotron2_train.py @@ -3,7 +3,7 @@ import os import shutil -from trainer import get_last_checkpoint +from trainer.io import get_last_checkpoint from tests import get_device_id, get_tests_output_path, run_cli from TTS.tts.configs.tacotron2_config import Tacotron2Config diff --git a/tests/tts_tests/test_tacotron_train.py b/tests/tts_tests/test_tacotron_train.py index f7751931ae..17f1fd46a6 100644 --- a/tests/tts_tests/test_tacotron_train.py +++ b/tests/tts_tests/test_tacotron_train.py @@ -2,7 +2,7 @@ import os import shutil -from trainer import get_last_checkpoint +from trainer.io import get_last_checkpoint from tests import get_device_id, get_tests_output_path, run_cli from TTS.tts.configs.tacotron_config import TacotronConfig diff --git a/tests/tts_tests/test_vits.py b/tests/tts_tests/test_vits.py index 17992773ad..c8a52e1c1b 100644 --- a/tests/tts_tests/test_vits.py +++ b/tests/tts_tests/test_vits.py @@ -13,14 +13,10 @@ Vits, VitsArgs, VitsAudioConfig, - amp_to_db, - db_to_amp, load_audio, - spec_to_mel, - wav_to_mel, - wav_to_spec, ) from TTS.tts.utils.speakers import SpeakerManager +from TTS.utils.audio.torch_transforms import amp_to_db, db_to_amp, spec_to_mel, wav_to_mel, wav_to_spec LANG_FILE = os.path.join(get_tests_input_path(), "language_ids.json") SPEAKER_ENCODER_CONFIG = os.path.join(get_tests_input_path(), "test_speaker_encoder_config.json") diff --git a/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py b/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py index 71597ef32f..09df7d29f2 100644 --- a/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py +++ b/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py @@ -3,7 +3,7 @@ import os import shutil -from trainer import get_last_checkpoint +from trainer.io import get_last_checkpoint from tests import get_device_id, get_tests_output_path, run_cli from TTS.config.shared_configs import BaseDatasetConfig diff --git a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py index fd58db534a..7ae09c0e5c 100644 --- a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py +++ b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py @@ -3,7 +3,7 @@ import os import shutil -from trainer import get_last_checkpoint +from trainer.io import get_last_checkpoint from tests import get_device_id, get_tests_output_path, run_cli from TTS.config.shared_configs import BaseDatasetConfig diff --git a/tests/tts_tests/test_vits_speaker_emb_train.py b/tests/tts_tests/test_vits_speaker_emb_train.py index b7fe197cfe..69fae21f8d 100644 --- a/tests/tts_tests/test_vits_speaker_emb_train.py +++ b/tests/tts_tests/test_vits_speaker_emb_train.py @@ -3,7 +3,7 @@ import os import shutil -from trainer import get_last_checkpoint +from trainer.io import get_last_checkpoint from tests import get_device_id, get_tests_output_path, run_cli from TTS.tts.configs.vits_config import VitsConfig diff --git a/tests/tts_tests/test_vits_train.py b/tests/tts_tests/test_vits_train.py index ea5dc02405..78f42d154b 100644 --- a/tests/tts_tests/test_vits_train.py +++ b/tests/tts_tests/test_vits_train.py @@ -3,7 +3,7 @@ import os import shutil -from trainer import get_last_checkpoint +from trainer.io import get_last_checkpoint from tests import get_device_id, get_tests_output_path, run_cli from TTS.tts.configs.vits_config import VitsConfig diff --git a/tests/tts_tests2/test_align_tts_train.py b/tests/tts_tests2/test_align_tts_train.py index 9b0b730df4..91c3c35bc6 100644 --- a/tests/tts_tests2/test_align_tts_train.py +++ b/tests/tts_tests2/test_align_tts_train.py @@ -3,7 +3,7 @@ import os import shutil -from trainer import get_last_checkpoint +from trainer.io import get_last_checkpoint from tests import get_device_id, get_tests_output_path, run_cli from TTS.tts.configs.align_tts_config import AlignTTSConfig diff --git a/tests/tts_tests2/test_delightful_tts_d-vectors_train.py b/tests/tts_tests2/test_delightful_tts_d-vectors_train.py index 8fc4ea7e9b..1e5cd49f73 100644 --- a/tests/tts_tests2/test_delightful_tts_d-vectors_train.py +++ b/tests/tts_tests2/test_delightful_tts_d-vectors_train.py @@ -3,7 +3,7 @@ import os import shutil -from trainer import get_last_checkpoint +from trainer.io import get_last_checkpoint from tests import get_device_id, get_tests_output_path, run_cli from TTS.tts.configs.delightful_tts_config import DelightfulTtsAudioConfig, DelightfulTTSConfig diff --git a/tests/tts_tests2/test_delightful_tts_emb_spk.py b/tests/tts_tests2/test_delightful_tts_emb_spk.py index 6fb70c5f61..9bbf7a55ea 100644 --- a/tests/tts_tests2/test_delightful_tts_emb_spk.py +++ b/tests/tts_tests2/test_delightful_tts_emb_spk.py @@ -3,7 +3,7 @@ import os import shutil -from trainer import get_last_checkpoint +from trainer.io import get_last_checkpoint from tests import get_device_id, get_tests_output_path, run_cli from TTS.tts.configs.delightful_tts_config import DelightfulTtsAudioConfig, DelightfulTTSConfig diff --git a/tests/tts_tests2/test_delightful_tts_train.py b/tests/tts_tests2/test_delightful_tts_train.py index a917d77657..3e6fbd2e86 100644 --- a/tests/tts_tests2/test_delightful_tts_train.py +++ b/tests/tts_tests2/test_delightful_tts_train.py @@ -3,7 +3,7 @@ import os import shutil -from trainer import get_last_checkpoint +from trainer.io import get_last_checkpoint from tests import get_device_id, get_tests_output_path, run_cli from TTS.config.shared_configs import BaseAudioConfig diff --git a/tests/tts_tests2/test_fast_pitch_speaker_emb_train.py b/tests/tts_tests2/test_fast_pitch_speaker_emb_train.py index 7f79bfcab2..e6bc9f9feb 100644 --- a/tests/tts_tests2/test_fast_pitch_speaker_emb_train.py +++ b/tests/tts_tests2/test_fast_pitch_speaker_emb_train.py @@ -3,7 +3,7 @@ import os import shutil -from trainer import get_last_checkpoint +from trainer.io import get_last_checkpoint from tests import get_device_id, get_tests_output_path, run_cli from TTS.config.shared_configs import BaseAudioConfig diff --git a/tests/tts_tests2/test_fast_pitch_train.py b/tests/tts_tests2/test_fast_pitch_train.py index a525715b53..fe87c8b600 100644 --- a/tests/tts_tests2/test_fast_pitch_train.py +++ b/tests/tts_tests2/test_fast_pitch_train.py @@ -3,7 +3,7 @@ import os import shutil -from trainer import get_last_checkpoint +from trainer.io import get_last_checkpoint from tests import get_device_id, get_tests_output_path, run_cli from TTS.config.shared_configs import BaseAudioConfig diff --git a/tests/tts_tests2/test_fastspeech_2_speaker_emb_train.py b/tests/tts_tests2/test_fastspeech_2_speaker_emb_train.py index 35bda597d5..735d2fc4c6 100644 --- a/tests/tts_tests2/test_fastspeech_2_speaker_emb_train.py +++ b/tests/tts_tests2/test_fastspeech_2_speaker_emb_train.py @@ -3,7 +3,7 @@ import os import shutil -from trainer import get_last_checkpoint +from trainer.io import get_last_checkpoint from tests import get_device_id, get_tests_output_path, run_cli from TTS.config.shared_configs import BaseAudioConfig diff --git a/tests/tts_tests2/test_fastspeech_2_train.py b/tests/tts_tests2/test_fastspeech_2_train.py index dd4b07d240..07fc5a1a2c 100644 --- a/tests/tts_tests2/test_fastspeech_2_train.py +++ b/tests/tts_tests2/test_fastspeech_2_train.py @@ -3,7 +3,7 @@ import os import shutil -from trainer import get_last_checkpoint +from trainer.io import get_last_checkpoint from tests import get_device_id, get_tests_output_path, run_cli from TTS.config.shared_configs import BaseAudioConfig diff --git a/tests/tts_tests2/test_forward_tts.py b/tests/tts_tests2/test_forward_tts.py index cec0f211c8..13a2c270af 100644 --- a/tests/tts_tests2/test_forward_tts.py +++ b/tests/tts_tests2/test_forward_tts.py @@ -6,29 +6,7 @@ # pylint: disable=unused-variable -def expand_encoder_outputs_test(): - model = ForwardTTS(ForwardTTSArgs(num_chars=10)) - - inputs = T.rand(2, 5, 57) - durations = T.randint(1, 4, (2, 57)) - - x_mask = T.ones(2, 1, 57) - y_mask = T.ones(2, 1, durations.sum(1).max()) - - expanded, _ = model.expand_encoder_outputs(inputs, durations, x_mask, y_mask) - - for b in range(durations.shape[0]): - index = 0 - for idx, dur in enumerate(durations[b]): - diff = ( - expanded[b, :, index : index + dur.item()] - - inputs[b, :, idx].repeat(dur.item()).view(expanded[b, :, index : index + dur.item()].shape) - ).sum() - assert abs(diff) < 1e-6, diff - index += dur - - -def model_input_output_test(): +def test_model_input_output(): """Assert the output shapes of the model in different modes""" # VANILLA MODEL diff --git a/tests/tts_tests2/test_glow_tts_d-vectors_train.py b/tests/tts_tests2/test_glow_tts_d-vectors_train.py index f1cfd4368f..8236607c25 100644 --- a/tests/tts_tests2/test_glow_tts_d-vectors_train.py +++ b/tests/tts_tests2/test_glow_tts_d-vectors_train.py @@ -3,7 +3,7 @@ import os import shutil -from trainer import get_last_checkpoint +from trainer.io import get_last_checkpoint from tests import get_device_id, get_tests_output_path, run_cli from TTS.tts.configs.glow_tts_config import GlowTTSConfig diff --git a/tests/tts_tests2/test_glow_tts_speaker_emb_train.py b/tests/tts_tests2/test_glow_tts_speaker_emb_train.py index b1eb6237a4..4a8bd0658d 100644 --- a/tests/tts_tests2/test_glow_tts_speaker_emb_train.py +++ b/tests/tts_tests2/test_glow_tts_speaker_emb_train.py @@ -3,7 +3,7 @@ import os import shutil -from trainer import get_last_checkpoint +from trainer.io import get_last_checkpoint from tests import get_device_id, get_tests_output_path, run_cli from TTS.tts.configs.glow_tts_config import GlowTTSConfig diff --git a/tests/tts_tests2/test_glow_tts_train.py b/tests/tts_tests2/test_glow_tts_train.py index 0a8e226b65..1d7f913575 100644 --- a/tests/tts_tests2/test_glow_tts_train.py +++ b/tests/tts_tests2/test_glow_tts_train.py @@ -3,7 +3,7 @@ import os import shutil -from trainer import get_last_checkpoint +from trainer.io import get_last_checkpoint from tests import get_device_id, get_tests_output_path, run_cli from TTS.tts.configs.glow_tts_config import GlowTTSConfig diff --git a/tests/vc_tests/test_freevc.py b/tests/vc_tests/test_freevc.py index c90551b494..fe07b2723c 100644 --- a/tests/vc_tests/test_freevc.py +++ b/tests/vc_tests/test_freevc.py @@ -22,31 +22,19 @@ class TestFreeVC(unittest.TestCase): def _create_inputs(self, config, batch_size=2): - input_dummy = torch.rand(batch_size, 30 * config.audio["hop_length"]).to(device) - input_lengths = torch.randint(100, 30 * config.audio["hop_length"], (batch_size,)).long().to(device) - input_lengths[-1] = 30 * config.audio["hop_length"] spec = torch.rand(batch_size, 30, config.audio["filter_length"] // 2 + 1).to(device) mel = torch.rand(batch_size, 30, config.audio["n_mel_channels"]).to(device) spec_lengths = torch.randint(20, 30, (batch_size,)).long().to(device) spec_lengths[-1] = spec.size(2) waveform = torch.rand(batch_size, spec.size(2) * config.audio["hop_length"]).to(device) - return input_dummy, input_lengths, mel, spec, spec_lengths, waveform + return mel, spec, spec_lengths, waveform @staticmethod def _create_inputs_inference(): - source_wav = torch.rand(16000) + source_wav = torch.rand(15999) target_wav = torch.rand(16000) return source_wav, target_wav - @staticmethod - def _check_parameter_changes(model, model_ref): - count = 0 - for param, param_ref in zip(model.parameters(), model_ref.parameters()): - assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format( - count, param.shape, param, param_ref - ) - count += 1 - def test_methods(self): config = FreeVCConfig() model = FreeVC(config).to(device) @@ -69,7 +57,7 @@ def _test_forward(self, batch_size): model.train() print(" > Num parameters for FreeVC model:%s" % (count_parameters(model))) - _, _, mel, spec, spec_lengths, waveform = self._create_inputs(config, batch_size) + mel, spec, spec_lengths, waveform = self._create_inputs(config, batch_size) wavlm_vec = model.extract_wavlm_features(waveform) wavlm_vec_lengths = torch.ones(batch_size, dtype=torch.long) @@ -86,7 +74,7 @@ def _test_inference(self, batch_size): model = FreeVC(config).to(device) model.eval() - _, _, mel, _, _, waveform = self._create_inputs(config, batch_size) + mel, _, _, waveform = self._create_inputs(config, batch_size) wavlm_vec = model.extract_wavlm_features(waveform) wavlm_vec_lengths = torch.ones(batch_size, dtype=torch.long) @@ -108,8 +96,8 @@ def test_voice_conversion(self): source_wav, target_wav = self._create_inputs_inference() output_wav = model.voice_conversion(source_wav, target_wav) assert ( - output_wav.shape[0] + config.audio.hop_length == source_wav.shape[0] - ), f"{output_wav.shape} != {source_wav.shape}" + output_wav.shape[0] == source_wav.shape[0] - source_wav.shape[0] % config.audio.hop_length + ), f"{output_wav.shape} != {source_wav.shape}, {config.audio.hop_length}" def test_train_step(self): ... diff --git a/tests/vc_tests/test_openvoice.py b/tests/vc_tests/test_openvoice.py new file mode 100644 index 0000000000..c9f7ae3931 --- /dev/null +++ b/tests/vc_tests/test_openvoice.py @@ -0,0 +1,42 @@ +import os +import unittest + +import torch + +from tests import get_tests_input_path +from TTS.vc.models.openvoice import OpenVoice, OpenVoiceConfig + +torch.manual_seed(1) +use_cuda = torch.cuda.is_available() +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + +c = OpenVoiceConfig() + +WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") + + +class TestOpenVoice(unittest.TestCase): + + @staticmethod + def _create_inputs_inference(): + source_wav = torch.rand(16100) + target_wav = torch.rand(16000) + return source_wav, target_wav + + def test_load_audio(self): + config = OpenVoiceConfig() + model = OpenVoice(config).to(device) + wav = model.load_audio(WAV_FILE) + wav2 = model.load_audio(wav) + assert all(torch.isclose(wav, wav2)) + + def test_voice_conversion(self): + config = OpenVoiceConfig() + model = OpenVoice(config).to(device) + model.eval() + + source_wav, target_wav = self._create_inputs_inference() + output_wav = model.voice_conversion(source_wav, target_wav) + assert ( + output_wav.shape[0] == source_wav.shape[0] - source_wav.shape[0] % config.audio.hop_length + ), f"{output_wav.shape} != {source_wav.shape}"