From 0971bc236ea41e22970764b11dacefcd8f2273b8 Mon Sep 17 00:00:00 2001 From: Enno Hermann Date: Thu, 7 Nov 2024 00:33:54 +0100 Subject: [PATCH 1/4] refactor: use external package for monotonic alignment --- .../layers/delightful_tts/acoustic_model.py | 3 +- TTS/tts/models/align_tts.py | 3 +- TTS/tts/models/forward_tts.py | 3 +- TTS/tts/models/glow_tts.py | 3 +- TTS/tts/models/vits.py | 3 +- TTS/tts/utils/helpers.py | 74 ------------------- TTS/tts/utils/monotonic_align/__init__.py | 0 TTS/tts/utils/monotonic_align/core.pyx | 47 ------------ pyproject.toml | 1 + 9 files changed, 11 insertions(+), 126 deletions(-) delete mode 100644 TTS/tts/utils/monotonic_align/__init__.py delete mode 100644 TTS/tts/utils/monotonic_align/core.pyx diff --git a/TTS/tts/layers/delightful_tts/acoustic_model.py b/TTS/tts/layers/delightful_tts/acoustic_model.py index 83989f9ba4..3c0e3a3a76 100644 --- a/TTS/tts/layers/delightful_tts/acoustic_model.py +++ b/TTS/tts/layers/delightful_tts/acoustic_model.py @@ -5,6 +5,7 @@ import torch import torch.nn.functional as F from coqpit import Coqpit +from monotonic_alignment_search import maximum_path from torch import nn from TTS.tts.layers.delightful_tts.conformer import Conformer @@ -19,7 +20,7 @@ from TTS.tts.layers.delightful_tts.pitch_adaptor import PitchAdaptor from TTS.tts.layers.delightful_tts.variance_predictor import VariancePredictor from TTS.tts.layers.generic.aligner import AlignmentNetwork -from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask +from TTS.tts.utils.helpers import generate_path, sequence_mask logger = logging.getLogger(__name__) diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index 2d27a57850..1c3d57582e 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -3,6 +3,7 @@ import torch from coqpit import Coqpit +from monotonic_alignment_search import maximum_path from torch import nn from trainer.io import load_fsspec @@ -12,7 +13,7 @@ from TTS.tts.layers.feed_forward.encoder import Encoder from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.models.base_tts import BaseTTS -from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask +from TTS.tts.utils.helpers import generate_path, sequence_mask from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.visual import plot_alignment, plot_spectrogram diff --git a/TTS/tts/models/forward_tts.py b/TTS/tts/models/forward_tts.py index 4b74462dd5..e7bc86374d 100644 --- a/TTS/tts/models/forward_tts.py +++ b/TTS/tts/models/forward_tts.py @@ -4,6 +4,7 @@ import torch from coqpit import Coqpit +from monotonic_alignment_search import maximum_path from torch import nn from torch.cuda.amp.autocast_mode import autocast from trainer.io import load_fsspec @@ -14,7 +15,7 @@ from TTS.tts.layers.generic.pos_encoding import PositionalEncoding from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor from TTS.tts.models.base_tts import BaseTTS -from TTS.tts.utils.helpers import average_over_durations, generate_path, maximum_path, sequence_mask +from TTS.tts.utils.helpers import average_over_durations, generate_path, sequence_mask from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.visual import plot_alignment, plot_avg_energy, plot_avg_pitch, plot_spectrogram diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index 64954d283c..5ea69865b2 100644 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -4,6 +4,7 @@ import torch from coqpit import Coqpit +from monotonic_alignment_search import maximum_path from torch import nn from torch.cuda.amp.autocast_mode import autocast from torch.nn import functional as F @@ -13,7 +14,7 @@ from TTS.tts.layers.glow_tts.decoder import Decoder from TTS.tts.layers.glow_tts.encoder import Encoder from TTS.tts.models.base_tts import BaseTTS -from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask +from TTS.tts.utils.helpers import generate_path, sequence_mask from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.text.tokenizer import TTSTokenizer diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index b014e4fdde..af803a0f76 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -11,6 +11,7 @@ import torchaudio from coqpit import Coqpit from librosa.filters import mel as librosa_mel_fn +from monotonic_alignment_search import maximum_path from torch import nn from torch.cuda.amp.autocast_mode import autocast from torch.nn import functional as F @@ -28,7 +29,7 @@ from TTS.tts.layers.vits.stochastic_duration_predictor import StochasticDurationPredictor from TTS.tts.models.base_tts import BaseTTS from TTS.tts.utils.fairseq import rehash_fairseq_vits_checkpoint -from TTS.tts.utils.helpers import generate_path, maximum_path, rand_segments, segment, sequence_mask +from TTS.tts.utils.helpers import generate_path, rand_segments, segment, sequence_mask from TTS.tts.utils.languages import LanguageManager from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.synthesis import synthesis diff --git a/TTS/tts/utils/helpers.py b/TTS/tts/utils/helpers.py index 7429d0fcc8..d1722501f7 100644 --- a/TTS/tts/utils/helpers.py +++ b/TTS/tts/utils/helpers.py @@ -3,13 +3,6 @@ from scipy.stats import betabinom from torch.nn import functional as F -try: - from TTS.tts.utils.monotonic_align.core import maximum_path_c - - CYTHON = True -except ModuleNotFoundError: - CYTHON = False - class StandardScaler: """StandardScaler for mean-scale normalization with the given mean and scale values.""" @@ -168,73 +161,6 @@ def generate_path(duration, mask): return path -def maximum_path(value, mask): - if CYTHON: - return maximum_path_cython(value, mask) - return maximum_path_numpy(value, mask) - - -def maximum_path_cython(value, mask): - """Cython optimised version. - Shapes: - - value: :math:`[B, T_en, T_de]` - - mask: :math:`[B, T_en, T_de]` - """ - value = value * mask - device = value.device - dtype = value.dtype - value = value.data.cpu().numpy().astype(np.float32) - path = np.zeros_like(value).astype(np.int32) - mask = mask.data.cpu().numpy() - - t_x_max = mask.sum(1)[:, 0].astype(np.int32) - t_y_max = mask.sum(2)[:, 0].astype(np.int32) - maximum_path_c(path, value, t_x_max, t_y_max) - return torch.from_numpy(path).to(device=device, dtype=dtype) - - -def maximum_path_numpy(value, mask, max_neg_val=None): - """ - Monotonic alignment search algorithm - Numpy-friendly version. It's about 4 times faster than torch version. - value: [b, t_x, t_y] - mask: [b, t_x, t_y] - """ - if max_neg_val is None: - max_neg_val = -np.inf # Patch for Sphinx complaint - value = value * mask - - device = value.device - dtype = value.dtype - value = value.cpu().detach().numpy() - mask = mask.cpu().detach().numpy().astype(bool) - - b, t_x, t_y = value.shape - direction = np.zeros(value.shape, dtype=np.int64) - v = np.zeros((b, t_x), dtype=np.float32) - x_range = np.arange(t_x, dtype=np.float32).reshape(1, -1) - for j in range(t_y): - v0 = np.pad(v, [[0, 0], [1, 0]], mode="constant", constant_values=max_neg_val)[:, :-1] - v1 = v - max_mask = v1 >= v0 - v_max = np.where(max_mask, v1, v0) - direction[:, :, j] = max_mask - - index_mask = x_range <= j - v = np.where(index_mask, v_max + value[:, :, j], max_neg_val) - direction = np.where(mask, direction, 1) - - path = np.zeros(value.shape, dtype=np.float32) - index = mask[:, :, 0].sum(1).astype(np.int64) - 1 - index_range = np.arange(b) - for j in reversed(range(t_y)): - path[index_range, index, j] = 1 - index = index + direction[index_range, index, j] - 1 - path = path * mask.astype(np.float32) - path = torch.from_numpy(path).to(device=device, dtype=dtype) - return path - - def beta_binomial_prior_distribution(phoneme_count, mel_count, scaling_factor=1.0): P, M = phoneme_count, mel_count x = np.arange(0, P) diff --git a/TTS/tts/utils/monotonic_align/__init__.py b/TTS/tts/utils/monotonic_align/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/TTS/tts/utils/monotonic_align/core.pyx b/TTS/tts/utils/monotonic_align/core.pyx deleted file mode 100644 index 091fcc3a50..0000000000 --- a/TTS/tts/utils/monotonic_align/core.pyx +++ /dev/null @@ -1,47 +0,0 @@ -import numpy as np - -cimport cython -cimport numpy as np - -from cython.parallel import prange - - -@cython.boundscheck(False) -@cython.wraparound(False) -cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_x, int t_y, float max_neg_val) nogil: - cdef int x - cdef int y - cdef float v_prev - cdef float v_cur - cdef float tmp - cdef int index = t_x - 1 - - for y in range(t_y): - for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)): - if x == y: - v_cur = max_neg_val - else: - v_cur = value[x, y-1] - if x == 0: - if y == 0: - v_prev = 0. - else: - v_prev = max_neg_val - else: - v_prev = value[x-1, y-1] - value[x, y] = max(v_cur, v_prev) + value[x, y] - - for y in range(t_y - 1, -1, -1): - path[index, y] = 1 - if index != 0 and (index == y or value[index, y-1] < value[index-1, y-1]): - index = index - 1 - - -@cython.boundscheck(False) -@cython.wraparound(False) -cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_xs, int[::1] t_ys, float max_neg_val=-1e9) nogil: - cdef int b = values.shape[0] - - cdef int i - for i in prange(b, nogil=True): - maximum_path_each(paths[i], values[i], t_xs[i], t_ys[i], max_neg_val) diff --git a/pyproject.toml b/pyproject.toml index 23387fd37d..d13e2145d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,6 +64,7 @@ dependencies = [ # Coqui stack "coqui-tts-trainer>=0.1.4,<0.2.0", "coqpit>=0.0.16", + "monotonic-alignment-search>=0.1.0", # Gruut + supported languages "gruut[de,es,fr]>=2.4.0", # Tortoise From 9dd7ae6cca4a7c6db254f6d3c42aebcf34170af5 Mon Sep 17 00:00:00 2001 From: Enno Hermann Date: Thu, 7 Nov 2024 10:32:57 +0100 Subject: [PATCH 2/4] build: switch to hatch Setuptools is not needed anymore because the Cython extension is now built in an external package. --- MANIFEST.in | 10 ---------- pyproject.toml | 49 +++++++++++++++++++++++++++++++++++++++---------- setup.py | 37 ------------------------------------- 3 files changed, 39 insertions(+), 57 deletions(-) delete mode 100644 MANIFEST.in delete mode 100644 setup.py diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 8d092ceff2..0000000000 --- a/MANIFEST.in +++ /dev/null @@ -1,10 +0,0 @@ -include README.md -include LICENSE.txt -include *.cff -recursive-include TTS *.json -recursive-include TTS *.html -recursive-include TTS *.png -recursive-include TTS *.md -recursive-include TTS *.py -recursive-include TTS *.pyx -recursive-include images *.png diff --git a/pyproject.toml b/pyproject.toml index d13e2145d8..379187feed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,14 +1,27 @@ -[build-system] -requires = [ - "setuptools", - "setuptools-scm", - "cython>=3.0.0", - "numpy>=2.0.0", -] -build-backend = "setuptools.build_meta" +# ,*++++++*, ,*++++++*, +# *++. .+++ *++. .++* +# *+* ,++++* *+* *+* ,++++, *+* +# ,+, .++++++++++* ,++,,,,*+, ,++++++++++. *+, +# *+. .++++++++++++..++ *+.,++++++++++++. .+* +# .+* ++++++++++++.*+, .+*.++++++++++++ *+, +# .++ *++++++++* ++, .++.*++++++++* ++, +# ,+++*. . .*++, ,++*. .*+++* +# *+, .,*++**. .**++**. ,+* +# .+* *+, +# *+. Coqui .+* +# *+* +++ TTS +++ *+* +# .+++*. . . *+++. +# ,+* *+++*... ...*+++* *+, +# .++. .""""+++++++****+++++++"""". ++. +# ,++. .++, +# .++* *++. +# *+++, ,+++* +# .,*++++::::::++++*,. +# `````` -[tool.setuptools.packages.find] -include = ["TTS*"] +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" [project] name = "coqui-tts" @@ -152,6 +165,22 @@ tts-server = "TTS.server.server:main" [tool.uv] constraint-dependencies = ["numba>0.58.0"] +[tool.hatch.build] +exclude = [ + "/.github", + "/.gitignore", + "/.pre-commit-config.yaml", + "/.readthedocs.yml", + "/Makefile", + "/dockerfiles", + "/run_bash_tests.sh", + "/scripts", + "/tests", +] + +[tool.hatch.build.targets.wheel] +packages = ["TTS"] + [tool.ruff] line-length = 120 extend-exclude = ["*.ipynb"] diff --git a/setup.py b/setup.py deleted file mode 100644 index 1cf2def1d3..0000000000 --- a/setup.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python -# ,*++++++*, ,*++++++*, -# *++. .+++ *++. .++* -# *+* ,++++* *+* *+* ,++++, *+* -# ,+, .++++++++++* ,++,,,,*+, ,++++++++++. *+, -# *+. .++++++++++++..++ *+.,++++++++++++. .+* -# .+* ++++++++++++.*+, .+*.++++++++++++ *+, -# .++ *++++++++* ++, .++.*++++++++* ++, -# ,+++*. . .*++, ,++*. .*+++* -# *+, .,*++**. .**++**. ,+* -# .+* *+, -# *+. Coqui .+* -# *+* +++ TTS +++ *+* -# .+++*. . . *+++. -# ,+* *+++*... ...*+++* *+, -# .++. .""""+++++++****+++++++"""". ++. -# ,++. .++, -# .++* *++. -# *+++, ,+++* -# .,*++++::::::++++*,. -# `````` - -import numpy -from Cython.Build import cythonize -from setuptools import Extension, setup - -exts = [ - Extension( - name="TTS.tts.utils.monotonic_align.core", - sources=["TTS/tts/utils/monotonic_align/core.pyx"], - ) -] -setup( - include_dirs=numpy.get_include(), - ext_modules=cythonize(exts, language_level=3), - zip_safe=False, -) From d30eba573e089ef2770ac574ebff91f59df3e743 Mon Sep 17 00:00:00 2001 From: Enno Hermann Date: Thu, 7 Nov 2024 10:33:51 +0100 Subject: [PATCH 3/4] chore: remove obsolete code owners file --- CODE_OWNERS.rst | 75 ------------------------------------------------- 1 file changed, 75 deletions(-) delete mode 100644 CODE_OWNERS.rst diff --git a/CODE_OWNERS.rst b/CODE_OWNERS.rst deleted file mode 100644 index 768b573911..0000000000 --- a/CODE_OWNERS.rst +++ /dev/null @@ -1,75 +0,0 @@ -TTS code owners / governance system -========================================== - -TTS is run under a governance system inspired (and partially copied from) by the `Mozilla module ownership system `_. The project is roughly divided into modules, and each module has its owners, which are responsible for reviewing pull requests and deciding on technical direction for their modules. Module ownership authority is given to people who have worked extensively on areas of the project. - -Module owners also have the authority of naming other module owners or appointing module peers, which are people with authority to review pull requests in that module. They can also sub-divide their module into sub-modules with their owners. - -Module owners are not tyrants. They are chartered to make decisions with input from the community and in the best interest of the community. Module owners are not required to make code changes or additions solely because the community wants them to do so. (Like anyone else, the module owners may write code because they want to, because their employers want them to, because the community wants them to, or for some other reason.) Module owners do need to pay attention to patches submitted to that module. However “pay attention” does not mean agreeing to every patch. Some patches may not make sense for the WebThings project; some may be poorly implemented. Module owners have the authority to decline a patch; this is a necessary part of the role. We ask the module owners to describe in the relevant issue their reasons for wanting changes to a patch, for declining it altogether, or for postponing review for some period. We don’t ask or expect them to rewrite patches to make them acceptable. Similarly, module owners may need to delay review of a promising patch due to an upcoming deadline. For example, a patch may be of interest, but not for the next milestone. In such a case it may make sense for the module owner to postpone review of a patch until after matters needed for a milestone have been finalized. Again, we expect this to be described in the relevant issue. And of course, it shouldn’t go on very often or for very long or escalation and review is likely. - -The work of the various module owners and peers is overseen by the global owners, which are responsible for making final decisions in case there's conflict between owners as well as set the direction for the project as a whole. - -This file describes module owners who are active on the project and which parts of the code they have expertise on (and interest in). If you're making changes to the code and are wondering who's an appropriate person to talk to, this list will tell you who to ping. - -There's overlap in the areas of expertise of each owner, and in particular when looking at which files are covered by each area, there is a lot of overlap. Don't worry about getting it exactly right when requesting review, any code owner will be happy to redirect the request to a more appropriate person. - -Global owners ----------------- - -These are people who have worked on the project extensively and are familiar with all or most parts of it. Their expertise and review guidance is trusted by other code owners to cover their own areas of expertise. In case of conflicting opinions from other owners, global owners will make a final decision. - -- Eren Gölge (@erogol) -- Reuben Morais (@reuben) - -Training, feeding ------------------ - -- Eren Gölge (@erogol) - -Model exporting ---------------- - -- Eren Gölge (@erogol) - -Multi-Speaker TTS ------------------ - -- Eren Gölge (@erogol) -- Edresson Casanova (@edresson) - -TTS ---- - -- Eren Gölge (@erogol) - -Vocoders --------- - -- Eren Gölge (@erogol) - -Speaker Encoder ---------------- - -- Eren Gölge (@erogol) - -Testing & CI ------------- - -- Eren Gölge (@erogol) -- Reuben Morais (@reuben) - -Python bindings ---------------- - -- Eren Gölge (@erogol) -- Reuben Morais (@reuben) - -Documentation -------------- - -- Eren Gölge (@erogol) - -Third party bindings --------------------- - -Owned by the author. From 683ee664a8be4fcec3a5f377890dc7f22394476b Mon Sep 17 00:00:00 2001 From: Enno Hermann Date: Thu, 7 Nov 2024 10:34:12 +0100 Subject: [PATCH 4/4] ci: simplify release, cibuildwheel not needed anymore --- .github/workflows/pypi-release.yml | 48 ++++++++++++------------------ 1 file changed, 19 insertions(+), 29 deletions(-) diff --git a/.github/workflows/pypi-release.yml b/.github/workflows/pypi-release.yml index 822990e967..4122f69f73 100644 --- a/.github/workflows/pypi-release.yml +++ b/.github/workflows/pypi-release.yml @@ -7,8 +7,7 @@ defaults: shell: bash jobs: - build-sdist: - name: Build source distribution + build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -20,37 +19,29 @@ jobs: if [[ "v$version" != "$tag" ]]; then exit 1 fi - - uses: actions/setup-python@v5 + - name: Install uv + uses: astral-sh/setup-uv@v3 with: - python-version: 3.9 - - run: | - python -m pip install -U pip setuptools build - - run: | - python -m build - - run: | - pip install dist/*.tar.gz - - uses: actions/upload-artifact@v4 - with: - name: build-sdist - path: dist/*.tar.gz - build-wheels: - name: Build wheels on ${{ matrix.os }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-latest, windows-latest, macos-latest] - steps: - - uses: actions/checkout@v4 - - name: Build wheels - uses: pypa/cibuildwheel@v2.21.1 + version: "0.4.27" + enable-cache: true + cache-dependency-glob: "**/pyproject.toml" + - name: Set up Python + run: uv python install 3.12 + - name: Build sdist and wheel + run: uv build + - name: Test installation of sdist and wheel + run: | + uv venv --no-project + uv pip install dist/*.tar.gz + uv pip install dist/*.whl - uses: actions/upload-artifact@v4 with: - name: build-wheels-${{ matrix.os }} - path: ./wheelhouse/*.whl + name: build + path: dist/* publish-artifacts: name: Publish to PyPI runs-on: ubuntu-latest - needs: [build-sdist, build-wheels] + needs: [build] environment: name: release url: https://pypi.org/p/coqui-tts @@ -60,8 +51,7 @@ jobs: - uses: actions/download-artifact@v4 with: path: dist - pattern: build-* - merge-multiple: true + pattern: build - run: | ls -lh dist/ - name: Publish package distributions to PyPI