Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use external package for monotonic alignment search #135

Merged
merged 4 commits into from
Nov 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 19 additions & 29 deletions .github/workflows/pypi-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@ defaults:
shell:
bash
jobs:
build-sdist:
name: Build source distribution
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
Expand All @@ -20,37 +19,29 @@ jobs:
if [[ "v$version" != "$tag" ]]; then
exit 1
fi
- uses: actions/setup-python@v5
- name: Install uv
uses: astral-sh/setup-uv@v3
with:
python-version: 3.9
- run: |
python -m pip install -U pip setuptools build
- run: |
python -m build
- run: |
pip install dist/*.tar.gz
- uses: actions/upload-artifact@v4
with:
name: build-sdist
path: dist/*.tar.gz
build-wheels:
name: Build wheels on ${{ matrix.os }}
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
steps:
- uses: actions/checkout@v4
- name: Build wheels
uses: pypa/[email protected]
version: "0.4.27"
enable-cache: true
cache-dependency-glob: "**/pyproject.toml"
- name: Set up Python
run: uv python install 3.12
- name: Build sdist and wheel
run: uv build
- name: Test installation of sdist and wheel
run: |
uv venv --no-project
uv pip install dist/*.tar.gz
uv pip install dist/*.whl
- uses: actions/upload-artifact@v4
with:
name: build-wheels-${{ matrix.os }}
path: ./wheelhouse/*.whl
name: build
path: dist/*
publish-artifacts:
name: Publish to PyPI
runs-on: ubuntu-latest
needs: [build-sdist, build-wheels]
needs: [build]
environment:
name: release
url: https://pypi.org/p/coqui-tts
Expand All @@ -60,8 +51,7 @@ jobs:
- uses: actions/download-artifact@v4
with:
path: dist
pattern: build-*
merge-multiple: true
pattern: build
- run: |
ls -lh dist/
- name: Publish package distributions to PyPI
Expand Down
75 changes: 0 additions & 75 deletions CODE_OWNERS.rst

This file was deleted.

10 changes: 0 additions & 10 deletions MANIFEST.in

This file was deleted.

3 changes: 2 additions & 1 deletion TTS/tts/layers/delightful_tts/acoustic_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import torch
import torch.nn.functional as F
from coqpit import Coqpit
from monotonic_alignment_search import maximum_path
from torch import nn

from TTS.tts.layers.delightful_tts.conformer import Conformer
Expand All @@ -19,7 +20,7 @@
from TTS.tts.layers.delightful_tts.pitch_adaptor import PitchAdaptor
from TTS.tts.layers.delightful_tts.variance_predictor import VariancePredictor
from TTS.tts.layers.generic.aligner import AlignmentNetwork
from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
from TTS.tts.utils.helpers import generate_path, sequence_mask

logger = logging.getLogger(__name__)

Expand Down
3 changes: 2 additions & 1 deletion TTS/tts/models/align_tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import torch
from coqpit import Coqpit
from monotonic_alignment_search import maximum_path
from torch import nn
from trainer.io import load_fsspec

Expand All @@ -12,7 +13,7 @@
from TTS.tts.layers.feed_forward.encoder import Encoder
from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
from TTS.tts.models.base_tts import BaseTTS
from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
from TTS.tts.utils.helpers import generate_path, sequence_mask
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
Expand Down
3 changes: 2 additions & 1 deletion TTS/tts/models/forward_tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import torch
from coqpit import Coqpit
from monotonic_alignment_search import maximum_path
from torch import nn
from torch.cuda.amp.autocast_mode import autocast
from trainer.io import load_fsspec
Expand All @@ -14,7 +15,7 @@
from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
from TTS.tts.models.base_tts import BaseTTS
from TTS.tts.utils.helpers import average_over_durations, generate_path, maximum_path, sequence_mask
from TTS.tts.utils.helpers import average_over_durations, generate_path, sequence_mask
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.utils.visual import plot_alignment, plot_avg_energy, plot_avg_pitch, plot_spectrogram
Expand Down
3 changes: 2 additions & 1 deletion TTS/tts/models/glow_tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import torch
from coqpit import Coqpit
from monotonic_alignment_search import maximum_path
from torch import nn
from torch.cuda.amp.autocast_mode import autocast
from torch.nn import functional as F
Expand All @@ -13,7 +14,7 @@
from TTS.tts.layers.glow_tts.decoder import Decoder
from TTS.tts.layers.glow_tts.encoder import Encoder
from TTS.tts.models.base_tts import BaseTTS
from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
from TTS.tts.utils.helpers import generate_path, sequence_mask
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.text.tokenizer import TTSTokenizer
Expand Down
3 changes: 2 additions & 1 deletion TTS/tts/models/vits.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import torchaudio
from coqpit import Coqpit
from librosa.filters import mel as librosa_mel_fn
from monotonic_alignment_search import maximum_path
from torch import nn
from torch.cuda.amp.autocast_mode import autocast
from torch.nn import functional as F
Expand All @@ -28,7 +29,7 @@
from TTS.tts.layers.vits.stochastic_duration_predictor import StochasticDurationPredictor
from TTS.tts.models.base_tts import BaseTTS
from TTS.tts.utils.fairseq import rehash_fairseq_vits_checkpoint
from TTS.tts.utils.helpers import generate_path, maximum_path, rand_segments, segment, sequence_mask
from TTS.tts.utils.helpers import generate_path, rand_segments, segment, sequence_mask
from TTS.tts.utils.languages import LanguageManager
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.synthesis import synthesis
Expand Down
74 changes: 0 additions & 74 deletions TTS/tts/utils/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,6 @@
from scipy.stats import betabinom
from torch.nn import functional as F

try:
from TTS.tts.utils.monotonic_align.core import maximum_path_c

CYTHON = True
except ModuleNotFoundError:
CYTHON = False


class StandardScaler:
"""StandardScaler for mean-scale normalization with the given mean and scale values."""
Expand Down Expand Up @@ -168,73 +161,6 @@ def generate_path(duration, mask):
return path


def maximum_path(value, mask):
if CYTHON:
return maximum_path_cython(value, mask)
return maximum_path_numpy(value, mask)


def maximum_path_cython(value, mask):
"""Cython optimised version.
Shapes:
- value: :math:`[B, T_en, T_de]`
- mask: :math:`[B, T_en, T_de]`
"""
value = value * mask
device = value.device
dtype = value.dtype
value = value.data.cpu().numpy().astype(np.float32)
path = np.zeros_like(value).astype(np.int32)
mask = mask.data.cpu().numpy()

t_x_max = mask.sum(1)[:, 0].astype(np.int32)
t_y_max = mask.sum(2)[:, 0].astype(np.int32)
maximum_path_c(path, value, t_x_max, t_y_max)
return torch.from_numpy(path).to(device=device, dtype=dtype)


def maximum_path_numpy(value, mask, max_neg_val=None):
"""
Monotonic alignment search algorithm
Numpy-friendly version. It's about 4 times faster than torch version.
value: [b, t_x, t_y]
mask: [b, t_x, t_y]
"""
if max_neg_val is None:
max_neg_val = -np.inf # Patch for Sphinx complaint
value = value * mask

device = value.device
dtype = value.dtype
value = value.cpu().detach().numpy()
mask = mask.cpu().detach().numpy().astype(bool)

b, t_x, t_y = value.shape
direction = np.zeros(value.shape, dtype=np.int64)
v = np.zeros((b, t_x), dtype=np.float32)
x_range = np.arange(t_x, dtype=np.float32).reshape(1, -1)
for j in range(t_y):
v0 = np.pad(v, [[0, 0], [1, 0]], mode="constant", constant_values=max_neg_val)[:, :-1]
v1 = v
max_mask = v1 >= v0
v_max = np.where(max_mask, v1, v0)
direction[:, :, j] = max_mask

index_mask = x_range <= j
v = np.where(index_mask, v_max + value[:, :, j], max_neg_val)
direction = np.where(mask, direction, 1)

path = np.zeros(value.shape, dtype=np.float32)
index = mask[:, :, 0].sum(1).astype(np.int64) - 1
index_range = np.arange(b)
for j in reversed(range(t_y)):
path[index_range, index, j] = 1
index = index + direction[index_range, index, j] - 1
path = path * mask.astype(np.float32)
path = torch.from_numpy(path).to(device=device, dtype=dtype)
return path


def beta_binomial_prior_distribution(phoneme_count, mel_count, scaling_factor=1.0):
P, M = phoneme_count, mel_count
x = np.arange(0, P)
Expand Down
Empty file.
47 changes: 0 additions & 47 deletions TTS/tts/utils/monotonic_align/core.pyx

This file was deleted.

Loading
Loading