Skip to content

Commit

Permalink
Merge pull request #135 from idiap/mas
Browse files Browse the repository at this point in the history
Use external package for monotonic alignment search
  • Loading branch information
eginhard authored Nov 8, 2024
2 parents 59996ff + 683ee66 commit e18f7da
Show file tree
Hide file tree
Showing 13 changed files with 69 additions and 287 deletions.
48 changes: 19 additions & 29 deletions .github/workflows/pypi-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@ defaults:
shell:
bash
jobs:
build-sdist:
name: Build source distribution
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
Expand All @@ -20,37 +19,29 @@ jobs:
if [[ "v$version" != "$tag" ]]; then
exit 1
fi
- uses: actions/setup-python@v5
- name: Install uv
uses: astral-sh/setup-uv@v3
with:
python-version: 3.9
- run: |
python -m pip install -U pip setuptools build
- run: |
python -m build
- run: |
pip install dist/*.tar.gz
- uses: actions/upload-artifact@v4
with:
name: build-sdist
path: dist/*.tar.gz
build-wheels:
name: Build wheels on ${{ matrix.os }}
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
steps:
- uses: actions/checkout@v4
- name: Build wheels
uses: pypa/[email protected]
version: "0.4.27"
enable-cache: true
cache-dependency-glob: "**/pyproject.toml"
- name: Set up Python
run: uv python install 3.12
- name: Build sdist and wheel
run: uv build
- name: Test installation of sdist and wheel
run: |
uv venv --no-project
uv pip install dist/*.tar.gz
uv pip install dist/*.whl
- uses: actions/upload-artifact@v4
with:
name: build-wheels-${{ matrix.os }}
path: ./wheelhouse/*.whl
name: build
path: dist/*
publish-artifacts:
name: Publish to PyPI
runs-on: ubuntu-latest
needs: [build-sdist, build-wheels]
needs: [build]
environment:
name: release
url: https://pypi.org/p/coqui-tts
Expand All @@ -60,8 +51,7 @@ jobs:
- uses: actions/download-artifact@v4
with:
path: dist
pattern: build-*
merge-multiple: true
pattern: build
- run: |
ls -lh dist/
- name: Publish package distributions to PyPI
Expand Down
75 changes: 0 additions & 75 deletions CODE_OWNERS.rst

This file was deleted.

10 changes: 0 additions & 10 deletions MANIFEST.in

This file was deleted.

3 changes: 2 additions & 1 deletion TTS/tts/layers/delightful_tts/acoustic_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import torch
import torch.nn.functional as F
from coqpit import Coqpit
from monotonic_alignment_search import maximum_path
from torch import nn

from TTS.tts.layers.delightful_tts.conformer import Conformer
Expand All @@ -19,7 +20,7 @@
from TTS.tts.layers.delightful_tts.pitch_adaptor import PitchAdaptor
from TTS.tts.layers.delightful_tts.variance_predictor import VariancePredictor
from TTS.tts.layers.generic.aligner import AlignmentNetwork
from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
from TTS.tts.utils.helpers import generate_path, sequence_mask

logger = logging.getLogger(__name__)

Expand Down
3 changes: 2 additions & 1 deletion TTS/tts/models/align_tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import torch
from coqpit import Coqpit
from monotonic_alignment_search import maximum_path
from torch import nn
from trainer.io import load_fsspec

Expand All @@ -12,7 +13,7 @@
from TTS.tts.layers.feed_forward.encoder import Encoder
from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
from TTS.tts.models.base_tts import BaseTTS
from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
from TTS.tts.utils.helpers import generate_path, sequence_mask
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
Expand Down
3 changes: 2 additions & 1 deletion TTS/tts/models/forward_tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import torch
from coqpit import Coqpit
from monotonic_alignment_search import maximum_path
from torch import nn
from torch.cuda.amp.autocast_mode import autocast
from trainer.io import load_fsspec
Expand All @@ -14,7 +15,7 @@
from TTS.tts.layers.generic.pos_encoding import PositionalEncoding
from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
from TTS.tts.models.base_tts import BaseTTS
from TTS.tts.utils.helpers import average_over_durations, generate_path, maximum_path, sequence_mask
from TTS.tts.utils.helpers import average_over_durations, generate_path, sequence_mask
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.utils.visual import plot_alignment, plot_avg_energy, plot_avg_pitch, plot_spectrogram
Expand Down
3 changes: 2 additions & 1 deletion TTS/tts/models/glow_tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import torch
from coqpit import Coqpit
from monotonic_alignment_search import maximum_path
from torch import nn
from torch.cuda.amp.autocast_mode import autocast
from torch.nn import functional as F
Expand All @@ -13,7 +14,7 @@
from TTS.tts.layers.glow_tts.decoder import Decoder
from TTS.tts.layers.glow_tts.encoder import Encoder
from TTS.tts.models.base_tts import BaseTTS
from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
from TTS.tts.utils.helpers import generate_path, sequence_mask
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.text.tokenizer import TTSTokenizer
Expand Down
3 changes: 2 additions & 1 deletion TTS/tts/models/vits.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import torchaudio
from coqpit import Coqpit
from librosa.filters import mel as librosa_mel_fn
from monotonic_alignment_search import maximum_path
from torch import nn
from torch.cuda.amp.autocast_mode import autocast
from torch.nn import functional as F
Expand All @@ -28,7 +29,7 @@
from TTS.tts.layers.vits.stochastic_duration_predictor import StochasticDurationPredictor
from TTS.tts.models.base_tts import BaseTTS
from TTS.tts.utils.fairseq import rehash_fairseq_vits_checkpoint
from TTS.tts.utils.helpers import generate_path, maximum_path, rand_segments, segment, sequence_mask
from TTS.tts.utils.helpers import generate_path, rand_segments, segment, sequence_mask
from TTS.tts.utils.languages import LanguageManager
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.synthesis import synthesis
Expand Down
74 changes: 0 additions & 74 deletions TTS/tts/utils/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,6 @@
from scipy.stats import betabinom
from torch.nn import functional as F

try:
from TTS.tts.utils.monotonic_align.core import maximum_path_c

CYTHON = True
except ModuleNotFoundError:
CYTHON = False


class StandardScaler:
"""StandardScaler for mean-scale normalization with the given mean and scale values."""
Expand Down Expand Up @@ -168,73 +161,6 @@ def generate_path(duration, mask):
return path


def maximum_path(value, mask):
if CYTHON:
return maximum_path_cython(value, mask)
return maximum_path_numpy(value, mask)


def maximum_path_cython(value, mask):
"""Cython optimised version.
Shapes:
- value: :math:`[B, T_en, T_de]`
- mask: :math:`[B, T_en, T_de]`
"""
value = value * mask
device = value.device
dtype = value.dtype
value = value.data.cpu().numpy().astype(np.float32)
path = np.zeros_like(value).astype(np.int32)
mask = mask.data.cpu().numpy()

t_x_max = mask.sum(1)[:, 0].astype(np.int32)
t_y_max = mask.sum(2)[:, 0].astype(np.int32)
maximum_path_c(path, value, t_x_max, t_y_max)
return torch.from_numpy(path).to(device=device, dtype=dtype)


def maximum_path_numpy(value, mask, max_neg_val=None):
"""
Monotonic alignment search algorithm
Numpy-friendly version. It's about 4 times faster than torch version.
value: [b, t_x, t_y]
mask: [b, t_x, t_y]
"""
if max_neg_val is None:
max_neg_val = -np.inf # Patch for Sphinx complaint
value = value * mask

device = value.device
dtype = value.dtype
value = value.cpu().detach().numpy()
mask = mask.cpu().detach().numpy().astype(bool)

b, t_x, t_y = value.shape
direction = np.zeros(value.shape, dtype=np.int64)
v = np.zeros((b, t_x), dtype=np.float32)
x_range = np.arange(t_x, dtype=np.float32).reshape(1, -1)
for j in range(t_y):
v0 = np.pad(v, [[0, 0], [1, 0]], mode="constant", constant_values=max_neg_val)[:, :-1]
v1 = v
max_mask = v1 >= v0
v_max = np.where(max_mask, v1, v0)
direction[:, :, j] = max_mask

index_mask = x_range <= j
v = np.where(index_mask, v_max + value[:, :, j], max_neg_val)
direction = np.where(mask, direction, 1)

path = np.zeros(value.shape, dtype=np.float32)
index = mask[:, :, 0].sum(1).astype(np.int64) - 1
index_range = np.arange(b)
for j in reversed(range(t_y)):
path[index_range, index, j] = 1
index = index + direction[index_range, index, j] - 1
path = path * mask.astype(np.float32)
path = torch.from_numpy(path).to(device=device, dtype=dtype)
return path


def beta_binomial_prior_distribution(phoneme_count, mel_count, scaling_factor=1.0):
P, M = phoneme_count, mel_count
x = np.arange(0, P)
Expand Down
Empty file.
47 changes: 0 additions & 47 deletions TTS/tts/utils/monotonic_align/core.pyx

This file was deleted.

Loading

0 comments on commit e18f7da

Please sign in to comment.