From whisper to sensevoice (#482)

* Fix win deps * Fix win docs * Fix indent = 4 * Fix zh docs indent * Update whisper asr: no split * Add sensevoice labeling pipeline * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * allow api accept any ref audio * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update mel-band-roformer * revert to bs-roformer * Add option to save emo * fsmnvad -> silerovad * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update req * max single seg time * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
fishaudio · Aug 19, 2024 · 8702c61 · 8702c61
1 parent 5a842d1
commit 8702c61
Show file tree

Hide file tree

Showing 16 changed files with 1,194 additions and 165 deletions.
diff --git a/docs/zh/index.md b/docs/zh/index.md
@@ -13,8 +13,8 @@
 </div>
 
 !!! warning
-   我们不对代码库的任何非法使用承担任何责任. 请参阅您当地关于 DMCA (数字千年法案) 和其他相关法律法规. <br/>
-   此代码库与所有模型根据 CC-BY-NC-SA-4.0 许可证发布.
+    我们不对代码库的任何非法使用承担任何责任. 请参阅您当地关于 DMCA (数字千年法案) 和其他相关法律法规. <br/>
+    此代码库与所有模型根据 CC-BY-NC-SA-4.0 许可证发布.
 
 <p align="center">
    <img src="../assets/figs/diagram.png" width="75%">
@@ -33,23 +33,23 @@ Windows 非专业用户可考虑以下为免 Linux 环境的基础运行方法
 
 1. 解压项目压缩包。
 2. 点击 `install_env.bat` 安装环境。
-   - 可以通过编辑 `install_env.bat` 的 `USE_MIRROR` 项来决定是否使用镜像站下载。
-   - `USE_MIRROR=false` 使用原始站下载最新稳定版 `torch` 环境。`USE_MIRROR=true` 为从镜像站下载最新 `torch` 环境。默认为 `true`。
-   - 可以通过编辑 `install_env.bat` 的 `INSTALL_TYPE` 项来决定是否启用可编译环境下载。
-   - `INSTALL_TYPE=preview` 下载开发版编译环境。`INSTALL_TYPE=stable` 下载稳定版不带编译环境。
+    - 可以通过编辑 `install_env.bat` 的 `USE_MIRROR` 项来决定是否使用镜像站下载。
+    - `USE_MIRROR=false` 使用原始站下载最新稳定版 `torch` 环境。`USE_MIRROR=true` 为从镜像站下载最新 `torch` 环境。默认为 `true`。
+    - 可以通过编辑 `install_env.bat` 的 `INSTALL_TYPE` 项来决定是否启用可编译环境下载。
+    - `INSTALL_TYPE=preview` 下载开发版编译环境。`INSTALL_TYPE=stable` 下载稳定版不带编译环境。
 3. 若第 2 步 `INSTALL_TYPE=preview` 则执行这一步（可跳过，此步为激活编译模型环境）
-   1. 使用如下链接下载 LLVM 编译器。
-      - [LLVM-17.0.6（原站站点下载）](https://huggingface.co/fishaudio/fish-speech-1/resolve/main/LLVM-17.0.6-win64.exe?download=true)
-      - [LLVM-17.0.6（镜像站点下载）](https://hf-mirror.com/fishaudio/fish-speech-1/resolve/main/LLVM-17.0.6-win64.exe?download=true)
-      - 下载完 `LLVM-17.0.6-win64.exe` 后，双击进行安装，选择合适的安装位置，最重要的是勾选 `Add Path to Current User` 添加环境变量。
-      - 确认安装完成。
-   2. 下载安装 Microsoft Visual C++ 可再发行程序包，解决潜在 .dll 丢失问题。
-      - [MSVC++ 14.40.33810.0 下载](https://aka.ms/vs/17/release/vc_redist.x64.exe)
-   3. 下载安装 Visual Studio 社区版以获取 MSVC++ 编译工具, 解决 LLVM 的头文件依赖问题。
-      - [Visual Studio 下载](https://visualstudio.microsoft.com/zh-hans/downloads/)
-      - 安装好 Visual Studio Installer 之后，下载 Visual Studio Community 2022
-      - 如下图点击`修改`按钮，找到`使用C++的桌面开发`项，勾选下载
-   4. 下载安装 [CUDA Toolkit 12](https://developer.nvidia.com/cuda-12-1-0-download-archive?target_os=Windows&target_arch=x86_64)
+    1. 使用如下链接下载 LLVM 编译器。
+        - [LLVM-17.0.6（原站站点下载）](https://huggingface.co/fishaudio/fish-speech-1/resolve/main/LLVM-17.0.6-win64.exe?download=true)
+        - [LLVM-17.0.6（镜像站点下载）](https://hf-mirror.com/fishaudio/fish-speech-1/resolve/main/LLVM-17.0.6-win64.exe?download=true)
+        - 下载完 `LLVM-17.0.6-win64.exe` 后，双击进行安装，选择合适的安装位置，最重要的是勾选 `Add Path to Current User` 添加环境变量。
+        - 确认安装完成。
+    2. 下载安装 Microsoft Visual C++ 可再发行程序包，解决潜在 .dll 丢失问题。
+        - [MSVC++ 14.40.33810.0 下载](https://aka.ms/vs/17/release/vc_redist.x64.exe)
+    3. 下载安装 Visual Studio 社区版以获取 MSVC++ 编译工具, 解决 LLVM 的头文件依赖问题。
+        - [Visual Studio 下载](https://visualstudio.microsoft.com/zh-hans/downloads/)
+        - 安装好 Visual Studio Installer 之后，下载 Visual Studio Community 2022
+        - 如下图点击`修改`按钮，找到`使用C++的桌面开发`项，勾选下载
+    4. 下载安装 [CUDA Toolkit 12](https://developer.nvidia.com/cuda-12-1-0-download-archive?target_os=Windows&target_arch=x86_64)
 4. 双击 `start.bat` 打开训练推理 WebUI 管理界面. 如有需要，可照下列提示修改`API_FLAGS`.
 
 !!! info "可选"

diff --git a/fish_speech/utils/file.py b/fish_speech/utils/file.py
@@ -1,55 +1,5 @@
 import os
-from glob import glob
 from pathlib import Path
-from typing import Union
-
-from loguru import logger
-from natsort import natsorted
-
-AUDIO_EXTENSIONS = {
-    ".mp3",
-    ".wav",
-    ".flac",
-    ".ogg",
-    ".m4a",
-    ".wma",
-    ".aac",
-    ".aiff",
-    ".aif",
-    ".aifc",
-}
-
-
-def list_files(
-    path: Union[Path, str],
-    extensions: set[str] = None,
-    recursive: bool = False,
-    sort: bool = True,
-) -> list[Path]:
-    """List files in a directory.
-
-    Args:
-        path (Path): Path to the directory.
-        extensions (set, optional): Extensions to filter. Defaults to None.
-        recursive (bool, optional): Whether to search recursively. Defaults to False.
-        sort (bool, optional): Whether to sort the files. Defaults to True.
-
-    Returns:
-        list: List of files.
-    """
-
-    if isinstance(path, str):
-        path = Path(path)
-
-    if not path.exists():
-        raise FileNotFoundError(f"Directory {path} does not exist.")
-
-    files = [file for ext in extensions for file in path.rglob(f"*{ext}")]
-
-    if sort:
-        files = natsorted(files)
-
-    return files
 
 
 def get_latest_checkpoint(path: Path | str) -> Path | None:
@@ -64,56 +14,3 @@ def get_latest_checkpoint(path: Path | str) -> Path | None:
         return None
 
     return ckpts[-1]
-
-
-def load_filelist(path: Path | str) -> list[tuple[Path, str, str, str]]:
-    """
-    Load a Bert-VITS2 style filelist.
-    """
-
-    files = set()
-    results = []
-    count_duplicated, count_not_found = 0, 0
-
-    LANGUAGE_TO_LANGUAGES = {
-        "zh": ["zh", "en"],
-        "jp": ["jp", "en"],
-        "en": ["en"],
-    }
-
-    with open(path, "r", encoding="utf-8") as f:
-        for line in f.readlines():
-            splits = line.strip().split("|", maxsplit=3)
-            if len(splits) != 4:
-                logger.warning(f"Invalid line: {line}")
-                continue
-
-            filename, speaker, language, text = splits
-            file = Path(filename)
-            language = language.strip().lower()
-
-            if language == "ja":
-                language = "jp"
-
-            assert language in ["zh", "jp", "en"], f"Invalid language {language}"
-            languages = LANGUAGE_TO_LANGUAGES[language]
-
-            if file in files:
-                logger.warning(f"Duplicated file: {file}")
-                count_duplicated += 1
-                continue
-
-            if not file.exists():
-                logger.warning(f"File not found: {file}")
-                count_not_found += 1
-                continue
-
-            results.append((file, speaker, languages, text))
-
-    if count_duplicated > 0:
-        logger.warning(f"Total duplicated files: {count_duplicated}")
-
-    if count_not_found > 0:
-        logger.warning(f"Total files not found: {count_not_found}")
-
-    return results
diff --git a/pyproject.toml b/pyproject.toml
@@ -38,9 +38,11 @@ dependencies = [
     "zstandard>=0.22.0",
     "pydub",
     "faster_whisper",
-    "modelscope==1.16.1",
-    "funasr==1.1.2",
+    "modelscope==1.17.1",
+    "funasr==1.1.5",
     "opencc-python-reimplemented==0.1.7",
+    "audio-seperator[gpu]==0.18.3",
+    "silero-vad",
 ]
 
 [project.optional-dependencies]

diff --git a/run_cmd.bat b/run_cmd.bat
@@ -29,7 +29,7 @@ set INSTALL_ENV_DIR=%cd%\fishenv\env
 
 
 set PYTHONNOUSERSITE=1
-set PYTHONPATH=
+set PYTHONPATH=%~dp0
 set PYTHONHOME=
 
 

diff --git a/tools/api.py b/tools/api.py
@@ -3,18 +3,19 @@
 import json
 import queue
 import random
+import sys
 import traceback
 import wave
 from argparse import ArgumentParser
 from http import HTTPStatus
 from pathlib import Path
 from typing import Annotated, Literal, Optional
 
-import librosa
 import numpy as np
 import pyrootutils
 import soundfile as sf
 import torch
+import torchaudio
 from kui.asgi import (
     Body,
     HTTPException,
@@ -87,7 +88,18 @@ def load_audio(reference_audio, sr):
         except base64.binascii.Error:
             raise ValueError("Invalid path or base64 string")
 
-    audio, _ = librosa.load(reference_audio, sr=sr, mono=True)
+    waveform, original_sr = torchaudio.load(
+        reference_audio, backend="sox" if sys.platform == "linux" else "soundfile"
+    )
+
+    if waveform.shape[0] > 1:
+        waveform = torch.mean(waveform, dim=0, keepdim=True)
+
+    if original_sr != sr:
+        resampler = torchaudio.transforms.Resample(orig_freq=original_sr, new_freq=sr)
+        waveform = resampler(waveform)
+
+    audio = waveform.squeeze().numpy()
     return audio
 
 

diff --git a/tools/file.py b/tools/file.py
@@ -0,0 +1,108 @@
+from pathlib import Path
+from typing import Union
+
+from loguru import logger
+from natsort import natsorted
+
+AUDIO_EXTENSIONS = {
+    ".mp3",
+    ".wav",
+    ".flac",
+    ".ogg",
+    ".m4a",
+    ".wma",
+    ".aac",
+    ".aiff",
+    ".aif",
+    ".aifc",
+}
+
+VIDEO_EXTENSIONS = {
+    ".mp4",
+    ".avi",
+}
+
+
+def list_files(
+    path: Union[Path, str],
+    extensions: set[str] = None,
+    recursive: bool = False,
+    sort: bool = True,
+) -> list[Path]:
+    """List files in a directory.
+
+    Args:
+        path (Path): Path to the directory.
+        extensions (set, optional): Extensions to filter. Defaults to None.
+        recursive (bool, optional): Whether to search recursively. Defaults to False.
+        sort (bool, optional): Whether to sort the files. Defaults to True.
+
+    Returns:
+        list: List of files.
+    """
+
+    if isinstance(path, str):
+        path = Path(path)
+
+    if not path.exists():
+        raise FileNotFoundError(f"Directory {path} does not exist.")
+
+    files = [file for ext in extensions for file in path.rglob(f"*{ext}")]
+
+    if sort:
+        files = natsorted(files)
+
+    return files
+
+
+def load_filelist(path: Path | str) -> list[tuple[Path, str, str, str]]:
+    """
+    Load a Bert-VITS2 style filelist.
+    """
+
+    files = set()
+    results = []
+    count_duplicated, count_not_found = 0, 0
+
+    LANGUAGE_TO_LANGUAGES = {
+        "zh": ["zh", "en"],
+        "jp": ["jp", "en"],
+        "en": ["en"],
+    }
+
+    with open(path, "r", encoding="utf-8") as f:
+        for line in f.readlines():
+            splits = line.strip().split("|", maxsplit=3)
+            if len(splits) != 4:
+                logger.warning(f"Invalid line: {line}")
+                continue
+
+            filename, speaker, language, text = splits
+            file = Path(filename)
+            language = language.strip().lower()
+
+            if language == "ja":
+                language = "jp"
+
+            assert language in ["zh", "jp", "en"], f"Invalid language {language}"
+            languages = LANGUAGE_TO_LANGUAGES[language]
+
+            if file in files:
+                logger.warning(f"Duplicated file: {file}")
+                count_duplicated += 1
+                continue
+
+            if not file.exists():
+                logger.warning(f"File not found: {file}")
+                count_not_found += 1
+                continue
+
+            results.append((file, speaker, languages, text))
+
+    if count_duplicated > 0:
+        logger.warning(f"Total duplicated files: {count_duplicated}")
+
+    if count_not_found > 0:
+        logger.warning(f"Total files not found: {count_not_found}")
+
+    return results
diff --git a/tools/merge_asr_files.py b/tools/merge_asr_files.py
@@ -4,7 +4,7 @@
 from pydub import AudioSegment
 from tqdm import tqdm
 
-from fish_speech.utils.file import AUDIO_EXTENSIONS, list_files
+from tools.file import AUDIO_EXTENSIONS, list_files
 
 
 def merge_and_delete_files(save_dir, original_files):