From fc5b9f5a2cd1cdffb8885a44335d4e76fcbe782a Mon Sep 17 00:00:00 2001 From: paidax <40118038+kenwaytis@users.noreply.github.com> Date: Thu, 21 Dec 2023 10:42:23 +0800 Subject: [PATCH] 1.Supports FLAC, WAV, MP3 2.Fixed conversion path issue. (#22) * 1.Supports FLAC, WAV, MP3 2.Fixed conversion path issue. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * 1.Use list_files to filter audio 2.Use the click library 3.Implement sample rate conversion. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- tools/whisper_asr.py | 134 ++++++++++++++++++------------------------- 1 file changed, 56 insertions(+), 78 deletions(-) diff --git a/tools/whisper_asr.py b/tools/whisper_asr.py index 5d5a854f..24894a07 100644 --- a/tools/whisper_asr.py +++ b/tools/whisper_asr.py @@ -21,42 +21,17 @@ Note: Be aware of your audio sample rate, which defaults to 44.1kHz. """ - -import argparse -import os from pathlib import Path -import librosa -import numpy as np +import click import whisper -from scipy.io import wavfile -from tqdm import tqdm - - -def load_and_normalize_audio(filepath, target_sr): - wav, sr = librosa.load(filepath, sr=None, mono=True) - wav, _ = librosa.effects.trim(wav, top_db=20) - peak = np.abs(wav).max() - if peak > 1.0: - wav /= peak / 0.98 - return librosa.resample(wav, orig_sr=sr, target_sr=target_sr), target_sr - +from pydub import AudioSegment -def transcribe_audio(model, filepath): - return model.transcribe( - filepath, word_timestamps=True, task="transcribe", beam_size=5, best_of=5 - ) +from fish_speech.utils.file import list_files -def save_audio_segments(segments, wav, sr, save_path): - for i, seg in enumerate(segments): - start_time, end_time = seg["start"], seg["end"] - wav_seg = wav[int(start_time * sr) : int(end_time * sr)] - wav_seg_name = f"{save_path.stem}_{i}.wav" - out_fpath = save_path / wav_seg_name - wavfile.write( - out_fpath, rate=sr, data=(wav_seg * np.iinfo(np.int16).max).astype(np.int16) - ) +def transcribe_audio(model, filepath, language): + return model.transcribe(filepath, language=language) def transcribe_segment(model, filepath): @@ -70,57 +45,60 @@ def transcribe_segment(model, filepath): return result.text, lang -def process_output(save_dir, language, out_file): - with open(out_file, "w", encoding="utf-8") as wf: - ch_name = save_dir.stem - for file in save_dir.glob("*.lab"): - with open(file, "r", encoding="utf-8") as perFile: - line = perFile.readline().strip() - result = ( - f"{save_dir}/{ch_name}/{file.stem}.wav|{ch_name}|{language}|{line}" - ) - wf.write(f"{result}\n") - - +def load_audio(file_path, file_suffix): + try: + if file_suffix == ".wav": + audio = AudioSegment.from_wav(file_path) + elif file_suffix == ".mp3": + audio = AudioSegment.from_mp3(file_path) + elif file_suffix == ".flac": + audio = AudioSegment.from_file(file_path, format="flac") + return audio + except Exception as e: + print(f"Error processing file {file_path}: {e}") + return None + + +@click.command() +@click.option("--model_size", default="large", help="Size of the Whisper model") +@click.option("--audio_dir", required=True, help="Directory containing audio files") +@click.option( + "--save_dir", required=True, help="Directory to save processed audio files" +) +@click.option("--language", default="ZH", help="Language of the transcription") +@click.option("--out_sr", default=44100, type=int, help="Output sample rate") def main(model_size, audio_dir, save_dir, out_sr, language): + print("Loading/Downloading OpenAI Whisper model...") model = whisper.load_model(model_size) - audio_dir, save_dir = Path(audio_dir), Path(save_dir) - save_dir.mkdir(exist_ok=True) - - for filepath in tqdm(list(audio_dir.glob("*.wav")), desc="Processing files"): - wav, sr = load_and_normalize_audio(filepath, out_sr) - transcription = transcribe_audio(model, filepath) - save_path = save_dir / filepath.stem - save_audio_segments(transcription["segments"], wav, sr, save_path) - - for segment_file in tqdm( - list(save_path.glob("*.wav")), desc="Transcribing segments" - ): - text, _ = transcribe_segment(model, segment_file) - with open(segment_file.with_suffix(".lab"), "w", encoding="utf-8") as f: + save_path = Path(save_dir) + save_path.mkdir(parents=True, exist_ok=True) + audio_files = list_files( + path=audio_dir, extensions=[".wav", ".mp3", ".flac"], recursive=True + ) + for file_path in tqdm(audio_files, desc="Processing audio file"): + file_stem = file_path.stem + file_suffix = file_path.suffix + file_path = str(file_path) + audio = load_audio(file_path, file_suffix) + if not audio: + continue + transcription = transcribe_audio(model, file_path, language) + for segment in transcription.get("segments", []): + print(segment) + id, text, start, end = ( + segment["id"], + segment["text"], + segment["start"], + segment["end"], + ) + extract = audio[int(start * 1000) : int(end * 1000)].set_frame_rate(out_sr) + extract.export( + save_path / f"{file_stem}_{id}{file_suffix}", + format=file_suffix.lower().strip("."), + ) + with open(save_path / f"{file_stem}_{id}.lab", "w", encoding="utf-8") as f: f.write(text) - # process_output(save_dir, language, save_dir / "output.txt") # Dont need summarize to one file - if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Audio Transcription with Whisper") - parser.add_argument( - "--model_size", type=str, default="large", help="Size of the Whisper model" - ) - parser.add_argument( - "--audio_dir", type=str, required=True, help="Directory containing audio files" - ) - parser.add_argument( - "--save_dir", - type=str, - required=True, - help="Directory to save processed audio files", - ) - parser.add_argument( - "--language", type=str, default="ZH", help="Language of the transcription" - ) - parser.add_argument("--out_sr", type=int, default=44100, help="Output sample rate") - args = parser.parse_args() - - main(args.model_size, args.audio_dir, args.save_dir, args.out_sr, args.language) + main()