-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtranscribe.py
135 lines (107 loc) · 3.93 KB
/
transcribe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import glob
import logging
import os
import subprocess
from openai import OpenAI
from openai.types.audio.transcription import Transcription
from openai.types.audio.translation import Translation
client = OpenAI()
logger = logging.getLogger(__name__)
def split_audio(file, split_sec=3600):
logger.debug(f"Splitting {file} into {split_sec} second chunks.")
root, ext = os.path.splitext(file)
# Split the audio file into smaller chunks
output = subprocess.run(
[
"ffmpeg",
"-i",
file,
"-vn",
"-b:a",
"50k",
"-f",
"segment",
"-segment_time",
str(split_sec),
f"{root}-%03d{ext}",
],
capture_output=True,
)
if output.returncode != 0:
logger.debug(f"FFmpeg stderr: {output.stderr.decode()}")
logger.debug(f"FFmpeg stdout: {output.stdout.decode()}")
raise Exception(
f"FFmpeg failed to split {file} into {split_sec} second chunks."
)
# Return the list of output filenames
files = glob.glob(f"{glob.escape(root)}-*{glob.escape(ext)}")
logger.debug(f"Split {file} into {len(files)} files.")
return files
def merge(transcriptions: list[Transcription | Translation], merged_transcription=None):
logger.debug(f"Merging {len(transcriptions)} transcriptions.")
transcription = merged_transcription or {
"task": transcriptions[0].task,
"language": transcriptions[0].language,
"duration": 0.00,
"text": "",
"segments": [],
}
id = 0
for t in transcriptions:
for s in t.segments:
transcription["segments"].append(
{
"id": id,
"seek": s.seek + transcription["duration"] * 100,
"start": s.start + transcription["duration"],
"end": s.end + transcription["duration"],
"text": s.text,
"tokens": s.tokens,
"temperature": s.temperature,
"avg_logprob": s.avg_logprob,
"compression_ratio": s.compression_ratio,
"no_speech_prob": s.no_speech_prob,
}
)
id += 1
transcription["duration"] += t.duration
transcription["text"] += t.text
return transcription
def transcribe_audio(file, prompt=None, language=None):
files = [file]
# Check if the audio file is larger than 25 MB
if os.path.getsize(file) > 25 * 1024 * 1024:
# Split the audio file into smaller chunks
files = split_audio(file)
merged_transcription = None
for file in files:
logger.debug(f"Transcribing {file}.")
# Open the audio file
with open(file, "rb") as f:
# Transcribe the audio file using the Whisper model
response = client.audio.transcriptions.create(
model="whisper-1",
file=f,
prompt=prompt,
response_format="verbose_json",
language=language,
)
merged_transcription = merge([response], merged_transcription)
return merged_transcription
def translate_audio(file, prompt=None):
files = [file]
# Check if the audio file is larger than 25 MB
if os.path.getsize(file) > 25 * 1024 * 1024:
# Split the audio file into smaller chunks
files = split_audio(file)
merged_translation = None
for file in files:
logger.debug(f"Translating {file}.")
# Open the audio file
with open(file, "rb") as f:
# Translate the audio file to English using the Whisper model
response = client.audio.translations.create(
model="whisper-1", file=f, prompt=prompt, response_format="verbose_json"
)
merged_translation = merge([response], merged_translation)
return merged_translation