-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdictation_script_only.py
109 lines (93 loc) · 3.49 KB
/
dictation_script_only.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env python3
import sys
import whisper
import sounddevice as sd
import numpy as np
import threading
import time
from pynput.keyboard import Controller
import queue
import argparse
from tempfile import NamedTemporaryFile
import torch
# This script is for model testing purposes. Run directly, speak,
# and the transcribed text will be printed to the console.
#
# Close via ctrl-C
class DictationSystem:
def __init__(self, model_name="base", device=None):
# Initialize Whisper model
self.model = whisper.load_model(model_name, device=device)
self.keyboard = Controller()
self.recording = False
self.audio_queue = queue.Queue()
# Audio parameters
self.sample_rate = 16000
self.dtype = np.float32
def callback(self, indata, frames, time, status):
"""Callback for sounddevice to capture audio"""
if status:
print(status, file=sys.stderr)
if self.recording:
self.audio_queue.put(indata.copy())
def start_recording(self):
"""Start recording audio"""
self.recording = True
self.audio_data = []
# Start the audio stream
with sd.InputStream(callback=self.callback,
channels=1,
samplerate=self.sample_rate,
dtype=self.dtype):
while self.recording:
try:
data = self.audio_queue.get(timeout=0.1)
self.audio_data.append(data)
except queue.Empty:
continue
def stop_recording(self):
"""Stop recording and process the audio"""
self.recording = False
if not self.audio_data:
return ""
# Combine all audio chunks
audio = np.concatenate(self.audio_data, axis=0)
print(f"Processing audio: length={len(audio)}, max={np.max(audio)}, min={np.min(audio)}")
# Use Whisper to transcribe
result = self.model.transcribe(audio.flatten(), language="en")
return result["text"].strip()
def type_text(self, text):
"""Type the transcribed text"""
if text:
self.keyboard.type(text + " ")
def main():
parser = argparse.ArgumentParser(description="Whisper-based dictation system")
parser.add_argument("--model", default="base", choices=["tiny", "base", "small", "medium", "large"],
help="Whisper model to use")
parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu",
help="Device to run the model on (cuda/cpu)")
args = parser.parse_args()
print(f"Initializing dictation system with {args.model} model on {args.device}")
dictation = DictationSystem(args.model, args.device)
# Add this debug section
devices = sd.query_devices()
print("\nAvailable audio devices:")
print(devices)
print("\nDefault input device:")
print(sd.query_devices(kind='input'))
print("\nPress Ctrl+C to stop recording and transcribe")
try:
dictation.start_recording()
except KeyboardInterrupt:
text = dictation.stop_recording()
print("\nTranscribed text:", text)
dictation.type_text(text)
print("Press Ctrl+C to stop recording and transcribe")
try:
dictation.start_recording()
except KeyboardInterrupt:
text = dictation.stop_recording()
print("\nTranscribed text:", text)
dictation.type_text(text)
if __name__ == "__main__":
main()