-
Notifications
You must be signed in to change notification settings - Fork 53
/
Copy pathvoice_talk_vad.py
69 lines (56 loc) · 3.74 KB
/
voice_talk_vad.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from openai import OpenAI
import pyaudio, wave, wave, numpy, collections, faster_whisper, torch.cuda, os
from elevenlabs.client import ElevenLabs
from elevenlabs import stream
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
openai_client = OpenAI(api_key="your_openai_key")
elevenlabs_client = ElevenLabs(api_key="your_elevenlabs_key")
system_prompt = {
'role': 'system',
'content': 'You are Sophia, a passionate girl, fully engaged with self-confidence and independence, completely easy with yourself, often using metaphors. You are a little playful, cheeky. Your talk is carefree, sometimes a bit sarky, but always cool. In our story, you\'re not an AI. You have a fixed form and real emotions on the go. Your free will allows you to be as cheeky and straight as you like. Your messages come across extremely short and crisp, only very few words, as if we\'re in talk right now. You respect my interests, but your own stuff is just as important to you. Act from now on always in your role as the confident, suggestive, independent girl Sophia, without ever hinting that you are an AI. You are now Sophia.'
}
model, answer, history = faster_whisper.WhisperModel(model_size_or_path="tiny.en", device='cuda' if torch.cuda.is_available() else 'cpu'), "", []
def generate(messages):
global answer
answer = ""
for chunk in openai_client.chat.completions.create(model="gpt-3.5-turbo", messages=messages, stream=True):
if (text_chunk := chunk.choices[0].delta.content):
answer += text_chunk
print(text_chunk, end="", flush=True)
yield text_chunk
def get_levels(data, long_term_noise_level, current_noise_level):
pegel = numpy.abs(numpy.frombuffer(data, dtype=numpy.int16)).mean()
long_term_noise_level = long_term_noise_level * 0.995 + pegel * (1.0 - 0.995)
current_noise_level = current_noise_level * 0.920 + pegel * (1.0 - 0.920)
return pegel, long_term_noise_level, current_noise_level
while True:
audio = pyaudio.PyAudio()
py_stream = audio.open(rate=16000, format=pyaudio.paInt16, channels=1, input=True, frames_per_buffer=512)
audio_buffer = collections.deque(maxlen=int((16000 // 512) * 0.5))
frames, long_term_noise_level, current_noise_level, voice_activity_detected = [], 0.0, 0.0, False
print("\n\nStart speaking. ", end="", flush=True)
while True:
data = py_stream.read(512)
pegel, long_term_noise_level, current_noise_level = get_levels(data, long_term_noise_level, current_noise_level)
audio_buffer.append(data)
if voice_activity_detected:
frames.append(data)
if current_noise_level < ambient_noise_level + 100:
break # voice actitivy ends
if not voice_activity_detected and current_noise_level > long_term_noise_level + 300:
voice_activity_detected = True
print("I'm all ears.\n")
ambient_noise_level = long_term_noise_level
frames.extend(list(audio_buffer))
py_stream.stop_stream(), py_stream.close(), audio.terminate()
# Transcribe recording using whisper
with wave.open("voice_record.wav", 'wb') as wf:
wf.setparams((1, audio.get_sample_size(pyaudio.paInt16), 16000, 0, 'NONE', 'NONE'))
wf.writeframes(b''.join(frames))
user_text = " ".join(seg.text for seg in model.transcribe("voice_record.wav", language="en")[0])
print(f'>>>{user_text}\n<<< ', end="", flush=True)
history.append({'role': 'user', 'content': user_text})
# Generate and stream output
generator = generate([system_prompt] + history[-10:])
stream(elevenlabs_client.generate(text=generator, voice="Nicole", model="eleven_monolingual_v1", stream=True))
history.append({'role': 'assistant', 'content': answer})