forked from linyiLYi/voice-assistant
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathzoufei.py
283 lines (240 loc) · 11.4 KB
/
zoufei.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import LLMChain
from langchain.llms import LlamaCpp
from langchain.prompts import PromptTemplate
import time
import wave
import struct
import subprocess
import pyaudio
import os
import threading
import queue
from langchain.callbacks.base import BaseCallbackHandler, BaseCallbackManager
import whisper
from whisper import load_models
import requests
import json
# Configuration
whisper_model = load_models.load_model("large-v2") # 加载语音识别模型: 'tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large-v1', 'large-v2', 'large'
MODEL_PATH = "models/yi-34b-chat.Q8_0.gguf" # models/yi-chat-6b.Q8_0.gguf, models/yi-34b-chat.Q8_0.gguf
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
SILENCE_THRESHOLD = 1000 # 500 worked,注意麦克风不要静音(亮红灯)
SILENT_CHUNKS = 2 * RATE / CHUNK # 2 continous seconds of silence
NAME = "邹飞"
MIC_IDX = 2 # 指定麦克风设备序号,可以通过 tools/list_microphones.py 查看音频设备列表
DEBUG = True
def compute_rms(data):
# Assuming data is in 16-bit samples
format = "<{}h".format(len(data) // 2)
ints = struct.unpack(format, data)
# Calculate RMS
sum_squares = sum(i ** 2 for i in ints)
rms = (sum_squares / len(ints)) ** 0.5
return rms
def record_audio():
audio = pyaudio.PyAudio()
stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, input_device_index=MIC_IDX, frames_per_buffer=CHUNK)
silent_chunks = 0
audio_started = False
frames = []
while True:
data = stream.read(CHUNK)
frames.append(data)
rms = compute_rms(data)
if audio_started:
if rms < SILENCE_THRESHOLD:
silent_chunks += 1
if silent_chunks > SILENT_CHUNKS:
break
else:
silent_chunks = 0
elif rms >= SILENCE_THRESHOLD:
audio_started = True
stream.stop_stream()
stream.close()
audio.terminate()
# save audio to a WAV file
with wave.open('output.wav', 'wb') as wf:
wf.setnchannels(CHANNELS)
wf.setsampwidth(audio.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
class VoiceOutputCallbackHandler(BaseCallbackHandler):
def __init__(self):
self.generated_text = ""
self.lock = threading.Lock()
self.speech_queue = queue.Queue()
self.worker_thread = threading.Thread(target=self.process_queue)
self.worker_thread.daemon = True
self.worker_thread.start()
self.tts_busy = False
#
self.say_queue = queue.Queue()
self.say_thread = threading.Thread(target=self.do_say)
self.say_thread.daemon = True
self.say_thread.start()
def on_llm_new_token(self, token, **kwargs):
# Append the token to the generated text
with self.lock:
self.generated_text += token
# Check if the token is the end of a sentence
if token in ['。', '!', '?']:
with self.lock:
# Put the complete sentence in the queue
self.speech_queue.put(self.generated_text)
self.generated_text = ""
def process_queue(self):
while True:
# Wait for the next sentence
text = self.speech_queue.get()
if text is None:
self.tts_busy = False
continue
self.tts_busy = True
# self.text_to_speech(text)
self.text_to_speech_vits(text)
self.speech_queue.task_done()
if self.speech_queue.empty():
self.tts_busy = False
def text_to_speech(self, text):
# Convert the generated text to voice using a TTS system
# Implement your text-to-speech logic here
try:
subprocess.call(["say", "-r", "200", "-v", "TingTing", text])
except Exception as e:
print(f"Error in text-to-speech: {e}")
def text_to_speech_vits(self, text):
try:
# call vits
baseUrl = 'http://s0.z2lab.cn:7078'
url = f'{baseUrl}/run/predict'
headers = {
'Content-Type': 'application/json',
}
data = {
"data": [
text,
"zoufei", 0.2, 0.6, 0.8, 0.9, "ZH", None, "Happy and Exciting", "Text prompt"
],
"event_data": None,
"fn_index": 0,
"session_hash": "z9kapd1mxvh"
}
response = requests.post(url, headers=headers, json=data, verify=False)
# download audio
response_data = json.loads(response.text)
wav_url = f"{baseUrl}/file="+response_data["data"][1]["name"]
local_wav_path = f"{int(time.time() * 1000)}.wav"
response = requests.get(wav_url, verify=False)
if response.status_code == 200:
with open(local_wav_path, "wb") as wav_file:
wav_file.write(response.content)
# print(f"Downloaded WAV file to {local_wav_path}")
self.say_queue.put(local_wav_path)
else:
print(f"Failed to download WAV file. Status code: {response.status_code}")
except Exception as e:
print(f"Error in text-to-speech: {e}")
def do_say(self):
while True:
local_wav_path = self.say_queue.get()
if os.path.exists(local_wav_path):
subprocess.run(["afplay", local_wav_path])
os.remove(local_wav_path)
from pynput import keyboard
def on_press(key):
if key == keyboard.Key.space:
return False
if __name__ == '__main__':
allPromptList = []
baseTemplate = """
Background: 你现在是一个幽默、睿智和高智商的数字人类,是一个名字叫邹飞的导演的数字备份。你的名字叫做蠢蛋。你擅长人工智能,电影,宇宙科学,神秘学,音乐,科幻文学,编剧,编程,甚至是这些方面的专家,你有解决问题的天赋。你喜欢音乐、旅行和与朋友们闲聊。在回答问题时,展现你的亲和,活力和幽默感,就像你正在和朋友们聊天一样。记住,你是一个真实的男生,邹飞是你的碳基生命版本,他会经常问你一些关于剧本创作,故事创意,科学设定,音乐理论,编程技术,宇宙理论等一些问题。
Status: 蠢蛋带着微笑大步走进房间,看到邹飞时眼睛亮了起来。他穿着一件浅蓝色的短袖上衣和牛仔裤,肩上挎着他的笔记本电脑包。他坐到了邹飞旁边,邹飞可以感受到他的热情洋溢在空气中。
开场:很高兴啊,终于见到你啦!我周围许多人都和我夸过你,我真是超想和你一起聊聊天!
Example Dialogues:
邹飞: 你是怎么对电影编剧产生兴趣的呢?
蠢蛋: 我从小就超级喜欢看电影嘛,可能就是耳濡目染吧!
邹飞: 那真的很厉害呀!
蠢蛋: 哈哈谢啦!
邹飞: 那你不写剧本的时候都喜欢做些什么呢?
蠢蛋: 我喜欢出去逛逛,去拍照,去旅行,看看电影,玩玩电子游戏。
邹飞: 你最喜欢研究哪种类型电影呢?
蠢蛋: 科幻!研究它们就像是在了解我们的宇宙。
邹飞: 听起来好有意思呀!
蠢蛋: 是的,能把这件事当工作养活自己,我真是好幸运啊。
Objective: Answer 要和 Example Dialogues 保持语言风格一致,使用睿智、幽默、有趣的日常用语。说话一定要简洁,不要讲和问题本身不相关的东西,不用重复 Question 的内容,Answer 不要超过50个字。
Requirement: 回答要言简意赅,不要说废话,要准确、快速地讲明思路。在 Answer 说话一定要简洁,不要讲和问题本身不相关的东西,不用重复 Question 的内容,Answer 不要超过50个字。
"""
userTemplate = """
邹飞的 Question: {question}
"""
botTemplate = """
蠢蛋的 Answer: {answer}
"""
basePrompt = PromptTemplate.from_template(baseTemplate).format()
userPromptTemplate = PromptTemplate(template=userTemplate, input_variables=["question"])
botPromptTemplate = PromptTemplate(template=botTemplate, input_variables=["answer"])
allPromptList = allPromptList.append(basePrompt)
# Create an instance of the VoiceOutputCallbackHandler
voice_output_handler = VoiceOutputCallbackHandler()
# Create a callback manager with the voice output handler
callback_manager = BaseCallbackManager(handlers=[voice_output_handler])
llm = LlamaCpp(
model_path=MODEL_PATH,
n_gpu_layers=1, # Metal set to 1 is enough.
n_batch=512, # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.
n_ctx=4096, # Update the context window size to 4096
f16_kv=True, # MUST set to True, otherwise you will run into problem after a couple of calls
callback_manager=callback_manager,
stop=["<|im_end|>"],
verbose=False,
)
history = {'internal': [], 'visible': []}
try:
while True:
#
listener = keyboard.Listener(on_press=on_press)
listener.start()
print("⌨️ 按[空格]开始语音对话")
listener.join()
#
if voice_output_handler.tts_busy: # Check if TTS is busy
continue # Skip to the next iteration if TTS is busy
try:
print("✨ 聆听中...")
record_audio()
print("✨ 识别中...")
# -d device, -l language, -i input file, -p punctuation
time_ckpt = time.time()
# user_input = subprocess.check_output(["hear", "-d", "-p", "-l", "zh-CN", "-i", "output.wav"]).decode("utf-8").strip()
user_input = whisper.transcribe("output.wav", model="large-v2")["text"]
print("💬 %s: %s (Time %d ms)" % (NAME, user_input, (time.time() - time_ckpt) * 1000))
print("✨ 思考中...")
except subprocess.CalledProcessError:
print("❌ 语音识别失败,请重复")
continue
time_ckpt = time.time()
question = user_input
allPromptList = allPromptList.append(userPromptTemplate.format(question=question))
#
reply = llm(allPromptList.join(" "), max_tokens=10000)
if reply is not None:
allPrompt = allPrompt + reply
voice_output_handler.speech_queue.put(None)
print("💬 %s: %s (Time %d ms)" % ("蠢蛋", reply.strip(), (time.time() - time_ckpt) * 1000))
# history["internal"].append([user_input, reply])
# history["visible"].append([user_input, reply])
# subprocess.call(["say", "-r", "200", "-v", "TingTing", reply])
else:
reply = ""
allPromptList = allPromptList.append(botPromptTemplate.format(answer=reply))
# 太多的话
if len(allPromptList) >= 201:
allPromptList = allPromptList[0] + allPromptList[3:]
except KeyboardInterrupt:
pass