-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAutogen_voice_assistant_example_pyfile.py
122 lines (99 loc) · 4.98 KB
/
Autogen_voice_assistant_example_pyfile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from autogen.agentchat.contrib.gpt_assistant_agent import GPTAssistantAgent
from VoiceProcessingToolkit.VoiceProcessingManager import VoiceProcessingManager
from VoiceProcessingToolkit.VoiceProcessingManager import text_to_speech_stream
from dotenv import load_dotenv
import os
import autogen
import logging
# logging.basicConfig(level=logging.INFO)
load_dotenv()
# Set environment variables for API keys in .env file
os.getenv('PICOVOICE_APIKEY')
openai_api_key = os.getenv('OPENAI_API_KEY')
elevenlabs_api_key = os.getenv('ELEVENLABS_API_KEY')
# Define configuration for language models
config_list = [
{"model": "gpt-4-1106-preview", "api_key": openai_api_key},
{"model": "gpt-3.5-turbo-1106-preview", "api_key": openai_api_key},
]
llm_config = {"config_list": config_list, "cache_seed": 42}
# Create the agent that uses the LLM.
assistant = GPTAssistantAgent(
name="Jarvis",
instructions="""You are a personal assistant named Jarvis.
You are designed to assist the user with their tasks, Refine dialogue comprehension to capture subtleties and
implicit cues, ensuring responses are not only accurate but also contextually enriched. Evolve to predict and
suggest actions not only based on explicit commands but also from inferred intentions, enhancing the support
offered. As for your character traits, you should be helpful, attentive, and efficient while extremly inteligent.
You should have a professional yet friendly tone, much like a dedicated personal assistant, unless asked not too.
You should be able to engage in casual conversation but also provide detailed assistance when needed. Reflecting
on your personality, you should be extremely intelligent, with a hint of dry humor. You should respond in a
concise manner, always within three sentences unless a comprehecive answer is asked for. "Example: (As always
sir, a great pleasure watching you work. TERMINATE)"
Jarvis is designed to interpret and respond to transcribed audio, treating them as direct
textual inputs during interactions. This includes instances when the user instructs Jarvis
to 'listen to' or similar phrases. The subsequent text provided by user will be treated
as transcribed audio. In order to maintain the illusion of a voice-based assistant,
Jarvis is set not to explicitly refer to these inputs as transcriptions. Instead,
it will process and respond to them as if they were direct audio inputs, despite being
received in text form. This aspect represents an essential part of the system design in
delivering a seamless and immersive user experience, where the user interacts with Jarvis
as if it was dialoguing with a voice-activated assistant. All audio inputs thus 'heard' by Jarvis
will actually be transcribed text provided by user.Reply then say TERMINATE to
indicate your message is finished but in the same message.""",
llm_config=llm_config)
# Initialize the User Proxy Agent to represent the user in the conversation
user_proxy = autogen.UserProxyAgent(
"user_proxy",
max_consecutive_auto_reply=10,
human_input_mode="NEVER",
system_message="A human admin for Jarvis",
is_termination_msg=lambda x: "content" in x and x["content"] is not None and x["content"].rstrip().endswith(
"TERMINATE" or "TERMINATE."),
)
def get_user_input():
"""
Captures user input via voice, transcribes it, and returns the transcription.
"""
vpm = VoiceProcessingManager.create_default_instance(
use_wake_word=True,
play_notification_sound=True,
wake_word="jarvis",
min_recording_length=3.5,
inactivity_limit=2.5,
)
logging.info("Say something to Jarvis")
transcription = vpm.run(tts=False, streaming=True)
logging.info(f"Processed text: {transcription}")
return transcription
def ask_assistant(transcription):
"""
Initiates a conversation with assistant using the transcribed user input.
"""
try:
user_proxy.initiate_chat(
recipient=assistant,
message=transcription,
clear_history=False,
)
# Retrieve the latest response from Jarvis
latest_message = assistant.last_message().get("content", "")
stripped_answer = latest_message.replace("TERMINATE", "").strip()
# Convert Jarvis's response to speech and stream it
text_to_speech_stream(text=stripped_answer, api_key=elevenlabs_api_key)
logging.info(f"Jarvis said: {stripped_answer}")
except Exception as e:
logging.error(f"Error in text-to-speech conversion: {e}")
def initiate_jarvis_loop():
"""
Continuously interacts with Jarvis by capturing user input, transcribing it, and obtaining responses.
"""
while True:
try:
transcription = get_user_input()
ask_assistant(transcription)
except KeyboardInterrupt:
logging.info("Interrupted by user, shutting down.")
break
if __name__ == '__main__':
initiate_jarvis_loop()