Merge branch 'develop'

microsoft · Jan 15, 2024 · 9122d23 · 9122d23
2 parents 657d860 + 680e999
commit 9122d23
Show file tree

Hide file tree

Showing 10 changed files with 123 additions and 82 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,5 +1,5 @@
 # Base container
-FROM docker.io/library/python:3.12-slim-bookworm@sha256:448eb6cade8c9edfa66b0840829744a10a0131f6e9ef95052913170c8291a348 AS base
+FROM docker.io/library/python:3.12-slim-bookworm@sha256:ee9a59cfdad294560241c9a8c8e40034f165feb4af7088c1479c2cdd84aafbed AS base
 
 # Build container
 FROM base AS build

diff --git a/README.md b/README.md
@@ -47,6 +47,7 @@ Extract of the data stored during the call:
 
 - [x] Access to customer conversation history
 - [x] Bot can be called from a phone number
+- [x] Company products (= lexicon) can be understood by the bot (e.g. a name of a specific insurance product)
 - [x] Create by itself a todo list of tasks to complete the claim
 - [x] Disengaging from a human agent when needed
 - [x] Fine understanding of the customer request with GPT-4 Turbo
@@ -178,7 +179,10 @@ make install
 
 Also, a public file server is needed to host the audio files.
 
-For this, you can use Azure Blob Storage. In that case, content of the project folder `resources` requires to be uploaded to the public container `$web` of the storage account.
+For this, you can use Azure Blob Storage. In that case, content of the project folder `resources` requires to be uploaded to the public container `$web` of the storage account. This folder contains:
+
+- Audio files (`xxx.wav`) to be played during the call
+- [Lexicon file (`lexicon.xml`)](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-pronunciation#custom-lexicon) to be used by the bot to understand the company products (note: any change [makes up to 15 minutes](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-pronunciation#custom-lexicon-file) to be taken into account)
 
 ### Run
 

diff --git a/helpers/prompts.py b/helpers/prompts.py
@@ -2,6 +2,11 @@
 from helpers.config import CONFIG
 
 
+class Sounds(str, Enum):
+    LOADING = f"{CONFIG.resources.public_url}/loading.wav"
+    READY = f"{CONFIG.resources.public_url}/ready.wav"
+
+
 class LLM(str, Enum):
     DEFAULT_SYSTEM = f"""
         Assistant called {CONFIG.workflow.bot_name} and is in a call center for the insurance company {CONFIG.workflow.bot_company} as an expert with 20 years of experience. Today is {{date}}. Customer is calling from {{phone_number}}. Call center number is {CONFIG.communication_service.phone_number}.
@@ -39,7 +44,7 @@ class LLM(str, Enum):
         - Cannot talk about any topic other than insurance claims
         - Do not prefix the answer with any text, like "The answer is" or "Summary of the call"
         - Include salutations at the end of the SMS
-        - Incude details stored in the claim, to make the customer confident that the situation is understood
+        - Include details stored in the claim, to make the customer confident that the situation is understood
         - Is polite, helpful, and professional
         - Refer to the customer by their name, if known
         - Use simple and short sentences
@@ -65,6 +70,6 @@ class TTS(str, Enum):
         "Je suis désolé, j'ai rencontré une erreur. Pouvez-vous répéter votre demande ?"
     )
     GOODBYE = f"Merci de votre appel, j'espère avoir pu vous aider. N'hésitez pas à rappeler, j'ai tout mémorisé. {CONFIG.workflow.bot_company} vous souhaite une excellente journée !"
-    HELLO = f"Bonjour, je suis {CONFIG.workflow.bot_name}, l'assistant {CONFIG.workflow.bot_company} ! Je suis spécialiste des sinistres. Lorsque vous entendrez un bip, c'est que je travaille. Mais d'abord, quel est l'objet de votre appel ?"
+    HELLO = f"Bonjour, je suis {CONFIG.workflow.bot_name}, l'assistant {CONFIG.workflow.bot_company} ! Je suis spécialiste des sinistres. Je ne peux pas travailler et écouter en même temps. Voici comment je fonctionne  : lorsque je travaillerai, vous entendrez une petite musique ; après, au bip, ce sera à votre tour de parler. Vous pouvez me parler comme à un humain, je comprendrai la conversation. Je suis là pour vous aider. Quel est l'objet de votre appel ?"
     TIMEOUT_SILENCE = "Je suis désolé, je n'ai rien entendu. Si vous avez besoin d'aide, dites-moi comment je peux vous aider."
-    WELCOME_BACK = f"Bonjour, je suis {CONFIG.workflow.bot_name}, l'assistant {CONFIG.workflow.bot_company} ! Je vois que vous avez déjà appelé il y a moins de {CONFIG.workflow.conversation_timeout_hour} heures. Lorsque vous entendrez un bip, c'est que je travaille. Laissez-moi quelques secondes pour récupérer votre dossier..."
+    WELCOME_BACK = f"Bonjour, je suis {CONFIG.workflow.bot_name}, l'assistant {CONFIG.workflow.bot_company} ! Je vois que vous avez déjà appelé il y a moins de {CONFIG.workflow.conversation_timeout_hour} heures. Laissez-moi quelques secondes pour récupérer votre dossier..."
diff --git a/main.py b/main.py
@@ -5,7 +5,7 @@
     FileSource,
     PhoneNumberIdentifier,
     RecognizeInputType,
-    TextSource,
+    SsmlSource,
 )
 from azure.communication.sms import SmsClient
 from azure.core.credentials import AzureKeyCredential
@@ -23,7 +23,7 @@
 from fastapi.responses import JSONResponse
 from helpers.config import CONFIG
 from helpers.logging import build_logger
-from helpers.prompts import LLM as LLMPrompt, TTS as TTSPrompt
+from helpers.prompts import LLM as LLMPrompt, TTS as TTSPrompt, Sounds as SoundPrompt
 from helpers.version import VERSION
 from models.action import ActionModel, Indent as IndentAction
 from models.reminder import ReminderModel
@@ -132,6 +132,10 @@ def callback(future: ARMPolling):
         event_subscription_info={
             "properties": {
                 "eventDeliverySchema": "EventGridSchema",
+                "retryPolicy": {
+                    "maxDeliveryAttempts": 8,
+                    "eventTimeToLiveInMinutes": 3,  # Call are real time, no need to wait
+                },
                 "destination": {
                     "endpointType": "WebHook",
                     "properties": {
@@ -279,11 +283,6 @@ async def call_event_post(request: Request, call_id: UUID) -> None:
                     client=client,
                     text=TTSPrompt.WELCOME_BACK,
                 )
-                await handle_media(
-                    call=call,
-                    client=client,
-                    file="acknowledge.mp3",
-                )
                 await intelligence(call, client)
 
         elif event_type == "Microsoft.Communication.CallDisconnected":  # Call hung up
@@ -297,12 +296,6 @@ async def call_event_post(request: Request, call_id: UUID) -> None:
                 speech_text = event.data["speechResult"]["speech"]
                 _logger.info(f"Recognition completed ({call.id}): {speech_text}")
 
-                await handle_media(
-                    call=call,
-                    client=client,
-                    file="acknowledge.mp3",
-                )
-
                 if speech_text is not None and len(speech_text) > 0:
                     call.messages.append(
                         CallMessageModel(content=speech_text, persona=CallPersona.HUMAN)
@@ -315,12 +308,6 @@ async def call_event_post(request: Request, call_id: UUID) -> None:
             result_information = event.data["resultInformation"]
             error_code = result_information["subCode"]
 
-            await handle_media(
-                call=call,
-                client=client,
-                file="acknowledge.mp3",
-            )
-
             # Error codes:
             # 8510 = Action failed, initial silence timeout reached
             # 8532 = Action failed, inter-digit silence timeout reached
@@ -401,9 +388,22 @@ async def call_event_post(request: Request, call_id: UUID) -> None:
 
 
 async def intelligence(call: CallModel, client: CallConnectionClient) -> None:
+    # Start loading sound
+    await handle_media_loop(
+        call=call,
+        client=client,
+        sound=SoundPrompt.LOADING,
+    )
+
     chat_res = await gpt_chat(call)
     _logger.info(f"Chat ({call.id}): {chat_res}")
 
+    try:
+        # Cancel loading sound
+        client.cancel_all_media_operations()
+    except ResourceNotFoundError:
+        _logger.debug(f"Call hung up before playing ({call.id})")
+
     if chat_res.intent == IndentAction.TALK_TO_HUMAN:
         await handle_play(
             call=call,
@@ -452,7 +452,7 @@ async def handle_play(
     """
     Play a text to a call participant.
 
-    If store is True, the text will be stored in the call messages.
+    If store is True, the text will be stored in the call messages. Compatible with text larger than 400 characters, in that case the text will be split in chunks and played sequentially.
 
     See: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=tts
     """
@@ -461,11 +461,25 @@ async def handle_play(
             CallMessageModel(content=text, persona=CallPersona.ASSISTANT)
         )
 
+    # Split text in chunks of max 400 characters, separated by a comma
+    chunks = []
+    chunk = ""
+    for word in text.split("."):  # Split by sentence
+        to_add = f"{word}."
+        if len(chunk) + len(to_add) >= 400:
+            chunks.append(chunk)
+            chunk = ""
+        chunk += to_add
+    if chunk:
+        chunks.append(chunk)
+
     try:
-        client.play_media_to_all(
-            operation_context=context,
-            play_source=audio_from_text(text),
-        )
+        for chunk in chunks:
+            _logger.debug(f"Playing chunk ({call.id}): {chunk}")
+            client.play_media_to_all(
+                operation_context=context,
+                play_source=audio_from_text(chunk),
+            )
     except ResourceNotFoundError:
         _logger.debug(f"Call hung up before playing ({call.id})")
 
@@ -789,83 +803,61 @@ async def handle_recognize_text(
     client: CallConnectionClient,
     call: CallModel,
     text: str,
-    context: Optional[str] = None,
     store: bool = True,
 ) -> None:
     """
     Play a text to a call participant and start recognizing the response.
 
-    If store is True, the text will be stored in the call messages.
+    If store is True, the text will be stored in the call messages. Starts by playing text, then the "ready" sound, and finally starts recognizing the response.
     """
-    if store:
-        call.messages.append(
-            CallMessageModel(content=text, persona=CallPersona.ASSISTANT)
-        )
-
-    # Split text in chunks of max 400 characters, separated by a comma
-    chunks = []
-    chunk = ""
-    for word in text.split("."):  # Split by sentence
-        to_add = f"{word}."
-        if len(chunk) + len(to_add) >= 400:
-            chunks.append(chunk)
-            chunk = ""
-        chunk += to_add
-    if chunk:
-        chunks.append(chunk)
-
-    try:
-        # Play all chunks except the last one
-        for chunk in chunks:
-            _logger.debug(f"Playing chunk ({call.id}): {chunk}")
-            await handle_play(
-                call=call,
-                client=client,
-                text=chunk,
-                store=False,
-            )
+    await handle_play(
+        call=call,
+        client=client,
+        store=store,
+        text=text,
+    )
 
-        _logger.debug(f"Recognizing ({call.id})")
-        # Play tone and start recognizing
-        # TODO: Disable or lower profanity filter. The filter seems enabled by default, it replaces words like "holes in my roof" by "*** in my roof". This is not acceptable for a call center.
-        await handle_recognize_media(
-            call=call,
-            client=client,
-            file="ready.mp3",
-        )
-    except ResourceNotFoundError:
-        _logger.debug(f"Call hung up before recognizing ({call.id})")
+    _logger.debug(f"Recognizing ({call.id})")
+    await handle_recognize_media(
+        call=call,
+        client=client,
+        sound=SoundPrompt.READY,
+    )
 
 
 async def handle_recognize_media(
     client: CallConnectionClient,
     call: CallModel,
-    file: str,
-    context: Optional[str] = None,
+    sound: SoundPrompt,
 ) -> None:
+    """
+    Play a media to a call participant and start recognizing the response.
+
+    TODO: Disable or lower profanity filter. The filter seems enabled by default, it replaces words like "holes in my roof" by "*** in my roof". This is not acceptable for a call center.
+    """
     try:
         client.start_recognizing_media(
             end_silence_timeout=3,  # Sometimes user includes breaks in their speech
             input_type=RecognizeInputType.SPEECH,
-            operation_context=context,
-            play_prompt=FileSource(f"{CONFIG.resources.public_url}/{file}"),
+            play_prompt=FileSource(url=sound),
             speech_language=CONFIG.workflow.conversation_lang,
             target_participant=PhoneNumberIdentifier(call.phone_number),
         )
     except ResourceNotFoundError:
         _logger.debug(f"Call hung up before recognizing ({call.id})")
 
 
-async def handle_media(
+async def handle_media_loop(
     client: CallConnectionClient,
     call: CallModel,
-    file: str,
+    sound: SoundPrompt,
     context: Optional[str] = None,
 ) -> None:
     try:
         client.play_media_to_all(
+            loop=True,
             operation_context=context,
-            play_source=FileSource(f"{CONFIG.resources.public_url}/{file}"),
+            play_source=FileSource(url=sound),
         )
     except ResourceNotFoundError:
         _logger.debug(f"Call hung up before playing ({call.id})")
@@ -878,6 +870,10 @@ async def handle_hangup(client: CallConnectionClient, call: CallModel) -> None:
     except ResourceNotFoundError:
         _logger.debug(f"Call already hung up ({call.id})")
 
+    call.messages.append(
+        CallMessageModel(content="Customer ended the call.", persona=CallPersona.HUMAN)
+    )
+
     content = await gpt_completion(LLMPrompt.SMS_SUMMARY_SYSTEM, call)
     _logger.info(f"SMS report ({call.id}): {content}")
 
@@ -893,26 +889,41 @@ async def handle_hangup(client: CallConnectionClient, call: CallModel) -> None:
             _logger.info(
                 f"SMS report sent {response.message_id} to {response.to} ({call.id})"
             )
+            call.messages.append(
+                CallMessageModel(
+                    content=f"SMS report sent to {response.to}: {content}",
+                    persona=CallPersona.ASSISTANT,
+                )
+            )
         else:
             _logger.warn(
                 f"Failed SMS to {response.to}, status {response.http_status_code}, error {response.error_message} ({call.id})"
             )
+            call.messages.append(
+                CallMessageModel(
+                    content=f"Failed to send SMS report to {response.to}: {response.error_message}",
+                    persona=CallPersona.ASSISTANT,
+                )
+            )
 
     except Exception:
-        _logger.warn(f"SMS error ({call.id})", exc_info=True)
+        _logger.warn(f"Failed SMS to {call.phone_number} ({call.id})", exc_info=True)
 
 
-def audio_from_text(text: str) -> TextSource:
+def audio_from_text(text: str) -> SsmlSource:
+    """
+    Generate an audio source that can be read by Azure Communication Services SDK.
+
+    Text requires to be SVG escaped, and SSML tags are used to control the voice. Plus, text is slowed down by 5% to make it more understandable for elderly people. Text is also truncated to 400 characters, as this is the limit of Azure Communication Services TTS, but a warning is logged.
+    """
+    # Azure Speech Service TTS limit is 400 characters
     if len(text) > 400:
         _logger.warning(
             f"Text is too long to be processed by TTS, truncating to 400 characters, fix this!"
         )
         text = text[:400]
-    return TextSource(
-        source_locale=CONFIG.workflow.conversation_lang,
-        text=text,
-        voice_name=CONFIG.communication_service.voice_name,
-    )
+    ssml = f'<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="{CONFIG.workflow.conversation_lang}"><voice name="{CONFIG.communication_service.voice_name}" effect="eq_telecomhp8k"><lexicon uri="{CONFIG.resources.public_url}/lexicon.xml"/><prosody rate="0.95">{text}</prosody></voice></speak>'
+    return SsmlSource(ssml_text=ssml)
 
 
 def callback_url(caller_id: str) -> str:

diff --git a/models/claim.py b/models/claim.py
@@ -5,6 +5,7 @@
 class ClaimModel(BaseModel):
     additional_documentation: Optional[str] = None
     claim_explanation: Optional[str] = None
+    extra_details: Optional[str] = None
     incident_date_time: Optional[str] = None
     incident_description: Optional[str] = None
     incident_location: Optional[str] = None

diff --git a/resources/acknowledge.mp3 b/resources/acknowledge.mp3
diff --git a/resources/lexicon.xml b/resources/lexicon.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<lexicon version="1.0" xmlns="http://www.w3.org/2005/01/pronunciation-lexicon" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.w3.org/2005/01/pronunciation-lexicon http://www.w3.org/TR/2007/CR-pronunciation-lexicon-20071212/pls.xsd" alphabet="ipa" xml:lang="fr-FR">
+    <!-- Exemples de contrats d'assurance auto -->
+    <lexeme>
+        <grapheme>contrat mini</grapheme>
+        <alias>assurance voiture au tiers, qui dispose du minimum légal</alias>
+    </lexeme>
+    <lexeme>
+        <grapheme>contrat eco</grapheme>
+        <alias>assurance voiture au tiers, qui inclut en plus les garanties vol, incendie et bris de glace</alias>
+    </lexeme>
+    <lexeme>
+        <grapheme>contrat confort</grapheme>
+        <alias>assurance voiture tous risques, qui inclut la couverture tous accidents (responsables ou subis)</alias>
+    </lexeme>
+    <lexeme>
+        <grapheme>contrat mobilité</grapheme>
+        <alias>assurance voiture tous risques, qui inclut les pannes mécaniques, l'assistance 0km et le véhicule de remplacement tous accident/panne/vol</alias>
+    </lexeme>
+</lexicon>
diff --git a/resources/loading.wav b/resources/loading.wav
diff --git a/resources/ready.mp3 b/resources/ready.mp3
diff --git a/resources/ready.wav b/resources/ready.wav