Merge pull request #69 from MSUSAzureAccelerators/main

Merged updated
pablomarin · Dec 9, 2023 · c234fac · c234fac
2 parents 75a800e + 193cb02
commit c234fac
Show file tree

Hide file tree

Showing 20 changed files with 2,822 additions and 2,559 deletions.
diff --git a/03-Quering-AOpenAI.ipynb b/03-Quering-AOpenAI.ipynb
diff --git a/04-Complex-Docs.ipynb b/04-Complex-Docs.ipynb
@@ -24,7 +24,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 22,
    "id": "15f6044e-463f-4988-bc46-a3c3d641c15c",
    "metadata": {},
    "outputs": [],
@@ -38,15 +38,10 @@
     "import urllib.request\n",
     "from tqdm import tqdm\n",
     "import langchain\n",
-    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
-    "from langchain.embeddings.openai import OpenAIEmbeddings\n",
-    "from langchain.vectorstores import Chroma, FAISS\n",
-    "from langchain import OpenAI, VectorDBQA\n",
+    "from langchain.embeddings import AzureOpenAIEmbeddings\n",
     "from langchain.chat_models import AzureChatOpenAI\n",
     "from langchain.chat_models import ChatOpenAI\n",
-    "from langchain.chains import RetrievalQAWithSourcesChain\n",
     "from langchain.docstore.document import Document\n",
-    "from langchain.chains.question_answering import load_qa_chain\n",
     "from langchain.chains.qa_with_sources import load_qa_with_sources_chain\n",
     "\n",
     "from common.utils import parse_pdf, read_pdf_files, text_to_base64\n",
@@ -87,10 +82,7 @@
    "outputs": [],
    "source": [
     "# Set the ENV variables that Langchain needs to connect to Azure OpenAI\n",
-    "os.environ[\"OPENAI_API_BASE\"] = os.environ[\"AZURE_OPENAI_ENDPOINT\"]\n",
-    "os.environ[\"OPENAI_API_KEY\"] = os.environ[\"AZURE_OPENAI_API_KEY\"]\n",
-    "os.environ[\"OPENAI_API_VERSION\"] = os.environ[\"AZURE_OPENAI_API_VERSION\"]\n",
-    "os.environ[\"OPENAI_API_TYPE\"] = \"azure\""
+    "os.environ[\"OPENAI_API_VERSION\"] = os.environ[\"AZURE_OPENAI_API_VERSION\"]"
    ]
   },
   {
@@ -100,7 +92,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "embedder = OpenAIEmbeddings(deployment=\"text-embedding-ada-002\", chunk_size=1) "
+    "embedder = AzureOpenAIEmbeddings(deployment=\"text-embedding-ada-002\", chunk_size=1) "
    ]
   },
   {
@@ -153,7 +145,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 5/5 [00:03<00:00,  1.64it/s]\n"
+      "100%|██████████| 5/5 [00:02<00:00,  2.02it/s]\n"
      ]
     }
    ],
@@ -192,27 +184,27 @@
      "text": [
       "Extracting Text from Azure_Cognitive_Search_Documentation.pdf ...\n",
       "Extracting text using PyPDF\n",
-      "Parsing took: 35.489580 seconds\n",
+      "Parsing took: 32.466733 seconds\n",
       "Azure_Cognitive_Search_Documentation.pdf contained 1947 pages\n",
       "\n",
       "Extracting Text from Boundaries_When_to_Say_Yes_How_to_Say_No_to_Take_Control_of_Your_Life.pdf ...\n",
       "Extracting text using PyPDF\n",
-      "Parsing took: 1.850767 seconds\n",
+      "Parsing took: 1.773100 seconds\n",
       "Boundaries_When_to_Say_Yes_How_to_Say_No_to_Take_Control_of_Your_Life.pdf contained 357 pages\n",
       "\n",
       "Extracting Text from Fundamentals_of_Physics_Textbook.pdf ...\n",
       "Extracting text using PyPDF\n",
-      "Parsing took: 104.721881 seconds\n",
+      "Parsing took: 96.595367 seconds\n",
       "Fundamentals_of_Physics_Textbook.pdf contained 1450 pages\n",
       "\n",
       "Extracting Text from Made_To_Stick.pdf ...\n",
       "Extracting text using PyPDF\n",
-      "Parsing took: 8.234870 seconds\n",
+      "Parsing took: 7.461382 seconds\n",
       "Made_To_Stick.pdf contained 225 pages\n",
       "\n",
       "Extracting Text from Pere_Riche_Pere_Pauvre.pdf ...\n",
       "Extracting text using PyPDF\n",
-      "Parsing took: 1.019250 seconds\n",
+      "Parsing took: 0.995148 seconds\n",
       "Pere_Riche_Pere_Pauvre.pdf contained 225 pages\n",
       "\n"
      ]
@@ -258,23 +250,25 @@
      "output_type": "stream",
      "text": [
       "Azure_Cognitive_Search_Documentation.pdf \n",
-      " chunk text: 1. Download sample data  consisting of a small file set of different types. Unzi ...\n",
+      " chunk text: 3. In Description, choose a field that provides details that might help someone  ...\n",
       "\n",
       "Boundaries_When_to_Say_Yes_How_to_Say_No_to_Take_Control_of_Your_Life.pdf \n",
-      " chunk text: 17\n",
-      "Your problems with him are the same as mine. Walt and I have\n",
-      "a real struggle  ...\n",
+      " chunk text: 14\n",
+      "11:59 A.M.\n",
+      "The rest of Sherrie’s morning proceeded fairly well. A tal-\n",
+      "ented  ...\n",
       "\n",
       "Fundamentals_of_Physics_Textbook.pdf \n",
-      " chunk text: 11PROBLEMS33A ton is a measure of volume frequently used in ship-ping, but that  ...\n",
+      " chunk text: 192-2INSTANTANEOUS VELOCITY AND SPEED\n",
+      "Figure 2-6(a) The x(t) curve for an elevat ...\n",
       "\n",
       "Made_To_Stick.pdf \n",
-      " chunk text: sound bites, because lots of sound bites are empty or misleading— \n",
-      "they're compa ...\n",
+      " chunk text: the artistic feel of the movie will change. When st ars are hired to play \n",
+      "the p ...\n",
       "\n",
       "Pere_Riche_Pere_Pauvre.pdf \n",
-      " chunk text: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
-      " ...\n",
+      " chunk text: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
+      "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ...\n",
       "\n"
      ]
     }
@@ -304,8 +298,8 @@
      "output_type": "stream",
      "text": [
       "Extracting text using Azure Document Intelligence\n",
-      "CPU times: user 11.6 s, sys: 204 ms, total: 11.8 s\n",
-      "Wall time: 49.2 s\n"
+      "CPU times: user 10.6 s, sys: 160 ms, total: 10.8 s\n",
+      "Wall time: 42.4 s\n"
      ]
     }
    ],
@@ -339,7 +333,7 @@
      "output_type": "stream",
      "text": [
       "Pere_Riche_Pere_Pauvre.pdf \n",
-      " chunk text: Après être sortie de la «foire d'empoigne », je surveillai pendant les deux heur ...\n",
+      " chunk text: règles différentes. Les employés sont perdants; les patrons et les investisseurs ...\n",
       "\n"
      ]
     }
@@ -481,7 +475,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 13,
    "id": "f5c8aa55-1b60-4057-93db-0d4a89993a57",
    "metadata": {},
    "outputs": [
@@ -496,21 +490,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 1947/1947 [07:39<00:00,  4.24it/s]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Uploading chunks from Boundaries_When_to_Say_Yes_How_to_Say_No_to_Take_Control_of_Your_Life.pdf\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 357/357 [01:25<00:00,  4.20it/s]\n"
+      "100%|██████████| 357/357 [01:36<00:00,  3.71it/s]s]\n"
      ]
     },
     {
@@ -524,7 +504,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 1450/1450 [06:15<00:00,  3.86it/s]\n"
+      "100%|██████████| 1450/1450 [07:10<00:00,  3.37it/s]\n"
      ]
     },
     {
@@ -538,7 +518,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 225/225 [00:54<00:00,  4.13it/s]\n"
+      "100%|██████████| 225/225 [01:03<00:00,  3.56it/s]\n"
      ]
     },
     {
@@ -552,15 +532,15 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 225/225 [00:54<00:00,  4.12it/s]"
+      "100%|██████████| 225/225 [01:02<00:00,  3.61it/s]"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 3min 14s, sys: 4.67 s, total: 3min 19s\n",
-      "Wall time: 17min 9s\n"
+      "CPU times: user 3min 6s, sys: 4.32 s, total: 3min 10s\n",
+      "Wall time: 19min 29s\n"
      ]
     },
     {
@@ -616,7 +596,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 14,
    "id": "8b408798-5527-44ca-9dba-cad2ee726aca",
    "metadata": {},
    "outputs": [],
@@ -631,7 +611,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 15,
    "id": "1b182ade-0ddd-47a1-b1eb-2cbf435c317f",
    "metadata": {},
    "outputs": [],
@@ -657,7 +637,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 16,
    "id": "410ff796-dab1-4817-a3a5-82eeff6c0c57",
    "metadata": {},
    "outputs": [],
@@ -701,9 +681,9 @@
      "text": [
       "System prompt token count: 1669\n",
       "Max Completion Token count: 1000\n",
-      "Combined docs (context) token count: 3595\n",
+      "Combined docs (context) token count: 3485\n",
       "--------\n",
-      "Requested token count: 6264\n",
+      "Requested token count: 6154\n",
       "Token limit for gpt-35-turbo-16k : 16384\n",
       "Chain Type selected: stuff\n"
      ]
@@ -734,7 +714,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 23,
    "id": "62cf3a3f-2b4d-4806-8b92-eb982c52b0cd",
    "metadata": {},
    "outputs": [],
@@ -751,16 +731,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 24,
    "id": "3b412c56-650f-4ca4-a868-9954f83679fa",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 7.77 ms, sys: 27 µs, total: 7.8 ms\n",
-      "Wall time: 6.54 s\n"
+      "CPU times: user 21.9 ms, sys: 0 ns, total: 21.9 ms\n",
+      "Wall time: 7.75 s\n"
      ]
     }
    ],
@@ -772,53 +752,56 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 25,
    "id": "63f07b08-87bd-4518-b2f2-03ee1096f59f",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/markdown": [
-       "To push documents with vectors to an index using the Python SDK, you can follow the steps outlined in the extracted parts. Here is an example of how to do it:\n",
-       "\n",
-       "1. Set up your development environment with Python 3.9, Git, and Visual Studio Code with the Python extension.\n",
-       "2. Create a search service in the Azure portal.\n",
-       "3. Use the Azure Cognitive Search REST APIs or Azure SDKs like the Azure SDK for Python to create a data source, index, indexer, and skillset.\n",
-       "4. Define a pipeline that uses Optical Character Recognition (OCR), language detection, and entity and key phrase recognition.\n",
-       "5. Execute the pipeline to invoke transformations and create and load a search index.\n",
-       "6. Explore the results using full-text search and a rich query syntax.\n",
-       "\n",
-       "Additionally, you can use the Azure SDK for Python to upload documents to the index. Here is an example of how to do it:\n",
+       "To push documents with vectors to an index using the Python SDK, you can use the following code example:\n",
        "\n",
        "```python\n",
-       "import os\n",
        "from azure.core.credentials import AzureKeyCredential\n",
        "from azure.search.documents import SearchClient\n",
        "\n",
-       "index_name = \"hotels\"\n",
-       "endpoint = os.environ[\"SEARCH_ENDPOINT\"]\n",
-       "key = os.environ[\"SEARCH_API_KEY\"]\n",
+       "# Set up the connection\n",
+       "endpoint = \"your_search_service_endpoint\"\n",
+       "index_name = \"your_index_name\"\n",
+       "api_key = \"your_api_key\"\n",
        "\n",
-       "DOCUMENT = {\n",
-       "    'Category': 'Hotel',\n",
-       "    'hotelId': '1000',\n",
-       "    'rating': 4.0,\n",
-       "    'rooms': [],\n",
-       "    'hotelName': 'Azure Inn',\n",
-       "}\n",
+       "# Create the search client\n",
+       "search_client = SearchClient(endpoint=endpoint, index_name=index_name, credential=AzureKeyCredential(api_key))\n",
        "\n",
-       "search_client = SearchClient(endpoint, index_name, AzureKeyCredential(key))\n",
-       "result = search_client.upload_documents(documents=[DOCUMENT])\n",
-       "print(\"Upload of new document succeeded: {}\".format(result[0].succeeded))\n",
-       "```\n",
+       "# Define your documents with vectors\n",
+       "documents = [\n",
+       "    {\n",
+       "        \"@search.action\": \"upload\",\n",
+       "        \"id\": \"1\",\n",
+       "        \"title\": \"Document 1\",\n",
+       "        \"vector\": [0.1, 0.2, 0.3]\n",
+       "    },\n",
+       "    {\n",
+       "        \"@search.action\": \"upload\",\n",
+       "        \"id\": \"2\",\n",
+       "        \"title\": \"Document 2\",\n",
+       "        \"vector\": [0.4, 0.5, 0.6]\n",
+       "    },\n",
+       "    # Add more documents as needed\n",
+       "]\n",
+       "\n",
+       "# Upload the documents to the index\n",
+       "result = search_client.upload_documents(documents=documents)\n",
        "\n",
-       "This example uses the Azure SDK for Python to create a search client and upload a document to the specified index.\n",
+       "# Check if the upload succeeded\n",
+       "print(\"Upload of new documents succeeded:\", result[0].succeeded)\n",
+       "```\n",
        "\n",
-       "Source: [5]\n",
+       "This code creates a search client using the endpoint, index name, and API key. It then defines the documents to be uploaded, including a unique ID, a title, and a vector. The `@search.action` field specifies that the action is an upload. Finally, the `upload_documents` method is called to push the documents to the index.\n",
        "\n",
-       "Please note that the above example is just one way to push documents with vectors to an index using the Python SDK. There may be other methods or variations depending on your specific requirements and use case.\n",
+       "Please note that you need to replace `\"your_search_service_endpoint\"`, `\"your_index_name\"`, and `\"your_api_key\"` with your own values.\n",
        "\n",
-       "Let me know if there is anything else I can help you with."
+       "[1]<sup><a href=\"https://demodatasetsp.blob.core.windows.net/books/Azure_Cognitive_Search_Documentation.pdf?sv=2022-11-02&ss=bf&srt=sco&sp=rl&se=2025-11-06T23:27:04Z&st=2023-11-06T15:27:04Z&spr=https&sig=IxmYt1nWtSI0MtBHeQBC1t%2F4VeoN19HqQM1Xu6tvacU%3D\" target=\"_blank\">Source</a></sup>"
       ],
       "text/plain": [
        "<IPython.core.display.Markdown object>"
@@ -858,6 +841,14 @@
     "\n",
     "On the next Notebook, we are going to understand the concept of **memory**. This is necessary in order to have a chatbot that can establish a conversation with the user. Without memory, there is no real conversation."
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "20ef4e89-a769-4e68-9f28-66fab1ec873f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {