diff --git a/bootcamp/RAG/readthedocs_zilliz_langchain.ipynb b/bootcamp/RAG/readthedocs_zilliz_langchain.ipynb index 7bd67b3e4..34202c9ca 100755 --- a/bootcamp/RAG/readthedocs_zilliz_langchain.ipynb +++ b/bootcamp/RAG/readthedocs_zilliz_langchain.ipynb @@ -34,7 +34,7 @@ "outputs": [], "source": [ "# For colab install these libraries in this order:\n", - "# !pip install pymilvus, langchain, torch, transformers, python-dotenv, accelerate\n", + "# !pip install pymilvus, langchain, torch, transformers, python-dotenv\n", "\n", "# Import common libraries.\n", "import time\n", @@ -74,20 +74,17 @@ "# !wget -r -A.html -P rtdocs --header=\"Accept-Charset: UTF-8\" $DOCS_PAGE" ] }, - { - "cell_type": "markdown", - "id": "8a67e382", - "metadata": {}, - "source": [ - "## Start up a Zilliz free tier cluster." - ] - }, { "cell_type": "markdown", "id": "fb844837", "metadata": {}, "source": [ - "Code in this notebook uses fully-managed Milvus on [Ziliz Cloud free trial](https://cloud.zilliz.com/login). Choose the default \"Starter\" option when you provision > Create collection > Give it a name > Create cluster and collection.\n", + "## Start up a Zilliz free tier cluster.\n", + "\n", + "Code in this notebook uses fully-managed Milvus on [Ziliz Cloud free trial](https://cloud.zilliz.com/login). \n", + " 1. Choose the default \"Starter\" option when you provision > Create collection > Give it a name > Create cluster and collection. \n", + " 2. On the Cluster main page, copy your `API Key` and store it locally in a .env variable. See note below how to do that.\n", + " 3. Also on the Cluster main page, copy the `Public Endpoint URI`.\n", "\n", "πŸ’‘ Note: To keep your tokens private, best practice is to use an **env variable**. See [how to save api key in env variable](https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety).
\n", "\n", @@ -118,10 +115,9 @@ "load_dotenv()\n", "TOKEN = os.getenv(\"ZILLIZ_API_KEY\")\n", "\n", - "# Connect to Zilliz cloud using enpoint URI and API key TOKEN.\n", + "# Connect to Zilliz cloud using endpoint URI and API key TOKEN.\n", "# TODO change this before checking into github.\n", - "# CLUSTER_ENDPOINT=\"https://in03-xxxx.api.gcp-us-west1.zillizcloud.com:443\"\n", - "\n", + "CLUSTER_ENDPOINT=\"https://in03-xxxx.api.gcp-us-west1.zillizcloud.com:443\"\n", "connections.connect(\n", " alias='default',\n", " # Public endpoint obtained from Zilliz Cloud\n", @@ -165,7 +161,7 @@ ")\n", "model_name: BAAI/bge-base-en-v1.5\n", "EMBEDDING_LENGTH: 768\n", - "MAX_SEQ_LENGTH: 1536\n" + "MAX_SEQ_LENGTH: 512\n" ] } ], @@ -189,9 +185,13 @@ "# Get the model parameters and save for later.\n", "EMBEDDING_LENGTH = encoder.get_sentence_embedding_dimension()\n", "MAX_SEQ_LENGTH_IN_TOKENS = encoder.get_max_seq_length() \n", - "# Assume tokens are 3 characters long.\n", - "MAX_SEQ_LENGTH = MAX_SEQ_LENGTH_IN_TOKENS * 3\n", - "HF_EOS_TOKEN_LENGTH = 1 * 3\n", + "# # Assume tokens are 3 characters long.\n", + "# MAX_SEQ_LENGTH = MAX_SEQ_LENGTH_IN_TOKENS * 3\n", + "# HF_EOS_TOKEN_LENGTH = 1 * 3\n", + "# Test with 512 sequence length.\n", + "MAX_SEQ_LENGTH = MAX_SEQ_LENGTH_IN_TOKENS\n", + "HF_EOS_TOKEN_LENGTH = 1\n", + "\n", "\n", "# Inspect model parameters.\n", "print(f\"model_name: {model_name}\")\n", @@ -383,23 +383,9 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "chunking time: 0.013296842575073242\n", - "docs: 8, split into: 8\n", - "split into chunks: 55, type: list of \n", - "\n", - "Looking at a sample chunk...\n", - "InstallationΒΆ Installing via pipΒΆ PyMilvus is in the Python Package Index. PyMilvus only support pyt\n", - "{'h1': 'Installation', 'h2': 'Installing via pip', 'source': 'rtdocs/pymilvus.readthedocs.io/en/latest/install.html'}\n" - ] - } - ], + "outputs": [], "source": [ "from langchain.text_splitter import HTMLHeaderTextSplitter, RecursiveCharacterTextSplitter\n", "from bs4 import BeautifulSoup\n", @@ -415,6 +401,7 @@ "# Use the embedding model parameters.\n", "chunk_size = MAX_SEQ_LENGTH - HF_EOS_TOKEN_LENGTH\n", "chunk_overlap = np.round(chunk_size * 0.10, 0)\n", + "print(f\"chunk_size: {chunk_size}, chunk_overlap: {chunk_overlap}\")\n", "\n", "# Create an instance of the RecursiveCharacterTextSplitter\n", "child_splitter = RecursiveCharacterTextSplitter(\n", @@ -550,11 +537,7 @@ " 'h1': chunk.metadata['h1'][:50],\n", " 'h2': h2,\n", " }\n", - " chunk_list.append(chunk_dict)\n", - "\n", - "# # TODO - remove this before saving in github.\n", - "# for chunk in chunk_list[:1]:\n", - "# print(chunk)" + " chunk_list.append(chunk_dict)" ] }, { @@ -568,7 +551,7 @@ "output_type": "stream", "text": [ "Start inserting entities\n", - "Milvus insert time for 55 vectors: 0.19388794898986816 seconds\n", + "Milvus insert time for 156 vectors: 1.3558049201965332 seconds\n", "[{\"name\":\"_default\",\"collection_name\":\"MilvusDocs\",\"description\":\"\"}]\n" ] } @@ -633,7 +616,7 @@ "# Define a sample question about your data.\n", "QUESTION1 = \"What do the parameters for HNSW mean?\"\n", "QUESTION2 = \"What are good default values for HNSW parameters with 25K vectors dim 768?\"\n", - "QUESTION3 = \"Default distance metric used in AUTOINDEX?\"\n", + "QUESTION3 = \"What is the default AUTOINDEX distance metric in Milvus Client?\"\n", "QUERY = [QUESTION1, QUESTION2, QUESTION3]\n", "\n", "# Inspect the length of the query.\n", @@ -678,7 +661,7 @@ "output_type": "stream", "text": [ "Loaded milvus collection into memory.\n", - "Milvus search time: 0.0818781852722168 sec\n", + "Milvus search time: 0.07079720497131348 sec\n", "type: , count: 3\n" ] } @@ -700,6 +683,9 @@ "# Return top k results with AUTOINDEX.\n", "TOP_K = 3\n", "\n", + "# Define output fields to return.\n", + "OUTPUT_FIELDS = [\"h1\", \"h2\", \"source\", \"text\"]\n", + "\n", "# Run semantic vector search using your query and the vector database.\n", "start_time = time.time()\n", "results = mc.search(\n", @@ -709,7 +695,7 @@ " param={},\n", " # Milvus can utilize metadata to enhance the search experience in boolean expressions.\n", " # expr=\"\",\n", - " output_fields=[\"h1\", \"h2\", \"text\", \"source\"], \n", + " output_fields=OUTPUT_FIELDS, \n", " limit=TOP_K,\n", " consistency_level=\"Eventually\"\n", " )\n", @@ -732,29 +718,23 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Length of context: 1533\n", - "1 1\n" - ] - } - ], + "outputs": [], "source": [ - "# # TODO - remove printing before saving in github.\n", - "# for n, hits in enumerate(results):\n", - "# print(f\"{n}th query result\")\n", - "# for hit in hits:\n", - "# print(hit)\n", - "\n", - "# Assemble the context and context metadata.\n", - "# context, context_metadata = _utils.assemble_retrieved_context(results, num_shot_answers=TOP_K)\n", - "context, context_metadata = _utils.assemble_retrieved_context(results, num_shot_answers=1)\n", - "print(len(context_metadata), len(context_metadata))" + "# Assemble just the top retrieved 1st context and context metadata.\n", + "# context, context_metadata = _utils.assemble_retrieved_context(results, METADATA_FIELDS, num_shot_answers=TOP_K)\n", + "\n", + "context, context_metadata = _utils.assemble_retrieved_context(results, OUTPUT_FIELDS, num_shot_answers=1)\n", + "print(f\"Using {len(context)} contexts and {len(context_metadata)} metadata stuffing in Prompt.\")\n", + "print(f\"Length context: {len(context[0])}\")\n", + "\n", + "# TODO - remove printing before saving in github.\n", + "# Loop throught each context and metadata and print.\n", + "for i in range(len(context)):\n", + " print(f\"Context: {context[i][:200]}\")\n", + " print(f\"Metadata: {context_metadata[i]}\")\n", + " print()" ] }, { @@ -769,7 +749,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 16, "id": "3e7fa0b6", "metadata": {}, "outputs": [ @@ -811,7 +791,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 17, "id": "a68e87b1", "metadata": {}, "outputs": [ @@ -831,7 +811,7 @@ "# NOW ASK THE SAME LLM THE SAME QUESTION USING THE RETRIEVED CONTEXT.\n", "QA_input = {\n", " 'question': SAMPLE_QUESTION,\n", - " 'context': context,\n", + " 'context': context[0],\n", "}\n", "\n", "nlp = pipeline('question-answering', \n", @@ -862,7 +842,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -872,6 +852,8 @@ "\n", "# Define the generation llm model to use.\n", "LLM_NAME = \"gpt-3.5-turbo-1106\"\n", + "TEMPERATURE = 0.1\n", + "RANDOM_SEED = 415\n", "\n", "# See how to save api key in env variable.\n", "# https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety\n", @@ -883,7 +865,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -891,14 +873,10 @@ "output_type": "stream", "text": [ "Question: What do the parameters for HNSW mean?\n", - "('Answer: The parameters for HNSW are M, which denotes the maximum degree of '\n", - " 'nodes on each layer of the graph, and efConstruction, which specifies a '\n", - " 'search range during index construction. M has values ranging from 4 to 64, '\n", - " 'and efConstruction ranges from 8 to 512. The ef parameter is used during the '\n", - " 'search stage and should be larger than top_k, with values from top_k to '\n", - " '32768. These parameters help control the graph structure and search scope '\n", - " 'for HNSW indexing. (Source: Milvus support to create index to accelerate '\n", - " 'vector search - https://pymilvus.readthedocs.io/en/latest/param.html)')\n", + "('Answer: Answer: The parameter M represents the maximum degree of nodes on '\n", + " 'each layer of the graph in the HNSW index, while efConstruction (when '\n", + " 'building index) or ef (when searching targets) specifies the search range. M '\n", + " 'ranges from 4 to 64, and efConstruction ranges from 8 to 512.')\n", "\n", "\n" ] @@ -908,23 +886,23 @@ "# CAREFUL!! THIS COSTS MONEY!!\n", "# Generate response\n", "\n", - "prompt = f\"\"\"Answer the question using the context provided. Be succinct.\n", - "Echo in the answer the Grounding Sources.\n", + "PROMPT = f\"\"\"Answer the question using the context provided. Be concise. Use the format below.\n", + "Answer: The answer to the question.\n", "Grounding sources: {context_metadata}\n", "\"\"\"\n", "\n", "# response = openai_client.chat.completions.create(\n", + "# # response_format={\n", + "# # \"type\": \"json_object\", \n", + "# # \"schema\": Result.schema_json()\n", + "# # },\n", "# messages=[\n", - "# {\n", - "# \"role\": \"system\",\n", - "# \"content\": prompt,\n", - "# },\n", - "# {\n", - "# \"role\": \"user\",\n", - "# \"content\": f\"question: {SAMPLE_QUESTION}, context: {context}\",\n", - "# }\n", + "# {\"role\": \"system\", \"content\": PROMPT,},\n", + "# {\"role\": \"user\", \"content\": f\"question: {SAMPLE_QUESTION}, context: {context[0]}\",}\n", "# ],\n", "# model=LLM_NAME,\n", + "# temperature=TEMPERATURE,\n", + "# seed=RANDOM_SEED,\n", "# )\n", "\n", "# Print the question and answer along with grounding sources and citations.\n", @@ -938,6 +916,14 @@ "\n", "# Question1: What do the parameters for HNSW mean?\n", "# Answer: Perfect!\n", + "# ('Answer: The parameters for HNSW are M, which limits the maximum degree of '\n", + "# 'nodes on each layer of the graph, and efConstruction, which specifies the '\n", + "# 'search range during index construction. Additionally, when searching '\n", + "# 'targets, the parameter ef is used to specify the search range. \\n'\n", + "# '\\n'\n", + "# \"Grounding sources: {'h1': 'Index', 'h2': 'Milvus support to create index to \"\n", + "# \"accelerate vecto', 'source': \"\n", + "# \"'https://pymilvus.readthedocs.io/en/latest/param.html'}\")\n", "# Best answer: M: maximum degree of nodes in a layer of the graph. \n", "# efConstruction: number of nearest neighbors to consider when connecting nodes in the graph.\n", "# ef: number of nearest neighbors to consider when searching for similar vectors. \n", @@ -974,22 +960,15 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Length context: 4229, Len metadata: 3\n" - ] - } - ], + "outputs": [], "source": [ "import requests, json\n", "\n", "# Define the URL, headers, and data\n", - "url = \"https://controller.api.gcp-us-west1.zillizcloud.com/v1/pipelines/pipe-458714b66ffc8ff3ab1bf1/run\"\n", + "# TODO change this before checking into github.\n", + "url = \"https://controller.api.gcp-us-west1.zillizcloud.com/v1/pipelines/pipe-xxx/run\"\n", "headers = {\n", " \"Content-Type\": \"application/json\",\n", " \"Authorization\": f\"Bearer {TOKEN}\",\n", @@ -1002,15 +981,17 @@ " \"limit\": TOP_K,\n", " \"offset\": 0,\n", " \"outputFields\": [\"chunk_text\", \"chunk_id\", \"doc_name\", \"source\"],\n", - " \"filter\": \"chunk_id >= 0\"\n", + " \"filter\": \"chunk_id >= 0 && doc_name == 'param.html'\",\n", " }\n", "}\n", "\n", "# Send the POST request\n", "response = requests.post(url, headers=headers, json=data)\n", - "# print(type(response))\n", - "# # Print the response\n", - "# pprint.pprint(response.json())\n", + "print(type(response))\n", + "\n", + "# TODO: Remove this before saving in github.\n", + "# Print the response\n", + "pprint.pprint(response.json())\n", "\n", "# Assemble context from Pipeline retriever.\n", "pipeline_context = \"\"\n", @@ -1031,7 +1012,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -1039,13 +1020,10 @@ "output_type": "stream", "text": [ "Question: What do the parameters for HNSW mean?\n", - "('Answer: The parameters for HNSW are used to control the structure of the '\n", - " 'multi-layer navigation graph. \"M\" determines the maximum degree of nodes on '\n", - " 'each layer, and \"efConstruction\" specifies the search range during index '\n", - " 'construction. When searching for targets, the parameter \"ef\" should be set '\n", - " 'and be larger than \"top_k\".\\n'\n", - " 'Sources: (https://pymilvus.readthedocs.io/en/latest/param.html), '\n", - " '(https://pymilvus.readthedocs.io/en/latest/api.html)')\n", + "('Answer: The parameters for HNSW are M, which is the maximum degree of the '\n", + " 'node, and efConstruction, which specifies the search range during index '\n", + " 'construction. The M parameter ranges from 4 to 64, and efConstruction ranges '\n", + " 'from 8 to 512. Source: https://pymilvus.readthedocs.io/en/latest/param.html')\n", "\n", "\n" ] @@ -1053,24 +1031,15 @@ ], "source": [ "# CAREFUL!! THIS COSTS MONEY!!\n", - "# Generate response\n", - "prompt = f\"\"\"Answer the question using the context provided. Be succinct.\n", - "Echo in the answer the Grounding sources.\n", - "Grounding sources: {pipeline_context_metadata}\n", - "\"\"\"\n", "\n", "# response_pipeline = openai_client.chat.completions.create(\n", "# messages=[\n", - "# {\n", - "# \"role\": \"system\",\n", - "# \"content\": prompt,\n", - "# },\n", - "# {\n", - "# \"role\": \"user\",\n", - "# \"content\": f\"question: {SAMPLE_QUESTION}, context: {pipeline_context}\",\n", - "# }\n", - "# ],\n", + "# {\"role\": \"system\", \"content\": PROMPT,},\n", + "# {\"role\": \"user\", \"content\": f\"question: {SAMPLE_QUESTION}, context: {pipeline_context[0]}\"}\n", + "# ],\n", "# model=LLM_NAME,\n", + "# temperature=TEMPERATURE,\n", + "# seed=RANDOM_SEED,\n", "# )\n", "\n", "# Print the question and answer along with grounding sources and citations.\n", @@ -1082,7 +1051,15 @@ " print(\"\\n\")\n", "\n", "# Question1: What do the parameters for HNSW mean?\n", - "# Answer: Perfect!\n", + "# Answer: Good, but missing 'ef' even though given 3 contexts instead of just 1, which manual method used.\n", + "# ('Answer: Answer: The parameters for HNSW in Milvus refer to the configuration '\n", + "# 'options for the Hierarchical Navigable Small World index, which is used to '\n", + "# 'accelerate vector similarity search. The parameters include \"M\" for the '\n", + "# 'number of edges per point and \"efConstruction\" for the size of the dynamic '\n", + "# 'list for the nearest neighbors during index construction.\\n'\n", + "# \"Grounding sources: [{'h1': 'Index', 'h2': 'Milvus support to create index to \"\n", + "# \"accelerate vecto', 'source': \"\n", + "# \"'https://pymilvus.readthedocs.io/en/latest/param.html'}]\")\n", "# Best answer: M: maximum degree of nodes in a layer of the graph. \n", "# efConstruction: number of nearest neighbors to consider when connecting nodes in the graph.\n", "# ef: number of nearest neighbors to consider when searching for similar vectors. \n", @@ -1098,7 +1075,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 24, "id": "d0e81e68", "metadata": {}, "outputs": [], @@ -1109,7 +1086,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 25, "id": "c777937e", "metadata": {}, "outputs": [ diff --git a/bootcamp/Retrieval/imdb_milvus_client.ipynb b/bootcamp/Retrieval/imdb_milvus_client.ipynb index b9a0df3dd..87627f202 100755 --- a/bootcamp/Retrieval/imdb_milvus_client.ipynb +++ b/bootcamp/Retrieval/imdb_milvus_client.ipynb @@ -27,21 +27,12 @@ "# !pip install milvus, pymilvus, langchain, torch, transformers, python-dotenv\n", "\n", "# Import common libraries.\n", - "from typing import List\n", "import time\n", "import pandas as pd\n", "import numpy as np\n", "\n", "# Import custom functions for splitting and search.\n", - "import imdb_utilities" - ] - }, - { - "cell_type": "markdown", - "id": "8a67e382", - "metadata": {}, - "source": [ - "## Start up a local Milvus server." + "import imdb_utilities as _utils" ] }, { @@ -49,13 +40,17 @@ "id": "fb844837", "metadata": {}, "source": [ - "Code in this notebook uses [Milvus client](https://milvus.io/docs/using_milvusclient.md) with [Milvus lite](https://milvus.io/docs/milvus_lite.md), which runs a local server. ⛔️ Milvus lite is only meant for demos and local testing.\n", - "- pip install milvus pymilvus\n", + "## Start up a Zilliz free tier cluster.\n", + "\n", + "Code in this notebook uses fully-managed Milvus on [Ziliz Cloud free trial](https://cloud.zilliz.com/login). \n", + " 1. Choose the default \"Starter\" option when you provision > Create collection > Give it a name > Create cluster and collection. \n", + " 2. On the Cluster main page, copy your `API Key` and store it locally in a .env variable. See note below how to do that.\n", + " 3. Also on the Cluster main page, copy the `Public Endpoint URI`.\n", + "\n", + "πŸ’‘ Note: To keep your tokens private, best practice is to use an **env variable**. See [how to save api key in env variable](https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety).
\n", "\n", - "πŸ’‘ **For production purposes**, use a local Milvus docker, Milvus clusters, or fully-managed Milvus on Zilliz Cloud.\n", - "- [Local Milvus docker](https://milvus.io/docs/install_standalone-docker.md) requires local docker installed and running.\n", - "- [Milvus clusters](https://milvus.io/docs/install_cluster-milvusoperator.md) requires a K8s cluster up and running.\n", - "- [Ziliz Cloud free trial](https://cloud.zilliz.com/login) choose a \"Default\" option when you provision.\n" + "In Jupyter, you also need a .env file (in same dir as notebooks) containing lines like this:\n", + "- VARIABLE_NAME=value\n" ] }, { @@ -68,40 +63,32 @@ "name": "stdout", "output_type": "stream", "text": [ - "Milvus server startup time: 7.601609945297241 sec\n", - "v2.3.3-lite\n" + "Type of server: zilliz_cloud\n" ] } ], "source": [ - "from milvus import default_server\n", - "from pymilvus import (\n", - " connections, utility\n", + "# !pip install pymilvus #python sdk for milvus\n", + "from pymilvus import connections, utility\n", + "\n", + "import os\n", + "from dotenv import load_dotenv\n", + "load_dotenv()\n", + "TOKEN = os.getenv(\"ZILLIZ_API_KEY\")\n", + "\n", + "# Connect to Zilliz cloud using endpoint URI and API key TOKEN.\n", + "# TODO change this before checking into github.\n", + "CLUSTER_ENDPOINT=\"https://in03-xxxx.api.gcp-us-west1.zillizcloud.com:443\"\n", + "connections.connect(\n", + " alias='default',\n", + " # Public endpoint obtained from Zilliz Cloud\n", + " uri=CLUSTER_ENDPOINT,\n", + " # API key or a colon-separated cluster username and password\n", + " token=TOKEN,\n", ")\n", "\n", - "# Cleanup previous data and stop server in case it is still running.\n", - "default_server.stop()\n", - "default_server.cleanup()\n", - "\n", - "# Start a new milvus-lite local server.\n", - "start_time = time.time()\n", - "default_server.start()\n", - "\n", - "end_time = time.time()\n", - "print(f\"Milvus server startup time: {end_time - start_time} sec\")\n", - "# startup time: 5.6739208698272705\n", - "\n", - "# Add wait to avoid error message from trying to connect.\n", - "time.sleep(15)\n", - "\n", - "# Now you could connect with localhost and the given port.\n", - "# Port is defined by default_server.listen_port.\n", - "connections.connect(host='127.0.0.1', \n", - " port=default_server.listen_port,\n", - " show_startup_banner=True)\n", - "\n", - "# Check if the server is ready.\n", - "print(utility.get_server_version())" + "# Check if the server is ready and get colleciton name.\n", + "print(f\"Type of server: {utility.get_server_version()}\")" ] }, { @@ -110,7 +97,7 @@ "metadata": {}, "source": [ "## Load the Embedding Model checkpoint and use it to create vector embeddings\n", - "**Embedding model:** We will use the open-source [sentence transformers](https://www.sbert.net/docs/pretrained_models.html) hosted on HuggingFace to encode the movie review text. We will save the embeddings to a pandas dataframe and then into the milvus database.\n", + "**Embedding model:** We will use the open-source [sentence transformers](https://www.sbert.net/docs/pretrained_models.html) available on HuggingFace to encode the documentation text. We will download the model from HuggingFace and run it locally. \n", "\n", "Two model parameters of note below:\n", "1. EMBEDDING_LENGTH refers to the dimensionality or length of the embedding vector. In this case, the embeddings generated for EACH token in the input text will have the SAME length = 768. This size of embedding is often associated with BERT-based models, where the embeddings are used for downstream tasks such as classification, question answering, or text generation.

\n", @@ -157,9 +144,14 @@ "print(encoder)\n", "\n", "# Get the model parameters and save for later.\n", - "MAX_SEQ_LENGTH = encoder.get_max_seq_length() \n", - "HF_EOS_TOKEN_LENGTH = 1\n", "EMBEDDING_LENGTH = encoder.get_sentence_embedding_dimension()\n", + "MAX_SEQ_LENGTH_IN_TOKENS = encoder.get_max_seq_length() \n", + "# # Assume tokens are 3 characters long.\n", + "# MAX_SEQ_LENGTH = MAX_SEQ_LENGTH_IN_TOKENS * 3\n", + "# HF_EOS_TOKEN_LENGTH = 1 * 3\n", + "# Test with 512 sequence length.\n", + "MAX_SEQ_LENGTH = MAX_SEQ_LENGTH_IN_TOKENS\n", + "HF_EOS_TOKEN_LENGTH = 1\n", "\n", "# Inspect model parameters.\n", "print(f\"model_name: {model_name}\")\n", @@ -175,30 +167,19 @@ "## Create a Milvus collection\n", "\n", "You can think of a collection in Milvus like a \"table\" in SQL databases. The **collection** will contain the \n", - "- **Schema** (or no-schema Milvus Client). \n", + "- **Schema** (or [no-schema Milvus client](https://milvus.io/docs/using_milvusclient.md)). \n", "πŸ’‘ You'll need the vector `EMBEDDING_LENGTH` parameter from your embedding model.\n", + "Typical values are:\n", + " - 768 for sbert embedding models\n", + " - 1536 for ada-002 OpenAI embedding models\n", "- **Vector index** for efficient vector search\n", "- **Vector distance metric** for measuring nearest neighbor vectors\n", "- **Consistency level**\n", "In Milvus, transactional consistency is possible; however, according to the [CAP theorem](https://en.wikipedia.org/wiki/CAP_theorem), some latency must be sacrificed. πŸ’‘ Searching movie reviews is not mission-critical, so [`eventually`](https://milvus.io/docs/consistency.md) consistent is fine here.\n", "\n", - "## Add a Vector Index\n", - "\n", - "The vector index determines the vector **search algorithm** used to find the closest vectors in your data to the query a user submits. Most vector indexes use different sets of parameters depending on whether the database is:\n", - "- **inserting vectors** (creation mode) - vs - \n", - "- **searching vectors** (search mode) \n", - "\n", - "Scroll down the [docs page](https://milvus.io/docs/index.md) to see a table listing different vector indexes available on Milvus. For example:\n", - "- FLAT - deterministic exhaustive search\n", - "- IVF_FLAT or IVF_SQ8 - Hash index (stochastic approximate search)\n", - "- HNSW - Graph index (stochastic approximate search)\n", - "\n", - "Besides a search algorithm, we also need to specify a **distance metric**, that is, a definition of what is considered \"close\" in vector space. In the cell below, the [`HNSW`](https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md) search index is chosen. Its possible distance metrics are one of:\n", - "- L2 - L2-norm\n", - "- IP - Dot-product\n", - "- COSINE - Angular distance\n", - "\n", - "πŸ’‘ Most use cases work better with normalized embeddings, in which case L2 is useless (every vector has length=1) and IP and COSINE are the same. Only choose L2 if you plan to keep your embeddings unnormalized." + "Some supported [data types](https://milvus.io/docs/schema.md) for Milvus schemas are:\n", + "- INT64 - primary key\n", + "- FLOAT_VECTOR - embedings = list of `numpy.ndarray` of `numpy.float32` numbers\n" ] }, { @@ -213,27 +194,58 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "20a05d59", "metadata": {}, "outputs": [], "source": [ + "from pymilvus import MilvusClient\n", + "\n", "# Set the Milvus collection name.\n", "COLLECTION_NAME = # TODO (exercise): code here\n", "\n", - "# Use no-schema Milvus client (uses flexible json key:value format).\n", + "# Use no-schema Milvus client uses flexible json key:value format.\n", "# https://milvus.io/docs/using_milvusclient.md\n", - "mc = MilvusClient(uri=\"http://localhost\")\n", + "mc = MilvusClient(\n", + " uri=CLUSTER_ENDPOINT,\n", + " # API key or a colon-separated cluster username and password\n", + " token=TOKEN)\n", + "\n", "mc.drop_collection(COLLECTION_NAME)\n", "mc.create_collection(COLLECTION_NAME, \n", " EMBEDDING_LENGTH, \n", - " #params=index_params # Omit params to use AUTOINDEX.\n", " )\n", "\n", "print(mc.describe_collection(COLLECTION_NAME))\n", "print(f\"Created collection: {COLLECTION_NAME}\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add a Vector Index\n", + "\n", + "The vector index determines the vector **search algorithm** used to find the closest vectors in your data to the query a user submits. \n", + "\n", + "Most vector indexes use different sets of parameters depending on whether the database is:\n", + "- **inserting vectors** (creation mode) - vs - \n", + "- **searching vectors** (search mode) \n", + "\n", + "Scroll down the [docs page](https://milvus.io/docs/index.md) to see a table listing different vector indexes available on Milvus. For example:\n", + "- FLAT - deterministic exhaustive search\n", + "- IVF_FLAT or IVF_SQ8 - Hash index (stochastic approximate search)\n", + "- HNSW - Graph index (stochastic approximate search)\n", + "- AUTOINDEX - Automatically determined based on OSS vs [Zilliz cloud](https://docs.zilliz.com/docs/autoindex-explained), type of GPU, size of data.\n", + "\n", + "Besides a search algorithm, we also need to specify a **distance metric**, that is, a definition of what is considered \"close\" in vector space. In the cell below, the [`HNSW`](https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md) search index is chosen (Milvus OSS default AUTOINDEX). Its possible distance metrics are one of:\n", + "- L2 - L2-norm\n", + "- IP - Dot-product\n", + "- COSINE - Angular distance\n", + "\n", + "πŸ’‘ Most use cases work better with normalized embeddings, in which case L2 is useless (every vector has length=1) and IP and COSINE are the same. Only choose L2 if you plan to keep your embeddings unnormalized." + ] + }, { "cell_type": "code", "execution_count": 5, @@ -246,12 +258,13 @@ "text": [ "Embedding length: 768\n", "Created collection: movies\n", - "{'collection_name': 'movies', 'auto_id': True, 'num_shards': 1, 'description': '', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': 5, 'params': {}, 'element_type': 0, 'auto_id': True, 'is_primary': True}, {'field_id': 101, 'name': 'vector', 'description': '', 'type': 101, 'params': {'dim': 768}, 'element_type': 0}], 'aliases': [], 'collection_id': 445754962278875466, 'consistency_level': 3, 'properties': {}, 'num_partitions': 1, 'enable_dynamic_field': True}\n" + "{'collection_name': 'movies', 'auto_id': True, 'num_shards': 1, 'description': '', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': 5, 'params': {}, 'element_type': 0, 'auto_id': True, 'is_primary': True}, {'field_id': 101, 'name': 'vector', 'description': '', 'type': 101, 'params': {'dim': 768}, 'element_type': 0}], 'aliases': [], 'collection_id': 446268198612080098, 'consistency_level': 3, 'properties': {}, 'num_partitions': 1, 'enable_dynamic_field': True}\n" ] } ], "source": [ "# Re-run create collection and add vector index specifying custom params.\n", + "from pymilvus import MilvusClient\n", "\n", "# For vector length, use the embedding length from the embedding model.\n", "print(f\"Embedding length: {EMBEDDING_LENGTH}\")\n", @@ -277,10 +290,18 @@ " \"params\": INDEX_PARAMS\n", " }\n", "\n", + "# Check if collection already exists, if so drop it.\n", + "has = utility.has_collection(COLLECTION_NAME)\n", + "if has:\n", + " drop_result = utility.drop_collection(COLLECTION_NAME)\n", + " print(f\"Successfully dropped collection: `{COLLECTION_NAME}`\")\n", + "\n", "# Below example uses no-schema Milvus client (flexible json key:value format).\n", "# https://milvus.io/docs/using_milvusclient.md\n", - "mc = MilvusClient(uri=\"http://localhost\")\n", - "mc.drop_collection(COLLECTION_NAME)\n", + "mc = MilvusClient(\n", + " uri=CLUSTER_ENDPOINT,\n", + " # API key or a colon-separated cluster username and password\n", + " token=TOKEN)\n", "mc.create_collection(\n", " COLLECTION_NAME, \n", " EMBEDDING_LENGTH, \n", @@ -329,7 +350,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "6a381e57", "metadata": {}, "outputs": [ @@ -416,10 +437,11 @@ "df.columns = ['text', 'label_int']\n", "\n", "# Map numbers to text 'Postive' and 'Negative' for sentiment labels.\n", - "df[\"label\"] = df[\"label_int\"].apply(imdb_utilities.sentiment_score_to_name)\n", + "df[\"label\"] = df[\"label_int\"].apply(_utils.sentiment_score_to_name)\n", "\n", "# Split data into train/valid/test.\n", - "df, df_train, df_val, df_test = imdb_utilities.partition_dataset(df, smoke_test=False)\n", + "columns = ['movie_index', 'text', 'label_int', 'label']\n", + "df, df_train, df_val, df_test = _utils.partition_dataset(df, columns, smoke_test=False)\n", "print(f\"original df shape: {df.shape}\")\n", "print(f\"df_train shape: {df_train.shape}, df_val shape: {df_val.shape}, df_test shape: {df_test.shape}\")\n", "assert df_train.shape[0] + df_val.shape[0] + df_test.shape[0] == df.shape[0]\n", @@ -432,7 +454,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "654dd135", "metadata": {}, "outputs": [ @@ -455,7 +477,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -479,79 +501,6 @@ "- **Function** = Langchain's convenient `RecursiveCharacterTextSplitter` to split up long reviews recursively.\n" ] }, - { - "cell_type": "code", - "execution_count": 11, - "id": "a53595fa", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", - "\n", - "def recursive_splitter_wrapper(text, chunk_size):\n", - "\n", - " # Default chunk overlap is 10% chunk_size.\n", - " chunk_overlap = np.round(chunk_size * 0.10, 0)\n", - "\n", - " # Use langchain's convenient recursive chunking method.\n", - " text_splitter = RecursiveCharacterTextSplitter(\n", - " chunk_size=chunk_size,\n", - " chunk_overlap=chunk_overlap,\n", - " length_function=len,\n", - " )\n", - " chunks: List[str] = text_splitter.split_text(text)\n", - "\n", - " # Replace special characters with spaces.\n", - " chunks = [text.replace(\"

\", \" \") for text in chunks]\n", - "\n", - " return chunks\n", - "\n", - "# Use recursive splitter to chunk text.\n", - "def imdb_chunk_text(batch_size, df, chunk_size):\n", - "\n", - " batch = df.head(batch_size).copy()\n", - " print(f\"chunk size: {chunk_size}\")\n", - " print(f\"original shape: {batch.shape}\")\n", - " \n", - " start_time = time.time()\n", - " # 1. Change primary key type to string.\n", - " batch[\"movie_index\"] = batch[\"movie_index\"].apply(lambda x: str(x))\n", - "\n", - " # 2. Truncate reviews to 512 characters.\n", - " batch['chunk'] = batch['text'].apply(recursive_splitter_wrapper, chunk_size=chunk_size)\n", - " # Explode the 'chunk' column to create new rows for each chunk.\n", - " batch = batch.explode('chunk', ignore_index=True)\n", - " print(f\"new shape: {batch.shape}\")\n", - "\n", - " # 3. Add embeddings as new column in df.\n", - " review_embeddings = torch.tensor(encoder.encode(batch['chunk']))\n", - " # Normalize embeddings to unit length.\n", - " review_embeddings = F.normalize(review_embeddings, p=2, dim=1)\n", - " # Quick check if embeddings are normalized.\n", - " norms = np.linalg.norm(review_embeddings, axis=1)\n", - " assert np.allclose(norms, 1.0, atol=1e-5) == True\n", - "\n", - " # 4. Convert embeddings to list of `numpy.ndarray`, each containing `numpy.float32` numbers.\n", - " converted_values = list(map(np.float32, review_embeddings))\n", - " batch['vector'] = converted_values\n", - "\n", - " # 5. Reorder columns for conveneince, so index first, labels at end.\n", - " new_order = [\"movie_index\", \"text\", \"chunk\", \"vector\", \"label_int\", \"label\"]\n", - " batch = batch[new_order]\n", - "\n", - " end_time = time.time()\n", - " print(f\"Chunking + embedding time for {batch_size} docs: {end_time - start_time} sec\")\n", - "\n", - " # Inspect the batch of data.\n", - " display(batch.head())\n", - " assert len(batch.chunk[0]) <= MAX_SEQ_LENGTH-1\n", - " assert len(batch.vector[0]) == EMBEDDING_LENGTH\n", - " print(f\"type embeddings: {type(batch.vector)} of {type(batch.vector[0])}\")\n", - " print(f\"of numbers: {type(batch.vector[0][0])}\")\n", - "\n", - " return batch" - ] - }, { "cell_type": "markdown", "id": "249e9c74", @@ -564,7 +513,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "id": "68917def", "metadata": {}, "outputs": [ @@ -575,7 +524,7 @@ "chunk size: 511\n", "original shape: (100, 4)\n", "new shape: (290, 5)\n", - "Chunking + embedding time for 100 docs: 8.375767946243286 sec\n" + "Chunking + embedding time for 100 docs: 7.053884267807007 sec\n" ] }, { @@ -613,7 +562,7 @@ " 80\n", " The whole town of Blackstone is afraid, becaus...\n", " The whole town of Blackstone is afraid, becaus...\n", - " [-0.075508565, -0.022925325, 0.022277957, 0.03...\n", + " [-0.075508595, -0.022925273, 0.022277843, 0.03...\n", " 1\n", " Positive\n", " \n", @@ -622,7 +571,7 @@ " 80\n", " The whole town of Blackstone is afraid, becaus...\n", " Mexican bandits (fighting the Gringos that too...\n", - " [0.0059213955, 0.0042556957, -0.028471153, 0.0...\n", + " [0.005921386, 0.0042556874, -0.028471047, 0.00...\n", " 1\n", " Positive\n", " \n", @@ -631,7 +580,7 @@ " 80\n", " The whole town of Blackstone is afraid, becaus...\n", " and definitely everybody is bad to the bone......\n", - " [-0.004301766, -0.03188503, -0.0051136613, -0....\n", + " [-0.004301741, -0.03188501, -0.005113593, -0.0...\n", " 1\n", " Positive\n", " \n", @@ -640,7 +589,7 @@ " 84\n", " This Harold Lloyd short wasn't really much; no...\n", " This Harold Lloyd short wasn't really much; no...\n", - " [-0.007607854, -0.033714272, -0.0077492087, 0....\n", + " [-0.0076078353, -0.033714227, -0.0077491296, 0...\n", " 0\n", " Negative\n", " \n", @@ -649,7 +598,7 @@ " 84\n", " This Harold Lloyd short wasn't really much; no...\n", " part was the last four or five minutes when th...\n", - " [0.014139466, -0.04540589, 0.012334436, 0.0192...\n", + " [0.014139436, -0.045405857, 0.012334452, 0.019...\n", " 0\n", " Negative\n", " \n", @@ -673,11 +622,11 @@ "4 part was the last four or five minutes when th... \n", "\n", " vector label_int label \n", - "0 [-0.075508565, -0.022925325, 0.022277957, 0.03... 1 Positive \n", - "1 [0.0059213955, 0.0042556957, -0.028471153, 0.0... 1 Positive \n", - "2 [-0.004301766, -0.03188503, -0.0051136613, -0.... 1 Positive \n", - "3 [-0.007607854, -0.033714272, -0.0077492087, 0.... 0 Negative \n", - "4 [0.014139466, -0.04540589, 0.012334436, 0.0192... 0 Negative " + "0 [-0.075508595, -0.022925273, 0.022277843, 0.03... 1 Positive \n", + "1 [0.005921386, 0.0042556874, -0.028471047, 0.00... 1 Positive \n", + "2 [-0.004301741, -0.03188501, -0.005113593, -0.0... 1 Positive \n", + "3 [-0.0076078353, -0.033714227, -0.0077491296, 0... 0 Negative \n", + "4 [0.014139436, -0.045405857, 0.012334452, 0.019... 0 Negative " ] }, "metadata": {}, @@ -695,12 +644,20 @@ "source": [ "## Prepare df for insertion into Milvus index.\n", "\n", - "# Use the embedding model parameters to calculate chunk_size and overlap.\n", + "# Use the embedding model parameters.\n", "chunk_size = MAX_SEQ_LENGTH - HF_EOS_TOKEN_LENGTH\n", + "chunk_overlap = np.round(chunk_size * 0.10, 0)\n", "\n", "# Chunk a batch of data from pandas DataFrame and inspect it.\n", "BATCH_SIZE = 100\n", - "batch = imdb_chunk_text(BATCH_SIZE, df, chunk_size)\n", + "batch = _utils.imdb_chunk_text(encoder, BATCH_SIZE, df, chunk_size)\n", + "\n", + "# Inspect the batch of data.\n", + "display(batch.head())\n", + "assert len(batch.chunk[0]) <= MAX_SEQ_LENGTH-1\n", + "assert len(batch.vector[0]) == EMBEDDING_LENGTH\n", + "print(f\"type embeddings: {type(batch.vector)} of {type(batch.vector[0])}\")\n", + "print(f\"of numbers: {type(batch.vector[0][0])}\")\n", "\n", "# Chunking looks good, drop the original text column.\n", "batch.drop(columns=[\"text\"], inplace=True)" @@ -720,7 +677,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "1954c96d", "metadata": {}, "outputs": [], @@ -740,7 +697,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 12, "id": "470a93c7", "metadata": {}, "outputs": [ @@ -751,120 +708,7 @@ "chunk size: 511\n", "original shape: (100, 4)\n", "new shape: (290, 5)\n", - "Chunking + embedding time for 100 docs: 8.245778799057007 sec\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
movie_indextextchunkvectorlabel_intlabel
080The whole town of Blackstone is afraid, becaus...The whole town of Blackstone is afraid, becaus...[-0.075508565, -0.022925325, 0.022277957, 0.03...1Positive
180The whole town of Blackstone is afraid, becaus...Mexican bandits (fighting the Gringos that too...[0.0059213955, 0.0042556957, -0.028471153, 0.0...1Positive
280The whole town of Blackstone is afraid, becaus...and definitely everybody is bad to the bone......[-0.004301766, -0.03188503, -0.0051136613, -0....1Positive
384This Harold Lloyd short wasn't really much; no...This Harold Lloyd short wasn't really much; no...[-0.007607854, -0.033714272, -0.0077492087, 0....0Negative
484This Harold Lloyd short wasn't really much; no...part was the last four or five minutes when th...[0.014139466, -0.04540589, 0.012334436, 0.0192...0Negative
\n", - "
" - ], - "text/plain": [ - " movie_index text \\\n", - "0 80 The whole town of Blackstone is afraid, becaus... \n", - "1 80 The whole town of Blackstone is afraid, becaus... \n", - "2 80 The whole town of Blackstone is afraid, becaus... \n", - "3 84 This Harold Lloyd short wasn't really much; no... \n", - "4 84 This Harold Lloyd short wasn't really much; no... \n", - "\n", - " chunk \\\n", - "0 The whole town of Blackstone is afraid, becaus... \n", - "1 Mexican bandits (fighting the Gringos that too... \n", - "2 and definitely everybody is bad to the bone...... \n", - "3 This Harold Lloyd short wasn't really much; no... \n", - "4 part was the last four or five minutes when th... \n", - "\n", - " vector label_int label \n", - "0 [-0.075508565, -0.022925325, 0.022277957, 0.03... 1 Positive \n", - "1 [0.0059213955, 0.0042556957, -0.028471153, 0.0... 1 Positive \n", - "2 [-0.004301766, -0.03188503, -0.0051136613, -0.... 1 Positive \n", - "3 [-0.007607854, -0.033714272, -0.0077492087, 0.... 0 Negative \n", - "4 [0.014139466, -0.04540589, 0.012334436, 0.0192... 0 Negative " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "type embeddings: of \n", - "of numbers: \n" + "Chunking + embedding time for 100 docs: 6.648707866668701 sec\n" ] } ], @@ -876,7 +720,7 @@ "\n", "# Chunk a batch of data from pandas DataFrame and inspect it.\n", "BATCH_SIZE = 100\n", - "batch = imdb_chunk_text(BATCH_SIZE, df, chunk_size)\n", + "batch = _utils.imdb_chunk_text(encoder, BATCH_SIZE, df, chunk_size)\n", "\n", "# Chunking looks good, drop the original text column.\n", "batch.drop(columns=[\"text\"], inplace=True)" @@ -889,16 +733,22 @@ "source": [ "## Insert data into Milvus\n", "\n", - "Milvus and Milvus Lite support loading pandas dataframes directly.\n", + "For each original text chunk, we'll write the quadruplet (`vector, text, source, h1, h2`) into the database.\n", "\n", - "Milvus Client, however, requires conerting pandas df into a list of dictionaries first.\n", + "
\n", + "\n", + "
\n", + "\n", + "Milvus and Milvus Lite support loading data from:\n", + "- pandas dataframes \n", + "- list of dictionaries\n", "\n", "πŸ€” TODO: This would be a good place to demonstrate Milvus' scalability by using Ray together with Milvus to run batches in parallel. I'll do this in a future tutorial." ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 13, "id": "b51ff139", "metadata": {}, "outputs": [ @@ -913,21 +763,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 29.92it/s]" + "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:02<00:00, 2.06s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Milvus insert time for 290 vectors: 0.03497195243835449 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" + "Milvus insert time for 290 vectors: 2.066472053527832 seconds\n" ] } ], @@ -968,7 +811,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 14, "id": "eb7bc132", "metadata": {}, "outputs": [], @@ -1003,7 +846,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 15, "id": "5e7f41f4", "metadata": {}, "outputs": [ @@ -1037,7 +880,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 16, "id": "a6863a32", "metadata": {}, "outputs": [ @@ -1093,7 +936,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 17, "id": "2ace8d04", "metadata": {}, "outputs": [ @@ -1101,7 +944,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Search time: 0.003979921340942383 sec\n", + "Search time: 0.07567000389099121 sec\n", "type: , count: 10\n" ] } @@ -1124,7 +967,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 18, "id": "c5d98e28", "metadata": {}, "outputs": [ @@ -1132,7 +975,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Search time: 0.0022897720336914062 sec\n", + "Milvus search time: 0.055058956146240234 sec\n", "type: , count: 3\n" ] } @@ -1142,6 +985,7 @@ "\n", "# Return top k results with HNSW index.\n", "TOP_K = 3\n", + "OUTPUT_FIELDS=[\"movie_index\", \"chunk\", \"label\"]\n", "SEARCH_PARAMS = dict({\n", " # Re-use index param for num_candidate_nearest_neighbors.\n", " \"ef\": INDEX_PARAMS['efConstruction']\n", @@ -1153,16 +997,16 @@ " COLLECTION_NAME,\n", " data=query_embeddings, \n", " search_params=SEARCH_PARAMS,\n", - " output_fields=[\"movie_index\", \"chunk\", \"label\"], \n", + " output_fields=OUTPUT_FIELDS, \n", " limit=TOP_K,\n", " consistency_level=\"Eventually\",\n", " )\n", "\n", "elapsed_time = time.time() - start_time\n", - "print(f\"Search time: {elapsed_time} sec\")\n", + "print(f\"Milvus search time: {elapsed_time} sec\")\n", "\n", "# Inspect search result.\n", - "print(f\"type: {type(results)}, count: {len(results[0])}\")\n" + "print(f\"type: {type(results)}, count: {len(results[0])}\")" ] }, { @@ -1177,52 +1021,21 @@ }, { "cell_type": "code", - "execution_count": 21, - "id": "d3dfa33a", - "metadata": {}, - "outputs": [], - "source": [ - "## Results returned from MilvusClient are in the form list of lists of dicts.\n", - "\n", - "# Get the movie_indexes, review texts, and labels.\n", - "distances = []\n", - "texts = []\n", - "movie_indexes = []\n", - "labels = []\n", - "for result in results[0]:\n", - " distances.append(result['distance'])\n", - " texts.append(result['entity']['chunk'])\n", - " movie_indexes.append(result['entity']['movie_index'])\n", - " labels.append(result['entity']['label'])\n", - "\n", - "# Assemble all the results in a zipped list.\n", - "formatted_results = list(zip(distances, movie_indexes, texts, labels))" - ] - }, - { - "cell_type": "code", - "execution_count": 22, + "execution_count": null, "id": "22d65363", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0: 0.541, 56, Negative, Dr. K(David H Hickey)has been trying to master a formula that would end all disease and handicaps, b\n", - "1: 0.54, 44, Positive, is not a horror movie, although it does contain some violent scenes, but is rather a comedy. A satir\n", - "2: 0.535, 67, Positive, a good movie with a real good story. The fact that there are so many other big stars who all also ha\n" - ] - } - ], + "outputs": [], "source": [ - "# Print the results.\n", - "# k: distance, movie_index, label, review text\n", + "# Assemble just the top retrieved 1st context and context metadata.\n", + "results = _utils.client_assemble_retrieved_context(results)\n", + "# formatted_results = _utils.client_assemble_retrieved_context(results)\n", + "# print(f\"Length context: {len(context[0])}\")\n", "\n", - "i = 0\n", - "for row in formatted_results:\n", - " print(f\"{i}: {np.round(row[0],3)}, {row[1]}, {row[3]}, {row[2][:100]}\")\n", - " i += 1\n", + "# TODO - remove printing before saving in github.\n", + "# Loop throught each formatted result and print.\n", + "print(f\"#i: (distance, movie_id, label, chunk_text)\")\n", + "for i, result in enumerate(results):\n", + " print(f\"#{i}: {result}\")\n", "\n", "#1: 2006, Serum, \n", "# 0: 0.541, 931, Negative, Dr. K(David H Hickey)has been trying to master a formula that would end all disease and handicaps, b\n", @@ -1246,39 +1059,29 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "id": "922073f2", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0: 0.561, 67, Positive, a good movie with a real good story. The fact that there are so many other big stars who all also ha\n", - "1: 0.56, 13, Positive, the stories but helps Malkovich to provoke some thought. I'd say it is worth seeing and the best of \n", - "2: 0.549, 12, Positive, the mini-bio on Woody Strode here as a primer: http://imdb.com/name/nm0834754/bio The film does a g\n" - ] - } - ], + "outputs": [], "source": [ - "# Take as input a user question and conduct semantic vector search using the question.\n", + "# # Take as input a user question and conduct semantic vector search using the question.\n", "question = \"I'm a medical doctor, what movie should I watch?\"\n", - "new_question = \"I'm a medical doctor, suggest only good movies to watch?\"\n", - "new_results = \\\n", - " imdb_utilities.mc_search_imdb([new_question],\n", - " encoder,\n", - " mc,\n", - " SEARCH_PARAMS, 3, \n", - " milvus_client=True,\n", - " COLLECTION_NAME=COLLECTION_NAME,\n", - " )\n", - "\n", - "# Print the results.\n", - "# k: distance, movie_index, label, review text\n", - "i = 0\n", - "for row in new_results:\n", - " print(f\"{i}: {np.round(row[0],3)}, {row[1]}, {row[3]}, {row[2][:100]}\")\n", - " i += 1\n", + "new_question = \"I'm a medical doctor, suggest only positive movies to watch?\"\n", + "print(f\"Question: {new_question}\")\n", + "\n", + "# Run the search and time it.\n", + "start_time = time.time()\n", + "new_results = _utils.mc_client_search_imdb(\n", + " [new_question], encoder, mc, SEARCH_PARAMS, OUTPUT_FIELDS, TOP_K)\n", + " \n", + "elapsed_time = time.time() - start_time\n", + "print(f\"Milvus search time: {elapsed_time} sec\")\n", + "\n", + "# TODO - remove printing before saving in github.\n", + "# Loop throught each formatted result and print.\n", + "print(f\"#i: (distance, movie_id, label, chunk_text)\")\n", + "for i, result in enumerate(new_results):\n", + " print(f\"#{i}: {result}\")\n", "\n", "# As expected, new_question answers are slightly different!\n", "# 0: 0.562, 45719, Positive, the stories but helps Malkovich to provoke some thought.

I'd say it is worth seeing and t\n", @@ -1288,19 +1091,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "d0e81e68", "metadata": {}, "outputs": [], "source": [ - "# Shut down and cleanup the milvus server.\n", - "default_server.stop()\n", - "default_server.cleanup()" + "# Drop collection\n", + "utility.drop_collection(COLLECTION_NAME)" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 22, "id": "c777937e", "metadata": {}, "outputs": [ @@ -1311,16 +1113,16 @@ "Author: Christy Bergman\n", "\n", "Python implementation: CPython\n", - "Python version : 3.10.12\n", - "IPython version : 8.15.0\n", + "Python version : 3.11.6\n", + "IPython version : 8.18.1\n", "\n", - "torch : 2.0.1\n", - "transformers: 4.34.1\n", + "torch : 2.1.1\n", + "transformers: 4.35.2\n", "milvus : 2.3.3\n", - "pymilvus : 2.3.3\n", + "pymilvus : 2.3.4\n", "langchain : 0.0.322\n", "\n", - "conda environment: py310\n", + "conda environment: py311\n", "\n" ] } @@ -1358,7 +1160,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.11.6" } }, "nbformat": 4,