, count: 3\n"
]
}
@@ -700,6 +683,9 @@
"# Return top k results with AUTOINDEX.\n",
"TOP_K = 3\n",
"\n",
+ "# Define output fields to return.\n",
+ "OUTPUT_FIELDS = [\"h1\", \"h2\", \"source\", \"text\"]\n",
+ "\n",
"# Run semantic vector search using your query and the vector database.\n",
"start_time = time.time()\n",
"results = mc.search(\n",
@@ -709,7 +695,7 @@
" param={},\n",
" # Milvus can utilize metadata to enhance the search experience in boolean expressions.\n",
" # expr=\"\",\n",
- " output_fields=[\"h1\", \"h2\", \"text\", \"source\"], \n",
+ " output_fields=OUTPUT_FIELDS, \n",
" limit=TOP_K,\n",
" consistency_level=\"Eventually\"\n",
" )\n",
@@ -732,29 +718,23 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Length of context: 1533\n",
- "1 1\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
- "# # TODO - remove printing before saving in github.\n",
- "# for n, hits in enumerate(results):\n",
- "# print(f\"{n}th query result\")\n",
- "# for hit in hits:\n",
- "# print(hit)\n",
- "\n",
- "# Assemble the context and context metadata.\n",
- "# context, context_metadata = _utils.assemble_retrieved_context(results, num_shot_answers=TOP_K)\n",
- "context, context_metadata = _utils.assemble_retrieved_context(results, num_shot_answers=1)\n",
- "print(len(context_metadata), len(context_metadata))"
+ "# Assemble just the top retrieved 1st context and context metadata.\n",
+ "# context, context_metadata = _utils.assemble_retrieved_context(results, METADATA_FIELDS, num_shot_answers=TOP_K)\n",
+ "\n",
+ "context, context_metadata = _utils.assemble_retrieved_context(results, OUTPUT_FIELDS, num_shot_answers=1)\n",
+ "print(f\"Using {len(context)} contexts and {len(context_metadata)} metadata stuffing in Prompt.\")\n",
+ "print(f\"Length context: {len(context[0])}\")\n",
+ "\n",
+ "# TODO - remove printing before saving in github.\n",
+ "# Loop throught each context and metadata and print.\n",
+ "for i in range(len(context)):\n",
+ " print(f\"Context: {context[i][:200]}\")\n",
+ " print(f\"Metadata: {context_metadata[i]}\")\n",
+ " print()"
]
},
{
@@ -769,7 +749,7 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 16,
"id": "3e7fa0b6",
"metadata": {},
"outputs": [
@@ -811,7 +791,7 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 17,
"id": "a68e87b1",
"metadata": {},
"outputs": [
@@ -831,7 +811,7 @@
"# NOW ASK THE SAME LLM THE SAME QUESTION USING THE RETRIEVED CONTEXT.\n",
"QA_input = {\n",
" 'question': SAMPLE_QUESTION,\n",
- " 'context': context,\n",
+ " 'context': context[0],\n",
"}\n",
"\n",
"nlp = pipeline('question-answering', \n",
@@ -862,7 +842,7 @@
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
@@ -872,6 +852,8 @@
"\n",
"# Define the generation llm model to use.\n",
"LLM_NAME = \"gpt-3.5-turbo-1106\"\n",
+ "TEMPERATURE = 0.1\n",
+ "RANDOM_SEED = 415\n",
"\n",
"# See how to save api key in env variable.\n",
"# https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety\n",
@@ -883,7 +865,7 @@
},
{
"cell_type": "code",
- "execution_count": 22,
+ "execution_count": 20,
"metadata": {},
"outputs": [
{
@@ -891,14 +873,10 @@
"output_type": "stream",
"text": [
"Question: What do the parameters for HNSW mean?\n",
- "('Answer: The parameters for HNSW are M, which denotes the maximum degree of '\n",
- " 'nodes on each layer of the graph, and efConstruction, which specifies a '\n",
- " 'search range during index construction. M has values ranging from 4 to 64, '\n",
- " 'and efConstruction ranges from 8 to 512. The ef parameter is used during the '\n",
- " 'search stage and should be larger than top_k, with values from top_k to '\n",
- " '32768. These parameters help control the graph structure and search scope '\n",
- " 'for HNSW indexing. (Source: Milvus support to create index to accelerate '\n",
- " 'vector search - https://pymilvus.readthedocs.io/en/latest/param.html)')\n",
+ "('Answer: Answer: The parameter M represents the maximum degree of nodes on '\n",
+ " 'each layer of the graph in the HNSW index, while efConstruction (when '\n",
+ " 'building index) or ef (when searching targets) specifies the search range. M '\n",
+ " 'ranges from 4 to 64, and efConstruction ranges from 8 to 512.')\n",
"\n",
"\n"
]
@@ -908,23 +886,23 @@
"# CAREFUL!! THIS COSTS MONEY!!\n",
"# Generate response\n",
"\n",
- "prompt = f\"\"\"Answer the question using the context provided. Be succinct.\n",
- "Echo in the answer the Grounding Sources.\n",
+ "PROMPT = f\"\"\"Answer the question using the context provided. Be concise. Use the format below.\n",
+ "Answer: The answer to the question.\n",
"Grounding sources: {context_metadata}\n",
"\"\"\"\n",
"\n",
"# response = openai_client.chat.completions.create(\n",
+ "# # response_format={\n",
+ "# # \"type\": \"json_object\", \n",
+ "# # \"schema\": Result.schema_json()\n",
+ "# # },\n",
"# messages=[\n",
- "# {\n",
- "# \"role\": \"system\",\n",
- "# \"content\": prompt,\n",
- "# },\n",
- "# {\n",
- "# \"role\": \"user\",\n",
- "# \"content\": f\"question: {SAMPLE_QUESTION}, context: {context}\",\n",
- "# }\n",
+ "# {\"role\": \"system\", \"content\": PROMPT,},\n",
+ "# {\"role\": \"user\", \"content\": f\"question: {SAMPLE_QUESTION}, context: {context[0]}\",}\n",
"# ],\n",
"# model=LLM_NAME,\n",
+ "# temperature=TEMPERATURE,\n",
+ "# seed=RANDOM_SEED,\n",
"# )\n",
"\n",
"# Print the question and answer along with grounding sources and citations.\n",
@@ -938,6 +916,14 @@
"\n",
"# Question1: What do the parameters for HNSW mean?\n",
"# Answer: Perfect!\n",
+ "# ('Answer: The parameters for HNSW are M, which limits the maximum degree of '\n",
+ "# 'nodes on each layer of the graph, and efConstruction, which specifies the '\n",
+ "# 'search range during index construction. Additionally, when searching '\n",
+ "# 'targets, the parameter ef is used to specify the search range. \\n'\n",
+ "# '\\n'\n",
+ "# \"Grounding sources: {'h1': 'Index', 'h2': 'Milvus support to create index to \"\n",
+ "# \"accelerate vecto', 'source': \"\n",
+ "# \"'https://pymilvus.readthedocs.io/en/latest/param.html'}\")\n",
"# Best answer: M: maximum degree of nodes in a layer of the graph. \n",
"# efConstruction: number of nearest neighbors to consider when connecting nodes in the graph.\n",
"# ef: number of nearest neighbors to consider when searching for similar vectors. \n",
@@ -974,22 +960,15 @@
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Length context: 4229, Len metadata: 3\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"import requests, json\n",
"\n",
"# Define the URL, headers, and data\n",
- "url = \"https://controller.api.gcp-us-west1.zillizcloud.com/v1/pipelines/pipe-458714b66ffc8ff3ab1bf1/run\"\n",
+ "# TODO change this before checking into github.\n",
+ "url = \"https://controller.api.gcp-us-west1.zillizcloud.com/v1/pipelines/pipe-xxx/run\"\n",
"headers = {\n",
" \"Content-Type\": \"application/json\",\n",
" \"Authorization\": f\"Bearer {TOKEN}\",\n",
@@ -1002,15 +981,17 @@
" \"limit\": TOP_K,\n",
" \"offset\": 0,\n",
" \"outputFields\": [\"chunk_text\", \"chunk_id\", \"doc_name\", \"source\"],\n",
- " \"filter\": \"chunk_id >= 0\"\n",
+ " \"filter\": \"chunk_id >= 0 && doc_name == 'param.html'\",\n",
" }\n",
"}\n",
"\n",
"# Send the POST request\n",
"response = requests.post(url, headers=headers, json=data)\n",
- "# print(type(response))\n",
- "# # Print the response\n",
- "# pprint.pprint(response.json())\n",
+ "print(type(response))\n",
+ "\n",
+ "# TODO: Remove this before saving in github.\n",
+ "# Print the response\n",
+ "pprint.pprint(response.json())\n",
"\n",
"# Assemble context from Pipeline retriever.\n",
"pipeline_context = \"\"\n",
@@ -1031,7 +1012,7 @@
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": 23,
"metadata": {},
"outputs": [
{
@@ -1039,13 +1020,10 @@
"output_type": "stream",
"text": [
"Question: What do the parameters for HNSW mean?\n",
- "('Answer: The parameters for HNSW are used to control the structure of the '\n",
- " 'multi-layer navigation graph. \"M\" determines the maximum degree of nodes on '\n",
- " 'each layer, and \"efConstruction\" specifies the search range during index '\n",
- " 'construction. When searching for targets, the parameter \"ef\" should be set '\n",
- " 'and be larger than \"top_k\".\\n'\n",
- " 'Sources: (https://pymilvus.readthedocs.io/en/latest/param.html), '\n",
- " '(https://pymilvus.readthedocs.io/en/latest/api.html)')\n",
+ "('Answer: The parameters for HNSW are M, which is the maximum degree of the '\n",
+ " 'node, and efConstruction, which specifies the search range during index '\n",
+ " 'construction. The M parameter ranges from 4 to 64, and efConstruction ranges '\n",
+ " 'from 8 to 512. Source: https://pymilvus.readthedocs.io/en/latest/param.html')\n",
"\n",
"\n"
]
@@ -1053,24 +1031,15 @@
],
"source": [
"# CAREFUL!! THIS COSTS MONEY!!\n",
- "# Generate response\n",
- "prompt = f\"\"\"Answer the question using the context provided. Be succinct.\n",
- "Echo in the answer the Grounding sources.\n",
- "Grounding sources: {pipeline_context_metadata}\n",
- "\"\"\"\n",
"\n",
"# response_pipeline = openai_client.chat.completions.create(\n",
"# messages=[\n",
- "# {\n",
- "# \"role\": \"system\",\n",
- "# \"content\": prompt,\n",
- "# },\n",
- "# {\n",
- "# \"role\": \"user\",\n",
- "# \"content\": f\"question: {SAMPLE_QUESTION}, context: {pipeline_context}\",\n",
- "# }\n",
- "# ],\n",
+ "# {\"role\": \"system\", \"content\": PROMPT,},\n",
+ "# {\"role\": \"user\", \"content\": f\"question: {SAMPLE_QUESTION}, context: {pipeline_context[0]}\"}\n",
+ "# ],\n",
"# model=LLM_NAME,\n",
+ "# temperature=TEMPERATURE,\n",
+ "# seed=RANDOM_SEED,\n",
"# )\n",
"\n",
"# Print the question and answer along with grounding sources and citations.\n",
@@ -1082,7 +1051,15 @@
" print(\"\\n\")\n",
"\n",
"# Question1: What do the parameters for HNSW mean?\n",
- "# Answer: Perfect!\n",
+ "# Answer: Good, but missing 'ef' even though given 3 contexts instead of just 1, which manual method used.\n",
+ "# ('Answer: Answer: The parameters for HNSW in Milvus refer to the configuration '\n",
+ "# 'options for the Hierarchical Navigable Small World index, which is used to '\n",
+ "# 'accelerate vector similarity search. The parameters include \"M\" for the '\n",
+ "# 'number of edges per point and \"efConstruction\" for the size of the dynamic '\n",
+ "# 'list for the nearest neighbors during index construction.\\n'\n",
+ "# \"Grounding sources: [{'h1': 'Index', 'h2': 'Milvus support to create index to \"\n",
+ "# \"accelerate vecto', 'source': \"\n",
+ "# \"'https://pymilvus.readthedocs.io/en/latest/param.html'}]\")\n",
"# Best answer: M: maximum degree of nodes in a layer of the graph. \n",
"# efConstruction: number of nearest neighbors to consider when connecting nodes in the graph.\n",
"# ef: number of nearest neighbors to consider when searching for similar vectors. \n",
@@ -1098,7 +1075,7 @@
},
{
"cell_type": "code",
- "execution_count": 25,
+ "execution_count": 24,
"id": "d0e81e68",
"metadata": {},
"outputs": [],
@@ -1109,7 +1086,7 @@
},
{
"cell_type": "code",
- "execution_count": 26,
+ "execution_count": 25,
"id": "c777937e",
"metadata": {},
"outputs": [
diff --git a/bootcamp/Retrieval/imdb_milvus_client.ipynb b/bootcamp/Retrieval/imdb_milvus_client.ipynb
index b9a0df3dd..87627f202 100755
--- a/bootcamp/Retrieval/imdb_milvus_client.ipynb
+++ b/bootcamp/Retrieval/imdb_milvus_client.ipynb
@@ -27,21 +27,12 @@
"# !pip install milvus, pymilvus, langchain, torch, transformers, python-dotenv\n",
"\n",
"# Import common libraries.\n",
- "from typing import List\n",
"import time\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"# Import custom functions for splitting and search.\n",
- "import imdb_utilities"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "8a67e382",
- "metadata": {},
- "source": [
- "## Start up a local Milvus server."
+ "import imdb_utilities as _utils"
]
},
{
@@ -49,13 +40,17 @@
"id": "fb844837",
"metadata": {},
"source": [
- "Code in this notebook uses [Milvus client](https://milvus.io/docs/using_milvusclient.md) with [Milvus lite](https://milvus.io/docs/milvus_lite.md), which runs a local server. βοΈ Milvus lite is only meant for demos and local testing.\n",
- "- pip install milvus pymilvus\n",
+ "## Start up a Zilliz free tier cluster.\n",
+ "\n",
+ "Code in this notebook uses fully-managed Milvus on [Ziliz Cloud free trial](https://cloud.zilliz.com/login). \n",
+ " 1. Choose the default \"Starter\" option when you provision > Create collection > Give it a name > Create cluster and collection. \n",
+ " 2. On the Cluster main page, copy your `API Key` and store it locally in a .env variable. See note below how to do that.\n",
+ " 3. Also on the Cluster main page, copy the `Public Endpoint URI`.\n",
+ "\n",
+ "π‘ Note: To keep your tokens private, best practice is to use an **env variable**. See [how to save api key in env variable](https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety).
\n",
"\n",
- "π‘ **For production purposes**, use a local Milvus docker, Milvus clusters, or fully-managed Milvus on Zilliz Cloud.\n",
- "- [Local Milvus docker](https://milvus.io/docs/install_standalone-docker.md) requires local docker installed and running.\n",
- "- [Milvus clusters](https://milvus.io/docs/install_cluster-milvusoperator.md) requires a K8s cluster up and running.\n",
- "- [Ziliz Cloud free trial](https://cloud.zilliz.com/login) choose a \"Default\" option when you provision.\n"
+ "In Jupyter, you also need a .env file (in same dir as notebooks) containing lines like this:\n",
+ "- VARIABLE_NAME=value\n"
]
},
{
@@ -68,40 +63,32 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Milvus server startup time: 7.601609945297241 sec\n",
- "v2.3.3-lite\n"
+ "Type of server: zilliz_cloud\n"
]
}
],
"source": [
- "from milvus import default_server\n",
- "from pymilvus import (\n",
- " connections, utility\n",
+ "# !pip install pymilvus #python sdk for milvus\n",
+ "from pymilvus import connections, utility\n",
+ "\n",
+ "import os\n",
+ "from dotenv import load_dotenv\n",
+ "load_dotenv()\n",
+ "TOKEN = os.getenv(\"ZILLIZ_API_KEY\")\n",
+ "\n",
+ "# Connect to Zilliz cloud using endpoint URI and API key TOKEN.\n",
+ "# TODO change this before checking into github.\n",
+ "CLUSTER_ENDPOINT=\"https://in03-xxxx.api.gcp-us-west1.zillizcloud.com:443\"\n",
+ "connections.connect(\n",
+ " alias='default',\n",
+ " # Public endpoint obtained from Zilliz Cloud\n",
+ " uri=CLUSTER_ENDPOINT,\n",
+ " # API key or a colon-separated cluster username and password\n",
+ " token=TOKEN,\n",
")\n",
"\n",
- "# Cleanup previous data and stop server in case it is still running.\n",
- "default_server.stop()\n",
- "default_server.cleanup()\n",
- "\n",
- "# Start a new milvus-lite local server.\n",
- "start_time = time.time()\n",
- "default_server.start()\n",
- "\n",
- "end_time = time.time()\n",
- "print(f\"Milvus server startup time: {end_time - start_time} sec\")\n",
- "# startup time: 5.6739208698272705\n",
- "\n",
- "# Add wait to avoid error message from trying to connect.\n",
- "time.sleep(15)\n",
- "\n",
- "# Now you could connect with localhost and the given port.\n",
- "# Port is defined by default_server.listen_port.\n",
- "connections.connect(host='127.0.0.1', \n",
- " port=default_server.listen_port,\n",
- " show_startup_banner=True)\n",
- "\n",
- "# Check if the server is ready.\n",
- "print(utility.get_server_version())"
+ "# Check if the server is ready and get colleciton name.\n",
+ "print(f\"Type of server: {utility.get_server_version()}\")"
]
},
{
@@ -110,7 +97,7 @@
"metadata": {},
"source": [
"## Load the Embedding Model checkpoint and use it to create vector embeddings\n",
- "**Embedding model:** We will use the open-source [sentence transformers](https://www.sbert.net/docs/pretrained_models.html) hosted on HuggingFace to encode the movie review text. We will save the embeddings to a pandas dataframe and then into the milvus database.\n",
+ "**Embedding model:** We will use the open-source [sentence transformers](https://www.sbert.net/docs/pretrained_models.html) available on HuggingFace to encode the documentation text. We will download the model from HuggingFace and run it locally. \n",
"\n",
"Two model parameters of note below:\n",
"1. EMBEDDING_LENGTH refers to the dimensionality or length of the embedding vector. In this case, the embeddings generated for EACH token in the input text will have the SAME length = 768. This size of embedding is often associated with BERT-based models, where the embeddings are used for downstream tasks such as classification, question answering, or text generation.
\n",
@@ -157,9 +144,14 @@
"print(encoder)\n",
"\n",
"# Get the model parameters and save for later.\n",
- "MAX_SEQ_LENGTH = encoder.get_max_seq_length() \n",
- "HF_EOS_TOKEN_LENGTH = 1\n",
"EMBEDDING_LENGTH = encoder.get_sentence_embedding_dimension()\n",
+ "MAX_SEQ_LENGTH_IN_TOKENS = encoder.get_max_seq_length() \n",
+ "# # Assume tokens are 3 characters long.\n",
+ "# MAX_SEQ_LENGTH = MAX_SEQ_LENGTH_IN_TOKENS * 3\n",
+ "# HF_EOS_TOKEN_LENGTH = 1 * 3\n",
+ "# Test with 512 sequence length.\n",
+ "MAX_SEQ_LENGTH = MAX_SEQ_LENGTH_IN_TOKENS\n",
+ "HF_EOS_TOKEN_LENGTH = 1\n",
"\n",
"# Inspect model parameters.\n",
"print(f\"model_name: {model_name}\")\n",
@@ -175,30 +167,19 @@
"## Create a Milvus collection\n",
"\n",
"You can think of a collection in Milvus like a \"table\" in SQL databases. The **collection** will contain the \n",
- "- **Schema** (or no-schema Milvus Client). \n",
+ "- **Schema** (or [no-schema Milvus client](https://milvus.io/docs/using_milvusclient.md)). \n",
"π‘ You'll need the vector `EMBEDDING_LENGTH` parameter from your embedding model.\n",
+ "Typical values are:\n",
+ " - 768 for sbert embedding models\n",
+ " - 1536 for ada-002 OpenAI embedding models\n",
"- **Vector index** for efficient vector search\n",
"- **Vector distance metric** for measuring nearest neighbor vectors\n",
"- **Consistency level**\n",
"In Milvus, transactional consistency is possible; however, according to the [CAP theorem](https://en.wikipedia.org/wiki/CAP_theorem), some latency must be sacrificed. π‘ Searching movie reviews is not mission-critical, so [`eventually`](https://milvus.io/docs/consistency.md) consistent is fine here.\n",
"\n",
- "## Add a Vector Index\n",
- "\n",
- "The vector index determines the vector **search algorithm** used to find the closest vectors in your data to the query a user submits. Most vector indexes use different sets of parameters depending on whether the database is:\n",
- "- **inserting vectors** (creation mode) - vs - \n",
- "- **searching vectors** (search mode) \n",
- "\n",
- "Scroll down the [docs page](https://milvus.io/docs/index.md) to see a table listing different vector indexes available on Milvus. For example:\n",
- "- FLAT - deterministic exhaustive search\n",
- "- IVF_FLAT or IVF_SQ8 - Hash index (stochastic approximate search)\n",
- "- HNSW - Graph index (stochastic approximate search)\n",
- "\n",
- "Besides a search algorithm, we also need to specify a **distance metric**, that is, a definition of what is considered \"close\" in vector space. In the cell below, the [`HNSW`](https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md) search index is chosen. Its possible distance metrics are one of:\n",
- "- L2 - L2-norm\n",
- "- IP - Dot-product\n",
- "- COSINE - Angular distance\n",
- "\n",
- "π‘ Most use cases work better with normalized embeddings, in which case L2 is useless (every vector has length=1) and IP and COSINE are the same. Only choose L2 if you plan to keep your embeddings unnormalized."
+ "Some supported [data types](https://milvus.io/docs/schema.md) for Milvus schemas are:\n",
+ "- INT64 - primary key\n",
+ "- FLOAT_VECTOR - embedings = list of `numpy.ndarray` of `numpy.float32` numbers\n"
]
},
{
@@ -213,27 +194,58 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"id": "20a05d59",
"metadata": {},
"outputs": [],
"source": [
+ "from pymilvus import MilvusClient\n",
+ "\n",
"# Set the Milvus collection name.\n",
"COLLECTION_NAME = # TODO (exercise): code here\n",
"\n",
- "# Use no-schema Milvus client (uses flexible json key:value format).\n",
+ "# Use no-schema Milvus client uses flexible json key:value format.\n",
"# https://milvus.io/docs/using_milvusclient.md\n",
- "mc = MilvusClient(uri=\"http://localhost\")\n",
+ "mc = MilvusClient(\n",
+ " uri=CLUSTER_ENDPOINT,\n",
+ " # API key or a colon-separated cluster username and password\n",
+ " token=TOKEN)\n",
+ "\n",
"mc.drop_collection(COLLECTION_NAME)\n",
"mc.create_collection(COLLECTION_NAME, \n",
" EMBEDDING_LENGTH, \n",
- " #params=index_params # Omit params to use AUTOINDEX.\n",
" )\n",
"\n",
"print(mc.describe_collection(COLLECTION_NAME))\n",
"print(f\"Created collection: {COLLECTION_NAME}\")"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Add a Vector Index\n",
+ "\n",
+ "The vector index determines the vector **search algorithm** used to find the closest vectors in your data to the query a user submits. \n",
+ "\n",
+ "Most vector indexes use different sets of parameters depending on whether the database is:\n",
+ "- **inserting vectors** (creation mode) - vs - \n",
+ "- **searching vectors** (search mode) \n",
+ "\n",
+ "Scroll down the [docs page](https://milvus.io/docs/index.md) to see a table listing different vector indexes available on Milvus. For example:\n",
+ "- FLAT - deterministic exhaustive search\n",
+ "- IVF_FLAT or IVF_SQ8 - Hash index (stochastic approximate search)\n",
+ "- HNSW - Graph index (stochastic approximate search)\n",
+ "- AUTOINDEX - Automatically determined based on OSS vs [Zilliz cloud](https://docs.zilliz.com/docs/autoindex-explained), type of GPU, size of data.\n",
+ "\n",
+ "Besides a search algorithm, we also need to specify a **distance metric**, that is, a definition of what is considered \"close\" in vector space. In the cell below, the [`HNSW`](https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md) search index is chosen (Milvus OSS default AUTOINDEX). Its possible distance metrics are one of:\n",
+ "- L2 - L2-norm\n",
+ "- IP - Dot-product\n",
+ "- COSINE - Angular distance\n",
+ "\n",
+ "π‘ Most use cases work better with normalized embeddings, in which case L2 is useless (every vector has length=1) and IP and COSINE are the same. Only choose L2 if you plan to keep your embeddings unnormalized."
+ ]
+ },
{
"cell_type": "code",
"execution_count": 5,
@@ -246,12 +258,13 @@
"text": [
"Embedding length: 768\n",
"Created collection: movies\n",
- "{'collection_name': 'movies', 'auto_id': True, 'num_shards': 1, 'description': '', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': 5, 'params': {}, 'element_type': 0, 'auto_id': True, 'is_primary': True}, {'field_id': 101, 'name': 'vector', 'description': '', 'type': 101, 'params': {'dim': 768}, 'element_type': 0}], 'aliases': [], 'collection_id': 445754962278875466, 'consistency_level': 3, 'properties': {}, 'num_partitions': 1, 'enable_dynamic_field': True}\n"
+ "{'collection_name': 'movies', 'auto_id': True, 'num_shards': 1, 'description': '', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': 5, 'params': {}, 'element_type': 0, 'auto_id': True, 'is_primary': True}, {'field_id': 101, 'name': 'vector', 'description': '', 'type': 101, 'params': {'dim': 768}, 'element_type': 0}], 'aliases': [], 'collection_id': 446268198612080098, 'consistency_level': 3, 'properties': {}, 'num_partitions': 1, 'enable_dynamic_field': True}\n"
]
}
],
"source": [
"# Re-run create collection and add vector index specifying custom params.\n",
+ "from pymilvus import MilvusClient\n",
"\n",
"# For vector length, use the embedding length from the embedding model.\n",
"print(f\"Embedding length: {EMBEDDING_LENGTH}\")\n",
@@ -277,10 +290,18 @@
" \"params\": INDEX_PARAMS\n",
" }\n",
"\n",
+ "# Check if collection already exists, if so drop it.\n",
+ "has = utility.has_collection(COLLECTION_NAME)\n",
+ "if has:\n",
+ " drop_result = utility.drop_collection(COLLECTION_NAME)\n",
+ " print(f\"Successfully dropped collection: `{COLLECTION_NAME}`\")\n",
+ "\n",
"# Below example uses no-schema Milvus client (flexible json key:value format).\n",
"# https://milvus.io/docs/using_milvusclient.md\n",
- "mc = MilvusClient(uri=\"http://localhost\")\n",
- "mc.drop_collection(COLLECTION_NAME)\n",
+ "mc = MilvusClient(\n",
+ " uri=CLUSTER_ENDPOINT,\n",
+ " # API key or a colon-separated cluster username and password\n",
+ " token=TOKEN)\n",
"mc.create_collection(\n",
" COLLECTION_NAME, \n",
" EMBEDDING_LENGTH, \n",
@@ -329,7 +350,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 7,
"id": "6a381e57",
"metadata": {},
"outputs": [
@@ -416,10 +437,11 @@
"df.columns = ['text', 'label_int']\n",
"\n",
"# Map numbers to text 'Postive' and 'Negative' for sentiment labels.\n",
- "df[\"label\"] = df[\"label_int\"].apply(imdb_utilities.sentiment_score_to_name)\n",
+ "df[\"label\"] = df[\"label_int\"].apply(_utils.sentiment_score_to_name)\n",
"\n",
"# Split data into train/valid/test.\n",
- "df, df_train, df_val, df_test = imdb_utilities.partition_dataset(df, smoke_test=False)\n",
+ "columns = ['movie_index', 'text', 'label_int', 'label']\n",
+ "df, df_train, df_val, df_test = _utils.partition_dataset(df, columns, smoke_test=False)\n",
"print(f\"original df shape: {df.shape}\")\n",
"print(f\"df_train shape: {df_train.shape}, df_val shape: {df_val.shape}, df_test shape: {df_test.shape}\")\n",
"assert df_train.shape[0] + df_val.shape[0] + df_test.shape[0] == df.shape[0]\n",
@@ -432,7 +454,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 8,
"id": "654dd135",
"metadata": {},
"outputs": [
@@ -455,7 +477,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
@@ -479,79 +501,6 @@
"- **Function** = Langchain's convenient `RecursiveCharacterTextSplitter` to split up long reviews recursively.\n"
]
},
- {
- "cell_type": "code",
- "execution_count": 11,
- "id": "a53595fa",
- "metadata": {},
- "outputs": [],
- "source": [
- "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
- "\n",
- "def recursive_splitter_wrapper(text, chunk_size):\n",
- "\n",
- " # Default chunk overlap is 10% chunk_size.\n",
- " chunk_overlap = np.round(chunk_size * 0.10, 0)\n",
- "\n",
- " # Use langchain's convenient recursive chunking method.\n",
- " text_splitter = RecursiveCharacterTextSplitter(\n",
- " chunk_size=chunk_size,\n",
- " chunk_overlap=chunk_overlap,\n",
- " length_function=len,\n",
- " )\n",
- " chunks: List[str] = text_splitter.split_text(text)\n",
- "\n",
- " # Replace special characters with spaces.\n",
- " chunks = [text.replace(\"
\", \" \") for text in chunks]\n",
- "\n",
- " return chunks\n",
- "\n",
- "# Use recursive splitter to chunk text.\n",
- "def imdb_chunk_text(batch_size, df, chunk_size):\n",
- "\n",
- " batch = df.head(batch_size).copy()\n",
- " print(f\"chunk size: {chunk_size}\")\n",
- " print(f\"original shape: {batch.shape}\")\n",
- " \n",
- " start_time = time.time()\n",
- " # 1. Change primary key type to string.\n",
- " batch[\"movie_index\"] = batch[\"movie_index\"].apply(lambda x: str(x))\n",
- "\n",
- " # 2. Truncate reviews to 512 characters.\n",
- " batch['chunk'] = batch['text'].apply(recursive_splitter_wrapper, chunk_size=chunk_size)\n",
- " # Explode the 'chunk' column to create new rows for each chunk.\n",
- " batch = batch.explode('chunk', ignore_index=True)\n",
- " print(f\"new shape: {batch.shape}\")\n",
- "\n",
- " # 3. Add embeddings as new column in df.\n",
- " review_embeddings = torch.tensor(encoder.encode(batch['chunk']))\n",
- " # Normalize embeddings to unit length.\n",
- " review_embeddings = F.normalize(review_embeddings, p=2, dim=1)\n",
- " # Quick check if embeddings are normalized.\n",
- " norms = np.linalg.norm(review_embeddings, axis=1)\n",
- " assert np.allclose(norms, 1.0, atol=1e-5) == True\n",
- "\n",
- " # 4. Convert embeddings to list of `numpy.ndarray`, each containing `numpy.float32` numbers.\n",
- " converted_values = list(map(np.float32, review_embeddings))\n",
- " batch['vector'] = converted_values\n",
- "\n",
- " # 5. Reorder columns for conveneince, so index first, labels at end.\n",
- " new_order = [\"movie_index\", \"text\", \"chunk\", \"vector\", \"label_int\", \"label\"]\n",
- " batch = batch[new_order]\n",
- "\n",
- " end_time = time.time()\n",
- " print(f\"Chunking + embedding time for {batch_size} docs: {end_time - start_time} sec\")\n",
- "\n",
- " # Inspect the batch of data.\n",
- " display(batch.head())\n",
- " assert len(batch.chunk[0]) <= MAX_SEQ_LENGTH-1\n",
- " assert len(batch.vector[0]) == EMBEDDING_LENGTH\n",
- " print(f\"type embeddings: {type(batch.vector)} of {type(batch.vector[0])}\")\n",
- " print(f\"of numbers: {type(batch.vector[0][0])}\")\n",
- "\n",
- " return batch"
- ]
- },
{
"cell_type": "markdown",
"id": "249e9c74",
@@ -564,7 +513,7 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 10,
"id": "68917def",
"metadata": {},
"outputs": [
@@ -575,7 +524,7 @@
"chunk size: 511\n",
"original shape: (100, 4)\n",
"new shape: (290, 5)\n",
- "Chunking + embedding time for 100 docs: 8.375767946243286 sec\n"
+ "Chunking + embedding time for 100 docs: 7.053884267807007 sec\n"
]
},
{
@@ -613,7 +562,7 @@
" 80 | \n",
" The whole town of Blackstone is afraid, becaus... | \n",
" The whole town of Blackstone is afraid, becaus... | \n",
- " [-0.075508565, -0.022925325, 0.022277957, 0.03... | \n",
+ " [-0.075508595, -0.022925273, 0.022277843, 0.03... | \n",
" 1 | \n",
" Positive | \n",
" \n",
@@ -622,7 +571,7 @@
" 80 | \n",
" The whole town of Blackstone is afraid, becaus... | \n",
" Mexican bandits (fighting the Gringos that too... | \n",
- " [0.0059213955, 0.0042556957, -0.028471153, 0.0... | \n",
+ " [0.005921386, 0.0042556874, -0.028471047, 0.00... | \n",
" 1 | \n",
" Positive | \n",
" \n",
@@ -631,7 +580,7 @@
" 80 | \n",
" The whole town of Blackstone is afraid, becaus... | \n",
" and definitely everybody is bad to the bone...... | \n",
- " [-0.004301766, -0.03188503, -0.0051136613, -0.... | \n",
+ " [-0.004301741, -0.03188501, -0.005113593, -0.0... | \n",
" 1 | \n",
" Positive | \n",
" \n",
@@ -640,7 +589,7 @@
" 84 | \n",
" This Harold Lloyd short wasn't really much; no... | \n",
" This Harold Lloyd short wasn't really much; no... | \n",
- " [-0.007607854, -0.033714272, -0.0077492087, 0.... | \n",
+ " [-0.0076078353, -0.033714227, -0.0077491296, 0... | \n",
" 0 | \n",
" Negative | \n",
" \n",
@@ -649,7 +598,7 @@
" 84 | \n",
" This Harold Lloyd short wasn't really much; no... | \n",
" part was the last four or five minutes when th... | \n",
- " [0.014139466, -0.04540589, 0.012334436, 0.0192... | \n",
+ " [0.014139436, -0.045405857, 0.012334452, 0.019... | \n",
" 0 | \n",
" Negative | \n",
" \n",
@@ -673,11 +622,11 @@
"4 part was the last four or five minutes when th... \n",
"\n",
" vector label_int label \n",
- "0 [-0.075508565, -0.022925325, 0.022277957, 0.03... 1 Positive \n",
- "1 [0.0059213955, 0.0042556957, -0.028471153, 0.0... 1 Positive \n",
- "2 [-0.004301766, -0.03188503, -0.0051136613, -0.... 1 Positive \n",
- "3 [-0.007607854, -0.033714272, -0.0077492087, 0.... 0 Negative \n",
- "4 [0.014139466, -0.04540589, 0.012334436, 0.0192... 0 Negative "
+ "0 [-0.075508595, -0.022925273, 0.022277843, 0.03... 1 Positive \n",
+ "1 [0.005921386, 0.0042556874, -0.028471047, 0.00... 1 Positive \n",
+ "2 [-0.004301741, -0.03188501, -0.005113593, -0.0... 1 Positive \n",
+ "3 [-0.0076078353, -0.033714227, -0.0077491296, 0... 0 Negative \n",
+ "4 [0.014139436, -0.045405857, 0.012334452, 0.019... 0 Negative "
]
},
"metadata": {},
@@ -695,12 +644,20 @@
"source": [
"## Prepare df for insertion into Milvus index.\n",
"\n",
- "# Use the embedding model parameters to calculate chunk_size and overlap.\n",
+ "# Use the embedding model parameters.\n",
"chunk_size = MAX_SEQ_LENGTH - HF_EOS_TOKEN_LENGTH\n",
+ "chunk_overlap = np.round(chunk_size * 0.10, 0)\n",
"\n",
"# Chunk a batch of data from pandas DataFrame and inspect it.\n",
"BATCH_SIZE = 100\n",
- "batch = imdb_chunk_text(BATCH_SIZE, df, chunk_size)\n",
+ "batch = _utils.imdb_chunk_text(encoder, BATCH_SIZE, df, chunk_size)\n",
+ "\n",
+ "# Inspect the batch of data.\n",
+ "display(batch.head())\n",
+ "assert len(batch.chunk[0]) <= MAX_SEQ_LENGTH-1\n",
+ "assert len(batch.vector[0]) == EMBEDDING_LENGTH\n",
+ "print(f\"type embeddings: {type(batch.vector)} of {type(batch.vector[0])}\")\n",
+ "print(f\"of numbers: {type(batch.vector[0][0])}\")\n",
"\n",
"# Chunking looks good, drop the original text column.\n",
"batch.drop(columns=[\"text\"], inplace=True)"
@@ -720,7 +677,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"id": "1954c96d",
"metadata": {},
"outputs": [],
@@ -740,7 +697,7 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 12,
"id": "470a93c7",
"metadata": {},
"outputs": [
@@ -751,120 +708,7 @@
"chunk size: 511\n",
"original shape: (100, 4)\n",
"new shape: (290, 5)\n",
- "Chunking + embedding time for 100 docs: 8.245778799057007 sec\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " movie_index | \n",
- " text | \n",
- " chunk | \n",
- " vector | \n",
- " label_int | \n",
- " label | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 80 | \n",
- " The whole town of Blackstone is afraid, becaus... | \n",
- " The whole town of Blackstone is afraid, becaus... | \n",
- " [-0.075508565, -0.022925325, 0.022277957, 0.03... | \n",
- " 1 | \n",
- " Positive | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 80 | \n",
- " The whole town of Blackstone is afraid, becaus... | \n",
- " Mexican bandits (fighting the Gringos that too... | \n",
- " [0.0059213955, 0.0042556957, -0.028471153, 0.0... | \n",
- " 1 | \n",
- " Positive | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 80 | \n",
- " The whole town of Blackstone is afraid, becaus... | \n",
- " and definitely everybody is bad to the bone...... | \n",
- " [-0.004301766, -0.03188503, -0.0051136613, -0.... | \n",
- " 1 | \n",
- " Positive | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 84 | \n",
- " This Harold Lloyd short wasn't really much; no... | \n",
- " This Harold Lloyd short wasn't really much; no... | \n",
- " [-0.007607854, -0.033714272, -0.0077492087, 0.... | \n",
- " 0 | \n",
- " Negative | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 84 | \n",
- " This Harold Lloyd short wasn't really much; no... | \n",
- " part was the last four or five minutes when th... | \n",
- " [0.014139466, -0.04540589, 0.012334436, 0.0192... | \n",
- " 0 | \n",
- " Negative | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " movie_index text \\\n",
- "0 80 The whole town of Blackstone is afraid, becaus... \n",
- "1 80 The whole town of Blackstone is afraid, becaus... \n",
- "2 80 The whole town of Blackstone is afraid, becaus... \n",
- "3 84 This Harold Lloyd short wasn't really much; no... \n",
- "4 84 This Harold Lloyd short wasn't really much; no... \n",
- "\n",
- " chunk \\\n",
- "0 The whole town of Blackstone is afraid, becaus... \n",
- "1 Mexican bandits (fighting the Gringos that too... \n",
- "2 and definitely everybody is bad to the bone...... \n",
- "3 This Harold Lloyd short wasn't really much; no... \n",
- "4 part was the last four or five minutes when th... \n",
- "\n",
- " vector label_int label \n",
- "0 [-0.075508565, -0.022925325, 0.022277957, 0.03... 1 Positive \n",
- "1 [0.0059213955, 0.0042556957, -0.028471153, 0.0... 1 Positive \n",
- "2 [-0.004301766, -0.03188503, -0.0051136613, -0.... 1 Positive \n",
- "3 [-0.007607854, -0.033714272, -0.0077492087, 0.... 0 Negative \n",
- "4 [0.014139466, -0.04540589, 0.012334436, 0.0192... 0 Negative "
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "type embeddings: of \n",
- "of numbers: \n"
+ "Chunking + embedding time for 100 docs: 6.648707866668701 sec\n"
]
}
],
@@ -876,7 +720,7 @@
"\n",
"# Chunk a batch of data from pandas DataFrame and inspect it.\n",
"BATCH_SIZE = 100\n",
- "batch = imdb_chunk_text(BATCH_SIZE, df, chunk_size)\n",
+ "batch = _utils.imdb_chunk_text(encoder, BATCH_SIZE, df, chunk_size)\n",
"\n",
"# Chunking looks good, drop the original text column.\n",
"batch.drop(columns=[\"text\"], inplace=True)"
@@ -889,16 +733,22 @@
"source": [
"## Insert data into Milvus\n",
"\n",
- "Milvus and Milvus Lite support loading pandas dataframes directly.\n",
+ "For each original text chunk, we'll write the quadruplet (`vector, text, source, h1, h2`) into the database.\n",
"\n",
- "Milvus Client, however, requires conerting pandas df into a list of dictionaries first.\n",
+ "\n",
+ "
![](\"../../images/db_insert.png\")
\n",
+ "
\n",
+ "\n",
+ "Milvus and Milvus Lite support loading data from:\n",
+ "- pandas dataframes \n",
+ "- list of dictionaries\n",
"\n",
"π€ TODO: This would be a good place to demonstrate Milvus' scalability by using Ray together with Milvus to run batches in parallel. I'll do this in a future tutorial."
]
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 13,
"id": "b51ff139",
"metadata": {},
"outputs": [
@@ -913,21 +763,14 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "100%|ββββββββββ| 1/1 [00:00<00:00, 29.92it/s]"
+ "100%|ββββββββββ| 1/1 [00:02<00:00, 2.06s/it]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Milvus insert time for 290 vectors: 0.03497195243835449 seconds\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n"
+ "Milvus insert time for 290 vectors: 2.066472053527832 seconds\n"
]
}
],
@@ -968,7 +811,7 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 14,
"id": "eb7bc132",
"metadata": {},
"outputs": [],
@@ -1003,7 +846,7 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 15,
"id": "5e7f41f4",
"metadata": {},
"outputs": [
@@ -1037,7 +880,7 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 16,
"id": "a6863a32",
"metadata": {},
"outputs": [
@@ -1093,7 +936,7 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 17,
"id": "2ace8d04",
"metadata": {},
"outputs": [
@@ -1101,7 +944,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Search time: 0.003979921340942383 sec\n",
+ "Search time: 0.07567000389099121 sec\n",
"type: , count: 10\n"
]
}
@@ -1124,7 +967,7 @@
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 18,
"id": "c5d98e28",
"metadata": {},
"outputs": [
@@ -1132,7 +975,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Search time: 0.0022897720336914062 sec\n",
+ "Milvus search time: 0.055058956146240234 sec\n",
"type: , count: 3\n"
]
}
@@ -1142,6 +985,7 @@
"\n",
"# Return top k results with HNSW index.\n",
"TOP_K = 3\n",
+ "OUTPUT_FIELDS=[\"movie_index\", \"chunk\", \"label\"]\n",
"SEARCH_PARAMS = dict({\n",
" # Re-use index param for num_candidate_nearest_neighbors.\n",
" \"ef\": INDEX_PARAMS['efConstruction']\n",
@@ -1153,16 +997,16 @@
" COLLECTION_NAME,\n",
" data=query_embeddings, \n",
" search_params=SEARCH_PARAMS,\n",
- " output_fields=[\"movie_index\", \"chunk\", \"label\"], \n",
+ " output_fields=OUTPUT_FIELDS, \n",
" limit=TOP_K,\n",
" consistency_level=\"Eventually\",\n",
" )\n",
"\n",
"elapsed_time = time.time() - start_time\n",
- "print(f\"Search time: {elapsed_time} sec\")\n",
+ "print(f\"Milvus search time: {elapsed_time} sec\")\n",
"\n",
"# Inspect search result.\n",
- "print(f\"type: {type(results)}, count: {len(results[0])}\")\n"
+ "print(f\"type: {type(results)}, count: {len(results[0])}\")"
]
},
{
@@ -1177,52 +1021,21 @@
},
{
"cell_type": "code",
- "execution_count": 21,
- "id": "d3dfa33a",
- "metadata": {},
- "outputs": [],
- "source": [
- "## Results returned from MilvusClient are in the form list of lists of dicts.\n",
- "\n",
- "# Get the movie_indexes, review texts, and labels.\n",
- "distances = []\n",
- "texts = []\n",
- "movie_indexes = []\n",
- "labels = []\n",
- "for result in results[0]:\n",
- " distances.append(result['distance'])\n",
- " texts.append(result['entity']['chunk'])\n",
- " movie_indexes.append(result['entity']['movie_index'])\n",
- " labels.append(result['entity']['label'])\n",
- "\n",
- "# Assemble all the results in a zipped list.\n",
- "formatted_results = list(zip(distances, movie_indexes, texts, labels))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
+ "execution_count": null,
"id": "22d65363",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "0: 0.541, 56, Negative, Dr. K(David H Hickey)has been trying to master a formula that would end all disease and handicaps, b\n",
- "1: 0.54, 44, Positive, is not a horror movie, although it does contain some violent scenes, but is rather a comedy. A satir\n",
- "2: 0.535, 67, Positive, a good movie with a real good story. The fact that there are so many other big stars who all also ha\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
- "# Print the results.\n",
- "# k: distance, movie_index, label, review text\n",
+ "# Assemble just the top retrieved 1st context and context metadata.\n",
+ "results = _utils.client_assemble_retrieved_context(results)\n",
+ "# formatted_results = _utils.client_assemble_retrieved_context(results)\n",
+ "# print(f\"Length context: {len(context[0])}\")\n",
"\n",
- "i = 0\n",
- "for row in formatted_results:\n",
- " print(f\"{i}: {np.round(row[0],3)}, {row[1]}, {row[3]}, {row[2][:100]}\")\n",
- " i += 1\n",
+ "# TODO - remove printing before saving in github.\n",
+ "# Loop throught each formatted result and print.\n",
+ "print(f\"#i: (distance, movie_id, label, chunk_text)\")\n",
+ "for i, result in enumerate(results):\n",
+ " print(f\"#{i}: {result}\")\n",
"\n",
"#1: 2006, Serum, \n",
"# 0: 0.541, 931, Negative, Dr. K(David H Hickey)has been trying to master a formula that would end all disease and handicaps, b\n",
@@ -1246,39 +1059,29 @@
},
{
"cell_type": "code",
- "execution_count": 23,
+ "execution_count": null,
"id": "922073f2",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "0: 0.561, 67, Positive, a good movie with a real good story. The fact that there are so many other big stars who all also ha\n",
- "1: 0.56, 13, Positive, the stories but helps Malkovich to provoke some thought. I'd say it is worth seeing and the best of \n",
- "2: 0.549, 12, Positive, the mini-bio on Woody Strode here as a primer: http://imdb.com/name/nm0834754/bio The film does a g\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
- "# Take as input a user question and conduct semantic vector search using the question.\n",
+ "# # Take as input a user question and conduct semantic vector search using the question.\n",
"question = \"I'm a medical doctor, what movie should I watch?\"\n",
- "new_question = \"I'm a medical doctor, suggest only good movies to watch?\"\n",
- "new_results = \\\n",
- " imdb_utilities.mc_search_imdb([new_question],\n",
- " encoder,\n",
- " mc,\n",
- " SEARCH_PARAMS, 3, \n",
- " milvus_client=True,\n",
- " COLLECTION_NAME=COLLECTION_NAME,\n",
- " )\n",
- "\n",
- "# Print the results.\n",
- "# k: distance, movie_index, label, review text\n",
- "i = 0\n",
- "for row in new_results:\n",
- " print(f\"{i}: {np.round(row[0],3)}, {row[1]}, {row[3]}, {row[2][:100]}\")\n",
- " i += 1\n",
+ "new_question = \"I'm a medical doctor, suggest only positive movies to watch?\"\n",
+ "print(f\"Question: {new_question}\")\n",
+ "\n",
+ "# Run the search and time it.\n",
+ "start_time = time.time()\n",
+ "new_results = _utils.mc_client_search_imdb(\n",
+ " [new_question], encoder, mc, SEARCH_PARAMS, OUTPUT_FIELDS, TOP_K)\n",
+ " \n",
+ "elapsed_time = time.time() - start_time\n",
+ "print(f\"Milvus search time: {elapsed_time} sec\")\n",
+ "\n",
+ "# TODO - remove printing before saving in github.\n",
+ "# Loop throught each formatted result and print.\n",
+ "print(f\"#i: (distance, movie_id, label, chunk_text)\")\n",
+ "for i, result in enumerate(new_results):\n",
+ " print(f\"#{i}: {result}\")\n",
"\n",
"# As expected, new_question answers are slightly different!\n",
"# 0: 0.562, 45719, Positive, the stories but helps Malkovich to provoke some thought.
I'd say it is worth seeing and t\n",
@@ -1288,19 +1091,18 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 21,
"id": "d0e81e68",
"metadata": {},
"outputs": [],
"source": [
- "# Shut down and cleanup the milvus server.\n",
- "default_server.stop()\n",
- "default_server.cleanup()"
+ "# Drop collection\n",
+ "utility.drop_collection(COLLECTION_NAME)"
]
},
{
"cell_type": "code",
- "execution_count": 25,
+ "execution_count": 22,
"id": "c777937e",
"metadata": {},
"outputs": [
@@ -1311,16 +1113,16 @@
"Author: Christy Bergman\n",
"\n",
"Python implementation: CPython\n",
- "Python version : 3.10.12\n",
- "IPython version : 8.15.0\n",
+ "Python version : 3.11.6\n",
+ "IPython version : 8.18.1\n",
"\n",
- "torch : 2.0.1\n",
- "transformers: 4.34.1\n",
+ "torch : 2.1.1\n",
+ "transformers: 4.35.2\n",
"milvus : 2.3.3\n",
- "pymilvus : 2.3.3\n",
+ "pymilvus : 2.3.4\n",
"langchain : 0.0.322\n",
"\n",
- "conda environment: py310\n",
+ "conda environment: py311\n",
"\n"
]
}
@@ -1358,7 +1160,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.10.12"
+ "version": "3.11.6"
}
},
"nbformat": 4,