From 37f1b8f236cdf7600863db61fd493c6e8d790b2b Mon Sep 17 00:00:00 2001 From: Igor Shapiro Date: Sat, 8 Apr 2023 21:03:26 +0300 Subject: [PATCH] feat(opensearch): implement filtering by metadata attributes + integration test --- examples/package.json | 2 +- langchain/package.json | 2 +- langchain/src/vectorstores/opensearch.ts | 41 ++++++++++++- .../vectorstores/tests/opensearch.int.test.ts | 42 ++++++++++++++ yarn.lock | 58 +++++++------------ 5 files changed, 104 insertions(+), 41 deletions(-) create mode 100644 langchain/src/vectorstores/tests/opensearch.int.test.ts diff --git a/examples/package.json b/examples/package.json index 38245cd8a4f5..8267513610e3 100644 --- a/examples/package.json +++ b/examples/package.json @@ -53,4 +53,4 @@ "tsx": "^3.12.3", "typescript": "^4.9.5" } -} \ No newline at end of file +} diff --git a/langchain/package.json b/langchain/package.json index 3d082de059c8..72135369d207 100644 --- a/langchain/package.json +++ b/langchain/package.json @@ -803,4 +803,4 @@ }, "./package.json": "./package.json" } -} \ No newline at end of file +} diff --git a/langchain/src/vectorstores/opensearch.ts b/langchain/src/vectorstores/opensearch.ts index 14eb25df8149..dbbe3b2b09bb 100644 --- a/langchain/src/vectorstores/opensearch.ts +++ b/langchain/src/vectorstores/opensearch.ts @@ -1,3 +1,4 @@ +/* eslint-disable no-instanceof/no-instanceof */ import { Embeddings } from "embeddings/base.js"; import { Client, RequestParams, errors } from "@opensearch-project/opensearch"; import { v4 as uuid } from "uuid"; @@ -87,14 +88,21 @@ export class OpenSearchVectorStore extends VectorStore { async similaritySearchVectorWithScore( query: number[], k: number, - _filter?: object | undefined + filter?: object | undefined ): Promise<[Document, number][]> { const search: RequestParams.Search = { index: this.indexName, body: { query: { - knn: { - embedding: { vector: query, k }, + bool: { + filter: { bool: { must: this.buildMetadataTerms(filter) } }, + must: [ + { + knn: { + embedding: { vector: query, k }, + }, + }, + ], }, }, size: k, @@ -154,6 +162,15 @@ export class OpenSearchVectorStore extends VectorStore { }, }, mappings: { + dynamic_templates: [ + { + // map all metadata properties to be keyword + "metadata.*": { + match_mapping_type: "*", + mapping: { type: "keyword" }, + }, + }, + ], properties: { text: { type: "text" }, metadata: { type: "object" }, @@ -177,6 +194,17 @@ export class OpenSearchVectorStore extends VectorStore { await this.client.indices.create({ index: this.indexName, body }); } + private buildMetadataTerms( + filter?: object + ): { term: Record }[] { + if (filter == null) return []; + const result = []; + for (const [key, value] of Object.entries(filter)) { + result.push({ term: { [`metadata.${key}`]: value } }); + } + return result; + } + async doesIndexExist(): Promise { try { await this.client.cat.indices({ index: this.indexName }); @@ -188,4 +216,11 @@ export class OpenSearchVectorStore extends VectorStore { throw err; } } + + async deleteIfExists(): Promise { + const indexExists = await this.doesIndexExist(); + if (!indexExists) return; + + await this.client.indices.delete({ index: this.indexName }); + } } diff --git a/langchain/src/vectorstores/tests/opensearch.int.test.ts b/langchain/src/vectorstores/tests/opensearch.int.test.ts new file mode 100644 index 000000000000..5fc311ff66db --- /dev/null +++ b/langchain/src/vectorstores/tests/opensearch.int.test.ts @@ -0,0 +1,42 @@ +/* eslint-disable no-process-env */ +import { test, expect } from "@jest/globals"; +import { Client } from "@opensearch-project/opensearch"; +import { OpenAIEmbeddings } from "../../embeddings/index.js"; +import { OpenSearchVectorStore } from "../opensearch.js"; +import { Document } from "../../document.js"; + +test("OpenSearchVectorStore integration", async () => { + const client = new Client({ + nodes: [process.env.OPENSEARCH_URL!], + }); + + const indexName = "test_index"; + + const embeddings = new OpenAIEmbeddings(undefined, { + baseOptions: { temperature: 0 }, + }); + const store = new OpenSearchVectorStore(embeddings, { client, indexName }); + await store.deleteIfExists(); + + expect(store).toBeDefined(); + + await store.addDocuments([ + { pageContent: "hello", metadata: { a: 2 } }, + { pageContent: "car", metadata: { a: 1 } }, + { pageContent: "adjective", metadata: { a: 1 } }, + { pageContent: "hi", metadata: { a: 1 } }, + ]); + + const results1 = await store.similaritySearch("hello!", 1); + + expect(results1).toHaveLength(1); + expect(results1).toEqual([ + new Document({ metadata: { a: 2 }, pageContent: "hello" }), + ]); + + const results2 = await store.similaritySearchWithScore("hello!", 1, { + a: 1, + }); + + expect(results2).toHaveLength(1); +}); diff --git a/yarn.lock b/yarn.lock index 85ba65d9ace5..338908992720 100644 --- a/yarn.lock +++ b/yarn.lock @@ -12950,6 +12950,7 @@ __metadata: resolution: "examples@workspace:examples" dependencies: "@getmetal/metal-sdk": ^1.0.12 + "@opensearch-project/opensearch": ^2.2.0 "@pinecone-database/pinecone": ^0.0.12 "@prisma/client": ^4.11.0 "@supabase/supabase-js": ^2.10.0 @@ -14437,7 +14438,25 @@ __metadata: languageName: node linkType: hard -"html-entities@npm:^2.3.2": +"html-encoding-sniffer@npm:^2.0.1": + version: 2.0.1 + resolution: "html-encoding-sniffer@npm:2.0.1" + dependencies: + whatwg-encoding: ^1.0.5 + checksum: bf30cce461015ed7e365736fcd6a3063c7bc016a91f74398ef6158886970a96333938f7c02417ab3c12aa82e3e53b40822145facccb9ddfbcdc15a879ae4d7ba + languageName: node + linkType: hard + +"html-encoding-sniffer@npm:^3.0.0": + version: 3.0.0 + resolution: "html-encoding-sniffer@npm:3.0.0" + dependencies: + whatwg-encoding: ^2.0.0 + checksum: 8d806aa00487e279e5ccb573366a951a9f68f65c90298eac9c3a2b440a7ffe46615aff2995a2f61c6746c639234e6179a97e18ca5ccbbf93d3725ef2099a4502 + languageName: node + linkType: hard + +"html-entities@npm:^2.1.0, html-entities@npm:^2.3.2": version: 2.3.3 resolution: "html-entities@npm:2.3.3" checksum: 92521501da8aa5f66fee27f0f022d6e9ceae62667dae93aa6a2f636afa71ad530b7fb24a18d4d6c124c9885970cac5f8a52dbf1731741161002816ae43f98196 @@ -17026,39 +17045,6 @@ __metadata: languageName: node linkType: hard -"langchain-examples@workspace:examples": - version: 0.0.0-use.local - resolution: "langchain-examples@workspace:examples" - dependencies: - "@dqbd/tiktoken": ^1.0.2 - "@getmetal/metal-sdk": ^1.0.12 - "@opensearch-project/opensearch": ^2.2.0 - "@pinecone-database/pinecone": ^0.0.10 - "@prisma/client": ^4.11.0 - "@supabase/supabase-js": ^2.10.0 - "@tsconfig/recommended": ^1.0.2 - "@types/js-yaml": ^4 - "@typescript-eslint/eslint-plugin": ^5.51.0 - "@typescript-eslint/parser": ^5.51.0 - chromadb: ^1.3.0 - dotenv: ^16.0.3 - eslint: ^8.33.0 - eslint-config-airbnb-base: ^15.0.0 - eslint-config-prettier: ^8.6.0 - eslint-plugin-import: ^2.27.5 - eslint-plugin-prettier: ^4.2.1 - js-yaml: ^4.1.0 - langchain: "workspace:*" - prettier: ^2.8.3 - prisma: ^4.11.0 - sqlite3: ^5.1.4 - tsx: ^3.12.3 - typeorm: ^0.3.12 - typescript: ^4.9.5 - zod: ^3.21.4 - languageName: unknown - linkType: soft - "langchain@workspace:*, langchain@workspace:langchain": version: 0.0.0-use.local resolution: "langchain@workspace:langchain" @@ -17071,7 +17057,7 @@ __metadata: "@huggingface/inference": ^1.5.1 "@jest/globals": ^29.5.0 "@opensearch-project/opensearch": ^2.2.0 - "@pinecone-database/pinecone": ^0.0.10 + "@pinecone-database/pinecone": ^0.0.12 "@supabase/supabase-js": ^2.10.0 "@tsconfig/recommended": ^1.0.2 "@types/d3-dsv": ^2 @@ -17131,7 +17117,7 @@ __metadata: "@getmetal/metal-sdk": "*" "@huggingface/inference": ^1.5.1 "@opensearch-project/opensearch": "*" - "@pinecone-database/pinecone": ^0.0.10 + "@pinecone-database/pinecone": "*" "@supabase/supabase-js": ^2.10.0 "@zilliz/milvus2-sdk-node": ^2.2.0 cheerio: ^1.0.0-rc.12