wip

sourcegraph · Dec 25, 2023 · 3cba4c7 · 3cba4c7
1 parent f5da0cc
commit 3cba4c7
Show file tree

Hide file tree

Showing 10 changed files with 74 additions and 29 deletions.
diff --git a/provider/docs/src/corpus/corpus.ts b/provider/docs/src/corpus/corpus.ts
@@ -18,6 +18,7 @@ export interface Document {
 
 export interface CorpusSearchResult {
     docID: DocID
+    chunk: ChunkIndex
     score: number
     excerpt: string
 }
@@ -34,7 +35,7 @@ export interface StoredDocument {
 /**
  * Index of a {@link Chunk} in a {@link StoredDocument}.
  */
-export type ChunkIndex = 0
+export type ChunkIndex = number
 
 export function createStoredCorpus(docs: Document[]): StoredCorpus {
     const storage: StoredCorpus = { docs: [] }

diff --git a/provider/docs/src/corpus/search/embeddings.test.ts b/provider/docs/src/corpus/search/embeddings.test.ts
@@ -7,7 +7,7 @@ describe('embeddingsSearch', () => {
     test('finds matches', async () => {
         expect(await embeddingsSearch(createStoredCorpus([doc(1, 'a'), doc(2, 'b')]), 'b')).toEqual<
             CorpusSearchResult[]
-        >([{ docID: 2, score: 1, excerpt: 'b' }])
+        >([{ docID: 2, chunk: 0, score: 1, excerpt: 'b' }])
     })
 })
 

diff --git a/provider/docs/src/corpus/search/embeddings.ts b/provider/docs/src/corpus/search/embeddings.ts
@@ -6,10 +6,10 @@ export async function embeddingsSearch(storage: StoredCorpus, query: string): Pr
 
     const results: CorpusSearchResult[] = []
     for (const { doc, chunks } of storage.docs) {
-        for (const chunk of chunks) {
+        for (const [i, chunk] of chunks.entries()) {
             const chunkVec = await embedText(chunk.text)
             const score = cos_sim(queryVec, chunkVec)
-            results.push({ docID: doc.docID, score, excerpt: chunk.text })
+            results.push({ docID: doc.docID, chunk: i, score, excerpt: chunk.text })
         }
     }
 

diff --git a/provider/docs/src/corpus/search/keyword.test.ts b/provider/docs/src/corpus/search/keyword.test.ts
@@ -2,11 +2,22 @@ import { describe, expect, test } from 'vitest'
 import { CorpusSearchResult, createStoredCorpus } from '../corpus'
 import { doc } from '../corpus.test'
 import { keywordSearch } from './keyword'
+import { calculateTFIDF } from './tfidf'
 
 describe('keywordSearch', () => {
     test('finds matches', () => {
-        expect(keywordSearch(createStoredCorpus([doc(1, 'a'), doc(2, 'b')]), 'b')).toEqual<CorpusSearchResult[]>([
-            { docID: 2, score: 1, excerpt: 'b' },
+        expect(keywordSearch(createStoredCorpus([doc(1, 'aaa'), doc(2, 'bbb')]), 'bbb')).toEqual<CorpusSearchResult[]>([
+            {
+                docID: 2,
+                chunk: 0,
+                score: calculateTFIDF({
+                    termOccurrencesInChunk: 1,
+                    chunkTermLength: 1,
+                    totalChunks: 2,
+                    termChunkFrequency: 1,
+                }),
+                excerpt: 'bbb',
+            },
         ])
     })
 })
diff --git a/provider/docs/src/corpus/search/keyword.ts b/provider/docs/src/corpus/search/keyword.ts
@@ -1,12 +1,17 @@
 import { CorpusSearchResult, StoredCorpus } from '../corpus'
+import { terms } from './terms'
+import { createIndexForTFIDF } from './tfidf'
 
 export function keywordSearch(storage: StoredCorpus, query: string): CorpusSearchResult[] {
-    const terms = query.split(/\s+/)
+    const queryTerms = terms(query).filter(term => term.length >= 3)
+    const tfidf = createIndexForTFIDF(storage)
+
     const results: CorpusSearchResult[] = []
     for (const { doc, chunks } of storage.docs) {
-        for (const chunk of chunks) {
-            if (terms.some(term => chunk.text.includes(term))) {
-                results.push({ docID: doc.docID, score: 1, excerpt: chunk.text })
+        for (const [i, chunk] of chunks.entries()) {
+            const score = queryTerms.reduce((score, term) => score + tfidf(term, doc.docID, i), 0)
+            if (score > 0) {
+                results.push({ docID: doc.docID, chunk: i, score, excerpt: chunk.text })
             }
         }
     }

diff --git a/provider/docs/src/corpus/search/multi.ts b/provider/docs/src/corpus/search/multi.ts
@@ -1,13 +1,32 @@
-import { CorpusSearchResult, StoredCorpus } from '../corpus'
+import { ChunkIndex, CorpusSearchResult, DocID, StoredCorpus } from '../corpus'
 import { embeddingsSearch } from './embeddings'
 import { keywordSearch } from './keyword'
 
 /**
  * Search using multiple search methods.
  */
 export async function multiSearch(storage: StoredCorpus, query: string): Promise<CorpusSearchResult[]> {
-    const allResults = await Promise.all(SEARCH_METHODS.map(method => method(storage, query)))
-    return allResults.flat()
+    const allResults = (await Promise.all(SEARCH_METHODS.map(method => method(storage, query)))).flat()
+
+    // Sum scores for each chunk.
+    const combinedResults = new Map<DocID, Map<ChunkIndex, CorpusSearchResult>>()
+    for (const result of allResults) {
+        let docResults = combinedResults.get(result.docID)
+        if (!docResults) {
+            docResults = new Map<ChunkIndex, CorpusSearchResult>()
+            combinedResults.set(result.docID, docResults)
+        }
+
+        const chunkResult = docResults.get(result.chunk) ?? {
+            docID: result.docID,
+            chunk: result.chunk,
+            score: 0,
+            excerpt: result.excerpt,
+        }
+        docResults.set(result.chunk, { ...chunkResult, score: chunkResult.score + result.score })
+    }
+
+    return Array.from(combinedResults.values()).flatMap(docResults => Array.from(docResults.values()))
 }
 
 const SEARCH_METHODS: ((

diff --git a/provider/docs/src/corpus/search/terms.test.ts b/provider/docs/src/corpus/search/terms.test.ts
@@ -0,0 +1,8 @@
+import { describe, expect, test } from 'vitest'
+import { terms } from './terms'
+
+describe('terms', () => {
+    test('splits, stems, normalizes', () => {
+        expect(terms('my apples are cooler when stored')).toEqual(['my', 'apple', 'are', 'cool', 'when', 'stor'])
+    })
+})
diff --git a/provider/docs/src/corpus/search/terms.ts b/provider/docs/src/corpus/search/terms.ts
@@ -0,0 +1,14 @@
+export type Term = string
+
+/**
+ * All terms in the text, with normalization and stemming applied.
+ */
+export function terms(text: string): Term[] {
+    return (
+        text
+            .toLowerCase()
+            .split(/[^a-zA-Z0-9-_]+/)
+            // TODO(sqs): get a real stemmer
+            .map(term => term.replace(/(.*)(?:es|ed|ing|s|ed|er|ing)$/, '$1'))
+    )
+}
diff --git a/provider/docs/src/corpus/search/tfidf.ts b/provider/docs/src/corpus/search/tfidf.ts
@@ -1,4 +1,5 @@
 import { ChunkIndex, DocID, StoredCorpus } from '../corpus'
+import { Term, terms } from './terms'
 
 /**
  * TF-IDF is a way of measuring the relevance of a term to a document in a corpus. See
@@ -115,18 +116,3 @@ export function calculateTFIDF({
 }): number {
     return (termOccurrencesInChunk / chunkTermLength) * Math.log((1 + totalChunks) / (1 + termChunkFrequency))
 }
-
-type Term = string
-
-/**
- * All terms in the text, with normalization and stemming applied.
- */
-function terms(text: string): Term[] {
-    return (
-        text
-            .toLowerCase()
-            .split(/[^a-zA-Z0-9-_]+/)
-            // TODO(sqs): get a real stemmer
-            .map(term => term.replace(/(.*)(?:es|ed|ing|s|ed|ing)$/, '$1'))
-    )
-}
diff --git a/provider/docs/src/e2e.test.ts b/provider/docs/src/e2e.test.ts
@@ -14,8 +14,9 @@ describe('e2e', () => {
         expect(results).toEqual<CorpusSearchResult[]>([
             {
                 docID: 1,
+                chunk: 3,
                 excerpt: 'Audio URL parsing\n\nTo parse an audio URL, use the `parseAudioURL` function.',
-                score: 0.685,
+                score: 0.755,
             },
         ])
     })