diff --git a/provider/docs/src/corpus/corpus.test.ts b/provider/docs/src/corpus/corpus.test.ts index 248204a3..26e07b8e 100644 --- a/provider/docs/src/corpus/corpus.test.ts +++ b/provider/docs/src/corpus/corpus.test.ts @@ -1,8 +1,8 @@ import { describe, expect, test } from 'vitest' -import { createCorpus, Document } from './corpus' +import { createCorpus, DocID, Document } from './corpus' -export function doc(docID: string | number, text: string): Document { - return { docID: typeof docID === 'string' ? docID : docID.toString(), text } +export function doc(docID: DocID | number, text: string): Document { + return { docID, text } } describe('Corpus', () => { diff --git a/provider/docs/src/corpus/corpus.ts b/provider/docs/src/corpus/corpus.ts index 2fcaae22..0416a6ad 100644 --- a/provider/docs/src/corpus/corpus.ts +++ b/provider/docs/src/corpus/corpus.ts @@ -9,25 +9,32 @@ export interface Corpus { length: number } +export type DocID = number + export interface Document { - docID: string + docID: DocID text: string } export interface CorpusSearchResult { - docID: string + docID: DocID score: number excerpt: string } -interface StoredDocument { +export interface StoredCorpus { + docs: StoredDocument[] +} + +export interface StoredDocument { doc: Document chunks: Chunk[] } -export interface StoredCorpus { - docs: StoredDocument[] -} +/** + * Index of a {@link Chunk} in a {@link StoredDocument}. + */ +export type ChunkIndex = 0 export function createStoredCorpus(docs: Document[]): StoredCorpus { const storage: StoredCorpus = { docs: [] } diff --git a/provider/docs/src/corpus/search/embeddings.test.ts b/provider/docs/src/corpus/search/embeddings.test.ts index 5def3423..4d5da472 100644 --- a/provider/docs/src/corpus/search/embeddings.test.ts +++ b/provider/docs/src/corpus/search/embeddings.test.ts @@ -7,7 +7,7 @@ describe('embeddingsSearch', () => { test('finds matches', async () => { expect(await embeddingsSearch(createStoredCorpus([doc(1, 'a'), doc(2, 'b')]), 'b')).toEqual< CorpusSearchResult[] - >([{ docID: '2', score: 1, excerpt: 'b' }]) + >([{ docID: 2, score: 1, excerpt: 'b' }]) }) }) diff --git a/provider/docs/src/corpus/search/keyword.test.ts b/provider/docs/src/corpus/search/keyword.test.ts index 794136f9..9d62d8f7 100644 --- a/provider/docs/src/corpus/search/keyword.test.ts +++ b/provider/docs/src/corpus/search/keyword.test.ts @@ -6,7 +6,7 @@ import { keywordSearch } from './keyword' describe('keywordSearch', () => { test('finds matches', () => { expect(keywordSearch(createStoredCorpus([doc(1, 'a'), doc(2, 'b')]), 'b')).toEqual([ - { docID: '2', score: 1, excerpt: 'b' }, + { docID: 2, score: 1, excerpt: 'b' }, ]) }) }) diff --git a/provider/docs/src/corpus/search/keyword.ts b/provider/docs/src/corpus/search/keyword.ts index ea67739d..dcdb5426 100644 --- a/provider/docs/src/corpus/search/keyword.ts +++ b/provider/docs/src/corpus/search/keyword.ts @@ -3,9 +3,11 @@ import { CorpusSearchResult, StoredCorpus } from '../corpus' export function keywordSearch(storage: StoredCorpus, query: string): CorpusSearchResult[] { const terms = query.split(/\s+/) const results: CorpusSearchResult[] = [] - for (const { doc } of storage.docs) { - if (terms.some(term => doc.text.includes(term))) { - results.push({ docID: doc.docID, score: 1, excerpt: doc.text }) + for (const { doc, chunks } of storage.docs) { + for (const chunk of chunks) { + if (terms.some(term => chunk.text.includes(term))) { + results.push({ docID: doc.docID, score: 1, excerpt: chunk.text }) + } } } return results diff --git a/provider/docs/src/corpus/search/tfidf.test.ts b/provider/docs/src/corpus/search/tfidf.test.ts new file mode 100644 index 00000000..ee8d10cc --- /dev/null +++ b/provider/docs/src/corpus/search/tfidf.test.ts @@ -0,0 +1,33 @@ +import { describe, expect, test } from 'vitest' +import { createStoredCorpus } from '../corpus' +import { calculateTFIDF, createIndexForTFIDF } from './tfidf' + +describe('createIndexForTFIDF', () => { + const corpus = createStoredCorpus([ + { docID: 1, text: `a b c c c` }, + { docID: 2, text: `b c d` }, + { docID: 3, text: `c d e` }, + ]) + const docIDs = corpus.docs.map(({ doc: { docID } }) => docID) + const tfidf = createIndexForTFIDF(corpus) + + test.only('term in 1 doc', () => { + expect(docIDs.map(docID => tfidf('a', docID, 0))).toEqual([ + calculateTFIDF({ termOccurrencesInChunk: 1, chunkTermLength: 5, totalChunks: 3, termChunkFrequency: 1 }), + 0, + 0, + ]) + }) + + test('term in all docs', () => { + expect(docIDs.map(docID => tfidf('c', docID, 0))).toEqual([ + calculateTFIDF({ termOccurrencesInChunk: 3, chunkTermLength: 5, totalChunks: 3, termChunkFrequency: 3 }), + calculateTFIDF({ termOccurrencesInChunk: 1, chunkTermLength: 3, totalChunks: 3, termChunkFrequency: 3 }), + calculateTFIDF({ termOccurrencesInChunk: 1, chunkTermLength: 3, totalChunks: 3, termChunkFrequency: 3 }), + ]) + }) + + test('unknown term', () => { + expect(docIDs.map(docID => tfidf('x', docID, 0))).toEqual([0, 0, 0]) + }) +}) diff --git a/provider/docs/src/corpus/search/tfidf.ts b/provider/docs/src/corpus/search/tfidf.ts new file mode 100644 index 00000000..7ca47b70 --- /dev/null +++ b/provider/docs/src/corpus/search/tfidf.ts @@ -0,0 +1,132 @@ +import { ChunkIndex, DocID, StoredCorpus } from '../corpus' + +/** + * TF-IDF is a way of measuring the relevance of a term to a document in a corpus. See + * https://en.wikipedia.org/wiki/Tf%E2%80%93idf. + * + * TF-IDF = TF * IDF + * - TF = number of occurrences of term in the chunk / number of (non-unique) terms in the chunk + * - IDF = log(number of chunks / number of chunks containing the term) + */ +export type TFIDF = (term: Term, docID: DocID, chunk: ChunkIndex) => number + +/** + * Index the corpus for fast computation of TF-IDF. @see {TFIDF} + */ +export function createIndexForTFIDF(storage: StoredCorpus): TFIDF { + /** + * Document -> chunk index -> term -> number of occurrences of term in the chunk. + * + * "TF" in "TF-IDF" (with chunks instead of documents as the unit of analysis). + */ + const termFrequency = new Map[]>() + + /** + * Document -> chunk index -> number of (non-unique) terms in the chunk. + */ + const termLength = new Map() + + /** + * Term -> number of chunks containing the term. + * + * "DF" in "IDF" in "TF-IDF" (with chunks instead of documents as the unit of analysis). + */ + const chunkFrequency = new Map() + + let totalChunks = 0 + + for (const { doc, chunks } of storage.docs) { + const docTermFrequency: Map[] = new Array(chunks.length) + termFrequency.set(doc.docID, docTermFrequency) + + const docTermLength: number[] = new Array(chunks.length) + termLength.set(doc.docID, docTermLength) + + for (const [i, chunk] of chunks.entries()) { + const chunkTerms = terms(chunk.text) + + // Set chunk frequencies. + for (const uniqueTerm of new Set(chunkTerms).values()) { + chunkFrequency.set(uniqueTerm, (chunkFrequency.get(uniqueTerm) ?? 0) + 1) + } + + // Set term frequencies. + const chunkTermFrequency = new Map() + docTermFrequency[i] = chunkTermFrequency + for (const term of chunkTerms) { + chunkTermFrequency.set(term, (chunkTermFrequency.get(term) ?? 0) + 1) + } + + // Set term cardinality. + docTermLength[i] = chunkTerms.length + + // Increment total chunks. + totalChunks++ + } + } + + return (termRaw: string, doc: DocID, chunk: ChunkIndex): number => { + const processedTerms = terms(termRaw) + if (processedTerms.length !== 1) { + throw new Error(`term ${JSON.stringify(termRaw)} is not a single term`) + } + const term = processedTerms[0] + + const docTermLength = termLength.get(doc) + if (!docTermLength) { + throw new Error(`doc ${doc} not found in termLength`) + } + if (typeof docTermLength[chunk] !== 'number') { + throw new Error(`chunk ${chunk} not found in termLength for doc ${doc}`) + } + + const docTermFrequency = termFrequency.get(doc) + if (!docTermFrequency) { + throw new Error(`doc ${doc} not found in termFrequency`) + } + if (!(docTermFrequency[chunk] instanceof Map)) { + throw new Error(`chunk ${chunk} not found in termFrequency for doc ${doc}`) + } + + return calculateTFIDF({ + termOccurrencesInChunk: docTermFrequency[chunk].get(term) ?? 0, + chunkTermLength: docTermLength[chunk], + totalChunks, + termChunkFrequency: chunkFrequency.get(term) ?? 0, + }) + } +} + +/** + * Calculate TF-IDF given the formula inputs. @see {TFIDF} + * + * Use {@link createIndexForTFIDF} instead of calling this directly. + */ +export function calculateTFIDF({ + termOccurrencesInChunk, + chunkTermLength, + totalChunks, + termChunkFrequency, +}: { + termOccurrencesInChunk: number + chunkTermLength: number + totalChunks: number + termChunkFrequency: number +}): number { + return (termOccurrencesInChunk / chunkTermLength) * Math.log((1 + totalChunks) / (1 + termChunkFrequency)) +} + +type Term = string + +/** + * All terms in the text, with normalization and stemming applied. + */ +function terms(text: string): Term[] { + return ( + text + .toLowerCase() + .split(/[^a-zA-Z0-9-_]+/) + // TODO(sqs): get a real stemmer + .map(term => term.replace(/(.*)(?:es|ed|ing|s|ed|ing)$/, '$1')) + ) +} diff --git a/provider/docs/src/e2e.test.ts b/provider/docs/src/e2e.test.ts index b075cd16..89473ccc 100644 --- a/provider/docs/src/e2e.test.ts +++ b/provider/docs/src/e2e.test.ts @@ -8,12 +8,12 @@ describe('e2e', () => { const docFile = await fs.readFile(path.join(__dirname, 'testdata/corpus/urlParsing.md'), 'utf8') const codeFile = await fs.readFile(path.join(__dirname, 'testdata/code/urlParsing.ts'), 'utf8') - const corpus = createCorpus([{ docID: '1', text: docFile }]) + const corpus = createCorpus([{ docID: 1, text: docFile }]) const results = await corpus.search(codeFile, false) roundScores(results) expect(results).toEqual([ { - docID: '1', + docID: 1, excerpt: 'Audio URL parsing\n\nTo parse an audio URL, use the `parseAudioURL` function.', score: 0.685, },