Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
sqs committed Dec 25, 2023
1 parent f5da0cc commit 3cba4c7
Show file tree
Hide file tree
Showing 10 changed files with 74 additions and 29 deletions.
3 changes: 2 additions & 1 deletion provider/docs/src/corpus/corpus.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ export interface Document {

export interface CorpusSearchResult {
docID: DocID
chunk: ChunkIndex
score: number
excerpt: string
}
Expand All @@ -34,7 +35,7 @@ export interface StoredDocument {
/**
* Index of a {@link Chunk} in a {@link StoredDocument}.
*/
export type ChunkIndex = 0
export type ChunkIndex = number

export function createStoredCorpus(docs: Document[]): StoredCorpus {
const storage: StoredCorpus = { docs: [] }
Expand Down
2 changes: 1 addition & 1 deletion provider/docs/src/corpus/search/embeddings.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ describe('embeddingsSearch', () => {
test('finds matches', async () => {
expect(await embeddingsSearch(createStoredCorpus([doc(1, 'a'), doc(2, 'b')]), 'b')).toEqual<
CorpusSearchResult[]
>([{ docID: 2, score: 1, excerpt: 'b' }])
>([{ docID: 2, chunk: 0, score: 1, excerpt: 'b' }])
})
})

Expand Down
4 changes: 2 additions & 2 deletions provider/docs/src/corpus/search/embeddings.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ export async function embeddingsSearch(storage: StoredCorpus, query: string): Pr

const results: CorpusSearchResult[] = []
for (const { doc, chunks } of storage.docs) {
for (const chunk of chunks) {
for (const [i, chunk] of chunks.entries()) {
const chunkVec = await embedText(chunk.text)
const score = cos_sim(queryVec, chunkVec)
results.push({ docID: doc.docID, score, excerpt: chunk.text })
results.push({ docID: doc.docID, chunk: i, score, excerpt: chunk.text })
}
}

Expand Down
15 changes: 13 additions & 2 deletions provider/docs/src/corpus/search/keyword.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,22 @@ import { describe, expect, test } from 'vitest'
import { CorpusSearchResult, createStoredCorpus } from '../corpus'
import { doc } from '../corpus.test'
import { keywordSearch } from './keyword'
import { calculateTFIDF } from './tfidf'

describe('keywordSearch', () => {
test('finds matches', () => {
expect(keywordSearch(createStoredCorpus([doc(1, 'a'), doc(2, 'b')]), 'b')).toEqual<CorpusSearchResult[]>([
{ docID: 2, score: 1, excerpt: 'b' },
expect(keywordSearch(createStoredCorpus([doc(1, 'aaa'), doc(2, 'bbb')]), 'bbb')).toEqual<CorpusSearchResult[]>([
{
docID: 2,
chunk: 0,
score: calculateTFIDF({
termOccurrencesInChunk: 1,
chunkTermLength: 1,
totalChunks: 2,
termChunkFrequency: 1,
}),
excerpt: 'bbb',
},
])
})
})
13 changes: 9 additions & 4 deletions provider/docs/src/corpus/search/keyword.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
import { CorpusSearchResult, StoredCorpus } from '../corpus'
import { terms } from './terms'
import { createIndexForTFIDF } from './tfidf'

export function keywordSearch(storage: StoredCorpus, query: string): CorpusSearchResult[] {
const terms = query.split(/\s+/)
const queryTerms = terms(query).filter(term => term.length >= 3)
const tfidf = createIndexForTFIDF(storage)

const results: CorpusSearchResult[] = []
for (const { doc, chunks } of storage.docs) {
for (const chunk of chunks) {
if (terms.some(term => chunk.text.includes(term))) {
results.push({ docID: doc.docID, score: 1, excerpt: chunk.text })
for (const [i, chunk] of chunks.entries()) {
const score = queryTerms.reduce((score, term) => score + tfidf(term, doc.docID, i), 0)
if (score > 0) {
results.push({ docID: doc.docID, chunk: i, score, excerpt: chunk.text })
}
}
}
Expand Down
25 changes: 22 additions & 3 deletions provider/docs/src/corpus/search/multi.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,32 @@
import { CorpusSearchResult, StoredCorpus } from '../corpus'
import { ChunkIndex, CorpusSearchResult, DocID, StoredCorpus } from '../corpus'
import { embeddingsSearch } from './embeddings'
import { keywordSearch } from './keyword'

/**
* Search using multiple search methods.
*/
export async function multiSearch(storage: StoredCorpus, query: string): Promise<CorpusSearchResult[]> {
const allResults = await Promise.all(SEARCH_METHODS.map(method => method(storage, query)))
return allResults.flat()
const allResults = (await Promise.all(SEARCH_METHODS.map(method => method(storage, query)))).flat()

// Sum scores for each chunk.
const combinedResults = new Map<DocID, Map<ChunkIndex, CorpusSearchResult>>()
for (const result of allResults) {
let docResults = combinedResults.get(result.docID)
if (!docResults) {
docResults = new Map<ChunkIndex, CorpusSearchResult>()
combinedResults.set(result.docID, docResults)
}

const chunkResult = docResults.get(result.chunk) ?? {
docID: result.docID,
chunk: result.chunk,
score: 0,
excerpt: result.excerpt,
}
docResults.set(result.chunk, { ...chunkResult, score: chunkResult.score + result.score })
}

return Array.from(combinedResults.values()).flatMap(docResults => Array.from(docResults.values()))
}

const SEARCH_METHODS: ((
Expand Down
8 changes: 8 additions & 0 deletions provider/docs/src/corpus/search/terms.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import { describe, expect, test } from 'vitest'
import { terms } from './terms'

describe('terms', () => {
test('splits, stems, normalizes', () => {
expect(terms('my apples are cooler when stored')).toEqual(['my', 'apple', 'are', 'cool', 'when', 'stor'])
})
})
14 changes: 14 additions & 0 deletions provider/docs/src/corpus/search/terms.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
export type Term = string

/**
* All terms in the text, with normalization and stemming applied.
*/
export function terms(text: string): Term[] {
return (
text
.toLowerCase()
.split(/[^a-zA-Z0-9-_]+/)
// TODO(sqs): get a real stemmer
.map(term => term.replace(/(.*)(?:es|ed|ing|s|ed|er|ing)$/, '$1'))
)
}
16 changes: 1 addition & 15 deletions provider/docs/src/corpus/search/tfidf.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { ChunkIndex, DocID, StoredCorpus } from '../corpus'
import { Term, terms } from './terms'

/**
* TF-IDF is a way of measuring the relevance of a term to a document in a corpus. See
Expand Down Expand Up @@ -115,18 +116,3 @@ export function calculateTFIDF({
}): number {
return (termOccurrencesInChunk / chunkTermLength) * Math.log((1 + totalChunks) / (1 + termChunkFrequency))
}

type Term = string

/**
* All terms in the text, with normalization and stemming applied.
*/
function terms(text: string): Term[] {
return (
text
.toLowerCase()
.split(/[^a-zA-Z0-9-_]+/)
// TODO(sqs): get a real stemmer
.map(term => term.replace(/(.*)(?:es|ed|ing|s|ed|ing)$/, '$1'))
)
}
3 changes: 2 additions & 1 deletion provider/docs/src/e2e.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@ describe('e2e', () => {
expect(results).toEqual<CorpusSearchResult[]>([
{
docID: 1,
chunk: 3,
excerpt: 'Audio URL parsing\n\nTo parse an audio URL, use the `parseAudioURL` function.',
score: 0.685,
score: 0.755,
},
])
})
Expand Down

0 comments on commit 3cba4c7

Please sign in to comment.