Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
sqs committed Dec 25, 2023
1 parent a265d76 commit f5da0cc
Show file tree
Hide file tree
Showing 8 changed files with 190 additions and 16 deletions.
6 changes: 3 additions & 3 deletions provider/docs/src/corpus/corpus.test.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import { describe, expect, test } from 'vitest'
import { createCorpus, Document } from './corpus'
import { createCorpus, DocID, Document } from './corpus'

export function doc(docID: string | number, text: string): Document {
return { docID: typeof docID === 'string' ? docID : docID.toString(), text }
export function doc(docID: DocID | number, text: string): Document {
return { docID, text }
}

describe('Corpus', () => {
Expand Down
19 changes: 13 additions & 6 deletions provider/docs/src/corpus/corpus.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,25 +9,32 @@ export interface Corpus {
length: number
}

export type DocID = number

export interface Document {
docID: string
docID: DocID
text: string
}

export interface CorpusSearchResult {
docID: string
docID: DocID
score: number
excerpt: string
}

interface StoredDocument {
export interface StoredCorpus {
docs: StoredDocument[]
}

export interface StoredDocument {
doc: Document
chunks: Chunk[]
}

export interface StoredCorpus {
docs: StoredDocument[]
}
/**
* Index of a {@link Chunk} in a {@link StoredDocument}.
*/
export type ChunkIndex = 0

export function createStoredCorpus(docs: Document[]): StoredCorpus {
const storage: StoredCorpus = { docs: [] }
Expand Down
2 changes: 1 addition & 1 deletion provider/docs/src/corpus/search/embeddings.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ describe('embeddingsSearch', () => {
test('finds matches', async () => {
expect(await embeddingsSearch(createStoredCorpus([doc(1, 'a'), doc(2, 'b')]), 'b')).toEqual<
CorpusSearchResult[]
>([{ docID: '2', score: 1, excerpt: 'b' }])
>([{ docID: 2, score: 1, excerpt: 'b' }])
})
})

Expand Down
2 changes: 1 addition & 1 deletion provider/docs/src/corpus/search/keyword.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import { keywordSearch } from './keyword'
describe('keywordSearch', () => {
test('finds matches', () => {
expect(keywordSearch(createStoredCorpus([doc(1, 'a'), doc(2, 'b')]), 'b')).toEqual<CorpusSearchResult[]>([
{ docID: '2', score: 1, excerpt: 'b' },
{ docID: 2, score: 1, excerpt: 'b' },
])
})
})
8 changes: 5 additions & 3 deletions provider/docs/src/corpus/search/keyword.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@ import { CorpusSearchResult, StoredCorpus } from '../corpus'
export function keywordSearch(storage: StoredCorpus, query: string): CorpusSearchResult[] {
const terms = query.split(/\s+/)
const results: CorpusSearchResult[] = []
for (const { doc } of storage.docs) {
if (terms.some(term => doc.text.includes(term))) {
results.push({ docID: doc.docID, score: 1, excerpt: doc.text })
for (const { doc, chunks } of storage.docs) {
for (const chunk of chunks) {
if (terms.some(term => chunk.text.includes(term))) {
results.push({ docID: doc.docID, score: 1, excerpt: chunk.text })
}
}
}
return results
Expand Down
33 changes: 33 additions & 0 deletions provider/docs/src/corpus/search/tfidf.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import { describe, expect, test } from 'vitest'
import { createStoredCorpus } from '../corpus'
import { calculateTFIDF, createIndexForTFIDF } from './tfidf'

describe('createIndexForTFIDF', () => {
const corpus = createStoredCorpus([
{ docID: 1, text: `a b c c c` },
{ docID: 2, text: `b c d` },
{ docID: 3, text: `c d e` },
])
const docIDs = corpus.docs.map(({ doc: { docID } }) => docID)
const tfidf = createIndexForTFIDF(corpus)

test.only('term in 1 doc', () => {
expect(docIDs.map(docID => tfidf('a', docID, 0))).toEqual([
calculateTFIDF({ termOccurrencesInChunk: 1, chunkTermLength: 5, totalChunks: 3, termChunkFrequency: 1 }),
0,
0,
])
})

test('term in all docs', () => {
expect(docIDs.map(docID => tfidf('c', docID, 0))).toEqual([
calculateTFIDF({ termOccurrencesInChunk: 3, chunkTermLength: 5, totalChunks: 3, termChunkFrequency: 3 }),
calculateTFIDF({ termOccurrencesInChunk: 1, chunkTermLength: 3, totalChunks: 3, termChunkFrequency: 3 }),
calculateTFIDF({ termOccurrencesInChunk: 1, chunkTermLength: 3, totalChunks: 3, termChunkFrequency: 3 }),
])
})

test('unknown term', () => {
expect(docIDs.map(docID => tfidf('x', docID, 0))).toEqual([0, 0, 0])
})
})
132 changes: 132 additions & 0 deletions provider/docs/src/corpus/search/tfidf.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import { ChunkIndex, DocID, StoredCorpus } from '../corpus'

/**
* TF-IDF is a way of measuring the relevance of a term to a document in a corpus. See
* https://en.wikipedia.org/wiki/Tf%E2%80%93idf.
*
* TF-IDF = TF * IDF
* - TF = number of occurrences of term in the chunk / number of (non-unique) terms in the chunk
* - IDF = log(number of chunks / number of chunks containing the term)
*/
export type TFIDF = (term: Term, docID: DocID, chunk: ChunkIndex) => number

/**
* Index the corpus for fast computation of TF-IDF. @see {TFIDF}
*/
export function createIndexForTFIDF(storage: StoredCorpus): TFIDF {
/**
* Document -> chunk index -> term -> number of occurrences of term in the chunk.
*
* "TF" in "TF-IDF" (with chunks instead of documents as the unit of analysis).
*/
const termFrequency = new Map<DocID, Map<Term, number>[]>()

/**
* Document -> chunk index -> number of (non-unique) terms in the chunk.
*/
const termLength = new Map<DocID, number[]>()

/**
* Term -> number of chunks containing the term.
*
* "DF" in "IDF" in "TF-IDF" (with chunks instead of documents as the unit of analysis).
*/
const chunkFrequency = new Map<Term, number>()

let totalChunks = 0

for (const { doc, chunks } of storage.docs) {
const docTermFrequency: Map<Term, number>[] = new Array(chunks.length)
termFrequency.set(doc.docID, docTermFrequency)

const docTermLength: number[] = new Array(chunks.length)
termLength.set(doc.docID, docTermLength)

for (const [i, chunk] of chunks.entries()) {
const chunkTerms = terms(chunk.text)

// Set chunk frequencies.
for (const uniqueTerm of new Set<Term>(chunkTerms).values()) {
chunkFrequency.set(uniqueTerm, (chunkFrequency.get(uniqueTerm) ?? 0) + 1)
}

// Set term frequencies.
const chunkTermFrequency = new Map<Term, number>()
docTermFrequency[i] = chunkTermFrequency
for (const term of chunkTerms) {
chunkTermFrequency.set(term, (chunkTermFrequency.get(term) ?? 0) + 1)
}

// Set term cardinality.
docTermLength[i] = chunkTerms.length

// Increment total chunks.
totalChunks++
}
}

return (termRaw: string, doc: DocID, chunk: ChunkIndex): number => {
const processedTerms = terms(termRaw)
if (processedTerms.length !== 1) {
throw new Error(`term ${JSON.stringify(termRaw)} is not a single term`)
}
const term = processedTerms[0]

const docTermLength = termLength.get(doc)
if (!docTermLength) {
throw new Error(`doc ${doc} not found in termLength`)
}
if (typeof docTermLength[chunk] !== 'number') {
throw new Error(`chunk ${chunk} not found in termLength for doc ${doc}`)
}

const docTermFrequency = termFrequency.get(doc)
if (!docTermFrequency) {
throw new Error(`doc ${doc} not found in termFrequency`)
}
if (!(docTermFrequency[chunk] instanceof Map)) {
throw new Error(`chunk ${chunk} not found in termFrequency for doc ${doc}`)
}

return calculateTFIDF({
termOccurrencesInChunk: docTermFrequency[chunk].get(term) ?? 0,
chunkTermLength: docTermLength[chunk],
totalChunks,
termChunkFrequency: chunkFrequency.get(term) ?? 0,
})
}
}

/**
* Calculate TF-IDF given the formula inputs. @see {TFIDF}
*
* Use {@link createIndexForTFIDF} instead of calling this directly.
*/
export function calculateTFIDF({
termOccurrencesInChunk,
chunkTermLength,
totalChunks,
termChunkFrequency,
}: {
termOccurrencesInChunk: number
chunkTermLength: number
totalChunks: number
termChunkFrequency: number
}): number {
return (termOccurrencesInChunk / chunkTermLength) * Math.log((1 + totalChunks) / (1 + termChunkFrequency))
}

type Term = string

/**
* All terms in the text, with normalization and stemming applied.
*/
function terms(text: string): Term[] {
return (
text
.toLowerCase()
.split(/[^a-zA-Z0-9-_]+/)
// TODO(sqs): get a real stemmer
.map(term => term.replace(/(.*)(?:es|ed|ing|s|ed|ing)$/, '$1'))
)
}
4 changes: 2 additions & 2 deletions provider/docs/src/e2e.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@ describe('e2e', () => {
const docFile = await fs.readFile(path.join(__dirname, 'testdata/corpus/urlParsing.md'), 'utf8')
const codeFile = await fs.readFile(path.join(__dirname, 'testdata/code/urlParsing.ts'), 'utf8')

const corpus = createCorpus([{ docID: '1', text: docFile }])
const corpus = createCorpus([{ docID: 1, text: docFile }])
const results = await corpus.search(codeFile, false)
roundScores(results)
expect(results).toEqual<CorpusSearchResult[]>([
{
docID: '1',
docID: 1,
excerpt: 'Audio URL parsing\n\nTo parse an audio URL, use the `parseAudioURL` function.',
score: 0.685,
},
Expand Down

0 comments on commit f5da0cc

Please sign in to comment.