Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
sqs committed Dec 26, 2023
1 parent 51c46be commit adca3b2
Show file tree
Hide file tree
Showing 11 changed files with 76 additions and 39 deletions.
20 changes: 11 additions & 9 deletions provider/docs/src/corpus/corpus.ts
Original file line number Diff line number Diff line change
@@ -1,38 +1,40 @@
import { indexCorpus } from '.'
import { noopCache, type CorpusCache } from './cache/cache'
import { corpusData } from './data'
import { type Chunk, type ChunkIndex } from './doc/chunks'
import { type ChunkIndex } from './doc/chunks'
import { type Doc, type DocID } from './doc/doc'
import { multiSearch } from './search/multi'

/**
* A documentation corpus.
* A corpus of documents.
*/
export interface Corpus {
search(query: string): Promise<CorpusSearchResult[]>
length: number
}

/**
* A search result from searching a corpus.
*/
export interface CorpusSearchResult {
doc: DocID
chunk: ChunkIndex
score: number
excerpt: string
}

export interface StoredDocument {
doc: Doc
chunks: Chunk[]
}

/**
* Options for creating a corpus index.
*/
interface CorpusOptions {
cache?: CorpusCache
}

export function createCorpus(docs: Doc[], { cache = noopCache }: CorpusOptions = { cache: noopCache }): Corpus {
const storage = corpusData(docs)
const index = indexCorpus(corpusData(docs))

return {
search: query => multiSearch(storage, query, cache),
search: query => multiSearch(index, query, cache),
get length(): number {
return docs.length
},
Expand Down
16 changes: 8 additions & 8 deletions provider/docs/src/corpus/data.ts
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
import { type StoredDocument } from './corpus'
import { chunk } from './doc/chunks'
import { type Doc } from './doc/doc'

export function corpusData(docs: Doc[]): CorpusData {
const data: CorpusData = { docs: [] }

const seenIDs = new Set<number>()
for (const doc of docs) {
const chunks = chunk(doc.text, { isMarkdown: doc.text.includes('##') })
data.docs.push({ doc, chunks })
if (seenIDs.has(doc.id)) {
throw new Error(`duplicate doc ID: ${doc.id}`)
}
seenIDs.add(doc.id)
}

return data
return { docs }
}

export interface CorpusData {
docs: StoredDocument[]
docs: Doc[]
}
10 changes: 10 additions & 0 deletions provider/docs/src/corpus/doc/doc.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import { type Chunk } from './chunks'

/**
* A unique identifier for a document in a corpus.
*/
Expand All @@ -12,3 +14,11 @@ export interface Doc {

url?: string
}

/**
* An indexed document.
*/
export interface IndexedDoc {
doc: Doc
chunks: Chunk[]
}
20 changes: 20 additions & 0 deletions provider/docs/src/corpus/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import { type CorpusData } from './data'
import { chunk } from './doc/chunks'
import { type IndexedDoc } from './doc/doc'

export interface CorpusIndex {
data: CorpusData

docs: IndexedDoc[]
}

export function indexCorpus(data: CorpusData): CorpusIndex {
const index: CorpusIndex = { data, docs: [] }

for (const doc of data.docs) {
const chunks = chunk(doc.text, { isMarkdown: doc.text.includes('##') })
index.docs.push({ doc, chunks })
}

return index
}
7 changes: 4 additions & 3 deletions provider/docs/src/corpus/search/embeddings.test.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import { describe, expect, test } from 'vitest'
import { indexCorpus } from '..'
import { type CorpusSearchResult } from '../corpus'
import { doc } from '../corpus.test'
import { corpusData } from '../data'
import { embeddingsSearch, embedText, similarity } from './embeddings'

describe('embeddingsSearch', () => {
test('finds matches', async () => {
expect(await embeddingsSearch(corpusData([doc(1, 'a'), doc(2, 'b')]), 'b')).toEqual<CorpusSearchResult[]>([
{ doc: 2, chunk: 0, score: 1, excerpt: 'b' },
])
expect(await embeddingsSearch(indexCorpus(corpusData([doc(1, 'a'), doc(2, 'b')])), 'b')).toEqual<
CorpusSearchResult[]
>([{ doc: 2, chunk: 0, score: 1, excerpt: 'b' }])
})
})

Expand Down
6 changes: 3 additions & 3 deletions provider/docs/src/corpus/search/embeddings.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import { cos_sim, env, pipeline } from '@xenova/transformers'
import * as onnxWeb from 'onnxruntime-web'
import { type CorpusIndex } from '..'
import { memo, noopCache, type CorpusCache } from '../cache/cache'
import { type CorpusSearchResult } from '../corpus'
import { type CorpusData } from '../data'

// eslint-disable-next-line @typescript-eslint/prefer-optional-chain
if (typeof process !== 'undefined' && process.env.FORCE_WASM) {
Expand All @@ -19,14 +19,14 @@ if (typeof process !== 'undefined' && process.env.FORCE_WASM) {
env.allowLocalModels = false

export async function embeddingsSearch(
storage: CorpusData,
index: CorpusIndex,
query: string,
cache: CorpusCache = noopCache
): Promise<CorpusSearchResult[]> {
const queryVec = await embedText(query)

const results: CorpusSearchResult[] = []
for (const { doc, chunks } of storage.docs) {
for (const { doc, chunks } of index.docs) {
for (const [i, chunk] of chunks.entries()) {
const chunkVec = await cachedEmbedText(chunk.text, cache)
const score = cos_sim(queryVec, chunkVec)
Expand Down
5 changes: 4 additions & 1 deletion provider/docs/src/corpus/search/keyword.test.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { describe, expect, test } from 'vitest'
import { indexCorpus } from '..'
import { type CorpusSearchResult } from '../corpus'
import { doc } from '../corpus.test'
import { corpusData } from '../data'
Expand All @@ -7,7 +8,9 @@ import { calculateTFIDF } from './tfidf'

describe('keywordSearch', () => {
test('finds matches', () => {
expect(keywordSearch(corpusData([doc(1, 'aaa'), doc(2, 'bbb')]), 'bbb')).toEqual<CorpusSearchResult[]>([
expect(keywordSearch(indexCorpus(corpusData([doc(1, 'aaa'), doc(2, 'bbb')])), 'bbb')).toEqual<
CorpusSearchResult[]
>([
{
doc: 2,
chunk: 0,
Expand Down
8 changes: 4 additions & 4 deletions provider/docs/src/corpus/search/keyword.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import { type CorpusIndex } from '..'
import { type CorpusSearchResult } from '../corpus'
import { type CorpusData } from '../data'
import { terms } from './terms'
import { createIndexForTFIDF } from './tfidf'

export function keywordSearch(storage: CorpusData, query: string): CorpusSearchResult[] {
export function keywordSearch(index: CorpusIndex, query: string): CorpusSearchResult[] {
const queryTerms = terms(query).filter(term => term.length >= 3)
const tfidf = createIndexForTFIDF(storage)
const tfidf = createIndexForTFIDF(index.docs)

const results: CorpusSearchResult[] = []
for (const { doc, chunks } of storage.docs) {
for (const { doc, chunks } of index.docs) {
for (const [i, chunk] of chunks.entries()) {
const score = queryTerms.reduce((score, term) => score + tfidf(term, doc.id, i), 0)
if (score > 0) {
Expand Down
8 changes: 4 additions & 4 deletions provider/docs/src/corpus/search/multi.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { type CorpusIndex } from '..'
import { scopedCache, type CorpusCache } from '../cache/cache'
import { type CorpusSearchResult } from '../corpus'
import { type CorpusData } from '../data'
import { type ChunkIndex } from '../doc/chunks'
import { type DocID } from '../doc/doc'
import { embeddingsSearch } from './embeddings'
Expand All @@ -10,13 +10,13 @@ import { keywordSearch } from './keyword'
* Search using multiple search methods.
*/
export async function multiSearch(
storage: CorpusData,
index: CorpusIndex,
query: string,
cache: CorpusCache
): Promise<CorpusSearchResult[]> {
const allResults = (
await Promise.all(
Object.entries(SEARCH_METHODS).map(([name, searchFn]) => searchFn(storage, query, scopedCache(cache, name)))
Object.entries(SEARCH_METHODS).map(([name, searchFn]) => searchFn(index, query, scopedCache(cache, name)))
)
).flat()

Expand Down Expand Up @@ -44,5 +44,5 @@ export async function multiSearch(

const SEARCH_METHODS: Record<
string,
(storage: CorpusData, query: string, cache: CorpusCache) => CorpusSearchResult[] | Promise<CorpusSearchResult[]>
(index: CorpusIndex, query: string, cache: CorpusCache) => CorpusSearchResult[] | Promise<CorpusSearchResult[]>
> = { keywordSearch, embeddingsSearch }
8 changes: 5 additions & 3 deletions provider/docs/src/corpus/search/tfidf.test.ts
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import { describe, expect, test } from 'vitest'
import { indexCorpus } from '..'
import { corpusData } from '../data'
import { calculateTFIDF, createIndexForTFIDF } from './tfidf'

describe('createIndexForTFIDF', () => {
const corpus = corpusData([
const data = corpusData([
{ id: 1, text: 'a b c c c' },
{ id: 2, text: 'b c d' },
{ id: 3, text: 'c d e' },
])
const docIDs = corpus.docs.map(({ doc: { id } }) => id)
const tfidf = createIndexForTFIDF(corpus)
const docIDs = data.docs.map(({ id }) => id)
const index = indexCorpus(data)
const tfidf = createIndexForTFIDF(index.docs)

test('term in 1 doc', () => {
expect(docIDs.map(docID => tfidf('a', docID, 0))).toEqual([
Expand Down
7 changes: 3 additions & 4 deletions provider/docs/src/corpus/search/tfidf.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import { type CorpusData } from '../data'
import { type ChunkIndex } from '../doc/chunks'
import { type DocID } from '../doc/doc'
import { type DocID, type IndexedDoc } from '../doc/doc'
import { terms, type Term } from './terms'

/**
Expand All @@ -16,7 +15,7 @@ export type TFIDF = (term: Term, doc: DocID, chunk: ChunkIndex) => number
/**
* Index the corpus for fast computation of TF-IDF. @see {TFIDF}
*/
export function createIndexForTFIDF(storage: CorpusData): TFIDF {
export function createIndexForTFIDF(docs: IndexedDoc[]): TFIDF {
/**
* Document -> chunk index -> term -> number of occurrences of term in the chunk.
*
Expand All @@ -38,7 +37,7 @@ export function createIndexForTFIDF(storage: CorpusData): TFIDF {

let totalChunks = 0

for (const { doc, chunks } of storage.docs) {
for (const { doc, chunks } of docs) {
const docTermFrequency: Map<Term, number>[] = new Array<Map<Term, number>>(chunks.length)
termFrequency.set(doc.id, docTermFrequency)

Expand Down

0 comments on commit adca3b2

Please sign in to comment.