Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
sqs committed Dec 26, 2023
1 parent c9a1d71 commit 51c46be
Show file tree
Hide file tree
Showing 9 changed files with 42 additions and 34 deletions.
20 changes: 3 additions & 17 deletions provider/docs/src/corpus/corpus.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { noopCache, type CorpusCache } from './cache/cache'
import { chunk, type Chunk, type ChunkIndex } from './doc/chunks'
import { corpusData } from './data'
import { type Chunk, type ChunkIndex } from './doc/chunks'
import { type Doc, type DocID } from './doc/doc'
import { multiSearch } from './search/multi'

Expand All @@ -18,10 +19,6 @@ export interface CorpusSearchResult {
excerpt: string
}

export interface StoredCorpus {
docs: StoredDocument[]
}

export interface StoredDocument {
doc: Doc
chunks: Chunk[]
Expand All @@ -31,19 +28,8 @@ interface CorpusOptions {
cache?: CorpusCache
}

export function createStoredCorpus(docs: Doc[]): StoredCorpus {
const storage: StoredCorpus = { docs: [] }

for (const doc of docs) {
const chunks = chunk(doc.text, { isMarkdown: doc.text.includes('##') })
storage.docs.push({ doc, chunks })
}

return storage
}

export function createCorpus(docs: Doc[], { cache = noopCache }: CorpusOptions = { cache: noopCache }): Corpus {
const storage = createStoredCorpus(docs)
const storage = corpusData(docs)

return {
search: query => multiSearch(storage, query, cache),
Expand Down
17 changes: 17 additions & 0 deletions provider/docs/src/corpus/data.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import { type StoredDocument } from './corpus'
import { chunk } from './doc/chunks'
import { type Doc } from './doc/doc'

export function corpusData(docs: Doc[]): CorpusData {
const data: CorpusData = { docs: [] }

for (const doc of docs) {
const chunks = chunk(doc.text, { isMarkdown: doc.text.includes('##') })
data.docs.push({ doc, chunks })
}

return data
}
export interface CorpusData {
docs: StoredDocument[]
}
9 changes: 5 additions & 4 deletions provider/docs/src/corpus/search/embeddings.test.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import { describe, expect, test } from 'vitest'
import { createStoredCorpus, type CorpusSearchResult } from '../corpus'
import { type CorpusSearchResult } from '../corpus'
import { doc } from '../corpus.test'
import { corpusData } from '../data'
import { embeddingsSearch, embedText, similarity } from './embeddings'

describe('embeddingsSearch', () => {
test('finds matches', async () => {
expect(await embeddingsSearch(createStoredCorpus([doc(1, 'a'), doc(2, 'b')]), 'b')).toEqual<
CorpusSearchResult[]
>([{ doc: 2, chunk: 0, score: 1, excerpt: 'b' }])
expect(await embeddingsSearch(corpusData([doc(1, 'a'), doc(2, 'b')]), 'b')).toEqual<CorpusSearchResult[]>([
{ doc: 2, chunk: 0, score: 1, excerpt: 'b' },
])
})
})

Expand Down
5 changes: 3 additions & 2 deletions provider/docs/src/corpus/search/embeddings.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import { cos_sim, env, pipeline } from '@xenova/transformers'
import * as onnxWeb from 'onnxruntime-web'
import { memo, noopCache, type CorpusCache } from '../cache/cache'
import { type CorpusSearchResult, type StoredCorpus } from '../corpus'
import { type CorpusSearchResult } from '../corpus'
import { type CorpusData } from '../data'

// eslint-disable-next-line @typescript-eslint/prefer-optional-chain
if (typeof process !== 'undefined' && process.env.FORCE_WASM) {
Expand All @@ -18,7 +19,7 @@ if (typeof process !== 'undefined' && process.env.FORCE_WASM) {
env.allowLocalModels = false

export async function embeddingsSearch(
storage: StoredCorpus,
storage: CorpusData,
query: string,
cache: CorpusCache = noopCache
): Promise<CorpusSearchResult[]> {
Expand Down
5 changes: 3 additions & 2 deletions provider/docs/src/corpus/search/keyword.test.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import { describe, expect, test } from 'vitest'
import { createStoredCorpus, type CorpusSearchResult } from '../corpus'
import { type CorpusSearchResult } from '../corpus'
import { doc } from '../corpus.test'
import { corpusData } from '../data'
import { keywordSearch } from './keyword'
import { calculateTFIDF } from './tfidf'

describe('keywordSearch', () => {
test('finds matches', () => {
expect(keywordSearch(createStoredCorpus([doc(1, 'aaa'), doc(2, 'bbb')]), 'bbb')).toEqual<CorpusSearchResult[]>([
expect(keywordSearch(corpusData([doc(1, 'aaa'), doc(2, 'bbb')]), 'bbb')).toEqual<CorpusSearchResult[]>([
{
doc: 2,
chunk: 0,
Expand Down
5 changes: 3 additions & 2 deletions provider/docs/src/corpus/search/keyword.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import { type CorpusSearchResult, type StoredCorpus } from '../corpus'
import { type CorpusSearchResult } from '../corpus'
import { type CorpusData } from '../data'
import { terms } from './terms'
import { createIndexForTFIDF } from './tfidf'

export function keywordSearch(storage: StoredCorpus, query: string): CorpusSearchResult[] {
export function keywordSearch(storage: CorpusData, query: string): CorpusSearchResult[] {
const queryTerms = terms(query).filter(term => term.length >= 3)
const tfidf = createIndexForTFIDF(storage)

Expand Down
7 changes: 4 additions & 3 deletions provider/docs/src/corpus/search/multi.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { scopedCache, type CorpusCache } from '../cache/cache'
import { type CorpusSearchResult, type StoredCorpus } from '../corpus'
import { type CorpusSearchResult } from '../corpus'
import { type CorpusData } from '../data'
import { type ChunkIndex } from '../doc/chunks'
import { type DocID } from '../doc/doc'
import { embeddingsSearch } from './embeddings'
Expand All @@ -9,7 +10,7 @@ import { keywordSearch } from './keyword'
* Search using multiple search methods.
*/
export async function multiSearch(
storage: StoredCorpus,
storage: CorpusData,
query: string,
cache: CorpusCache
): Promise<CorpusSearchResult[]> {
Expand Down Expand Up @@ -43,5 +44,5 @@ export async function multiSearch(

const SEARCH_METHODS: Record<
string,
(storage: StoredCorpus, query: string, cache: CorpusCache) => CorpusSearchResult[] | Promise<CorpusSearchResult[]>
(storage: CorpusData, query: string, cache: CorpusCache) => CorpusSearchResult[] | Promise<CorpusSearchResult[]>
> = { keywordSearch, embeddingsSearch }
4 changes: 2 additions & 2 deletions provider/docs/src/corpus/search/tfidf.test.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import { describe, expect, test } from 'vitest'
import { createStoredCorpus } from '../corpus'
import { corpusData } from '../data'
import { calculateTFIDF, createIndexForTFIDF } from './tfidf'

describe('createIndexForTFIDF', () => {
const corpus = createStoredCorpus([
const corpus = corpusData([
{ id: 1, text: 'a b c c c' },
{ id: 2, text: 'b c d' },
{ id: 3, text: 'c d e' },
Expand Down
4 changes: 2 additions & 2 deletions provider/docs/src/corpus/search/tfidf.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { type StoredCorpus } from '../corpus'
import { type CorpusData } from '../data'
import { type ChunkIndex } from '../doc/chunks'
import { type DocID } from '../doc/doc'
import { terms, type Term } from './terms'
Expand All @@ -16,7 +16,7 @@ export type TFIDF = (term: Term, doc: DocID, chunk: ChunkIndex) => number
/**
* Index the corpus for fast computation of TF-IDF. @see {TFIDF}
*/
export function createIndexForTFIDF(storage: StoredCorpus): TFIDF {
export function createIndexForTFIDF(storage: CorpusData): TFIDF {
/**
* Document -> chunk index -> term -> number of occurrences of term in the chunk.
*
Expand Down

0 comments on commit 51c46be

Please sign in to comment.