From 4fe1b06a203f277bcec2c547264776ac5314df5e Mon Sep 17 00:00:00 2001 From: Quinn Slack Date: Tue, 26 Dec 2023 00:02:49 -0600 Subject: [PATCH] wip --- provider/docs/bin/docs-query.ts | 2 +- provider/docs/src/corpus/corpus.test.ts | 13 ------ provider/docs/src/corpus/corpus.ts | 45 ------------------- provider/docs/src/corpus/index.test.ts | 14 ++++++ provider/docs/src/corpus/index.ts | 35 +++++++++++++-- .../docs/src/corpus/search/embeddings.test.ts | 5 +-- provider/docs/src/corpus/search/embeddings.ts | 3 +- .../docs/src/corpus/search/keyword.test.ts | 5 +-- provider/docs/src/corpus/search/keyword.ts | 3 +- provider/docs/src/corpus/search/multi.ts | 3 +- .../src/corpus/source/web/webCorpusSource.ts | 4 +- provider/docs/src/e2e.test.ts | 5 ++- provider/docs/src/logger.ts | 1 + provider/docs/src/provider/provider.ts | 2 +- 14 files changed, 60 insertions(+), 80 deletions(-) delete mode 100644 provider/docs/src/corpus/corpus.test.ts delete mode 100644 provider/docs/src/corpus/corpus.ts create mode 100644 provider/docs/src/corpus/index.test.ts create mode 100644 provider/docs/src/logger.ts diff --git a/provider/docs/bin/docs-query.ts b/provider/docs/bin/docs-query.ts index 0733591a..2334b56b 100644 --- a/provider/docs/bin/docs-query.ts +++ b/provider/docs/bin/docs-query.ts @@ -1,8 +1,8 @@ import { readFile } from 'fs/promises' import path from 'path' import envPaths from 'env-paths' +import { createCorpus } from '../src/corpus' import { createFileSystemCorpusCache } from '../src/corpus/cache/fs' -import { createCorpus } from '../src/corpus/corpus' import { type CorpusData } from '../src/corpus/data' import { extractContentUsingMozillaReadability } from '../src/corpus/doc/contentExtractor' diff --git a/provider/docs/src/corpus/corpus.test.ts b/provider/docs/src/corpus/corpus.test.ts deleted file mode 100644 index 788aced8..00000000 --- a/provider/docs/src/corpus/corpus.test.ts +++ /dev/null @@ -1,13 +0,0 @@ -import { describe, expect, test } from 'vitest' -import { createCorpus } from './corpus' -import { type Doc, type DocID } from './doc/doc' - -export function doc(id: DocID, text: string): Doc { - return { id, text } -} - -describe('Corpus', () => { - test('#length', () => { - expect(createCorpus([doc(1, 'a'), doc(2, 'b')]).length).toBe(2) - }) -}) diff --git a/provider/docs/src/corpus/corpus.ts b/provider/docs/src/corpus/corpus.ts deleted file mode 100644 index d5a2cf3f..00000000 --- a/provider/docs/src/corpus/corpus.ts +++ /dev/null @@ -1,45 +0,0 @@ -import { indexCorpus, type IndexOptions } from '.' -import { noopCache, type CorpusCache } from './cache/cache' -import { corpusData } from './data' -import { type ChunkIndex } from './doc/chunks' -import { type Doc, type DocID } from './doc/doc' -import { multiSearch } from './search/multi' - -/** - * A corpus of documents. - */ -export interface Corpus { - search(query: string): Promise - length: number -} - -/** - * A search result from searching a corpus. - */ -export interface CorpusSearchResult { - doc: DocID - chunk: ChunkIndex - score: number - excerpt: string -} - -/** - * Options for creating a corpus index. - */ -interface CorpusOptions extends IndexOptions { - cache?: CorpusCache -} - -export function createCorpus( - docs: Doc[], - { cache = noopCache, ...rest }: CorpusOptions = { cache: noopCache } -): Corpus { - const index = indexCorpus(corpusData(docs), rest) - - return { - search: query => multiSearch(index, query, cache), - get length(): number { - return docs.length - }, - } -} diff --git a/provider/docs/src/corpus/index.test.ts b/provider/docs/src/corpus/index.test.ts new file mode 100644 index 00000000..24310c0d --- /dev/null +++ b/provider/docs/src/corpus/index.test.ts @@ -0,0 +1,14 @@ +import { describe, expect, test } from 'vitest' +import { indexCorpus } from '.' +import { corpusData } from './data' +import { type Doc, type DocID } from './doc/doc' + +export function doc(id: DocID, text: string): Doc { + return { id, text } +} + +describe('indexCorpus', () => { + test('#docs', () => { + expect(indexCorpus(corpusData([doc(1, 'a'), doc(2, 'b')])).docs.length).toBe(2) + }) +}) diff --git a/provider/docs/src/corpus/index.ts b/provider/docs/src/corpus/index.ts index 2e6398b6..b9679c80 100644 --- a/provider/docs/src/corpus/index.ts +++ b/provider/docs/src/corpus/index.ts @@ -1,7 +1,9 @@ +import { noopCache, type CorpusCache } from './cache/cache' import { type CorpusData } from './data' -import { chunk, type Chunk } from './doc/chunks' +import { chunk, type Chunk, type ChunkIndex } from './doc/chunks' import { type Content, type ContentExtractor } from './doc/contentExtractor' -import { type Doc } from './doc/doc' +import { type Doc, type DocID } from './doc/doc' +import { multiSearch } from './search/multi' /** * An index of a corpus. @@ -10,6 +12,8 @@ export interface CorpusIndex { data: CorpusData docs: IndexedDoc[] + + search(query: string): Promise } /** @@ -21,11 +25,32 @@ export interface IndexedDoc { chunks: Chunk[] } +/** + * A search result from searching a corpus. + */ + +export interface CorpusSearchResult { + doc: DocID + chunk: ChunkIndex + score: number + excerpt: string +} + +/** + * Options for indexing a corpus. + */ export interface IndexOptions { + cache?: CorpusCache contentExtractor?: ContentExtractor } -export function indexCorpus(data: CorpusData, { contentExtractor }: IndexOptions = {}): CorpusIndex { +/** + * Index a corpus. + */ +export function indexCorpus( + data: CorpusData, + { cache = noopCache, contentExtractor }: IndexOptions = { cache: noopCache } +): CorpusIndex { const indexedDocs: IndexedDoc[] = [] for (const doc of data.docs) { const content = contentExtractor?.(doc) ?? undefined @@ -34,5 +59,7 @@ export function indexCorpus(data: CorpusData, { contentExtractor }: IndexOptions indexedDocs.push({ doc, content, chunks }) } - return { data, docs: indexedDocs } + + const index: CorpusIndex = { data, docs: indexedDocs, search: query => multiSearch(index, query, cache) } + return index } diff --git a/provider/docs/src/corpus/search/embeddings.test.ts b/provider/docs/src/corpus/search/embeddings.test.ts index 9414560a..f97c6241 100644 --- a/provider/docs/src/corpus/search/embeddings.test.ts +++ b/provider/docs/src/corpus/search/embeddings.test.ts @@ -1,8 +1,7 @@ import { describe, expect, test } from 'vitest' -import { indexCorpus } from '..' -import { type CorpusSearchResult } from '../corpus' -import { doc } from '../corpus.test' +import { indexCorpus, type CorpusSearchResult } from '..' import { corpusData } from '../data' +import { doc } from '../index.test' import { embeddingsSearch, embedText, similarity } from './embeddings' describe('embeddingsSearch', () => { diff --git a/provider/docs/src/corpus/search/embeddings.ts b/provider/docs/src/corpus/search/embeddings.ts index 1cb59944..82bbdf55 100644 --- a/provider/docs/src/corpus/search/embeddings.ts +++ b/provider/docs/src/corpus/search/embeddings.ts @@ -1,8 +1,7 @@ import { cos_sim, env, pipeline } from '@xenova/transformers' import * as onnxWeb from 'onnxruntime-web' -import { type CorpusIndex } from '..' +import { type CorpusIndex, type CorpusSearchResult } from '..' import { memo, noopCache, type CorpusCache } from '../cache/cache' -import { type CorpusSearchResult } from '../corpus' // eslint-disable-next-line @typescript-eslint/prefer-optional-chain if (typeof process !== 'undefined' && process.env.FORCE_WASM) { diff --git a/provider/docs/src/corpus/search/keyword.test.ts b/provider/docs/src/corpus/search/keyword.test.ts index f2ecabae..6538d7f4 100644 --- a/provider/docs/src/corpus/search/keyword.test.ts +++ b/provider/docs/src/corpus/search/keyword.test.ts @@ -1,8 +1,7 @@ import { describe, expect, test } from 'vitest' -import { indexCorpus } from '..' -import { type CorpusSearchResult } from '../corpus' -import { doc } from '../corpus.test' +import { indexCorpus, type CorpusSearchResult } from '..' import { corpusData } from '../data' +import { doc } from '../index.test' import { keywordSearch } from './keyword' import { calculateTFIDF } from './tfidf' diff --git a/provider/docs/src/corpus/search/keyword.ts b/provider/docs/src/corpus/search/keyword.ts index 1084e1d0..2e015547 100644 --- a/provider/docs/src/corpus/search/keyword.ts +++ b/provider/docs/src/corpus/search/keyword.ts @@ -1,5 +1,4 @@ -import { type CorpusIndex } from '..' -import { type CorpusSearchResult } from '../corpus' +import { type CorpusIndex, type CorpusSearchResult } from '..' import { terms } from './terms' import { createIndexForTFIDF } from './tfidf' diff --git a/provider/docs/src/corpus/search/multi.ts b/provider/docs/src/corpus/search/multi.ts index 9bc5e94d..2de0446f 100644 --- a/provider/docs/src/corpus/search/multi.ts +++ b/provider/docs/src/corpus/search/multi.ts @@ -1,6 +1,5 @@ -import { type CorpusIndex } from '..' +import { type CorpusIndex, type CorpusSearchResult } from '..' import { scopedCache, type CorpusCache } from '../cache/cache' -import { type CorpusSearchResult } from '../corpus' import { type ChunkIndex } from '../doc/chunks' import { type DocID } from '../doc/doc' import { embeddingsSearch } from './embeddings' diff --git a/provider/docs/src/corpus/source/web/webCorpusSource.ts b/provider/docs/src/corpus/source/web/webCorpusSource.ts index d0318f0f..c2d44c79 100644 --- a/provider/docs/src/corpus/source/web/webCorpusSource.ts +++ b/provider/docs/src/corpus/source/web/webCorpusSource.ts @@ -1,4 +1,5 @@ import { JSDOM } from 'jsdom' +import { Logger } from '../../../logger' import { Doc } from '../../doc/doc' import { CorpusSource } from '../source' import { createCrawlQueue } from './crawlQueue' @@ -20,8 +21,6 @@ interface WebCorpusSourceOptions { logger?: Logger } -type Logger = (message: string) => void - export function createWebCorpusSource({ entryPage, prefix, logger }: WebCorpusSourceOptions): CorpusSource { return { documents: async () => { @@ -88,6 +87,7 @@ export function createWebCorpusSource({ entryPage, prefix, logger }: WebCorpusSo enqueueURL(new URL(link.href)) } } + return documents }, } diff --git a/provider/docs/src/e2e.test.ts b/provider/docs/src/e2e.test.ts index 26da4e59..b710b20a 100644 --- a/provider/docs/src/e2e.test.ts +++ b/provider/docs/src/e2e.test.ts @@ -1,14 +1,15 @@ import fs from 'node:fs/promises' import path from 'node:path' import { describe, expect, test } from 'vitest' -import { createCorpus, type CorpusSearchResult } from './corpus/corpus' +import { indexCorpus, type CorpusSearchResult } from './corpus' +import { corpusData } from './corpus/data' describe('e2e', () => { test('urlParsing', async () => { const docFile = await fs.readFile(path.join(__dirname, 'testdata/corpus/urlParsing.md'), 'utf8') const codeFile = await fs.readFile(path.join(__dirname, 'testdata/code/urlParsing.ts'), 'utf8') - const corpus = createCorpus([{ id: 1, text: docFile }]) + const corpus = indexCorpus(corpusData([{ id: 1, text: docFile }])) const results = await corpus.search(codeFile) roundScores(results) expect(results).toEqual([ diff --git a/provider/docs/src/logger.ts b/provider/docs/src/logger.ts new file mode 100644 index 00000000..a42e4737 --- /dev/null +++ b/provider/docs/src/logger.ts @@ -0,0 +1 @@ +export type Logger = (message: string) => void diff --git a/provider/docs/src/provider/provider.ts b/provider/docs/src/provider/provider.ts index fb85b93c..ecd29ecc 100644 --- a/provider/docs/src/provider/provider.ts +++ b/provider/docs/src/provider/provider.ts @@ -5,8 +5,8 @@ import { type CapabilitiesParams, type CapabilitiesResult, } from '@opencodegraph/provider' +import { createCorpus } from '../corpus' import { createWebStorageCorpusCache } from '../corpus/cache/localStorage' -import { createCorpus } from '../corpus/corpus' import { multiplex } from './multiplex' /** Settings for the docs OpenCodeGraph provider. */