From e7dd5d487c64fae5ff4b219e0b42e8a594b50a6b Mon Sep 17 00:00:00 2001 From: Quinn Slack Date: Sun, 24 Dec 2023 00:44:00 -0600 Subject: [PATCH] wip --- provider/docs/src/corpus/corpus.ts | 44 ++++++++++++---- .../{search.test.ts => embeddings.test.ts} | 12 ++--- .../search/{search.ts => embeddings.ts} | 20 +++++-- .../docs/src/corpus/search/keyword.test.ts | 12 +++++ provider/docs/src/corpus/search/keyword.ts | 12 +++++ provider/docs/src/doc/chunker.test.ts | 52 +++++++++++++++++++ provider/docs/src/doc/chunker.ts | 52 +++++++++++++++++++ provider/docs/src/e2e.test.ts | 28 ++++++++++ provider/docs/src/testdata/code/urlParsing.ts | 6 +++ .../docs/src/testdata/corpus/urlParsing.md | 15 ++++++ provider/docs/tsconfig.json | 2 +- tsconfig.json | 2 +- 12 files changed, 234 insertions(+), 23 deletions(-) rename provider/docs/src/corpus/search/{search.test.ts => embeddings.test.ts} (61%) rename provider/docs/src/corpus/search/{search.ts => embeddings.ts} (58%) create mode 100644 provider/docs/src/corpus/search/keyword.test.ts create mode 100644 provider/docs/src/corpus/search/keyword.ts create mode 100644 provider/docs/src/doc/chunker.test.ts create mode 100644 provider/docs/src/doc/chunker.ts create mode 100644 provider/docs/src/e2e.test.ts create mode 100644 provider/docs/src/testdata/code/urlParsing.ts create mode 100644 provider/docs/src/testdata/corpus/urlParsing.md diff --git a/provider/docs/src/corpus/corpus.ts b/provider/docs/src/corpus/corpus.ts index 8cc9df34..d3d68330 100644 --- a/provider/docs/src/corpus/corpus.ts +++ b/provider/docs/src/corpus/corpus.ts @@ -1,10 +1,12 @@ -import { CorpusSearchResult } from './search/search' +import { Chunk, chunk } from '../doc/chunker' +import { embeddingsSearch } from './search/embeddings' +import { keywordSearch } from './search/keyword' /** * A documentation corpus. */ export interface Corpus { - search(query: string): Promise + search(query: string, keyword: boolean): Promise length: number } @@ -13,17 +15,37 @@ export interface Document { text: string } +export interface CorpusSearchResult { + docID: string + score: number + excerpt: string +} + +interface StoredDocument { + doc: Document + chunks: Chunk[] +} + +export interface StoredCorpus { + docs: StoredDocument[] +} + +export function createStoredCorpus(docs: Document[]): StoredCorpus { + const storage: StoredCorpus = { docs: [] } + for (const doc of docs) { + const chunks = chunk(doc.text, { isMarkdown: doc.text.includes('##') }) + storage.docs.push({ doc, chunks }) + } + + return storage +} + export function createCorpus(docs: Document[]): Corpus { + const storage = createStoredCorpus(docs) + return { - search: query => { - const terms = query.split(/\s+/) - const results: CorpusSearchResult[] = [] - for (const doc of docs) { - if (terms.some(term => doc.text.includes(term))) { - results.push({ docID: doc.docID, score: 1, excerpt: doc.text }) - } - } - return Promise.resolve(results) + search: (query, keyword) => { + return Promise.resolve(keyword ? keywordSearch(storage, query) : embeddingsSearch(storage, query)) }, get length(): number { return docs.length diff --git a/provider/docs/src/corpus/search/search.test.ts b/provider/docs/src/corpus/search/embeddings.test.ts similarity index 61% rename from provider/docs/src/corpus/search/search.test.ts rename to provider/docs/src/corpus/search/embeddings.test.ts index 82a4af0e..5def3423 100644 --- a/provider/docs/src/corpus/search/search.test.ts +++ b/provider/docs/src/corpus/search/embeddings.test.ts @@ -1,13 +1,13 @@ import { describe, expect, test } from 'vitest' -import { createCorpus } from '../corpus' +import { CorpusSearchResult, createStoredCorpus } from '../corpus' import { doc } from '../corpus.test' -import { CorpusSearchResult, embedText, similarity } from './search' +import { embeddingsSearch, embedText, similarity } from './embeddings' -describe('Corpus#search', () => { +describe('embeddingsSearch', () => { test('finds matches', async () => { - expect(await createCorpus([doc(1, 'a'), doc(2, 'b')]).search('b')).toEqual([ - { docID: '2', score: 1, excerpt: 'b' }, - ]) + expect(await embeddingsSearch(createStoredCorpus([doc(1, 'a'), doc(2, 'b')]), 'b')).toEqual< + CorpusSearchResult[] + >([{ docID: '2', score: 1, excerpt: 'b' }]) }) }) diff --git a/provider/docs/src/corpus/search/search.ts b/provider/docs/src/corpus/search/embeddings.ts similarity index 58% rename from provider/docs/src/corpus/search/search.ts rename to provider/docs/src/corpus/search/embeddings.ts index b77cbc4d..cd50be97 100644 --- a/provider/docs/src/corpus/search/search.ts +++ b/provider/docs/src/corpus/search/embeddings.ts @@ -1,9 +1,21 @@ import { cos_sim, pipeline } from '@xenova/transformers' +import { CorpusSearchResult, StoredCorpus } from '../corpus' -export interface CorpusSearchResult { - docID: string - score: number - excerpt: string +export async function embeddingsSearch(storage: StoredCorpus, query: string): Promise { + const queryVec = await embedText(query) + + const results: CorpusSearchResult[] = [] + for (const { doc, chunks } of storage.docs) { + for (const chunk of chunks) { + const chunkVec = await embedText(chunk.text) + const score = cos_sim(queryVec, chunkVec) + results.push({ docID: doc.docID, score, excerpt: chunk.text }) + } + } + + results.sort((a, b) => b.score - a.score) + + return results.slice(0, 1) } /** diff --git a/provider/docs/src/corpus/search/keyword.test.ts b/provider/docs/src/corpus/search/keyword.test.ts new file mode 100644 index 00000000..794136f9 --- /dev/null +++ b/provider/docs/src/corpus/search/keyword.test.ts @@ -0,0 +1,12 @@ +import { describe, expect, test } from 'vitest' +import { CorpusSearchResult, createStoredCorpus } from '../corpus' +import { doc } from '../corpus.test' +import { keywordSearch } from './keyword' + +describe('keywordSearch', () => { + test('finds matches', () => { + expect(keywordSearch(createStoredCorpus([doc(1, 'a'), doc(2, 'b')]), 'b')).toEqual([ + { docID: '2', score: 1, excerpt: 'b' }, + ]) + }) +}) diff --git a/provider/docs/src/corpus/search/keyword.ts b/provider/docs/src/corpus/search/keyword.ts new file mode 100644 index 00000000..ea67739d --- /dev/null +++ b/provider/docs/src/corpus/search/keyword.ts @@ -0,0 +1,12 @@ +import { CorpusSearchResult, StoredCorpus } from '../corpus' + +export function keywordSearch(storage: StoredCorpus, query: string): CorpusSearchResult[] { + const terms = query.split(/\s+/) + const results: CorpusSearchResult[] = [] + for (const { doc } of storage.docs) { + if (terms.some(term => doc.text.includes(term))) { + results.push({ docID: doc.docID, score: 1, excerpt: doc.text }) + } + } + return results +} diff --git a/provider/docs/src/doc/chunker.test.ts b/provider/docs/src/doc/chunker.test.ts new file mode 100644 index 00000000..990e081f --- /dev/null +++ b/provider/docs/src/doc/chunker.test.ts @@ -0,0 +1,52 @@ +import { describe, expect, test } from 'vitest' +import { Chunk, chunk } from './chunker' + +describe('chunker', () => { + test('empty', () => expect(chunk('', {})).toEqual([])) + + test('fallback', () => expect(chunk('a', {})).toEqual([{ range: { start: 0, end: 1 }, text: 'a' }])) + + describe('Markdown', () => { + test('by section', () => + expect( + chunk( + ` +# Title + +Intro + +## Section 1 + +Body 1 + +## Section 2 + +Body 2 +`.trim(), + { isMarkdown: true } + ) + ).toEqual([ + { + range: { + start: 2, + end: 16, + }, + text: 'Title\n\nIntro', + }, + { + range: { + start: 5, + end: 24, + }, + text: 'Section 1\n\nBody 1', + }, + { + range: { + start: 8, + end: 25, + }, + text: 'Section 2\n\nBody 2', + }, + ])) + }) +}) diff --git a/provider/docs/src/doc/chunker.ts b/provider/docs/src/doc/chunker.ts new file mode 100644 index 00000000..d010cdb8 --- /dev/null +++ b/provider/docs/src/doc/chunker.ts @@ -0,0 +1,52 @@ +/** + * Information about the document to help the chunker know how to split the content into logical + * chunks. + */ +export interface ChunkerHints { + isMarkdown?: boolean +} + +export interface Chunk { + /** + * The text of the chunk, stripped of semantically meaningless markup, punctuation, and content. + * This text need not be present in the original document. + */ + text: string + + /** + * The range in the original document (as character offsets) represented by this chunk. + */ + range: { start: number; end: number } +} + +export function chunk(text: string, hints: ChunkerHints): Chunk[] { + if (hints.isMarkdown) { + return chunkMarkdown(text) + } + if (text.length === 0) { + return [] + } + return [{ text, range: { start: 0, end: text.length } }] +} + +function chunkMarkdown(text: string): Chunk[] { + const chunks: Chunk[] = [] + + const sections = text.split(/^(#+\s*)/m) + let pos = 0 + for (const section of sections) { + if (section.length === 0) { + continue + } + if (section.startsWith('#')) { + pos += section.length + continue + } + chunks.push({ + text: section.trim(), + range: { start: pos, end: pos + section.length }, + }) + } + + return chunks +} diff --git a/provider/docs/src/e2e.test.ts b/provider/docs/src/e2e.test.ts new file mode 100644 index 00000000..b075cd16 --- /dev/null +++ b/provider/docs/src/e2e.test.ts @@ -0,0 +1,28 @@ +import fs from 'node:fs/promises' +import path from 'node:path' +import { describe, expect, test } from 'vitest' +import { CorpusSearchResult, createCorpus } from './corpus/corpus' + +describe('e2e', () => { + test('urlParsing', async () => { + const docFile = await fs.readFile(path.join(__dirname, 'testdata/corpus/urlParsing.md'), 'utf8') + const codeFile = await fs.readFile(path.join(__dirname, 'testdata/code/urlParsing.ts'), 'utf8') + + const corpus = createCorpus([{ docID: '1', text: docFile }]) + const results = await corpus.search(codeFile, false) + roundScores(results) + expect(results).toEqual([ + { + docID: '1', + excerpt: 'Audio URL parsing\n\nTo parse an audio URL, use the `parseAudioURL` function.', + score: 0.685, + }, + ]) + }) +}) + +function roundScores(results: CorpusSearchResult[]) { + for (const result of results) { + result.score = Math.round(result.score * 1000) / 1000 + } +} diff --git a/provider/docs/src/testdata/code/urlParsing.ts b/provider/docs/src/testdata/code/urlParsing.ts new file mode 100644 index 00000000..8a7c652f --- /dev/null +++ b/provider/docs/src/testdata/code/urlParsing.ts @@ -0,0 +1,6 @@ +// @ts-nocheck + +function getAudio(title: string): URL { + const audioFile = searchAudioFiles(title) + return parseAudioURL(audioFile.url) +} diff --git a/provider/docs/src/testdata/corpus/urlParsing.md b/provider/docs/src/testdata/corpus/urlParsing.md new file mode 100644 index 00000000..864548eb --- /dev/null +++ b/provider/docs/src/testdata/corpus/urlParsing.md @@ -0,0 +1,15 @@ +# URL parsing + +To parse a URL, use the `parseURL` function. + +## Image URL parsing + +To parse an image URL, use the `parseImageURL` function. + +## Video URL parsing + +To parse an image URL, use the `parseVideoURL` function. + +## Audio URL parsing + +To parse an audio URL, use the `parseAudioURL` function. diff --git a/provider/docs/tsconfig.json b/provider/docs/tsconfig.json index 13999016..87e1b4d1 100644 --- a/provider/docs/tsconfig.json +++ b/provider/docs/tsconfig.json @@ -7,6 +7,6 @@ "lib": ["ESNext"], }, "include": ["src"], - "exclude": ["dist", "vitest.config.ts"], + "exclude": ["dist", "src/testdata", "vitest.config.ts"], "references": [{ "path": "../../lib/provider" }], } diff --git a/tsconfig.json b/tsconfig.json index e7821777..f5eb76d8 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -4,7 +4,7 @@ "noEmit": true }, "files": [], - "exclude": ["**/dist", "client/browser"], + "exclude": ["**/dist", "**/testdata", "client/browser"], "references": [ { "path": "lib/client" }, { "path": "lib/protocol" },