Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
sqs committed Dec 24, 2023
1 parent adf66e5 commit e7dd5d4
Show file tree
Hide file tree
Showing 12 changed files with 234 additions and 23 deletions.
44 changes: 33 additions & 11 deletions provider/docs/src/corpus/corpus.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import { CorpusSearchResult } from './search/search'
import { Chunk, chunk } from '../doc/chunker'
import { embeddingsSearch } from './search/embeddings'
import { keywordSearch } from './search/keyword'

/**
* A documentation corpus.
*/
export interface Corpus {
search(query: string): Promise<CorpusSearchResult[]>
search(query: string, keyword: boolean): Promise<CorpusSearchResult[]>
length: number
}

Expand All @@ -13,17 +15,37 @@ export interface Document {
text: string
}

export interface CorpusSearchResult {
docID: string
score: number
excerpt: string
}

interface StoredDocument {
doc: Document
chunks: Chunk[]
}

export interface StoredCorpus {
docs: StoredDocument[]
}

export function createStoredCorpus(docs: Document[]): StoredCorpus {
const storage: StoredCorpus = { docs: [] }
for (const doc of docs) {
const chunks = chunk(doc.text, { isMarkdown: doc.text.includes('##') })
storage.docs.push({ doc, chunks })
}

return storage
}

export function createCorpus(docs: Document[]): Corpus {
const storage = createStoredCorpus(docs)

return {
search: query => {
const terms = query.split(/\s+/)
const results: CorpusSearchResult[] = []
for (const doc of docs) {
if (terms.some(term => doc.text.includes(term))) {
results.push({ docID: doc.docID, score: 1, excerpt: doc.text })
}
}
return Promise.resolve(results)
search: (query, keyword) => {
return Promise.resolve(keyword ? keywordSearch(storage, query) : embeddingsSearch(storage, query))
},
get length(): number {
return docs.length
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import { describe, expect, test } from 'vitest'
import { createCorpus } from '../corpus'
import { CorpusSearchResult, createStoredCorpus } from '../corpus'
import { doc } from '../corpus.test'
import { CorpusSearchResult, embedText, similarity } from './search'
import { embeddingsSearch, embedText, similarity } from './embeddings'

describe('Corpus#search', () => {
describe('embeddingsSearch', () => {
test('finds matches', async () => {
expect(await createCorpus([doc(1, 'a'), doc(2, 'b')]).search('b')).toEqual<CorpusSearchResult[]>([
{ docID: '2', score: 1, excerpt: 'b' },
])
expect(await embeddingsSearch(createStoredCorpus([doc(1, 'a'), doc(2, 'b')]), 'b')).toEqual<
CorpusSearchResult[]
>([{ docID: '2', score: 1, excerpt: 'b' }])
})
})

Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,21 @@
import { cos_sim, pipeline } from '@xenova/transformers'
import { CorpusSearchResult, StoredCorpus } from '../corpus'

export interface CorpusSearchResult {
docID: string
score: number
excerpt: string
export async function embeddingsSearch(storage: StoredCorpus, query: string): Promise<CorpusSearchResult[]> {
const queryVec = await embedText(query)

const results: CorpusSearchResult[] = []
for (const { doc, chunks } of storage.docs) {
for (const chunk of chunks) {
const chunkVec = await embedText(chunk.text)
const score = cos_sim(queryVec, chunkVec)
results.push({ docID: doc.docID, score, excerpt: chunk.text })
}
}

results.sort((a, b) => b.score - a.score)

return results.slice(0, 1)
}

/**
Expand Down
12 changes: 12 additions & 0 deletions provider/docs/src/corpus/search/keyword.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import { describe, expect, test } from 'vitest'
import { CorpusSearchResult, createStoredCorpus } from '../corpus'
import { doc } from '../corpus.test'
import { keywordSearch } from './keyword'

describe('keywordSearch', () => {
test('finds matches', () => {
expect(keywordSearch(createStoredCorpus([doc(1, 'a'), doc(2, 'b')]), 'b')).toEqual<CorpusSearchResult[]>([
{ docID: '2', score: 1, excerpt: 'b' },
])
})
})
12 changes: 12 additions & 0 deletions provider/docs/src/corpus/search/keyword.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import { CorpusSearchResult, StoredCorpus } from '../corpus'

export function keywordSearch(storage: StoredCorpus, query: string): CorpusSearchResult[] {
const terms = query.split(/\s+/)
const results: CorpusSearchResult[] = []
for (const { doc } of storage.docs) {
if (terms.some(term => doc.text.includes(term))) {
results.push({ docID: doc.docID, score: 1, excerpt: doc.text })
}
}
return results
}
52 changes: 52 additions & 0 deletions provider/docs/src/doc/chunker.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import { describe, expect, test } from 'vitest'
import { Chunk, chunk } from './chunker'

describe('chunker', () => {
test('empty', () => expect(chunk('', {})).toEqual<Chunk[]>([]))

test('fallback', () => expect(chunk('a', {})).toEqual<Chunk[]>([{ range: { start: 0, end: 1 }, text: 'a' }]))

describe('Markdown', () => {
test('by section', () =>
expect(
chunk(
`
# Title
Intro
## Section 1
Body 1
## Section 2
Body 2
`.trim(),
{ isMarkdown: true }
)
).toEqual<Chunk[]>([
{
range: {
start: 2,
end: 16,
},
text: 'Title\n\nIntro',
},
{
range: {
start: 5,
end: 24,
},
text: 'Section 1\n\nBody 1',
},
{
range: {
start: 8,
end: 25,
},
text: 'Section 2\n\nBody 2',
},
]))
})
})
52 changes: 52 additions & 0 deletions provider/docs/src/doc/chunker.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/**
* Information about the document to help the chunker know how to split the content into logical
* chunks.
*/
export interface ChunkerHints {
isMarkdown?: boolean
}

export interface Chunk {
/**
* The text of the chunk, stripped of semantically meaningless markup, punctuation, and content.
* This text need not be present in the original document.
*/
text: string

/**
* The range in the original document (as character offsets) represented by this chunk.
*/
range: { start: number; end: number }
}

export function chunk(text: string, hints: ChunkerHints): Chunk[] {
if (hints.isMarkdown) {
return chunkMarkdown(text)
}
if (text.length === 0) {
return []
}
return [{ text, range: { start: 0, end: text.length } }]
}

function chunkMarkdown(text: string): Chunk[] {
const chunks: Chunk[] = []

const sections = text.split(/^(#+\s*)/m)
let pos = 0
for (const section of sections) {
if (section.length === 0) {
continue
}
if (section.startsWith('#')) {
pos += section.length
continue
}
chunks.push({
text: section.trim(),
range: { start: pos, end: pos + section.length },
})
}

return chunks
}
28 changes: 28 additions & 0 deletions provider/docs/src/e2e.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import fs from 'node:fs/promises'
import path from 'node:path'
import { describe, expect, test } from 'vitest'
import { CorpusSearchResult, createCorpus } from './corpus/corpus'

describe('e2e', () => {
test('urlParsing', async () => {
const docFile = await fs.readFile(path.join(__dirname, 'testdata/corpus/urlParsing.md'), 'utf8')
const codeFile = await fs.readFile(path.join(__dirname, 'testdata/code/urlParsing.ts'), 'utf8')

const corpus = createCorpus([{ docID: '1', text: docFile }])
const results = await corpus.search(codeFile, false)
roundScores(results)
expect(results).toEqual<CorpusSearchResult[]>([
{
docID: '1',
excerpt: 'Audio URL parsing\n\nTo parse an audio URL, use the `parseAudioURL` function.',
score: 0.685,
},
])
})
})

function roundScores(results: CorpusSearchResult[]) {
for (const result of results) {
result.score = Math.round(result.score * 1000) / 1000
}
}
6 changes: 6 additions & 0 deletions provider/docs/src/testdata/code/urlParsing.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
// @ts-nocheck

function getAudio(title: string): URL {
const audioFile = searchAudioFiles(title)
return parseAudioURL(audioFile.url)
}
15 changes: 15 additions & 0 deletions provider/docs/src/testdata/corpus/urlParsing.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# URL parsing

To parse a URL, use the `parseURL` function.

## Image URL parsing

To parse an image URL, use the `parseImageURL` function.

## Video URL parsing

To parse an image URL, use the `parseVideoURL` function.

## Audio URL parsing

To parse an audio URL, use the `parseAudioURL` function.
2 changes: 1 addition & 1 deletion provider/docs/tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@
"lib": ["ESNext"],
},
"include": ["src"],
"exclude": ["dist", "vitest.config.ts"],
"exclude": ["dist", "src/testdata", "vitest.config.ts"],
"references": [{ "path": "../../lib/provider" }],
}
2 changes: 1 addition & 1 deletion tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"noEmit": true
},
"files": [],
"exclude": ["**/dist", "client/browser"],
"exclude": ["**/dist", "**/testdata", "client/browser"],
"references": [
{ "path": "lib/client" },
{ "path": "lib/protocol" },
Expand Down

0 comments on commit e7dd5d4

Please sign in to comment.