-
Notifications
You must be signed in to change notification settings - Fork 21
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
12 changed files
with
234 additions
and
23 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
12 changes: 6 additions & 6 deletions
12
...der/docs/src/corpus/search/search.test.ts → ...docs/src/corpus/search/embeddings.test.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
20 changes: 16 additions & 4 deletions
20
provider/docs/src/corpus/search/search.ts → ...ider/docs/src/corpus/search/embeddings.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
import { describe, expect, test } from 'vitest' | ||
import { CorpusSearchResult, createStoredCorpus } from '../corpus' | ||
import { doc } from '../corpus.test' | ||
import { keywordSearch } from './keyword' | ||
|
||
describe('keywordSearch', () => { | ||
test('finds matches', () => { | ||
expect(keywordSearch(createStoredCorpus([doc(1, 'a'), doc(2, 'b')]), 'b')).toEqual<CorpusSearchResult[]>([ | ||
{ docID: '2', score: 1, excerpt: 'b' }, | ||
]) | ||
}) | ||
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
import { CorpusSearchResult, StoredCorpus } from '../corpus' | ||
|
||
export function keywordSearch(storage: StoredCorpus, query: string): CorpusSearchResult[] { | ||
const terms = query.split(/\s+/) | ||
const results: CorpusSearchResult[] = [] | ||
for (const { doc } of storage.docs) { | ||
if (terms.some(term => doc.text.includes(term))) { | ||
results.push({ docID: doc.docID, score: 1, excerpt: doc.text }) | ||
} | ||
} | ||
return results | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import { describe, expect, test } from 'vitest' | ||
import { Chunk, chunk } from './chunker' | ||
|
||
describe('chunker', () => { | ||
test('empty', () => expect(chunk('', {})).toEqual<Chunk[]>([])) | ||
|
||
test('fallback', () => expect(chunk('a', {})).toEqual<Chunk[]>([{ range: { start: 0, end: 1 }, text: 'a' }])) | ||
|
||
describe('Markdown', () => { | ||
test('by section', () => | ||
expect( | ||
chunk( | ||
` | ||
# Title | ||
Intro | ||
## Section 1 | ||
Body 1 | ||
## Section 2 | ||
Body 2 | ||
`.trim(), | ||
{ isMarkdown: true } | ||
) | ||
).toEqual<Chunk[]>([ | ||
{ | ||
range: { | ||
start: 2, | ||
end: 16, | ||
}, | ||
text: 'Title\n\nIntro', | ||
}, | ||
{ | ||
range: { | ||
start: 5, | ||
end: 24, | ||
}, | ||
text: 'Section 1\n\nBody 1', | ||
}, | ||
{ | ||
range: { | ||
start: 8, | ||
end: 25, | ||
}, | ||
text: 'Section 2\n\nBody 2', | ||
}, | ||
])) | ||
}) | ||
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
/** | ||
* Information about the document to help the chunker know how to split the content into logical | ||
* chunks. | ||
*/ | ||
export interface ChunkerHints { | ||
isMarkdown?: boolean | ||
} | ||
|
||
export interface Chunk { | ||
/** | ||
* The text of the chunk, stripped of semantically meaningless markup, punctuation, and content. | ||
* This text need not be present in the original document. | ||
*/ | ||
text: string | ||
|
||
/** | ||
* The range in the original document (as character offsets) represented by this chunk. | ||
*/ | ||
range: { start: number; end: number } | ||
} | ||
|
||
export function chunk(text: string, hints: ChunkerHints): Chunk[] { | ||
if (hints.isMarkdown) { | ||
return chunkMarkdown(text) | ||
} | ||
if (text.length === 0) { | ||
return [] | ||
} | ||
return [{ text, range: { start: 0, end: text.length } }] | ||
} | ||
|
||
function chunkMarkdown(text: string): Chunk[] { | ||
const chunks: Chunk[] = [] | ||
|
||
const sections = text.split(/^(#+\s*)/m) | ||
let pos = 0 | ||
for (const section of sections) { | ||
if (section.length === 0) { | ||
continue | ||
} | ||
if (section.startsWith('#')) { | ||
pos += section.length | ||
continue | ||
} | ||
chunks.push({ | ||
text: section.trim(), | ||
range: { start: pos, end: pos + section.length }, | ||
}) | ||
} | ||
|
||
return chunks | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
import fs from 'node:fs/promises' | ||
import path from 'node:path' | ||
import { describe, expect, test } from 'vitest' | ||
import { CorpusSearchResult, createCorpus } from './corpus/corpus' | ||
|
||
describe('e2e', () => { | ||
test('urlParsing', async () => { | ||
const docFile = await fs.readFile(path.join(__dirname, 'testdata/corpus/urlParsing.md'), 'utf8') | ||
const codeFile = await fs.readFile(path.join(__dirname, 'testdata/code/urlParsing.ts'), 'utf8') | ||
|
||
const corpus = createCorpus([{ docID: '1', text: docFile }]) | ||
const results = await corpus.search(codeFile, false) | ||
roundScores(results) | ||
expect(results).toEqual<CorpusSearchResult[]>([ | ||
{ | ||
docID: '1', | ||
excerpt: 'Audio URL parsing\n\nTo parse an audio URL, use the `parseAudioURL` function.', | ||
score: 0.685, | ||
}, | ||
]) | ||
}) | ||
}) | ||
|
||
function roundScores(results: CorpusSearchResult[]) { | ||
for (const result of results) { | ||
result.score = Math.round(result.score * 1000) / 1000 | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
// @ts-nocheck | ||
|
||
function getAudio(title: string): URL { | ||
const audioFile = searchAudioFiles(title) | ||
return parseAudioURL(audioFile.url) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# URL parsing | ||
|
||
To parse a URL, use the `parseURL` function. | ||
|
||
## Image URL parsing | ||
|
||
To parse an image URL, use the `parseImageURL` function. | ||
|
||
## Video URL parsing | ||
|
||
To parse an image URL, use the `parseVideoURL` function. | ||
|
||
## Audio URL parsing | ||
|
||
To parse an audio URL, use the `parseAudioURL` function. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters