Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
sqs committed Dec 26, 2023
1 parent c18e229 commit 4267153
Show file tree
Hide file tree
Showing 9 changed files with 47 additions and 25 deletions.
2 changes: 1 addition & 1 deletion provider/docs/bin/docs-query.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ const corpusData = JSON.parse(await readFile(corpusDataFile, 'utf8')) as CorpusD
const cacheDir = envPaths('opencodegraph-provider-docs').cache
const fsCache = createFileSystemCorpusCache(cacheDir)

const corpus = indexCorpus(corpusData, {
const corpus = await indexCorpus(corpusData, {
cache: fsCache,
contentExtractor: extractContentUsingMozillaReadability,
})
Expand Down
2 changes: 1 addition & 1 deletion provider/docs/src/corpus/doc/contentExtractor.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import { Content, extractContentUsingMozillaReadability } from './contentExtract
describe('extractContentUsingMozillaReadability', () => {
test('extracts content', () =>
expect(
extractContentUsingMozillaReadability({
extractContentUsingMozillaReadability.extractContent({
id: 1,
text: '<html><head><title>Bar - MySite</title></head><body><aside><nav><h1><a href="/">MySite</a></h1> <a href="/foo">foo</a></nav></aside><main><h1>Bar</h1>\n<p>Baz</p></main></body>',
})
Expand Down
28 changes: 18 additions & 10 deletions provider/docs/src/corpus/doc/contentExtractor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,23 @@ export interface Content {
textContent: string
}

export type ContentExtractor = (doc: Doc) => Content | null
export interface ContentExtractor {
id: string
extractContent(doc: Doc): Content | null
}

export const extractContentUsingMozillaReadability: ContentExtractor = doc => {
const info = new Readability(new JSDOM(doc.text, { url: doc.url }).window.document, { charThreshold: 500 }).parse()
return info
? {
title: info.title,
content: info.content,
textContent: info.textContent,
}
: null
export const extractContentUsingMozillaReadability: ContentExtractor = {
id: 'mozillaReadability',
extractContent(doc) {
const info = new Readability(new JSDOM(doc.text, { url: doc.url }).window.document, {
charThreshold: 500,
}).parse()
return info
? {
title: info.title,
content: info.content,
textContent: info.textContent,
}
: null
},
}
4 changes: 2 additions & 2 deletions provider/docs/src/corpus/index.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ export function doc(id: DocID, text: string): Doc {
}

describe('indexCorpus', () => {
test('#docs', () => {
expect(indexCorpus(corpusData([doc(1, 'a'), doc(2, 'b')])).docs.length).toBe(2)
test('#docs', async () => {
expect((await indexCorpus(corpusData([doc(1, 'a'), doc(2, 'b')]))).docs.length).toBe(2)
})
})
24 changes: 19 additions & 5 deletions provider/docs/src/corpus/index.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { noopCache, type CorpusCache } from './cache/cache'
import { memo, noopCache, type CorpusCache } from './cache/cache'
import { type CorpusData } from './data'
import { chunk, type Chunk, type ChunkIndex } from './doc/chunks'
import { type Content, type ContentExtractor } from './doc/contentExtractor'
Expand All @@ -21,7 +21,7 @@ export interface CorpusIndex {
*/
export interface IndexedDoc {
doc: Doc
content?: Content
content: Content | null
chunks: Chunk[]
}

Expand All @@ -47,13 +47,14 @@ export interface IndexOptions {
/**
* Index a corpus.
*/
export function indexCorpus(
export async function indexCorpus(
data: CorpusData,
{ cache = noopCache, contentExtractor }: IndexOptions = { cache: noopCache }
): CorpusIndex {
): Promise<CorpusIndex> {
const indexedDocs: IndexedDoc[] = []

for (const doc of data.docs) {
const content = contentExtractor?.(doc) ?? undefined
const content = await cachedExtractContent(cache, contentExtractor, doc)

const chunks = chunk(content?.content ?? doc.text, { isMarkdown: doc.text.includes('##') })

Expand All @@ -63,3 +64,16 @@ export function indexCorpus(
const index: CorpusIndex = { data, docs: indexedDocs, search: query => multiSearch(index, query, cache) }
return index
}

function cachedExtractContent(
cache: CorpusCache,
extractor: ContentExtractor | undefined,
doc: Doc
): Promise<Content | null> {
if (!extractor) {
return Promise.resolve(null)
}
return memo(cache, `${doc.url}:${doc.text}`, `extractContent:${extractor.id}`, () =>
Promise.resolve(extractor.extractContent(doc))
)
}
2 changes: 1 addition & 1 deletion provider/docs/src/corpus/search/embeddings.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import { embeddingsSearch, embedText, similarity } from './embeddings'

describe('embeddingsSearch', () => {
test('finds matches', async () => {
expect(await embeddingsSearch(indexCorpus(corpusData([doc(1, 'a'), doc(2, 'b')])), 'b')).toEqual<
expect(await embeddingsSearch(await indexCorpus(corpusData([doc(1, 'a'), doc(2, 'b')])), 'b')).toEqual<
CorpusSearchResult[]
>([{ doc: 2, chunk: 0, score: 1, excerpt: 'b' }])
})
Expand Down
4 changes: 2 additions & 2 deletions provider/docs/src/corpus/search/keyword.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ import { keywordSearch } from './keyword'
import { calculateTFIDF } from './tfidf'

describe('keywordSearch', () => {
test('finds matches', () => {
expect(keywordSearch(indexCorpus(corpusData([doc(1, 'aaa'), doc(2, 'bbb')])), 'bbb')).toEqual<
test('finds matches', async () => {
expect(keywordSearch(await indexCorpus(corpusData([doc(1, 'aaa'), doc(2, 'bbb')])), 'bbb')).toEqual<
CorpusSearchResult[]
>([
{
Expand Down
4 changes: 2 additions & 2 deletions provider/docs/src/corpus/search/tfidf.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@ import { indexCorpus } from '..'
import { corpusData } from '../data'
import { calculateTFIDF, createIndexForTFIDF } from './tfidf'

describe('createIndexForTFIDF', () => {
describe('createIndexForTFIDF', async () => {
const data = corpusData([
{ id: 1, text: 'a b c c c' },
{ id: 2, text: 'b c d' },
{ id: 3, text: 'c d e' },
])
const docIDs = data.docs.map(({ id }) => id)
const index = indexCorpus(data)
const index = await indexCorpus(data)
const tfidf = createIndexForTFIDF(index.docs)

test('term in 1 doc', () => {
Expand Down
2 changes: 1 addition & 1 deletion provider/docs/src/e2e.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ describe('e2e', () => {
const docFile = await fs.readFile(path.join(__dirname, 'testdata/corpus/urlParsing.md'), 'utf8')
const codeFile = await fs.readFile(path.join(__dirname, 'testdata/code/urlParsing.ts'), 'utf8')

const corpus = indexCorpus(corpusData([{ id: 1, text: docFile }]))
const corpus = await indexCorpus(corpusData([{ id: 1, text: docFile }]))
const results = await corpus.search(codeFile)
roundScores(results)
expect(results).toEqual<CorpusSearchResult[]>([
Expand Down

0 comments on commit 4267153

Please sign in to comment.