From 23825f21265443a14cd479421957694c761b6cf9 Mon Sep 17 00:00:00 2001 From: Quinn Slack Date: Mon, 25 Dec 2023 23:44:51 -0600 Subject: [PATCH] wip --- provider/docs/bin/docs-query.ts | 6 +++++- provider/docs/src/corpus/corpus.ts | 11 +++++++---- provider/docs/src/corpus/doc/contentExtractor.test.ts | 6 +++--- provider/docs/src/corpus/doc/contentExtractor.ts | 4 ++-- provider/docs/src/corpus/index.ts | 10 ++++++---- 5 files changed, 23 insertions(+), 14 deletions(-) diff --git a/provider/docs/bin/docs-query.ts b/provider/docs/bin/docs-query.ts index 996bbdf7..0733591a 100644 --- a/provider/docs/bin/docs-query.ts +++ b/provider/docs/bin/docs-query.ts @@ -4,6 +4,7 @@ import envPaths from 'env-paths' import { createFileSystemCorpusCache } from '../src/corpus/cache/fs' import { createCorpus } from '../src/corpus/corpus' import { type CorpusData } from '../src/corpus/data' +import { extractContentUsingMozillaReadability } from '../src/corpus/doc/contentExtractor' const args = process.argv.slice(2) @@ -32,7 +33,10 @@ const corpusData = JSON.parse(await readFile(corpusDataFile, 'utf8')) as CorpusD const cacheDir = envPaths('opencodegraph-provider-docs').cache const fsCache = createFileSystemCorpusCache(cacheDir) -const corpus = createCorpus(corpusData.docs, { cache: fsCache }) +const corpus = createCorpus(corpusData.docs, { + cache: fsCache, + contentExtractor: extractContentUsingMozillaReadability, +}) const results = await corpus.search(query) console.error(`# ${corpus.length} docs in corpus`) console.error(`# Query: ${JSON.stringify(query)}`) diff --git a/provider/docs/src/corpus/corpus.ts b/provider/docs/src/corpus/corpus.ts index 378d88e8..d5a2cf3f 100644 --- a/provider/docs/src/corpus/corpus.ts +++ b/provider/docs/src/corpus/corpus.ts @@ -1,4 +1,4 @@ -import { indexCorpus } from '.' +import { indexCorpus, type IndexOptions } from '.' import { noopCache, type CorpusCache } from './cache/cache' import { corpusData } from './data' import { type ChunkIndex } from './doc/chunks' @@ -26,12 +26,15 @@ export interface CorpusSearchResult { /** * Options for creating a corpus index. */ -interface CorpusOptions { +interface CorpusOptions extends IndexOptions { cache?: CorpusCache } -export function createCorpus(docs: Doc[], { cache = noopCache }: CorpusOptions = { cache: noopCache }): Corpus { - const index = indexCorpus(corpusData(docs)) +export function createCorpus( + docs: Doc[], + { cache = noopCache, ...rest }: CorpusOptions = { cache: noopCache } +): Corpus { + const index = indexCorpus(corpusData(docs), rest) return { search: query => multiSearch(index, query, cache), diff --git a/provider/docs/src/corpus/doc/contentExtractor.test.ts b/provider/docs/src/corpus/doc/contentExtractor.test.ts index 612cdda3..fea242e2 100644 --- a/provider/docs/src/corpus/doc/contentExtractor.test.ts +++ b/provider/docs/src/corpus/doc/contentExtractor.test.ts @@ -6,11 +6,11 @@ describe('extractContentUsingMozillaReadability', () => { expect( extractContentUsingMozillaReadability({ id: 1, - text: 'Bar - MySite

Bar

Baz

', + text: 'Bar - MySite

Bar

\n

Baz

', }) ).toEqual({ title: 'Bar - MySite', - content: '

Bar

Baz

', - textContent: 'Bar Baz', + content: '

Bar

\n

Baz

', + textContent: 'Bar\nBaz', })) }) diff --git a/provider/docs/src/corpus/doc/contentExtractor.ts b/provider/docs/src/corpus/doc/contentExtractor.ts index 1c01e2e4..330e0165 100644 --- a/provider/docs/src/corpus/doc/contentExtractor.ts +++ b/provider/docs/src/corpus/doc/contentExtractor.ts @@ -24,11 +24,11 @@ export interface Content { export type ContentExtractor = (doc: Doc) => Content | null export const extractContentUsingMozillaReadability: ContentExtractor = doc => { - const info = new Readability(new JSDOM(doc.text, { url: doc.url }).window.document).parse() + const info = new Readability(new JSDOM(doc.text, { url: doc.url }).window.document, { charThreshold: 500 }).parse() return info ? { title: info.title, - content: info.content.replace(/^
(.*)<\/div>$/, '$1'), + content: info.content, textContent: info.textContent, } : null diff --git a/provider/docs/src/corpus/index.ts b/provider/docs/src/corpus/index.ts index 099cb648..2e6398b6 100644 --- a/provider/docs/src/corpus/index.ts +++ b/provider/docs/src/corpus/index.ts @@ -17,20 +17,22 @@ export interface CorpusIndex { */ export interface IndexedDoc { doc: Doc - chunks: Chunk[] content?: Content + chunks: Chunk[] } -interface IndexOptions { +export interface IndexOptions { contentExtractor?: ContentExtractor } export function indexCorpus(data: CorpusData, { contentExtractor }: IndexOptions = {}): CorpusIndex { const indexedDocs: IndexedDoc[] = [] for (const doc of data.docs) { - const chunks = chunk(doc.text, { isMarkdown: doc.text.includes('##') }) const content = contentExtractor?.(doc) ?? undefined - indexedDocs.push({ doc, chunks, content }) + + const chunks = chunk(content?.content ?? doc.text, { isMarkdown: doc.text.includes('##') }) + + indexedDocs.push({ doc, content, chunks }) } return { data, docs: indexedDocs } }