Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
sqs committed Dec 26, 2023
1 parent 67bcb87 commit 23825f2
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 14 deletions.
6 changes: 5 additions & 1 deletion provider/docs/bin/docs-query.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import envPaths from 'env-paths'
import { createFileSystemCorpusCache } from '../src/corpus/cache/fs'
import { createCorpus } from '../src/corpus/corpus'
import { type CorpusData } from '../src/corpus/data'
import { extractContentUsingMozillaReadability } from '../src/corpus/doc/contentExtractor'

const args = process.argv.slice(2)

Expand Down Expand Up @@ -32,7 +33,10 @@ const corpusData = JSON.parse(await readFile(corpusDataFile, 'utf8')) as CorpusD
const cacheDir = envPaths('opencodegraph-provider-docs').cache
const fsCache = createFileSystemCorpusCache(cacheDir)

const corpus = createCorpus(corpusData.docs, { cache: fsCache })
const corpus = createCorpus(corpusData.docs, {
cache: fsCache,
contentExtractor: extractContentUsingMozillaReadability,
})
const results = await corpus.search(query)
console.error(`# ${corpus.length} docs in corpus`)
console.error(`# Query: ${JSON.stringify(query)}`)
Expand Down
11 changes: 7 additions & 4 deletions provider/docs/src/corpus/corpus.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { indexCorpus } from '.'
import { indexCorpus, type IndexOptions } from '.'
import { noopCache, type CorpusCache } from './cache/cache'
import { corpusData } from './data'
import { type ChunkIndex } from './doc/chunks'
Expand Down Expand Up @@ -26,12 +26,15 @@ export interface CorpusSearchResult {
/**
* Options for creating a corpus index.
*/
interface CorpusOptions {
interface CorpusOptions extends IndexOptions {
cache?: CorpusCache
}

export function createCorpus(docs: Doc[], { cache = noopCache }: CorpusOptions = { cache: noopCache }): Corpus {
const index = indexCorpus(corpusData(docs))
export function createCorpus(
docs: Doc[],
{ cache = noopCache, ...rest }: CorpusOptions = { cache: noopCache }
): Corpus {
const index = indexCorpus(corpusData(docs), rest)

return {
search: query => multiSearch(index, query, cache),
Expand Down
6 changes: 3 additions & 3 deletions provider/docs/src/corpus/doc/contentExtractor.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@ describe('extractContentUsingMozillaReadability', () => {
expect(
extractContentUsingMozillaReadability({
id: 1,
text: '<html><head><title>Bar - MySite</title></head><body><aside><nav><h1><a href="/">MySite</a></h1> <a href="/foo">foo</a></nav></aside><main><h1>Bar</h1> <p>Baz</p></main></body>',
text: '<html><head><title>Bar - MySite</title></head><body><aside><nav><h1><a href="/">MySite</a></h1> <a href="/foo">foo</a></nav></aside><main><h1>Bar</h1>\n<p>Baz</p></main></body>',
})
).toEqual<Content>({
title: 'Bar - MySite',
content: '<main><h2>Bar</h2> <p>Baz</p></main>',
textContent: 'Bar Baz',
content: '<div id="readability-page-1" class="page"><main><h2>Bar</h2>\n<p>Baz</p></main></div>',
textContent: 'Bar\nBaz',
}))
})
4 changes: 2 additions & 2 deletions provider/docs/src/corpus/doc/contentExtractor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ export interface Content {
export type ContentExtractor = (doc: Doc) => Content | null

export const extractContentUsingMozillaReadability: ContentExtractor = doc => {
const info = new Readability(new JSDOM(doc.text, { url: doc.url }).window.document).parse()
const info = new Readability(new JSDOM(doc.text, { url: doc.url }).window.document, { charThreshold: 500 }).parse()
return info
? {
title: info.title,
content: info.content.replace(/^<div id="readability-page-1" class="page">(.*)<\/div>$/, '$1'),
content: info.content,
textContent: info.textContent,
}
: null
Expand Down
10 changes: 6 additions & 4 deletions provider/docs/src/corpus/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,22 @@ export interface CorpusIndex {
*/
export interface IndexedDoc {
doc: Doc
chunks: Chunk[]
content?: Content
chunks: Chunk[]
}

interface IndexOptions {
export interface IndexOptions {
contentExtractor?: ContentExtractor
}

export function indexCorpus(data: CorpusData, { contentExtractor }: IndexOptions = {}): CorpusIndex {
const indexedDocs: IndexedDoc[] = []
for (const doc of data.docs) {
const chunks = chunk(doc.text, { isMarkdown: doc.text.includes('##') })
const content = contentExtractor?.(doc) ?? undefined
indexedDocs.push({ doc, chunks, content })

const chunks = chunk(content?.content ?? doc.text, { isMarkdown: doc.text.includes('##') })

indexedDocs.push({ doc, content, chunks })
}
return { data, docs: indexedDocs }
}

0 comments on commit 23825f2

Please sign in to comment.