Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
sqs committed Dec 26, 2023
1 parent 5f89832 commit 67bcb87
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 28 deletions.
30 changes: 30 additions & 0 deletions provider/docs/bin/create-file-corpus.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import { readFile } from 'fs/promises'
import path from 'path'
import { corpusData } from '../src/corpus/data'
import { Doc } from '../src/corpus/doc/doc'

const args = process.argv.slice(2)

const corpusFiles = args

const USAGE = `\nUsage: ${path.basename(process.argv[1])} <corpus-files>`
if (corpusFiles.length === 0) {
console.error('Error: no corpus files specified')
console.error(USAGE)
process.exit(1)
}

const data = corpusData(
await Promise.all(
corpusFiles.map(async (file, i) => {
const data = await readFile(file, 'utf8')
return {
id: i + 1,
text: data,
} satisfies Doc
})
)
)

console.error(`# ${data.docs.length} docs`)
console.log(JSON.stringify(data, null, 2))
10 changes: 6 additions & 4 deletions provider/docs/bin/create-web-corpus.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import path from 'path'
import { corpusData } from '../src/corpus/data'
import { createWebCorpusSource } from '../src/corpus/source/web/webCorpusSource'

const args = process.argv.slice(2)

const entryPage = args[0]
const prefix = args[1]

const USAGE = `\nUsage: ${path.basename(process.argv[1])} <query> <corpus-files>`
const USAGE = `\nUsage: ${path.basename(process.argv[1])} <entry-page-url> <prefix-url>`
if (!entryPage || !prefix || args.length !== 2) {
console.error('Error: invalid arguments')
console.error(USAGE)
Expand All @@ -19,6 +20,7 @@ const corpusSource = createWebCorpusSource({
logger: message => console.error('# ' + message),
})

const docs = await corpusSource.documents()
console.error(`# ${docs.length} docs`)
console.log(JSON.stringify(docs, null, 2))
const data = corpusData(await corpusSource.documents())

console.error(`# ${data.docs.length} docs`)
console.log(JSON.stringify(data, null, 2))
33 changes: 15 additions & 18 deletions provider/docs/bin/docs-query.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,50 +3,47 @@ import path from 'path'
import envPaths from 'env-paths'
import { createFileSystemCorpusCache } from '../src/corpus/cache/fs'
import { createCorpus } from '../src/corpus/corpus'
import { type Doc } from '../src/corpus/doc/doc'
import { type CorpusData } from '../src/corpus/data'

const args = process.argv.slice(2)

const query = args[0]
const corpusFiles = args.slice(1)
const corpusDataFile = args[0]
const query = args[1]

const USAGE = `\nUsage: ${path.basename(process.argv[1])} <query> <corpus-files>`
const USAGE = `\nUsage: ${path.basename(process.argv[1])} <corpus-data-file> <query>`
if (!corpusDataFile) {
console.error('Error: no corpus data file specified (use create-file-corpus or create-web-corpus to create one)')
console.error(USAGE)
process.exit(1)
}
if (!query) {
console.error('Error: no query specified')
console.error(USAGE)
process.exit(1)
}
if (corpusFiles.length === 0) {
console.error('Error: no corpus files specified')
if (args.length !== 2) {
console.error('Error: invalid arguments')
console.error(USAGE)
process.exit(1)
}

const docs: Doc[] = await Promise.all(
corpusFiles.map(async (file, i) => {
const data = await readFile(file, 'utf8')
return {
id: i + 1,
text: data,
} satisfies Doc
})
)
const corpusData = JSON.parse(await readFile(corpusDataFile, 'utf8')) as CorpusData

const cacheDir = envPaths('opencodegraph-provider-docs').cache
const fsCache = createFileSystemCorpusCache(cacheDir)

const corpus = createCorpus(docs, { cache: fsCache })
const corpus = createCorpus(corpusData.docs, { cache: fsCache })
const results = await corpus.search(query)
console.error(`# ${corpus.length} docs in corpus`)
console.error(`# Query: ${JSON.stringify(query)}`)
const MAX_RESULTS = 5
console.error(`# ${results.length} results${results.length > MAX_RESULTS ? ` (showing top ${MAX_RESULTS})` : ''}`)
for (const [i, result] of results.slice(0, MAX_RESULTS).entries()) {
const docFile = corpusFiles[result.doc - 1]
const doc = corpusData.docs[result.doc - 1]
if (i !== 0) {
console.log()
}
console.log(`#${i + 1} [${result.score.toFixed(3)}] ${docFile}#chunk${result.chunk}`)
console.log(`#${i + 1} [${result.score.toFixed(3)}] ${doc.url ?? `doc${doc.id}`}#chunk${result.chunk}`)
console.log(`${indent(truncate(result.excerpt.replaceAll('\n\n', '\n'), 500), '\t')}`)
}

Expand Down
1 change: 1 addition & 0 deletions provider/docs/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
"build": "tsc --build",
"test": "vitest",
"docs-query": "node --no-warnings=ExperimentalWarning --experimental-specifier-resolution=node --loader ts-node/esm/transpile-only bin/docs-query.ts",
"create-file-corpus": "node --no-warnings=ExperimentalWarning --experimental-specifier-resolution=node --loader ts-node/esm/transpile-only bin/create-file-corpus.ts",
"create-web-corpus": "node --no-warnings=ExperimentalWarning --experimental-specifier-resolution=node --loader ts-node/esm/transpile-only bin/create-web-corpus.ts"
},
"dependencies": {
Expand Down
3 changes: 2 additions & 1 deletion provider/docs/src/corpus/doc/contentExtractor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ export interface Content {
content: string

/**
* Text content of the document, with all markup removed. Omits all non-content-related elements.
* Text content of the document, with all markup removed. Omits all non-content-related
* elements.
*/
textContent: string
}
Expand Down
8 changes: 3 additions & 5 deletions provider/docs/src/corpus/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,11 @@ interface IndexOptions {
}

export function indexCorpus(data: CorpusData, { contentExtractor }: IndexOptions = {}): CorpusIndex {
const index: CorpusIndex = { data, docs: [] }

const indexedDocs: IndexedDoc[] = []
for (const doc of data.docs) {
const chunks = chunk(doc.text, { isMarkdown: doc.text.includes('##') })
const content = contentExtractor?.(doc) ?? undefined
index.docs.push({ doc, chunks, content })
indexedDocs.push({ doc, chunks, content })
}

return index
return { data, docs: indexedDocs }
}

0 comments on commit 67bcb87

Please sign in to comment.