Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
sqs committed Dec 26, 2023
1 parent ae6a54d commit e187100
Show file tree
Hide file tree
Showing 7 changed files with 94 additions and 32 deletions.
1 change: 1 addition & 0 deletions provider/docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ time p run -s docs-query 'making provider work in vscode' $(find ../../web/conte

TODOs:

- simplify cache interface
- deal with different content types (markdown/html) differently
- make it slurp up gdocs/confluence/markdown in repos
- show OCG annotations (but in a way that doesn't overlay lines in the file, is more passive?)
Expand Down
8 changes: 5 additions & 3 deletions provider/docs/bin/create-web-corpus.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@ const args = process.argv.slice(2)

const entryPage = args[0]
const prefix = args[1]
const ignore = args.slice(2)

const USAGE = `\nUsage: ${path.basename(process.argv[1])} <entry-page-url> <prefix-url>`
if (!entryPage || !prefix || args.length !== 2) {
const USAGE = `\nUsage: ${path.basename(process.argv[1])} <entry-page-url> <prefix-url> [ignore]`
if (!entryPage || !prefix || args.length < 2) {
console.error('Error: invalid arguments')
console.error(USAGE)
process.exit(1)
Expand All @@ -17,10 +18,11 @@ if (!entryPage || !prefix || args.length !== 2) {
const corpusSource = createWebCorpusSource({
entryPage: new URL(entryPage),
prefix: new URL(prefix),
ignore,
logger: message => console.error('# ' + message),
})

const data = corpusData(await corpusSource.documents())
const data = corpusData(await corpusSource.docs())

console.error(`# ${data.docs.length} docs`)
console.log(JSON.stringify(data, null, 2))
16 changes: 15 additions & 1 deletion provider/docs/src/corpus/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ export interface CorpusIndex {

docs: IndexedDoc[]

doc(id: DocID): IndexedDoc
search(query: string): Promise<CorpusSearchResult[]>
}

Expand Down Expand Up @@ -61,7 +62,20 @@ export async function indexCorpus(
indexedDocs.push({ doc, content, chunks })
}

const index: CorpusIndex = { data, docs: indexedDocs, search: query => multiSearch(index, query, cache) }
const index: CorpusIndex = {
data,
docs: indexedDocs,
doc(id) {
const doc = indexedDocs.find(d => d.doc.id === id)
if (!doc) {
throw new Error(`no document with id ${id} in corpus`)
}
return doc
},
search(query) {
return multiSearch(index, query, cache)
},
}
return index
}

Expand Down
20 changes: 19 additions & 1 deletion provider/docs/src/corpus/source/source.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,23 @@
import { CorpusData } from '../data'
import { Doc } from '../doc/doc'

export interface CorpusSource {
documents(): Promise<Doc[]>
docs(): Promise<Doc[]>
}

export function corpusDataSource(data: CorpusData | Promise<CorpusData>): CorpusSource {
return {
docs: async () => (await data).docs,
}
}

export function corpusDataURLSource(url: string): CorpusSource {
return corpusDataSource(
fetch(url).then(resp => {
if (!resp.ok) {
throw new Error(`failed to fetch corpus data from ${url}: ${resp.status} ${resp.statusText}`)
}
return resp.json()
})
)
}
4 changes: 2 additions & 2 deletions provider/docs/src/corpus/source/web/webCorpusSource.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ describe('createWebCorpusSource', () => {
entryPage: new URL('https://example.com/docs/entry'),
prefix: new URL('https://example.com/docs'),
})
expect(await source.documents()).toEqual<Doc[]>([
expect(await source.docs()).toEqual<Doc[]>([
{ id: 1, text: mockPages['/docs/entry'].body, url: 'https://example.com/docs/entry' },
{ id: 2, text: mockPages['/docs/foo'].body, url: 'https://example.com/docs/foo' },
{ id: 3, text: mockPages['/docs/bar'].body, url: 'https://example.com/docs/bar' },
Expand All @@ -90,7 +90,7 @@ describe('createWebCorpusSource', () => {
entryPage: new URL('https://example.com/a/'),
prefix: new URL('https://example.com'),
})
expect(await source.documents()).toMatchObject<Omit<Doc, 'text'>[]>([{ id: 1, url: 'https://example.com/a' }])
expect(await source.docs()).toMatchObject<Omit<Doc, 'text'>[]>([{ id: 1, url: 'https://example.com/a' }])
})
})

Expand Down
45 changes: 32 additions & 13 deletions provider/docs/src/corpus/source/web/webCorpusSource.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,23 @@ interface WebCorpusSourceOptions {
*/
prefix: URL

/**
* Exclude pages whose URL contains any of these strings.
*/
ignore?: string[]

/**
* Called to print a log message.
*/
logger?: Logger
}

export function createWebCorpusSource({ entryPage, prefix, logger }: WebCorpusSourceOptions): CorpusSource {
export function createWebCorpusSource({ entryPage, prefix, ignore, logger }: WebCorpusSourceOptions): CorpusSource {
return {
documents: async () => {
const { nextURL, enqueueURL, shouldCrawlURL } = createCrawlQueue(url => urlHasPrefix(url, prefix))
docs: async () => {
const { nextURL, enqueueURL, shouldCrawlURL } = createCrawlQueue(
url => urlHasPrefix(url, prefix) && !ignore?.some(ignore => url.href.includes(ignore))
)

if (!shouldCrawlURL(entryPage)) {
throw new Error(`web corpus entryPage (${entryPage}) does not start with prefix (${prefix})`)
Expand Down Expand Up @@ -61,15 +68,16 @@ export function createWebCorpusSource({ entryPage, prefix, logger }: WebCorpusSo

const canonicalURLStr = dom.querySelector<HTMLLinkElement>("head > link[rel='canonical']")?.href
if (canonicalURLStr && canonicalURLStr !== url.href) {
const canonicalURL = new URL(canonicalURLStr)

// Only trust the canonical URL if it's same-origin, to avoid letting other
// sites pollute this corpus.
if (canonicalURL.origin === url.origin) {
logger?.(`- Found canonical URL: ${canonicalURL}`)
url = canonicalURL
if (!shouldCrawlURL(url)) {
continue
const canonicalURL = parseURL(canonicalURLStr)
if (canonicalURL) {
// Only trust the canonical URL if it's same-origin, to avoid letting other
// sites pollute this corpus.
if (canonicalURL.origin === url.origin) {
logger?.(`- Found canonical URL: ${canonicalURL}`)
url = canonicalURL
if (!shouldCrawlURL(url)) {
continue
}
}
}
}
Expand All @@ -83,7 +91,10 @@ export function createWebCorpusSource({ entryPage, prefix, logger }: WebCorpusSo
const pageLinks = dom.querySelectorAll<HTMLAnchorElement>('a[href]')
logger?.(`- Found ${pageLinks.length} links on page`)
for (const link of pageLinks) {
enqueueURL(new URL(link.href))
const linkURL = parseURL(link.href)
if (linkURL) {
enqueueURL(linkURL)
}
}
}

Expand All @@ -92,6 +103,14 @@ export function createWebCorpusSource({ entryPage, prefix, logger }: WebCorpusSo
}
}

function parseURL(urlStr: string): URL | undefined {
try {
return new URL(urlStr)
} catch {
return undefined
}
}

export function urlHasPrefix(url: URL, prefix: URL): boolean {
// Disallow username and password.
if (url.username || url.password) {
Expand Down
32 changes: 20 additions & 12 deletions provider/docs/src/provider/provider.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,19 @@ import { indexCorpus } from '../corpus'
import { createWebStorageCorpusCache } from '../corpus/cache/localStorage'
import { corpusData } from '../corpus/data'
import { extractContentUsingMozillaReadability } from '../corpus/doc/contentExtractor'
import { corpusDataURLSource } from '../corpus/source/source'
import { createWebCorpusSource } from '../corpus/source/web/webCorpusSource'
import { multiplex } from './multiplex'

/** Settings for the docs OpenCodeGraph provider. */
export interface Settings {
entryPage: string
prefix: string
corpus:
| { url: string }
| {
entryPage: string
prefix: string
ignore?: string[]
}
}

const CORPUS_CACHE =
Expand All @@ -26,14 +32,16 @@ const CORPUS_CACHE =
* code from an existing documentation corpus.
*/
export default multiplex<Settings>(async settings => {
const data = corpusData(
await createWebCorpusSource({
entryPage: new URL(settings.entryPage),
prefix: new URL(settings.prefix),
logger: message => console.log(message),
}).documents()
)
const index = await indexCorpus(data, {
const source =
'url' in settings.corpus
? corpusDataURLSource(settings.corpus.url)
: createWebCorpusSource({
entryPage: new URL(settings.corpus.entryPage),
prefix: new URL(settings.corpus.prefix),
ignore: settings.corpus.ignore,
logger: message => console.log(message),
})
const index = await indexCorpus(corpusData(await source.docs()), {
cache: CORPUS_CACHE,
contentExtractor: extractContentUsingMozillaReadability,
})
Expand All @@ -50,10 +58,10 @@ export default multiplex<Settings>(async settings => {

const result: AnnotationsResult = { items: [], annotations: [] }
for (const [i, sr] of searchResults.entries()) {
console.log(i, sr)
const doc = index.doc(sr.doc)
const item: OpenCodeGraphItem = {
id: i.toString(),
title: truncate(sr.excerpt, 50),
title: truncate(doc.content?.title ?? doc.doc.url ?? 'Untitled', 50),
detail: sr.excerpt,
}
result.items.push(item)
Expand Down

0 comments on commit e187100

Please sign in to comment.