From e1871009f701f6519ae24c0ee97b7fbbe880c475 Mon Sep 17 00:00:00 2001
From: Quinn Slack <quinn@slack.org>
Date: Tue, 26 Dec 2023 01:31:11 -0600
Subject: [PATCH] wip

---
 provider/docs/README.md                       |  1 +
 provider/docs/bin/create-web-corpus.ts        |  8 ++--
 provider/docs/src/corpus/index.ts             | 16 ++++++-
 provider/docs/src/corpus/source/source.ts     | 20 ++++++++-
 .../corpus/source/web/webCorpusSource.test.ts |  4 +-
 .../src/corpus/source/web/webCorpusSource.ts  | 45 +++++++++++++------
 provider/docs/src/provider/provider.ts        | 32 ++++++++-----
 7 files changed, 94 insertions(+), 32 deletions(-)
diff --git a/provider/docs/README.md b/provider/docs/README.md
index 26b6ac12..66cc349d 100644
--- a/provider/docs/README.md
+++ b/provider/docs/README.md
@@ -56,6 +56,7 @@ time p run -s docs-query 'making provider work in vscode' $(find ../../web/conte
 
 TODOs:
 
+- simplify cache interface
 - deal with different content types (markdown/html) differently
 - make it slurp up gdocs/confluence/markdown in repos
 - show OCG annotations (but in a way that doesn't overlay lines in the file, is more passive?)
diff --git a/provider/docs/bin/create-web-corpus.ts b/provider/docs/bin/create-web-corpus.ts
index f45b4a6c..35a72fee 100644
--- a/provider/docs/bin/create-web-corpus.ts
+++ b/provider/docs/bin/create-web-corpus.ts
@@ -6,9 +6,10 @@ const args = process.argv.slice(2)
 
 const entryPage = args[0]
 const prefix = args[1]
+const ignore = args.slice(2)
 
-const USAGE = `\nUsage: ${path.basename(process.argv[1])} <entry-page-url> <prefix-url>`
-if (!entryPage || !prefix || args.length !== 2) {
+const USAGE = `\nUsage: ${path.basename(process.argv[1])} <entry-page-url> <prefix-url> [ignore]`
+if (!entryPage || !prefix || args.length < 2) {
     console.error('Error: invalid arguments')
     console.error(USAGE)
     process.exit(1)
@@ -17,10 +18,11 @@ if (!entryPage || !prefix || args.length !== 2) {
 const corpusSource = createWebCorpusSource({
     entryPage: new URL(entryPage),
     prefix: new URL(prefix),
+    ignore,
     logger: message => console.error('# ' + message),
 })
 
-const data = corpusData(await corpusSource.documents())
+const data = corpusData(await corpusSource.docs())
 
 console.error(`# ${data.docs.length} docs`)
 console.log(JSON.stringify(data, null, 2))
diff --git a/provider/docs/src/corpus/index.ts b/provider/docs/src/corpus/index.ts
index dda55855..411797ee 100644
--- a/provider/docs/src/corpus/index.ts
+++ b/provider/docs/src/corpus/index.ts
@@ -13,6 +13,7 @@ export interface CorpusIndex {
 
     docs: IndexedDoc[]
 
+    doc(id: DocID): IndexedDoc
     search(query: string): Promise<CorpusSearchResult[]>
 }
 
@@ -61,7 +62,20 @@ export async function indexCorpus(
         indexedDocs.push({ doc, content, chunks })
     }
 
-    const index: CorpusIndex = { data, docs: indexedDocs, search: query => multiSearch(index, query, cache) }
+    const index: CorpusIndex = {
+        data,
+        docs: indexedDocs,
+        doc(id) {
+            const doc = indexedDocs.find(d => d.doc.id === id)
+            if (!doc) {
+                throw new Error(`no document with id ${id} in corpus`)
+            }
+            return doc
+        },
+        search(query) {
+            return multiSearch(index, query, cache)
+        },
+    }
     return index
 }
 
diff --git a/provider/docs/src/corpus/source/source.ts b/provider/docs/src/corpus/source/source.ts
index 2eddd418..43dd91c0 100644
--- a/provider/docs/src/corpus/source/source.ts
+++ b/provider/docs/src/corpus/source/source.ts
@@ -1,5 +1,23 @@
+import { CorpusData } from '../data'
 import { Doc } from '../doc/doc'
 
 export interface CorpusSource {
-    documents(): Promise<Doc[]>
+    docs(): Promise<Doc[]>
+}
+
+export function corpusDataSource(data: CorpusData | Promise<CorpusData>): CorpusSource {
+    return {
+        docs: async () => (await data).docs,
+    }
+}
+
+export function corpusDataURLSource(url: string): CorpusSource {
+    return corpusDataSource(
+        fetch(url).then(resp => {
+            if (!resp.ok) {
+                throw new Error(`failed to fetch corpus data from ${url}: ${resp.status} ${resp.statusText}`)
+            }
+            return resp.json()
+        })
+    )
 }
diff --git a/provider/docs/src/corpus/source/web/webCorpusSource.test.ts b/provider/docs/src/corpus/source/web/webCorpusSource.test.ts
index 6083da4e..2778b0e8 100644
--- a/provider/docs/src/corpus/source/web/webCorpusSource.test.ts
+++ b/provider/docs/src/corpus/source/web/webCorpusSource.test.ts
@@ -63,7 +63,7 @@ describe('createWebCorpusSource', () => {
             entryPage: new URL('https://example.com/docs/entry'),
             prefix: new URL('https://example.com/docs'),
         })
-        expect(await source.documents()).toEqual<Doc[]>([
+        expect(await source.docs()).toEqual<Doc[]>([
             { id: 1, text: mockPages['/docs/entry'].body, url: 'https://example.com/docs/entry' },
             { id: 2, text: mockPages['/docs/foo'].body, url: 'https://example.com/docs/foo' },
             { id: 3, text: mockPages['/docs/bar'].body, url: 'https://example.com/docs/bar' },
@@ -90,7 +90,7 @@ describe('createWebCorpusSource', () => {
             entryPage: new URL('https://example.com/a/'),
             prefix: new URL('https://example.com'),
         })
-        expect(await source.documents()).toMatchObject<Omit<Doc, 'text'>[]>([{ id: 1, url: 'https://example.com/a' }])
+        expect(await source.docs()).toMatchObject<Omit<Doc, 'text'>[]>([{ id: 1, url: 'https://example.com/a' }])
     })
 })
 
diff --git a/provider/docs/src/corpus/source/web/webCorpusSource.ts b/provider/docs/src/corpus/source/web/webCorpusSource.ts
index f238942c..535baf12 100644
--- a/provider/docs/src/corpus/source/web/webCorpusSource.ts
+++ b/provider/docs/src/corpus/source/web/webCorpusSource.ts
@@ -15,16 +15,23 @@ interface WebCorpusSourceOptions {
      */
     prefix: URL
 
+    /**
+     * Exclude pages whose URL contains any of these strings.
+     */
+    ignore?: string[]
+
     /**
      * Called to print a log message.
      */
     logger?: Logger
 }
 
-export function createWebCorpusSource({ entryPage, prefix, logger }: WebCorpusSourceOptions): CorpusSource {
+export function createWebCorpusSource({ entryPage, prefix, ignore, logger }: WebCorpusSourceOptions): CorpusSource {
     return {
-        documents: async () => {
-            const { nextURL, enqueueURL, shouldCrawlURL } = createCrawlQueue(url => urlHasPrefix(url, prefix))
+        docs: async () => {
+            const { nextURL, enqueueURL, shouldCrawlURL } = createCrawlQueue(
+                url => urlHasPrefix(url, prefix) && !ignore?.some(ignore => url.href.includes(ignore))
+            )
 
             if (!shouldCrawlURL(entryPage)) {
                 throw new Error(`web corpus entryPage (${entryPage}) does not start with prefix (${prefix})`)
@@ -61,15 +68,16 @@ export function createWebCorpusSource({ entryPage, prefix, logger }: WebCorpusSo
 
                 const canonicalURLStr = dom.querySelector<HTMLLinkElement>("head > link[rel='canonical']")?.href
                 if (canonicalURLStr && canonicalURLStr !== url.href) {
-                    const canonicalURL = new URL(canonicalURLStr)
-
-                    // Only trust the canonical URL if it's same-origin, to avoid letting other
-                    // sites pollute this corpus.
-                    if (canonicalURL.origin === url.origin) {
-                        logger?.(`- Found canonical URL: ${canonicalURL}`)
-                        url = canonicalURL
-                        if (!shouldCrawlURL(url)) {
-                            continue
+                    const canonicalURL = parseURL(canonicalURLStr)
+                    if (canonicalURL) {
+                        // Only trust the canonical URL if it's same-origin, to avoid letting other
+                        // sites pollute this corpus.
+                        if (canonicalURL.origin === url.origin) {
+                            logger?.(`- Found canonical URL: ${canonicalURL}`)
+                            url = canonicalURL
+                            if (!shouldCrawlURL(url)) {
+                                continue
+                            }
                         }
                     }
                 }
@@ -83,7 +91,10 @@ export function createWebCorpusSource({ entryPage, prefix, logger }: WebCorpusSo
                 const pageLinks = dom.querySelectorAll<HTMLAnchorElement>('a[href]')
                 logger?.(`- Found ${pageLinks.length} links on page`)
                 for (const link of pageLinks) {
-                    enqueueURL(new URL(link.href))
+                    const linkURL = parseURL(link.href)
+                    if (linkURL) {
+                        enqueueURL(linkURL)
+                    }
                 }
             }
 
@@ -92,6 +103,14 @@ export function createWebCorpusSource({ entryPage, prefix, logger }: WebCorpusSo
     }
 }
 
+function parseURL(urlStr: string): URL | undefined {
+    try {
+        return new URL(urlStr)
+    } catch {
+        return undefined
+    }
+}
+
 export function urlHasPrefix(url: URL, prefix: URL): boolean {
     // Disallow username and password.
     if (url.username || url.password) {
diff --git a/provider/docs/src/provider/provider.ts b/provider/docs/src/provider/provider.ts
index 90b2cd38..22978a16 100644
--- a/provider/docs/src/provider/provider.ts
+++ b/provider/docs/src/provider/provider.ts
@@ -9,13 +9,19 @@ import { indexCorpus } from '../corpus'
 import { createWebStorageCorpusCache } from '../corpus/cache/localStorage'
 import { corpusData } from '../corpus/data'
 import { extractContentUsingMozillaReadability } from '../corpus/doc/contentExtractor'
+import { corpusDataURLSource } from '../corpus/source/source'
 import { createWebCorpusSource } from '../corpus/source/web/webCorpusSource'
 import { multiplex } from './multiplex'
 
 /** Settings for the docs OpenCodeGraph provider. */
 export interface Settings {
-    entryPage: string
-    prefix: string
+    corpus:
+        | { url: string }
+        | {
+              entryPage: string
+              prefix: string
+              ignore?: string[]
+          }
 }
 
 const CORPUS_CACHE =
@@ -26,14 +32,16 @@ const CORPUS_CACHE =
  * code from an existing documentation corpus.
  */
 export default multiplex<Settings>(async settings => {
-    const data = corpusData(
-        await createWebCorpusSource({
-            entryPage: new URL(settings.entryPage),
-            prefix: new URL(settings.prefix),
-            logger: message => console.log(message),
-        }).documents()
-    )
-    const index = await indexCorpus(data, {
+    const source =
+        'url' in settings.corpus
+            ? corpusDataURLSource(settings.corpus.url)
+            : createWebCorpusSource({
+                  entryPage: new URL(settings.corpus.entryPage),
+                  prefix: new URL(settings.corpus.prefix),
+                  ignore: settings.corpus.ignore,
+                  logger: message => console.log(message),
+              })
+    const index = await indexCorpus(corpusData(await source.docs()), {
         cache: CORPUS_CACHE,
         contentExtractor: extractContentUsingMozillaReadability,
     })
@@ -50,10 +58,10 @@ export default multiplex<Settings>(async settings => {
 
             const result: AnnotationsResult = { items: [], annotations: [] }
             for (const [i, sr] of searchResults.entries()) {
-                console.log(i, sr)
+                const doc = index.doc(sr.doc)
                 const item: OpenCodeGraphItem = {
                     id: i.toString(),
-                    title: truncate(sr.excerpt, 50),
+                    title: truncate(doc.content?.title ?? doc.doc.url ?? 'Untitled', 50),
                     detail: sr.excerpt,
                 }
                 result.items.push(item)