From e7dd5d487c64fae5ff4b219e0b42e8a594b50a6b Mon Sep 17 00:00:00 2001
From: Quinn Slack <quinn@slack.org>
Date: Sun, 24 Dec 2023 00:44:00 -0600
Subject: [PATCH] wip

---
 provider/docs/src/corpus/corpus.ts            | 44 ++++++++++++----
 .../{search.test.ts => embeddings.test.ts}    | 12 ++---
 .../search/{search.ts => embeddings.ts}       | 20 +++++--
 .../docs/src/corpus/search/keyword.test.ts    | 12 +++++
 provider/docs/src/corpus/search/keyword.ts    | 12 +++++
 provider/docs/src/doc/chunker.test.ts         | 52 +++++++++++++++++++
 provider/docs/src/doc/chunker.ts              | 52 +++++++++++++++++++
 provider/docs/src/e2e.test.ts                 | 28 ++++++++++
 provider/docs/src/testdata/code/urlParsing.ts |  6 +++
 .../docs/src/testdata/corpus/urlParsing.md    | 15 ++++++
 provider/docs/tsconfig.json                   |  2 +-
 tsconfig.json                                 |  2 +-
 12 files changed, 234 insertions(+), 23 deletions(-)
 rename provider/docs/src/corpus/search/{search.test.ts => embeddings.test.ts} (61%)
 rename provider/docs/src/corpus/search/{search.ts => embeddings.ts} (58%)
 create mode 100644 provider/docs/src/corpus/search/keyword.test.ts
 create mode 100644 provider/docs/src/corpus/search/keyword.ts
 create mode 100644 provider/docs/src/doc/chunker.test.ts
 create mode 100644 provider/docs/src/doc/chunker.ts
 create mode 100644 provider/docs/src/e2e.test.ts
 create mode 100644 provider/docs/src/testdata/code/urlParsing.ts
 create mode 100644 provider/docs/src/testdata/corpus/urlParsing.md

diff --git a/provider/docs/src/corpus/corpus.ts b/provider/docs/src/corpus/corpus.ts
index 8cc9df34..d3d68330 100644
--- a/provider/docs/src/corpus/corpus.ts
+++ b/provider/docs/src/corpus/corpus.ts
@@ -1,10 +1,12 @@
-import { CorpusSearchResult } from './search/search'
+import { Chunk, chunk } from '../doc/chunker'
+import { embeddingsSearch } from './search/embeddings'
+import { keywordSearch } from './search/keyword'
 
 /**
  * A documentation corpus.
  */
 export interface Corpus {
-    search(query: string): Promise<CorpusSearchResult[]>
+    search(query: string, keyword: boolean): Promise<CorpusSearchResult[]>
     length: number
 }
 
@@ -13,17 +15,37 @@ export interface Document {
     text: string
 }
 
+export interface CorpusSearchResult {
+    docID: string
+    score: number
+    excerpt: string
+}
+
+interface StoredDocument {
+    doc: Document
+    chunks: Chunk[]
+}
+
+export interface StoredCorpus {
+    docs: StoredDocument[]
+}
+
+export function createStoredCorpus(docs: Document[]): StoredCorpus {
+    const storage: StoredCorpus = { docs: [] }
+    for (const doc of docs) {
+        const chunks = chunk(doc.text, { isMarkdown: doc.text.includes('##') })
+        storage.docs.push({ doc, chunks })
+    }
+
+    return storage
+}
+
 export function createCorpus(docs: Document[]): Corpus {
+    const storage = createStoredCorpus(docs)
+
     return {
-        search: query => {
-            const terms = query.split(/\s+/)
-            const results: CorpusSearchResult[] = []
-            for (const doc of docs) {
-                if (terms.some(term => doc.text.includes(term))) {
-                    results.push({ docID: doc.docID, score: 1, excerpt: doc.text })
-                }
-            }
-            return Promise.resolve(results)
+        search: (query, keyword) => {
+            return Promise.resolve(keyword ? keywordSearch(storage, query) : embeddingsSearch(storage, query))
         },
         get length(): number {
             return docs.length
diff --git a/provider/docs/src/corpus/search/search.test.ts b/provider/docs/src/corpus/search/embeddings.test.ts
similarity index 61%
rename from provider/docs/src/corpus/search/search.test.ts
rename to provider/docs/src/corpus/search/embeddings.test.ts
index 82a4af0e..5def3423 100644
--- a/provider/docs/src/corpus/search/search.test.ts
+++ b/provider/docs/src/corpus/search/embeddings.test.ts
@@ -1,13 +1,13 @@
 import { describe, expect, test } from 'vitest'
-import { createCorpus } from '../corpus'
+import { CorpusSearchResult, createStoredCorpus } from '../corpus'
 import { doc } from '../corpus.test'
-import { CorpusSearchResult, embedText, similarity } from './search'
+import { embeddingsSearch, embedText, similarity } from './embeddings'
 
-describe('Corpus#search', () => {
+describe('embeddingsSearch', () => {
     test('finds matches', async () => {
-        expect(await createCorpus([doc(1, 'a'), doc(2, 'b')]).search('b')).toEqual<CorpusSearchResult[]>([
-            { docID: '2', score: 1, excerpt: 'b' },
-        ])
+        expect(await embeddingsSearch(createStoredCorpus([doc(1, 'a'), doc(2, 'b')]), 'b')).toEqual<
+            CorpusSearchResult[]
+        >([{ docID: '2', score: 1, excerpt: 'b' }])
     })
 })
 
diff --git a/provider/docs/src/corpus/search/search.ts b/provider/docs/src/corpus/search/embeddings.ts
similarity index 58%
rename from provider/docs/src/corpus/search/search.ts
rename to provider/docs/src/corpus/search/embeddings.ts
index b77cbc4d..cd50be97 100644
--- a/provider/docs/src/corpus/search/search.ts
+++ b/provider/docs/src/corpus/search/embeddings.ts
@@ -1,9 +1,21 @@
 import { cos_sim, pipeline } from '@xenova/transformers'
+import { CorpusSearchResult, StoredCorpus } from '../corpus'
 
-export interface CorpusSearchResult {
-    docID: string
-    score: number
-    excerpt: string
+export async function embeddingsSearch(storage: StoredCorpus, query: string): Promise<CorpusSearchResult[]> {
+    const queryVec = await embedText(query)
+
+    const results: CorpusSearchResult[] = []
+    for (const { doc, chunks } of storage.docs) {
+        for (const chunk of chunks) {
+            const chunkVec = await embedText(chunk.text)
+            const score = cos_sim(queryVec, chunkVec)
+            results.push({ docID: doc.docID, score, excerpt: chunk.text })
+        }
+    }
+
+    results.sort((a, b) => b.score - a.score)
+
+    return results.slice(0, 1)
 }
 
 /**
diff --git a/provider/docs/src/corpus/search/keyword.test.ts b/provider/docs/src/corpus/search/keyword.test.ts
new file mode 100644
index 00000000..794136f9
--- /dev/null
+++ b/provider/docs/src/corpus/search/keyword.test.ts
@@ -0,0 +1,12 @@
+import { describe, expect, test } from 'vitest'
+import { CorpusSearchResult, createStoredCorpus } from '../corpus'
+import { doc } from '../corpus.test'
+import { keywordSearch } from './keyword'
+
+describe('keywordSearch', () => {
+    test('finds matches', () => {
+        expect(keywordSearch(createStoredCorpus([doc(1, 'a'), doc(2, 'b')]), 'b')).toEqual<CorpusSearchResult[]>([
+            { docID: '2', score: 1, excerpt: 'b' },
+        ])
+    })
+})
diff --git a/provider/docs/src/corpus/search/keyword.ts b/provider/docs/src/corpus/search/keyword.ts
new file mode 100644
index 00000000..ea67739d
--- /dev/null
+++ b/provider/docs/src/corpus/search/keyword.ts
@@ -0,0 +1,12 @@
+import { CorpusSearchResult, StoredCorpus } from '../corpus'
+
+export function keywordSearch(storage: StoredCorpus, query: string): CorpusSearchResult[] {
+    const terms = query.split(/\s+/)
+    const results: CorpusSearchResult[] = []
+    for (const { doc } of storage.docs) {
+        if (terms.some(term => doc.text.includes(term))) {
+            results.push({ docID: doc.docID, score: 1, excerpt: doc.text })
+        }
+    }
+    return results
+}
diff --git a/provider/docs/src/doc/chunker.test.ts b/provider/docs/src/doc/chunker.test.ts
new file mode 100644
index 00000000..990e081f
--- /dev/null
+++ b/provider/docs/src/doc/chunker.test.ts
@@ -0,0 +1,52 @@
+import { describe, expect, test } from 'vitest'
+import { Chunk, chunk } from './chunker'
+
+describe('chunker', () => {
+    test('empty', () => expect(chunk('', {})).toEqual<Chunk[]>([]))
+
+    test('fallback', () => expect(chunk('a', {})).toEqual<Chunk[]>([{ range: { start: 0, end: 1 }, text: 'a' }]))
+
+    describe('Markdown', () => {
+        test('by section', () =>
+            expect(
+                chunk(
+                    `
+# Title
+
+Intro
+
+## Section 1
+
+Body 1
+
+## Section 2
+
+Body 2
+`.trim(),
+                    { isMarkdown: true }
+                )
+            ).toEqual<Chunk[]>([
+                {
+                    range: {
+                        start: 2,
+                        end: 16,
+                    },
+                    text: 'Title\n\nIntro',
+                },
+                {
+                    range: {
+                        start: 5,
+                        end: 24,
+                    },
+                    text: 'Section 1\n\nBody 1',
+                },
+                {
+                    range: {
+                        start: 8,
+                        end: 25,
+                    },
+                    text: 'Section 2\n\nBody 2',
+                },
+            ]))
+    })
+})
diff --git a/provider/docs/src/doc/chunker.ts b/provider/docs/src/doc/chunker.ts
new file mode 100644
index 00000000..d010cdb8
--- /dev/null
+++ b/provider/docs/src/doc/chunker.ts
@@ -0,0 +1,52 @@
+/**
+ * Information about the document to help the chunker know how to split the content into logical
+ * chunks.
+ */
+export interface ChunkerHints {
+    isMarkdown?: boolean
+}
+
+export interface Chunk {
+    /**
+     * The text of the chunk, stripped of semantically meaningless markup, punctuation, and content.
+     * This text need not be present in the original document.
+     */
+    text: string
+
+    /**
+     * The range in the original document (as character offsets) represented by this chunk.
+     */
+    range: { start: number; end: number }
+}
+
+export function chunk(text: string, hints: ChunkerHints): Chunk[] {
+    if (hints.isMarkdown) {
+        return chunkMarkdown(text)
+    }
+    if (text.length === 0) {
+        return []
+    }
+    return [{ text, range: { start: 0, end: text.length } }]
+}
+
+function chunkMarkdown(text: string): Chunk[] {
+    const chunks: Chunk[] = []
+
+    const sections = text.split(/^(#+\s*)/m)
+    let pos = 0
+    for (const section of sections) {
+        if (section.length === 0) {
+            continue
+        }
+        if (section.startsWith('#')) {
+            pos += section.length
+            continue
+        }
+        chunks.push({
+            text: section.trim(),
+            range: { start: pos, end: pos + section.length },
+        })
+    }
+
+    return chunks
+}
diff --git a/provider/docs/src/e2e.test.ts b/provider/docs/src/e2e.test.ts
new file mode 100644
index 00000000..b075cd16
--- /dev/null
+++ b/provider/docs/src/e2e.test.ts
@@ -0,0 +1,28 @@
+import fs from 'node:fs/promises'
+import path from 'node:path'
+import { describe, expect, test } from 'vitest'
+import { CorpusSearchResult, createCorpus } from './corpus/corpus'
+
+describe('e2e', () => {
+    test('urlParsing', async () => {
+        const docFile = await fs.readFile(path.join(__dirname, 'testdata/corpus/urlParsing.md'), 'utf8')
+        const codeFile = await fs.readFile(path.join(__dirname, 'testdata/code/urlParsing.ts'), 'utf8')
+
+        const corpus = createCorpus([{ docID: '1', text: docFile }])
+        const results = await corpus.search(codeFile, false)
+        roundScores(results)
+        expect(results).toEqual<CorpusSearchResult[]>([
+            {
+                docID: '1',
+                excerpt: 'Audio URL parsing\n\nTo parse an audio URL, use the `parseAudioURL` function.',
+                score: 0.685,
+            },
+        ])
+    })
+})
+
+function roundScores(results: CorpusSearchResult[]) {
+    for (const result of results) {
+        result.score = Math.round(result.score * 1000) / 1000
+    }
+}
diff --git a/provider/docs/src/testdata/code/urlParsing.ts b/provider/docs/src/testdata/code/urlParsing.ts
new file mode 100644
index 00000000..8a7c652f
--- /dev/null
+++ b/provider/docs/src/testdata/code/urlParsing.ts
@@ -0,0 +1,6 @@
+// @ts-nocheck
+
+function getAudio(title: string): URL {
+    const audioFile = searchAudioFiles(title)
+    return parseAudioURL(audioFile.url)
+}
diff --git a/provider/docs/src/testdata/corpus/urlParsing.md b/provider/docs/src/testdata/corpus/urlParsing.md
new file mode 100644
index 00000000..864548eb
--- /dev/null
+++ b/provider/docs/src/testdata/corpus/urlParsing.md
@@ -0,0 +1,15 @@
+# URL parsing
+
+To parse a URL, use the `parseURL` function.
+
+## Image URL parsing
+
+To parse an image URL, use the `parseImageURL` function.
+
+## Video URL parsing
+
+To parse an image URL, use the `parseVideoURL` function.
+
+## Audio URL parsing
+
+To parse an audio URL, use the `parseAudioURL` function.
diff --git a/provider/docs/tsconfig.json b/provider/docs/tsconfig.json
index 13999016..87e1b4d1 100644
--- a/provider/docs/tsconfig.json
+++ b/provider/docs/tsconfig.json
@@ -7,6 +7,6 @@
     "lib": ["ESNext"],
   },
   "include": ["src"],
-  "exclude": ["dist", "vitest.config.ts"],
+  "exclude": ["dist", "src/testdata", "vitest.config.ts"],
   "references": [{ "path": "../../lib/provider" }],
 }
diff --git a/tsconfig.json b/tsconfig.json
index e7821777..f5eb76d8 100644
--- a/tsconfig.json
+++ b/tsconfig.json
@@ -4,7 +4,7 @@
     "noEmit": true
   },
   "files": [],
-  "exclude": ["**/dist", "client/browser"],
+  "exclude": ["**/dist", "**/testdata", "client/browser"],
   "references": [
     { "path": "lib/client" },
     { "path": "lib/protocol" },