Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
sqs committed Dec 25, 2023
1 parent 8dca16c commit 786958b
Show file tree
Hide file tree
Showing 17 changed files with 110 additions and 31 deletions.
8 changes: 8 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 9 additions & 4 deletions provider/docs/bin/docs-query.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import { readFile } from 'fs/promises'
import path from 'path'
import { createCorpus, Document } from '../src/corpus/corpus'
import envPaths from 'env-paths'
import { createFileSystemCorpusCache } from '../src/corpus/cache/fs'
import { createCorpus, type Document } from '../src/corpus/corpus'

const args = process.argv.slice(2)

Expand Down Expand Up @@ -29,7 +31,10 @@ const docs: Document[] = await Promise.all(
})
)

const corpus = createCorpus(docs)
const cacheDir = envPaths('opencodegraph-provider-docs').cache
const fsCache = createFileSystemCorpusCache(cacheDir)

const corpus = createCorpus(docs, { cache: fsCache })
const results = await corpus.search(query)
console.error(`# ${corpus.length} docs in corpus`)
console.error(`# Query: ${JSON.stringify(query)}`)
Expand All @@ -41,7 +46,7 @@ for (const [i, result] of results.slice(0, MAX_RESULTS).entries()) {
console.log()
}
console.log(`#${i + 1} [${result.score.toFixed(3)}] ${docFile}#chunk${result.chunk}`)
console.log(`${indent(truncate(result.excerpt.replace(/\n\n/g, '\n'), 500), '\t')}`)
console.log(`${indent(truncate(result.excerpt.replaceAll('\n\n', '\n'), 500), '\t')}`)
}

function truncate(text: string, maxLength: number): string {
Expand All @@ -55,5 +60,5 @@ function indent(text: string, indent: string): string {
if (text === '') {
return ''
}
return indent + text.replace(/\n/g, '\n' + indent)
return indent + text.replaceAll('\n', '\n' + indent)
}
3 changes: 2 additions & 1 deletion provider/docs/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
},
"dependencies": {
"@opencodegraph/provider": "workspace:*",
"@xenova/transformers": "^2.12.1"
"@xenova/transformers": "^2.12.1",
"env-paths": "^3.0.0"
}
}
30 changes: 30 additions & 0 deletions provider/docs/src/corpus/cache/cache.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import { type webcrypto } from 'node:crypto'
import { type StoredDocument } from '../corpus'

/**
* A unique identifier for a document's content (based on a hash of the document's text).
*/
export type DocContentID = string

export async function docContentID(text: string): Promise<DocContentID> {
const crypto: webcrypto.Crypto = (globalThis as any).crypto || (await import('node:crypto')).default.webcrypto

const encoder = new TextEncoder()
const data = encoder.encode(text)

// Calculate the SHA-256 hash
const hashBuffer = await crypto.subtle.digest('SHA-256', data)

// Convert the buffer to a hexadecimal string
const hashArray = Array.from(new Uint8Array(hashBuffer))
const hashHex = hashArray.map(b => b.toString(16).padStart(2, '0')).join('')

return hashHex
}

export interface CachedDocument extends Omit<StoredDocument, 'docID'> {}

export interface CorpusCache {
get(docContentID: DocContentID): Promise<CachedDocument | null>
set(docContentID: DocContentID, doc: CachedDocument): Promise<void>
}
31 changes: 31 additions & 0 deletions provider/docs/src/corpus/cache/fs.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import { mkdir, readFile, writeFile } from 'fs/promises'
import path from 'path'
import { type CachedDocument, type CorpusCache, type DocContentID } from './cache'

/**
* Create a {@link CorpusCache} that stores cache data in the file system.
*/
export function createFileSystemCorpusCache(basePath: string): CorpusCache {
function docPath(docContentID: DocContentID): string {
return path.join(basePath, `doc-${docContentID}.json`)
}

return {
async get(docContentID) {
try {
const data = await readFile(docPath(docContentID), 'utf8')
return JSON.parse(data) as CachedDocument
} catch (error: unknown) {
if (error instanceof Error && 'code' in error && error.code === 'ENOENT') {
return null
}
throw error
}
},
async set(docContentID, doc) {
const filePath = docPath(docContentID)
await mkdir(path.basename(filePath), { recursive: true, mode: 0o700 })
await writeFile(filePath, JSON.stringify(doc, null, 2))
},
}
}
6 changes: 3 additions & 3 deletions provider/docs/src/corpus/corpus.test.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import { describe, expect, test } from 'vitest'
import { createCorpus, DocID, Document } from './corpus'
import { createCorpus, type DocID, type Document } from './corpus'

export function doc(docID: DocID | number, text: string): Document {
export function doc(docID: DocID, text: string): Document {
return { docID, text }
}

describe('Corpus', () => {
test('#length', () => {
expect(createCorpus([doc(1, 'a'), doc(2, 'b')]).length).toEqual
expect(createCorpus([doc(1, 'a'), doc(2, 'b')]).length).toBe(2)
})
})
18 changes: 11 additions & 7 deletions provider/docs/src/corpus/corpus.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { Chunk, chunk } from '../doc/chunker'
import { chunk, type Chunk } from '../doc/chunker'
import { type CorpusCache } from './cache/cache'
import { multiSearch } from './search/multi'

/**
Expand Down Expand Up @@ -37,8 +38,13 @@ export interface StoredDocument {
*/
export type ChunkIndex = number

export function createStoredCorpus(docs: Document[]): StoredCorpus {
interface CorpusCreationOptions {
cache?: CorpusCache
}

export function createStoredCorpus(docs: Document[], options: CorpusCreationOptions = {}): StoredCorpus {
const storage: StoredCorpus = { docs: [] }

for (const doc of docs) {
const chunks = chunk(doc.text, { isMarkdown: doc.text.includes('##') })
storage.docs.push({ doc, chunks })
Expand All @@ -47,13 +53,11 @@ export function createStoredCorpus(docs: Document[]): StoredCorpus {
return storage
}

export function createCorpus(docs: Document[]): Corpus {
const storage = createStoredCorpus(docs)
export function createCorpus(docs: Document[], options: CorpusCreationOptions = {}): Corpus {
const storage = createStoredCorpus(docs, options)

return {
search: query => {
return multiSearch(storage, query)
},
search: query => multiSearch(storage, query),
get length(): number {
return docs.length
},
Expand Down
2 changes: 1 addition & 1 deletion provider/docs/src/corpus/search/embeddings.test.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { describe, expect, test } from 'vitest'
import { CorpusSearchResult, createStoredCorpus } from '../corpus'
import { createStoredCorpus, type CorpusSearchResult } from '../corpus'
import { doc } from '../corpus.test'
import { embeddingsSearch, embedText, similarity } from './embeddings'

Expand Down
2 changes: 1 addition & 1 deletion provider/docs/src/corpus/search/embeddings.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { cos_sim, pipeline } from '@xenova/transformers'
import { CorpusSearchResult, StoredCorpus } from '../corpus'
import { type CorpusSearchResult, type StoredCorpus } from '../corpus'

export async function embeddingsSearch(storage: StoredCorpus, query: string): Promise<CorpusSearchResult[]> {
const queryVec = await embedText(query)
Expand Down
2 changes: 1 addition & 1 deletion provider/docs/src/corpus/search/keyword.test.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { describe, expect, test } from 'vitest'
import { CorpusSearchResult, createStoredCorpus } from '../corpus'
import { createStoredCorpus, type CorpusSearchResult } from '../corpus'
import { doc } from '../corpus.test'
import { keywordSearch } from './keyword'
import { calculateTFIDF } from './tfidf'
Expand Down
2 changes: 1 addition & 1 deletion provider/docs/src/corpus/search/keyword.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { CorpusSearchResult, StoredCorpus } from '../corpus'
import { type CorpusSearchResult, type StoredCorpus } from '../corpus'
import { terms } from './terms'
import { createIndexForTFIDF } from './tfidf'

Expand Down
2 changes: 1 addition & 1 deletion provider/docs/src/corpus/search/multi.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { ChunkIndex, CorpusSearchResult, DocID, StoredCorpus } from '../corpus'
import { type ChunkIndex, type CorpusSearchResult, type DocID, type StoredCorpus } from '../corpus'
import { embeddingsSearch } from './embeddings'
import { keywordSearch } from './keyword'

Expand Down
4 changes: 2 additions & 2 deletions provider/docs/src/corpus/search/terms.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ export function terms(text: string): Term[] {
return (
text
.toLowerCase()
.split(/[^a-zA-Z0-9-_]+/)
.split(/[^\w-]+/)
// TODO(sqs): get a real stemmer
.map(term => term.replace(/(.*)(?:es|ed|ing|s|ed|er|ing)$/, '$1'))
.map(term => term.replace(/(.*)(?:es|ed|ing|s|er)$/, '$1'))
)
}
6 changes: 3 additions & 3 deletions provider/docs/src/corpus/search/tfidf.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ import { calculateTFIDF, createIndexForTFIDF } from './tfidf'

describe('createIndexForTFIDF', () => {
const corpus = createStoredCorpus([
{ docID: 1, text: `a b c c c` },
{ docID: 2, text: `b c d` },
{ docID: 3, text: `c d e` },
{ docID: 1, text: 'a b c c c' },
{ docID: 2, text: 'b c d' },
{ docID: 3, text: 'c d e' },
])
const docIDs = corpus.docs.map(({ doc: { docID } }) => docID)
const tfidf = createIndexForTFIDF(corpus)
Expand Down
8 changes: 4 additions & 4 deletions provider/docs/src/corpus/search/tfidf.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { ChunkIndex, DocID, StoredCorpus } from '../corpus'
import { Term, terms } from './terms'
import { type ChunkIndex, type DocID, type StoredCorpus } from '../corpus'
import { terms, type Term } from './terms'

/**
* TF-IDF is a way of measuring the relevance of a term to a document in a corpus. See
Expand Down Expand Up @@ -78,15 +78,15 @@ export function createIndexForTFIDF(storage: StoredCorpus): TFIDF {
throw new Error(`doc ${doc} not found in termLength`)
}
if (typeof docTermLength[chunk] !== 'number') {
throw new Error(`chunk ${chunk} not found in termLength for doc ${doc}`)
throw new TypeError(`chunk ${chunk} not found in termLength for doc ${doc}`)
}

const docTermFrequency = termFrequency.get(doc)
if (!docTermFrequency) {
throw new Error(`doc ${doc} not found in termFrequency`)
}
if (!(docTermFrequency[chunk] instanceof Map)) {
throw new Error(`chunk ${chunk} not found in termFrequency for doc ${doc}`)
throw new TypeError(`chunk ${chunk} not found in termFrequency for doc ${doc}`)
}

return calculateTFIDF({
Expand Down
2 changes: 1 addition & 1 deletion provider/docs/src/doc/chunker.test.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { describe, expect, test } from 'vitest'
import { Chunk, chunk } from './chunker'
import { chunk, type Chunk } from './chunker'

describe('chunker', () => {
test('empty', () => expect(chunk('', {})).toEqual<Chunk[]>([]))
Expand Down
2 changes: 1 addition & 1 deletion provider/docs/src/e2e.test.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import fs from 'node:fs/promises'
import path from 'node:path'
import { describe, expect, test } from 'vitest'
import { CorpusSearchResult, createCorpus } from './corpus/corpus'
import { createCorpus, type CorpusSearchResult } from './corpus/corpus'

describe('e2e', () => {
test('urlParsing', async () => {
Expand Down

0 comments on commit 786958b

Please sign in to comment.