diff --git a/client/vscode/test/fixtures/workspace/.vscode/settings.json b/client/vscode/test/fixtures/workspace/.vscode/settings.json index f9e34508..247d8282 100644 --- a/client/vscode/test/fixtures/workspace/.vscode/settings.json +++ b/client/vscode/test/fixtures/workspace/.vscode/settings.json @@ -3,8 +3,9 @@ "opencodegraph.debug": true, "cody.autocomplete.experimental.graphContext": "opencodegraph", "opencodegraph.providers": { - "../../../../../../provider/docs/dist/provider.js": { - "index": "http://localhost:5900/@fs/home/sqs/tmp/ocg-provider-docs/vite-docs-web.index.json", + "../../../../../../provider/docs/dist/provider.cjs": { + // "index": "http://localhost:5900/@fs/home/sqs/tmp/ocg-provider-docs/vite-docs-web.index.json", + "index": "file:///home/sqs/tmp/ocg-provider-docs/vite-docs-web.index.json", }, }, "opencodegraph.providers2": { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 78014fc9..70acf0bd 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -489,17 +489,11 @@ importers: specifier: workspace:* version: link:../../lib/provider '@xenova/transformers': - specifier: ^2.12.1 - version: 2.12.1 - better-localstorage: - specifier: ^1.0.5 - version: 1.0.5 + specifier: ^2.13.4 + version: 2.13.4 buffer: specifier: ^6.0.3 version: 6.0.3 - env-paths: - specifier: ^3.0.0 - version: 3.0.0 jsdom: specifier: ^23.0.1 version: 23.0.1 @@ -6319,8 +6313,8 @@ packages: resolution: {integrity: sha512-CqTpxOlUCPWRNUPZDxT5v2NnHXA4oox612iUGnmTUGQFhZ1Gkj8kirtl/2wcF6MqX7+PqqicZzOCBKKfIn0dww==} dev: true - /@xenova/transformers@2.12.1: - resolution: {integrity: sha512-b++KSLKezi9Vic9VPXBc/egE5dTw11fvqCcWRg+AQgS+hLGNc7E/sL6JRNhnZ4NmKW0Sx/2gKs33rYllTC1xKA==} + /@xenova/transformers@2.13.4: + resolution: {integrity: sha512-yk8yDvQaCTEZJsasoUj+FWEM9dVcNdDXlushzJ0KjFe2oUOJ3XqICAJz1Htz/vWtM20ErGt509EKogACGYlpxA==} dependencies: '@huggingface/jinja': 0.1.2 onnxruntime-web: 1.14.0 @@ -6804,10 +6798,6 @@ packages: is-stream: 2.0.0 dev: true - /better-localstorage@1.0.5: - resolution: {integrity: sha512-fRS8BjU/Fl2Aeq4HLY7QPN5qaasYpZ9B4GB3ljEoBuo46gj36gDw6Yfi33ZHEklI1n707c1JmdZI9BK9kAebPA==} - dev: false - /better-opn@3.0.2: resolution: {integrity: sha512-aVNobHnJqLiUelTaHat9DZ1qM2w0C0Eym4LPI/3JxOnSokGVdsl1T1kN7TFvsEAD8G47A6VKQ0TVHqbBnYMJlQ==} engines: {node: '>=12.0.0'} @@ -7948,11 +7938,6 @@ packages: resolution: {integrity: sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==} engines: {node: '>=0.12'} - /env-paths@3.0.0: - resolution: {integrity: sha512-dtJUTepzMW3Lm/NPxRf3wP4642UWhjL2sQxc+ym2YMj1m/H2zDNQOlezafzkHwn6sMstjHTwG6iQQsctDW/b1A==} - engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} - dev: false - /envinfo@7.10.0: resolution: {integrity: sha512-ZtUjZO6l5mwTHvc1L9+1q5p/R3wTopcfqMW8r5t8SJSKqeVI/LtajORwRFEKpEFuekjD0VBjwu1HMxL4UalIRw==} engines: {node: '>=4'} diff --git a/provider/docs/bin/create-archive.ts b/provider/docs/bin/create-archive.ts index 536e192a..86c992fb 100644 --- a/provider/docs/bin/create-archive.ts +++ b/provider/docs/bin/create-archive.ts @@ -1,8 +1,8 @@ import { readFile } from 'fs/promises' import path from 'path' -import { CorpusArchive, createCorpusArchive } from '../src/corpus/archive/corpusArchive' -import { createWebCorpusArchive, WebCorpusArchiveOptions } from '../src/corpus/archive/web/webCorpusArchive' -import { Doc } from '../src/corpus/doc/doc' +import { createCorpusArchive, type CorpusArchive } from '../src/corpus/archive/corpusArchive.ts' +import { createWebCorpusArchive, type WebCorpusArchiveOptions } from '../src/corpus/archive/web/webCorpusArchive.ts' +import { type Doc } from '../src/corpus/doc/doc.ts' type ArchiveKind = 'web' | 'file' const ARCHIVE_KINDS: Record< diff --git a/provider/docs/bin/create-index.ts b/provider/docs/bin/create-index.ts index 6c5795da..490d6675 100644 --- a/provider/docs/bin/create-index.ts +++ b/provider/docs/bin/create-index.ts @@ -1,7 +1,7 @@ import path from 'path' -import { CorpusArchive } from '../src/corpus/archive/corpusArchive' -import { extractContentUsingMozillaReadability } from '../src/corpus/doc/contentExtractor' -import { createCorpusIndex } from '../src/corpus/index/corpusIndex' +import { type CorpusArchive } from '../src/corpus/archive/corpusArchive.ts' +import { extractContentUsingMozillaReadability } from '../src/corpus/doc/contentExtractor.ts' +import { createCorpusIndex } from '../src/corpus/index/corpusIndex.ts' function usage(): void { console.error() diff --git a/provider/docs/bin/search.ts b/provider/docs/bin/search.ts index 378f258a..cd92dee0 100644 --- a/provider/docs/bin/search.ts +++ b/provider/docs/bin/search.ts @@ -1,7 +1,7 @@ import { readFile } from 'fs/promises' import path from 'path' -import { createClient } from '../src/client/client' -import { fromJSON } from '../src/corpus/index/corpusIndex' +import { createClient } from '../src/client/client.ts' +import { fromJSON } from '../src/corpus/index/corpusIndex.ts' const args = process.argv.slice(2) diff --git a/provider/docs/package.json b/provider/docs/package.json index cf7f8749..56be901e 100644 --- a/provider/docs/package.json +++ b/provider/docs/package.json @@ -17,7 +17,7 @@ ], "sideEffects": false, "scripts": { - "build": "tsc --build && esbuild src/provider/provider.ts --bundle --outfile=dist/provider.js --platform=browser --format=esm --sourcemap --external:node:crypto --external:fs --external:node:os --define:self=global --loader:.node=file --banner:js=\"const require = (await import('node:module')).createRequire(import.meta.url);const __filename = (await import('node:url')).fileURLToPath(import.meta.url);const __dirname = (await import('node:path')).dirname(__filename);\"", + "build": "tsc --build && esbuild src/provider/provider.ts --bundle --outfile=dist/provider.cjs --platform=node --format=cjs --sourcemap --define:self=global --loader:.node=file --alias:sharp=/dev/null --define:import.meta.url=import_meta_url --inject:./polyfill1.js", "test": "vitest", "create-archive": "node --no-warnings=ExperimentalWarning --experimental-specifier-resolution=node --loader ts-node/esm/transpile-only bin/create-archive.ts", "create-index": "node --no-warnings=ExperimentalWarning --experimental-specifier-resolution=node --loader ts-node/esm/transpile-only bin/create-index.ts", @@ -26,10 +26,8 @@ "dependencies": { "@mozilla/readability": "^0.5.0", "@opencodegraph/provider": "workspace:*", - "@xenova/transformers": "^2.12.1", - "better-localstorage": "^1.0.5", + "@xenova/transformers": "^2.13.4", "buffer": "^6.0.3", - "env-paths": "^3.0.0", "jsdom": "^23.0.1", "lru-cache": "^10.1.0", "onnxruntime-web": "*" diff --git a/provider/docs/polyfill1.js b/provider/docs/polyfill1.js new file mode 100644 index 00000000..7b94bceb --- /dev/null +++ b/provider/docs/polyfill1.js @@ -0,0 +1,3 @@ +export const import_meta_url = + typeof document === 'undefined' ? new (require('url'.replace('', '')).URL)('file:' + __filename).href : + (document.currentScript && document.currentScript.src || new URL('main.js', document.baseURI).href) diff --git a/provider/docs/src/client/client.ts b/provider/docs/src/client/client.ts index 4ff5a76b..685d4647 100644 --- a/provider/docs/src/client/client.ts +++ b/provider/docs/src/client/client.ts @@ -1,8 +1,8 @@ -import { type DocID } from '../corpus/doc/doc' -import { type CorpusIndex, type IndexedDoc } from '../corpus/index/corpusIndex' -import { type Logger } from '../logger' -import { type Query, type SearchResult } from '../search/types' -import { search } from './search' +import { type DocID } from '../corpus/doc/doc.ts' +import { type CorpusIndex, type IndexedDoc } from '../corpus/index/corpusIndex.ts' +import { type Logger } from '../logger.ts' +import { type Query, type SearchResult } from '../search/types.ts' +import { search } from './search.ts' /** * A client for searching a {@link CorpusIndex}. diff --git a/provider/docs/src/client/search.ts b/provider/docs/src/client/search.ts index 92b0c77b..87834cc0 100644 --- a/provider/docs/src/client/search.ts +++ b/provider/docs/src/client/search.ts @@ -1,10 +1,10 @@ -import { type ChunkIndex } from '../corpus/doc/chunks' -import { type DocID } from '../corpus/doc/doc' -import { type CorpusIndex } from '../corpus/index/corpusIndex' -import { type Logger } from '../logger' -import { embeddingsSearch } from '../search/embeddings' -import { keywordSearch } from '../search/keyword' -import { type Query, type SearchResult } from '../search/types' +import { type ChunkIndex } from '../corpus/doc/chunks.ts' +import { type DocID } from '../corpus/doc/doc.ts' +import { type CorpusIndex } from '../corpus/index/corpusIndex.ts' +import { type Logger } from '../logger.ts' +import { embeddingsSearch } from '../search/embeddings.ts' +import { keywordSearch } from '../search/keyword.ts' +import { type Query, type SearchResult } from '../search/types.ts' export interface SearchOptions { logger?: Logger diff --git a/provider/docs/src/corpus/archive/corpusArchive.ts b/provider/docs/src/corpus/archive/corpusArchive.ts index db7eea8b..2d353414 100644 --- a/provider/docs/src/corpus/archive/corpusArchive.ts +++ b/provider/docs/src/corpus/archive/corpusArchive.ts @@ -1,5 +1,5 @@ -import { contentID } from '../cache/contentID' -import { type Doc } from '../doc/doc' +import { contentID } from '../cache/contentID.ts' +import { type Doc } from '../doc/doc.ts' export interface CorpusArchive { /** diff --git a/provider/docs/src/corpus/archive/web/webCorpusArchive.test.ts b/provider/docs/src/corpus/archive/web/webCorpusArchive.test.ts index b71550c0..c4f09dbd 100644 --- a/provider/docs/src/corpus/archive/web/webCorpusArchive.test.ts +++ b/provider/docs/src/corpus/archive/web/webCorpusArchive.test.ts @@ -1,7 +1,7 @@ import { afterAll, afterEach, beforeAll, describe, expect, test, vi } from 'vitest' import createFetchMock from 'vitest-fetch-mock' -import { type Doc } from '../../doc/doc' -import { createWebCorpusArchive, urlHasPrefix } from './webCorpusArchive' +import { type Doc } from '../../doc/doc.ts' +import { createWebCorpusArchive, urlHasPrefix } from './webCorpusArchive.ts' describe('createWebCorpusSource', () => { const fetchMocker = createFetchMock(vi) diff --git a/provider/docs/src/corpus/archive/web/webCorpusArchive.ts b/provider/docs/src/corpus/archive/web/webCorpusArchive.ts index aa122586..d2fcdd29 100644 --- a/provider/docs/src/corpus/archive/web/webCorpusArchive.ts +++ b/provider/docs/src/corpus/archive/web/webCorpusArchive.ts @@ -1,8 +1,8 @@ -import { parseDOM } from '../../../dom' -import { type Logger } from '../../../logger' -import { type Doc } from '../../doc/doc' -import { createCorpusArchive, type CorpusArchive } from '../corpusArchive' -import { createCrawlQueue } from './crawlQueue' +import { parseDOM } from '../../../dom.ts' +import { type Logger } from '../../../logger.ts' +import { type Doc } from '../../doc/doc.ts' +import { createCorpusArchive, type CorpusArchive } from '../corpusArchive.ts' +import { createCrawlQueue } from './crawlQueue.ts' export interface WebCorpusArchiveOptions { /** diff --git a/provider/docs/src/corpus/cache/cache.test.ts b/provider/docs/src/corpus/cache/cache.test.ts deleted file mode 100644 index 9e783fcf..00000000 --- a/provider/docs/src/corpus/cache/cache.test.ts +++ /dev/null @@ -1,26 +0,0 @@ -import { describe, expect, test } from 'vitest' -import { createCache, type Cache, type CacheStore } from './cache' - -export function createTestCache(): { data: Map; store: CacheStore; cache: Cache } { - const data = new Map() - const store: CacheStore = { - get: key => Promise.resolve(data.get(key) ?? null), - set: (key, value) => { - data.set(key, value) - return Promise.resolve(undefined) - }, - } - const cache = createCache(store) - return { data, store, cache } -} - -describe('Cache', () => { - test('get and set', async () => { - const { data, cache } = createTestCache() - await cache.set('k0', 'v0') - await cache.set('k1', 'v1') - await cache.set('k0', 'v2') - expect(data.get('k0')).toBe('v2') - expect(data.get('k1')).toBe('v1') - }) -}) diff --git a/provider/docs/src/corpus/cache/cache.ts b/provider/docs/src/corpus/cache/cache.ts deleted file mode 100644 index 43bc0335..00000000 --- a/provider/docs/src/corpus/cache/cache.ts +++ /dev/null @@ -1,35 +0,0 @@ -/** - * The low-level storage backend for a {@link Cache}. - */ -export interface CacheStore { - /** @returns null if no cache entry found */ - get(key: string): Promise - - set(key: string, value: unknown): Promise -} - -/** - * The high-level interface for a cache. - */ -export interface Cache { - get(key: string): Promise - set(key: string, value: T): Promise -} - -/** - * Create a high-level {@link Cache} from an underlying {@link CacheStore} storage backend. - */ -export function createCache(store: CacheStore): Cache { - return { - get: async key => store.get(key), - set: async (key, value) => store.set(key, value), - } -} - -/** - * A no-op {@link Cache} that always misses and never stores. - */ -export const noopCache: Cache = { - get: async () => Promise.resolve(null), - set: async () => {}, -} diff --git a/provider/docs/src/corpus/cache/contentID.test.ts b/provider/docs/src/corpus/cache/contentID.test.ts index 4ea16e49..8d0a8977 100644 --- a/provider/docs/src/corpus/cache/contentID.test.ts +++ b/provider/docs/src/corpus/cache/contentID.test.ts @@ -1,5 +1,5 @@ import { describe, expect, test } from 'vitest' -import { contentID } from './contentID' +import { contentID } from './contentID.ts' describe('contentID', () => { test('returns the content ID', async () => { diff --git a/provider/docs/src/corpus/cache/memo.test.ts b/provider/docs/src/corpus/cache/memo.test.ts deleted file mode 100644 index 6e2c1317..00000000 --- a/provider/docs/src/corpus/cache/memo.test.ts +++ /dev/null @@ -1,24 +0,0 @@ -import { describe, expect, test } from 'vitest' -import { createTestCache } from './cache.test' -import { memo } from './memo' - -describe('memo', () => { - test('memoizes', async () => { - let calls = 0 - const fn = () => { - calls++ - return Promise.resolve(true) - } - const { data, cache } = createTestCache() - - const v0 = await memo(cache, 'k', fn) - expect(v0).toBe(true) - expect(calls).toBe(1) - expect(data.size).toBe(1) - - const v1 = await memo(cache, 'k', fn) - expect(v1).toBe(true) - expect(calls).toBe(1) - expect(data.size).toBe(1) - }) -}) diff --git a/provider/docs/src/corpus/cache/memo.ts b/provider/docs/src/corpus/cache/memo.ts deleted file mode 100644 index c6f50df7..00000000 --- a/provider/docs/src/corpus/cache/memo.ts +++ /dev/null @@ -1,26 +0,0 @@ -import { type Cache } from './cache' - -/** - * Memoize an operation using the cache. - */ -export async function memo(cache: Cache, key: string, fn: () => Promise): Promise { - // Check if cache entry exists. - const memoized = await (cache as Cache).get(key) - if (memoized !== null) { - log(key, 'HIT') - return memoized - } - - log(key, 'MISS') - const result = await fn() - await (cache as Cache).set(key, result) - return result -} - -const VERBOSE_MEMO = false - -function log(key: string, message: string): void { - if (VERBOSE_MEMO) { - console.debug(`cache:${key} ${message}`) - } -} diff --git a/provider/docs/src/corpus/cache/store/fs.ts b/provider/docs/src/corpus/cache/store/fs.ts deleted file mode 100644 index 1324c407..00000000 --- a/provider/docs/src/corpus/cache/store/fs.ts +++ /dev/null @@ -1,32 +0,0 @@ -import { mkdir, readFile, writeFile } from 'fs/promises' -import path from 'path' -import { type CacheStore } from '../cache' - -/** - * Create a {@link CacheStore} that stores cache data in the file system. - */ -export function createFileSystemCacheStore(basePath: string): CacheStore { - function cacheFilePath(key: string): string { - return path.join(basePath, `${key.replaceAll('/', '_').replaceAll('\\', '_')}.json`) - } - - return { - async get(key) { - try { - const data = await readFile(cacheFilePath(key), 'utf8') - return JSON.parse(data) - } catch (error: any) { - // eslint-disable-next-line @typescript-eslint/no-unsafe-member-access - if ('code' in error && error.code === 'ENOENT') { - return null - } - throw error - } - }, - async set(key, value) { - const filePath = cacheFilePath(key) - await mkdir(path.dirname(filePath), { recursive: true, mode: 0o700 }) - await writeFile(filePath, JSON.stringify(value, null, 2)) - }, - } -} diff --git a/provider/docs/src/corpus/cache/store/indexedDB.ts b/provider/docs/src/corpus/cache/store/indexedDB.ts deleted file mode 100644 index 18e22436..00000000 --- a/provider/docs/src/corpus/cache/store/indexedDB.ts +++ /dev/null @@ -1,35 +0,0 @@ -/// - -import IndexedDBStorage from 'better-localstorage' -import { type CacheStore } from '../cache' - -/** - * Create a {@link CacheStore} that stores cache data using IndexedDB. - */ -export function createIndexedDBCacheStore(keyPrefix: string): CacheStore { - // Use keyPrefix as the IndexedDB database name, which means we don't need to use it as the - // actual entry key prefix in the DB. - const storage = new IndexedDBStorage(keyPrefix, 'cache') - - return { - async get(key) { - // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment - const data = await storage.getItem(key) - try { - return data ?? null - } catch (error) { - // TODO(sqs): cast because https://github.com/dreamsavior/Better-localStorage/pull/1 - // eslint-disable-next-line @typescript-eslint/no-unsafe-call, @typescript-eslint/no-unsafe-member-access - await (storage as any).delete(key) - throw error - } - }, - async set(key, value) { - try { - await storage.setItem(key, value) - } catch (error) { - console.error(`failed to store data for ${key}`, error) - } - }, - } -} diff --git a/provider/docs/src/corpus/cache/store/localStorage.ts b/provider/docs/src/corpus/cache/store/localStorage.ts deleted file mode 100644 index 43e122a9..00000000 --- a/provider/docs/src/corpus/cache/store/localStorage.ts +++ /dev/null @@ -1,34 +0,0 @@ -/// - -import { type CacheStore } from '../cache' - -/** - * Create a {@link CacheStore} that stores cache data in localStorage (using the Web Storage API). - */ -export function createWebStorageCacheStore(storage: Storage, keyPrefix: string): CacheStore { - function storageKey(key: string): string { - return `${keyPrefix}:${key}` - } - - return { - get(key) { - const k = storageKey(key) - const data = storage.getItem(k) - try { - return Promise.resolve(data === null ? null : JSON.parse(data)) - } catch (error) { - storage.removeItem(k) - throw error - } - }, - set(key, value) { - const valueData = JSON.stringify(value) - try { - storage.setItem(storageKey(key), valueData) - } catch { - // console.error(`failed to store data for ${contentID}:${key}`, error) - } - return Promise.resolve() - }, - } -} diff --git a/provider/docs/src/corpus/doc/chunks.test.ts b/provider/docs/src/corpus/doc/chunks.test.ts index 428fae8a..71140cb5 100644 --- a/provider/docs/src/corpus/doc/chunks.test.ts +++ b/provider/docs/src/corpus/doc/chunks.test.ts @@ -1,5 +1,5 @@ import { describe, expect, test } from 'vitest' -import { chunk, type Chunk } from './chunks' +import { chunk, type Chunk } from './chunks.ts' describe('chunker', () => { test('empty', () => expect(chunk('', {})).toEqual([])) diff --git a/provider/docs/src/corpus/doc/contentExtractor.test.ts b/provider/docs/src/corpus/doc/contentExtractor.test.ts index 08bfcdf1..e9ac904b 100644 --- a/provider/docs/src/corpus/doc/contentExtractor.test.ts +++ b/provider/docs/src/corpus/doc/contentExtractor.test.ts @@ -1,5 +1,5 @@ import { describe, expect, test } from 'vitest' -import { extractContentUsingMozillaReadability, type Content } from './contentExtractor' +import { extractContentUsingMozillaReadability, type Content } from './contentExtractor.ts' describe('extractContentUsingMozillaReadability', () => { test('extracts content', async () => diff --git a/provider/docs/src/corpus/doc/contentExtractor.ts b/provider/docs/src/corpus/doc/contentExtractor.ts index e25925aa..471f7233 100644 --- a/provider/docs/src/corpus/doc/contentExtractor.ts +++ b/provider/docs/src/corpus/doc/contentExtractor.ts @@ -1,6 +1,6 @@ import { Readability } from '@mozilla/readability' -import { parseDOM } from '../../dom' -import { type Doc } from './doc' +import { parseDOM } from '../../dom.ts' +import { type Doc } from './doc.ts' export interface Content { /** diff --git a/provider/docs/src/corpus/index/corpusIndex.test.ts b/provider/docs/src/corpus/index/corpusIndex.test.ts index d5997e5a..255f46ae 100644 --- a/provider/docs/src/corpus/index/corpusIndex.test.ts +++ b/provider/docs/src/corpus/index/corpusIndex.test.ts @@ -1,7 +1,7 @@ import { describe, expect, test } from 'vitest' -import { createCorpusArchive } from '../archive/corpusArchive' -import { type Doc, type DocID } from '../doc/doc' -import { createCorpusIndex, fromJSON } from './corpusIndex' +import { createCorpusArchive } from '../archive/corpusArchive.ts' +import { type Doc, type DocID } from '../doc/doc.ts' +import { createCorpusIndex, fromJSON } from './corpusIndex.ts' export function doc(id: DocID, text: string): Doc { return { id, text } @@ -14,7 +14,7 @@ describe('indexCorpus', async () => { expect(INDEX.docs.length).toBe(2) }) - test('JSON-serializable', async () => { + test('JSON-serializable', () => { const serialized = fromJSON(JSON.parse(JSON.stringify(INDEX))) const indexWithoutToJSON = { ...INDEX } delete (indexWithoutToJSON as any).toJSON diff --git a/provider/docs/src/corpus/index/corpusIndex.ts b/provider/docs/src/corpus/index/corpusIndex.ts index 96c48706..e9a731ef 100644 --- a/provider/docs/src/corpus/index/corpusIndex.ts +++ b/provider/docs/src/corpus/index/corpusIndex.ts @@ -1,10 +1,10 @@ -import { embedText } from '../../search/embeddings' -import { createTFIDFIndex, type TFIDFIndex } from '../../search/tfidf' -import { type CorpusArchive } from '../archive/corpusArchive' -import { contentID } from '../cache/contentID' -import { chunk, type Chunk } from '../doc/chunks' -import { type Content, type ContentExtractor } from '../doc/contentExtractor' -import { type Doc } from '../doc/doc' +import { embedText } from '../../search/embeddings.ts' +import { createTFIDFIndex, type TFIDFIndex } from '../../search/tfidf.ts' +import { type CorpusArchive } from '../archive/corpusArchive.ts' +import { contentID } from '../cache/contentID.ts' +import { chunk, type Chunk } from '../doc/chunks.ts' +import { type Content, type ContentExtractor } from '../doc/contentExtractor.ts' +import { type Doc } from '../doc/doc.ts' /** * An index of a corpus. diff --git a/provider/docs/src/e2e.test.ts b/provider/docs/src/e2e.test.ts index 793f61df..2cfb6070 100644 --- a/provider/docs/src/e2e.test.ts +++ b/provider/docs/src/e2e.test.ts @@ -1,10 +1,10 @@ import fs from 'node:fs/promises' import path from 'node:path' import { describe, expect, test } from 'vitest' -import { createClient } from './client/client' -import { createCorpusArchive } from './corpus/archive/corpusArchive' -import { createCorpusIndex } from './corpus/index/corpusIndex' -import { type SearchResult } from './search/types' +import { createClient } from './client/client.ts' +import { createCorpusArchive } from './corpus/archive/corpusArchive.ts' +import { createCorpusIndex } from './corpus/index/corpusIndex.ts' +import { type SearchResult } from './search/types.ts' describe('e2e', () => { test('urlParsing', async () => { diff --git a/provider/docs/src/provider/provider.ts b/provider/docs/src/provider/provider.ts index 9a526049..61d96f29 100644 --- a/provider/docs/src/provider/provider.ts +++ b/provider/docs/src/provider/provider.ts @@ -1,4 +1,5 @@ /* eslint-disable import/no-default-export */ +import { readFile } from 'node:fs/promises' import { createFilePositionCalculator, type AnnotationsParams, @@ -88,10 +89,15 @@ export default multiplex(async settings => { } }) -async function fetchIndex(url: string): Promise { - const resp = await fetch(url) +async function fetchIndex(urlStr: string): Promise { + const url = new URL(urlStr) + if (url.protocol === 'file:') { + return fromJSON(JSON.parse(await readFile(url.pathname, 'utf-8'))) + } + + const resp = await fetch(urlStr) if (!resp.ok) { - throw new Error(`Failed to fetch corpus index from ${url} with HTTP status ${resp.status}`) + throw new Error(`Failed to fetch corpus index from ${urlStr} with HTTP status ${resp.status}`) } return fromJSON(await resp.json()) } diff --git a/provider/docs/src/search/embeddings.test.ts b/provider/docs/src/search/embeddings.test.ts index ed0b3b67..090f02e6 100644 --- a/provider/docs/src/search/embeddings.test.ts +++ b/provider/docs/src/search/embeddings.test.ts @@ -1,8 +1,8 @@ import { describe, expect, test } from 'vitest' -import { createCorpusArchive } from '../corpus/archive/corpusArchive' -import { createCorpusIndex } from '../corpus/index/corpusIndex' -import { doc } from '../corpus/index/corpusIndex.test' -import { embeddingsSearch, embedTextInThisScope, similarity } from './embeddings' +import { createCorpusArchive } from '../corpus/archive/corpusArchive.ts' +import { doc } from '../corpus/index/corpusIndex.test.ts' +import { createCorpusIndex } from '../corpus/index/corpusIndex.ts' +import { embeddingsSearch, embedTextInThisScope, similarity } from './embeddings.ts' describe('embeddingsSearch', () => { test('finds matches', async () => { diff --git a/provider/docs/src/search/embeddings.ts b/provider/docs/src/search/embeddings.ts index 6f5ba365..dad05998 100644 --- a/provider/docs/src/search/embeddings.ts +++ b/provider/docs/src/search/embeddings.ts @@ -1,11 +1,11 @@ import { cos_sim, dot, env, magnitude, pipeline } from '@xenova/transformers' import * as onnxWeb from 'onnxruntime-web' -import { type CorpusIndex } from '../corpus/index/corpusIndex' -import { isWebWindowRuntime, useWebWorker } from '../env' -import { type Logger } from '../logger' -import { embedTextOnWorker } from '../worker/webWorkerClient' -import { withoutCodeStopwords } from './terms' -import { type Query, type SearchResult } from './types' +import { type CorpusIndex } from '../corpus/index/corpusIndex.ts' +import { isWebWindowRuntime, useWebWorker } from '../env.ts' +import { type Logger } from '../logger.ts' +import { embedTextOnWorker } from '../worker/webWorkerClient.ts' +import { withoutCodeStopwords } from './terms.ts' +import { type Query, type SearchResult } from './types.ts' // eslint-disable-next-line @typescript-eslint/prefer-optional-chain if (typeof process !== 'undefined' && process.env.VITEST) { @@ -40,6 +40,9 @@ if (isWebWindowRuntime) { // // eslint-disable-next-line @typescript-eslint/no-unsafe-member-access env.backends.onnx.wasm.wasmPaths = import.meta.resolve('../../node_modules/@xenova/transformers/dist/') +} else { + // TODO(sqs): seems to be triggered when running in vscode + env.backends.onnx.wasm.wasmPaths = __dirname + '/../node_modules/@xenova/transformers/dist/' } env.allowLocalModels = false @@ -68,7 +71,7 @@ export async function embeddingsSearch(index: CorpusIndex, query: Query): Promis return results } -const pipe = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2', {}) +const pipe = pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2', {}) /** * Embed the text and return the vector. Run in a worker in some environments. @@ -84,10 +87,10 @@ export async function embedTextInThisScope(text: string, logger?: Logger): Promi try { const t0 = performance.now() // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment - const out = await pipe(text, { pooling: 'mean', normalize: true }) + const out = await (await pipe)(text, { pooling: 'mean', normalize: true }) logger?.(`embedText (${text.length} chars) took ${Math.round(performance.now() - t0)}ms`) // eslint-disable-next-line @typescript-eslint/no-unsafe-member-access - return out.data + return out.data as Float32Array // TODO(sqs): cast } catch (error) { console.log(error) throw error diff --git a/provider/docs/src/search/keyword.test.ts b/provider/docs/src/search/keyword.test.ts index 7914523a..5328224c 100644 --- a/provider/docs/src/search/keyword.test.ts +++ b/provider/docs/src/search/keyword.test.ts @@ -1,10 +1,10 @@ import { describe, expect, test } from 'vitest' -import { createCorpusArchive } from '../corpus/archive/corpusArchive' -import { createCorpusIndex } from '../corpus/index/corpusIndex' -import { doc } from '../corpus/index/corpusIndex.test' -import { keywordSearch } from './keyword' -import { calculateTFIDF } from './tfidf' -import { type SearchResult } from './types' +import { createCorpusArchive } from '../corpus/archive/corpusArchive.ts' +import { doc } from '../corpus/index/corpusIndex.test.ts' +import { createCorpusIndex } from '../corpus/index/corpusIndex.ts' +import { keywordSearch } from './keyword.ts' +import { calculateTFIDF } from './tfidf.ts' +import { type SearchResult } from './types.ts' describe('keywordSearch', () => { test('finds matches', async () => { diff --git a/provider/docs/src/search/keyword.ts b/provider/docs/src/search/keyword.ts index 10aafc65..5ac05311 100644 --- a/provider/docs/src/search/keyword.ts +++ b/provider/docs/src/search/keyword.ts @@ -1,7 +1,7 @@ -import { type CorpusIndex } from '../corpus/index/corpusIndex' -import { terms } from './terms' -import { computeTFIDF } from './tfidf' -import { type Query, type SearchResult } from './types' +import { type CorpusIndex } from '../corpus/index/corpusIndex.ts' +import { terms } from './terms.ts' +import { computeTFIDF } from './tfidf.ts' +import { type Query, type SearchResult } from './types.ts' export function keywordSearch(index: CorpusIndex, query: Query): Omit[] { const queryTerms = terms(query.text).filter(term => term.length >= 3) diff --git a/provider/docs/src/search/terms.test.ts b/provider/docs/src/search/terms.test.ts index 73ea0f87..631bb5a3 100644 --- a/provider/docs/src/search/terms.test.ts +++ b/provider/docs/src/search/terms.test.ts @@ -1,5 +1,5 @@ import { describe, expect, test } from 'vitest' -import { terms } from './terms' +import { terms } from './terms.ts' describe('terms', () => { test('splits, stems, normalizes', () => { diff --git a/provider/docs/src/search/tfidf.test.ts b/provider/docs/src/search/tfidf.test.ts index 880047b0..669830be 100644 --- a/provider/docs/src/search/tfidf.test.ts +++ b/provider/docs/src/search/tfidf.test.ts @@ -1,7 +1,7 @@ import { describe, expect, test } from 'vitest' -import { createCorpusArchive } from '../corpus/archive/corpusArchive' -import { createCorpusIndex } from '../corpus/index/corpusIndex' -import { calculateTFIDF, computeTFIDF, createTFIDFIndex } from './tfidf' +import { createCorpusArchive } from '../corpus/archive/corpusArchive.ts' +import { createCorpusIndex } from '../corpus/index/corpusIndex.ts' +import { calculateTFIDF, computeTFIDF, createTFIDFIndex } from './tfidf.ts' describe('createTFIDFIndex', async () => { const data = await createCorpusArchive([ diff --git a/provider/docs/src/search/tfidf.ts b/provider/docs/src/search/tfidf.ts index ff64cdd1..e367d545 100644 --- a/provider/docs/src/search/tfidf.ts +++ b/provider/docs/src/search/tfidf.ts @@ -1,7 +1,7 @@ -import { type ChunkIndex } from '../corpus/doc/chunks' -import { type DocID } from '../corpus/doc/doc' -import { type IndexedDoc } from '../corpus/index/corpusIndex' -import { terms, type Term } from './terms' +import { type ChunkIndex } from '../corpus/doc/chunks.ts' +import { type DocID } from '../corpus/doc/doc.ts' +import { type IndexedDoc } from '../corpus/index/corpusIndex.ts' +import { terms, type Term } from './terms.ts' /** * Index the corpus for fast computation of TF-IDF. diff --git a/provider/docs/src/search/types.ts b/provider/docs/src/search/types.ts index 071ed834..a03d9137 100644 --- a/provider/docs/src/search/types.ts +++ b/provider/docs/src/search/types.ts @@ -1,5 +1,5 @@ -import { type ChunkIndex } from '../corpus/doc/chunks' -import { type DocID } from '../corpus/doc/doc' +import { type ChunkIndex } from '../corpus/doc/chunks.ts' +import { type DocID } from '../corpus/doc/doc.ts' /** A search query. */ export interface Query { diff --git a/provider/docs/src/worker/webWorker.ts b/provider/docs/src/worker/webWorker.ts index 0864fb21..a403f1a3 100644 --- a/provider/docs/src/worker/webWorker.ts +++ b/provider/docs/src/worker/webWorker.ts @@ -1,7 +1,7 @@ /// -import { embedTextInThisScope } from '../search/embeddings' -import { type WorkerEmbedTextMessage, type WorkerMessagePair } from './api' +import { embedTextInThisScope } from '../search/embeddings.ts' +import { type WorkerEmbedTextMessage, type WorkerMessagePair } from './api.ts' declare let self: DedicatedWorkerGlobalScope diff --git a/provider/docs/src/worker/webWorkerClient.ts b/provider/docs/src/worker/webWorkerClient.ts index 2c0bc011..c5abaa0b 100644 --- a/provider/docs/src/worker/webWorkerClient.ts +++ b/provider/docs/src/worker/webWorkerClient.ts @@ -1,5 +1,6 @@ -import { type embedTextInThisScope } from '../search/embeddings' -import { type WorkerEmbedTextMessage, type WorkerMessagePair } from './api' +import os from 'node:os' +import { type embedTextInThisScope } from '../search/embeddings.ts' +import { type WorkerEmbedTextMessage, type WorkerMessagePair } from './api.ts' export const embedTextOnWorker: typeof embedTextInThisScope = async (text: string): Promise => sendMessage('embedText', text) @@ -25,17 +26,16 @@ async function sendMessage

( const NUM_WORKERS: number = Math.min( 8, - (await (async (): Promise => { + ((): number => { if (typeof navigator !== 'undefined') { return navigator.hardwareConcurrency } try { - const os = await import('node:os') return os.cpus().length // eslint-disable-next-line no-empty } catch {} return 1 - })()) || 1 + })() || 1 ) const workers: (Promise | undefined)[] = []