Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
sqs committed Dec 26, 2023
1 parent 05d9d97 commit 4fe1b06
Show file tree
Hide file tree
Showing 14 changed files with 60 additions and 80 deletions.
2 changes: 1 addition & 1 deletion provider/docs/bin/docs-query.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import { readFile } from 'fs/promises'
import path from 'path'
import envPaths from 'env-paths'
import { createCorpus } from '../src/corpus'
import { createFileSystemCorpusCache } from '../src/corpus/cache/fs'
import { createCorpus } from '../src/corpus/corpus'
import { type CorpusData } from '../src/corpus/data'
import { extractContentUsingMozillaReadability } from '../src/corpus/doc/contentExtractor'

Expand Down
13 changes: 0 additions & 13 deletions provider/docs/src/corpus/corpus.test.ts

This file was deleted.

45 changes: 0 additions & 45 deletions provider/docs/src/corpus/corpus.ts

This file was deleted.

14 changes: 14 additions & 0 deletions provider/docs/src/corpus/index.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import { describe, expect, test } from 'vitest'
import { indexCorpus } from '.'
import { corpusData } from './data'
import { type Doc, type DocID } from './doc/doc'

export function doc(id: DocID, text: string): Doc {
return { id, text }
}

describe('indexCorpus', () => {
test('#docs', () => {
expect(indexCorpus(corpusData([doc(1, 'a'), doc(2, 'b')])).docs.length).toBe(2)
})
})
35 changes: 31 additions & 4 deletions provider/docs/src/corpus/index.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import { noopCache, type CorpusCache } from './cache/cache'
import { type CorpusData } from './data'
import { chunk, type Chunk } from './doc/chunks'
import { chunk, type Chunk, type ChunkIndex } from './doc/chunks'
import { type Content, type ContentExtractor } from './doc/contentExtractor'
import { type Doc } from './doc/doc'
import { type Doc, type DocID } from './doc/doc'
import { multiSearch } from './search/multi'

/**
* An index of a corpus.
Expand All @@ -10,6 +12,8 @@ export interface CorpusIndex {
data: CorpusData

docs: IndexedDoc[]

search(query: string): Promise<CorpusSearchResult[]>
}

/**
Expand All @@ -21,11 +25,32 @@ export interface IndexedDoc {
chunks: Chunk[]
}

/**
* A search result from searching a corpus.
*/

export interface CorpusSearchResult {
doc: DocID
chunk: ChunkIndex
score: number
excerpt: string
}

/**
* Options for indexing a corpus.
*/
export interface IndexOptions {
cache?: CorpusCache
contentExtractor?: ContentExtractor
}

export function indexCorpus(data: CorpusData, { contentExtractor }: IndexOptions = {}): CorpusIndex {
/**
* Index a corpus.
*/
export function indexCorpus(
data: CorpusData,
{ cache = noopCache, contentExtractor }: IndexOptions = { cache: noopCache }
): CorpusIndex {
const indexedDocs: IndexedDoc[] = []
for (const doc of data.docs) {
const content = contentExtractor?.(doc) ?? undefined
Expand All @@ -34,5 +59,7 @@ export function indexCorpus(data: CorpusData, { contentExtractor }: IndexOptions

indexedDocs.push({ doc, content, chunks })
}
return { data, docs: indexedDocs }

const index: CorpusIndex = { data, docs: indexedDocs, search: query => multiSearch(index, query, cache) }
return index
}
5 changes: 2 additions & 3 deletions provider/docs/src/corpus/search/embeddings.test.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import { describe, expect, test } from 'vitest'
import { indexCorpus } from '..'
import { type CorpusSearchResult } from '../corpus'
import { doc } from '../corpus.test'
import { indexCorpus, type CorpusSearchResult } from '..'
import { corpusData } from '../data'
import { doc } from '../index.test'
import { embeddingsSearch, embedText, similarity } from './embeddings'

describe('embeddingsSearch', () => {
Expand Down
3 changes: 1 addition & 2 deletions provider/docs/src/corpus/search/embeddings.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import { cos_sim, env, pipeline } from '@xenova/transformers'
import * as onnxWeb from 'onnxruntime-web'
import { type CorpusIndex } from '..'
import { type CorpusIndex, type CorpusSearchResult } from '..'
import { memo, noopCache, type CorpusCache } from '../cache/cache'
import { type CorpusSearchResult } from '../corpus'

// eslint-disable-next-line @typescript-eslint/prefer-optional-chain
if (typeof process !== 'undefined' && process.env.FORCE_WASM) {
Expand Down
5 changes: 2 additions & 3 deletions provider/docs/src/corpus/search/keyword.test.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import { describe, expect, test } from 'vitest'
import { indexCorpus } from '..'
import { type CorpusSearchResult } from '../corpus'
import { doc } from '../corpus.test'
import { indexCorpus, type CorpusSearchResult } from '..'
import { corpusData } from '../data'
import { doc } from '../index.test'
import { keywordSearch } from './keyword'
import { calculateTFIDF } from './tfidf'

Expand Down
3 changes: 1 addition & 2 deletions provider/docs/src/corpus/search/keyword.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import { type CorpusIndex } from '..'
import { type CorpusSearchResult } from '../corpus'
import { type CorpusIndex, type CorpusSearchResult } from '..'
import { terms } from './terms'
import { createIndexForTFIDF } from './tfidf'

Expand Down
3 changes: 1 addition & 2 deletions provider/docs/src/corpus/search/multi.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import { type CorpusIndex } from '..'
import { type CorpusIndex, type CorpusSearchResult } from '..'
import { scopedCache, type CorpusCache } from '../cache/cache'
import { type CorpusSearchResult } from '../corpus'
import { type ChunkIndex } from '../doc/chunks'
import { type DocID } from '../doc/doc'
import { embeddingsSearch } from './embeddings'
Expand Down
4 changes: 2 additions & 2 deletions provider/docs/src/corpus/source/web/webCorpusSource.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { JSDOM } from 'jsdom'
import { Logger } from '../../../logger'
import { Doc } from '../../doc/doc'
import { CorpusSource } from '../source'
import { createCrawlQueue } from './crawlQueue'
Expand All @@ -20,8 +21,6 @@ interface WebCorpusSourceOptions {
logger?: Logger
}

type Logger = (message: string) => void

export function createWebCorpusSource({ entryPage, prefix, logger }: WebCorpusSourceOptions): CorpusSource {
return {
documents: async () => {
Expand Down Expand Up @@ -88,6 +87,7 @@ export function createWebCorpusSource({ entryPage, prefix, logger }: WebCorpusSo
enqueueURL(new URL(link.href))
}
}

return documents
},
}
Expand Down
5 changes: 3 additions & 2 deletions provider/docs/src/e2e.test.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import fs from 'node:fs/promises'
import path from 'node:path'
import { describe, expect, test } from 'vitest'
import { createCorpus, type CorpusSearchResult } from './corpus/corpus'
import { indexCorpus, type CorpusSearchResult } from './corpus'
import { corpusData } from './corpus/data'

describe('e2e', () => {
test('urlParsing', async () => {
const docFile = await fs.readFile(path.join(__dirname, 'testdata/corpus/urlParsing.md'), 'utf8')
const codeFile = await fs.readFile(path.join(__dirname, 'testdata/code/urlParsing.ts'), 'utf8')

const corpus = createCorpus([{ id: 1, text: docFile }])
const corpus = indexCorpus(corpusData([{ id: 1, text: docFile }]))
const results = await corpus.search(codeFile)
roundScores(results)
expect(results).toEqual<CorpusSearchResult[]>([
Expand Down
1 change: 1 addition & 0 deletions provider/docs/src/logger.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
export type Logger = (message: string) => void
2 changes: 1 addition & 1 deletion provider/docs/src/provider/provider.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ import {
type CapabilitiesParams,
type CapabilitiesResult,
} from '@opencodegraph/provider'
import { createCorpus } from '../corpus'
import { createWebStorageCorpusCache } from '../corpus/cache/localStorage'
import { createCorpus } from '../corpus/corpus'
import { multiplex } from './multiplex'

/** Settings for the docs OpenCodeGraph provider. */
Expand Down

0 comments on commit 4fe1b06

Please sign in to comment.