Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
sqs committed Dec 26, 2023
1 parent adca3b2 commit 5f89832
Show file tree
Hide file tree
Showing 6 changed files with 74 additions and 32 deletions.
16 changes: 16 additions & 0 deletions provider/docs/src/corpus/doc/contentExtractor.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import { describe, expect, test } from 'vitest'
import { Content, extractContentUsingMozillaReadability } from './contentExtractor'

describe('extractContentUsingMozillaReadability', () => {
test('extracts content', () =>
expect(
extractContentUsingMozillaReadability({
id: 1,
text: '<html><head><title>Bar - MySite</title></head><body><aside><nav><h1><a href="/">MySite</a></h1> <a href="/foo">foo</a></nav></aside><main><h1>Bar</h1> <p>Baz</p></main></body>',
})
).toEqual<Content>({
title: 'Bar - MySite',
content: '<main><h2>Bar</h2> <p>Baz</p></main>',
textContent: 'Bar Baz',
}))
})
34 changes: 34 additions & 0 deletions provider/docs/src/corpus/doc/contentExtractor.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import { Readability } from '@mozilla/readability'
import { JSDOM } from 'jsdom'
import { type Doc } from './doc'

export interface Content {
/**
* Title of the document.
*/
title: string

/**
* Content of the document, including some markup. Omits non-content-related elements (header,
* footer, navigation, etc.).
*/
content: string

/**
* Text content of the document, with all markup removed. Omits all non-content-related elements.
*/
textContent: string
}

export type ContentExtractor = (doc: Doc) => Content | null

export const extractContentUsingMozillaReadability: ContentExtractor = doc => {
const info = new Readability(new JSDOM(doc.text, { url: doc.url }).window.document).parse()
return info
? {
title: info.title,
content: info.content.replace(/^<div id="readability-page-1" class="page">(.*)<\/div>$/, '$1'),
textContent: info.textContent,
}
: null
}
10 changes: 0 additions & 10 deletions provider/docs/src/corpus/doc/doc.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import { type Chunk } from './chunks'

/**
* A unique identifier for a document in a corpus.
*/
Expand All @@ -14,11 +12,3 @@ export interface Doc {

url?: string
}

/**
* An indexed document.
*/
export interface IndexedDoc {
doc: Doc
chunks: Chunk[]
}
17 changes: 0 additions & 17 deletions provider/docs/src/corpus/doc/summary.ts

This file was deleted.

26 changes: 22 additions & 4 deletions provider/docs/src/corpus/index.ts
Original file line number Diff line number Diff line change
@@ -1,19 +1,37 @@
import { type CorpusData } from './data'
import { chunk } from './doc/chunks'
import { type IndexedDoc } from './doc/doc'
import { chunk, type Chunk } from './doc/chunks'
import { type Content, type ContentExtractor } from './doc/contentExtractor'
import { type Doc } from './doc/doc'

/**
* An index of a corpus.
*/
export interface CorpusIndex {
data: CorpusData

docs: IndexedDoc[]
}

export function indexCorpus(data: CorpusData): CorpusIndex {
/**
* An indexed document.
*/
export interface IndexedDoc {
doc: Doc
chunks: Chunk[]
content?: Content
}

interface IndexOptions {
contentExtractor?: ContentExtractor
}

export function indexCorpus(data: CorpusData, { contentExtractor }: IndexOptions = {}): CorpusIndex {
const index: CorpusIndex = { data, docs: [] }

for (const doc of data.docs) {
const chunks = chunk(doc.text, { isMarkdown: doc.text.includes('##') })
index.docs.push({ doc, chunks })
const content = contentExtractor?.(doc) ?? undefined
index.docs.push({ doc, chunks, content })
}

return index
Expand Down
3 changes: 2 additions & 1 deletion provider/docs/src/corpus/search/tfidf.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { type IndexedDoc } from '..'
import { type ChunkIndex } from '../doc/chunks'
import { type DocID, type IndexedDoc } from '../doc/doc'
import { type DocID } from '../doc/doc'
import { terms, type Term } from './terms'

/**
Expand Down

0 comments on commit 5f89832

Please sign in to comment.