Skip to content

Commit

Permalink
feat: use crawlee for docs service
Browse files Browse the repository at this point in the history
  • Loading branch information
Patrick-Erichsen committed Aug 21, 2024
1 parent 4bb1846 commit a51c520
Show file tree
Hide file tree
Showing 17 changed files with 2,652 additions and 640 deletions.
12 changes: 12 additions & 0 deletions core/core.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ import type { IMessenger, Message } from "./util/messenger";
import { editConfigJson } from "./util/paths";
import { Telemetry } from "./util/posthog";
import { streamDiffLines } from "./util/verticalEdit";
import {
installChromium,
isChromiumInstalled,
} from "./indexing/docs/installChromium";

export class Core {
// implements IMessenger<ToCoreProtocol, FromCoreProtocol>
Expand Down Expand Up @@ -167,6 +171,14 @@ export class Core {
(..._) => Promise.resolve([]),
);

try {
if (!isChromiumInstalled()) {
installChromium();
}
} catch (err) {
console.debug(`Failed to install Chromium: ${err}`);
}

const on = this.messenger.on.bind(this.messenger);

this.messenger.onError((err) => {
Expand Down
40 changes: 33 additions & 7 deletions core/indexing/docs/DocsService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ import {
import { Telemetry } from "../../util/posthog.js";
import TransformersJsEmbeddingsProvider from "../embeddings/TransformersJsEmbeddingsProvider.js";
import { Article, chunkArticle, pageToArticle } from "./article.js";
import { crawlPage } from "./crawl.js";
import { crawlSite } from "./crawlSite.js";
import { runLanceMigrations, runSqliteMigrations } from "./migrations.js";
import {
downloadFromS3,
Expand Down Expand Up @@ -202,11 +202,36 @@ export default class DocsService {
let processedPages = 0;
let maxKnownPages = 1;

// crawlSite(
// new URL(startUrl),
// siteIndexingConfig.maxDepth,
// async function* (page) {
// processedPages++;

// const article = pageToArticle(page);

// if (article) {
// articles.push(article);
// }

// // Use a heuristic approach for progress calculation
// const progress = Math.min(processedPages / maxKnownPages, 1);

// yield {
// progress, // Yield the heuristic progress
// desc: `Finding subpages (${page.path})`,
// status: "indexing",
// };

// // Increase maxKnownPages to delay progress reaching 100% too soon
// if (processedPages === maxKnownPages) {
// maxKnownPages *= 2;
// }
// },
// );

// Crawl pages and retrieve info as articles
for await (const page of crawlPage(
new URL(startUrl),
siteIndexingConfig.maxDepth,
)) {
for await (const page of crawlSite(startUrl)) {
processedPages++;

const article = pageToArticle(page);
Expand Down Expand Up @@ -521,8 +546,9 @@ export default class DocsService {
private async getLanceTableNameFromEmbeddingsProvider(
isPreIndexedDoc: boolean,
) {
const embeddingsProvider =
await this.getEmbeddingsProvider(isPreIndexedDoc);
const embeddingsProvider = await this.getEmbeddingsProvider(
isPreIndexedDoc,
);
const embeddingsProviderId = this.removeInvalidLanceTableNameChars(
embeddingsProvider.id,
);
Expand Down
4 changes: 2 additions & 2 deletions core/indexing/docs/article.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import { Readability } from "@mozilla/readability";
import { JSDOM } from "jsdom";
import { Chunk } from "../../index.js";
import { cleanFragment, cleanHeader } from "../chunk/markdown.js";
import { PageData } from "./crawl.js";
import { PageData } from "./crawlSite.js";

export type ArticleComponent = {
title: string;
Expand Down Expand Up @@ -148,7 +148,7 @@ export function stringToArticle(

export function pageToArticle(page: PageData): Article | undefined {
try {
return stringToArticle(page.url, page.html, page.path);
return stringToArticle(page.url, page.content, page.path);
} catch (err) {
console.error("Error converting URL to article components", err);
return undefined;
Expand Down
103 changes: 0 additions & 103 deletions core/indexing/docs/crawl.test.ts

This file was deleted.

Loading

0 comments on commit a51c520

Please sign in to comment.