Skip to content

Commit

Permalink
Add URL Fetching Provider (#47)
Browse files Browse the repository at this point in the history
This PR ports the URL fetcher provider to OpenCtx. Originally
implemented in Cody and currently copying the works from
#43 but rebasing on main.

---------

Co-authored-by: Keegan Carruthers-Smith <[email protected]>
  • Loading branch information
thenamankumar and keegancsmith authored May 17, 2024
1 parent 30c23cc commit 51177bd
Show file tree
Hide file tree
Showing 7 changed files with 185 additions and 1 deletion.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
},
"engines": {
"node": "^20.10.0",
"pnpm": "^8.12.1"
"pnpm": "^8.6.7"
},
"packageManager": "[email protected]",
"scripts": {
Expand Down
10 changes: 10 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 20 additions & 0 deletions provider/url-fetcher/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# URL Fetcher context provider for OpenCtx


This is a context provider for [OpenCtx](https://openctx.org) that fetches URLs for use as context items.

## Usage

Add the following to your settings in any OpenCtx client:

```json
"openctx.providers": {
// ...other providers...
"https://openctx.org/npm/@openctx/url-fetcher": true
},

## Development

- [Source code](https://sourcegraph.com/github.com/sourcegraph/openctx/-/tree/provider/url-fetcher)
- [Docs](https://openctx.org/docs/providers/url-fetcher)
- License: Apache 2.0
107 changes: 107 additions & 0 deletions provider/url-fetcher/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import type {
CapabilitiesParams,
CapabilitiesResult,
ItemsParams,
ItemsResult,
Provider,
} from '@openctx/provider'

type UrlFetcherSettings = { [key: string]: any }

/**
* An OpenCtx provider that fetches the content of a URL and provides it as an item.
*/
const urlFetcher: Provider<UrlFetcherSettings> = {
capabilities(params: CapabilitiesParams, settings: UrlFetcherSettings): CapabilitiesResult {
return {
// empty since we don't provide any annotations.
selector: [],
}
},

async items(params: ItemsParams, settings: UrlFetcherSettings): Promise<ItemsResult> {
return fetchItem(params)
},
}

async function fetchItem(params: ItemsParams, timeoutMs?: number): Promise<ItemsResult> {
const url = params.query
if (!url) {
return []
}
try {
const content = await fetchContentForURLContextItem(url.toString(), timeoutSignal(timeoutMs))

if (content === null) {
return []
}
return [
{
url,
title: tryGetHTMLDocumentTitle(content) ?? url,
ui: { hover: { text: `Fetched from ${url}` } },
ai: { content: content },
},
]
} catch (error) {
// Suppress errors because the user might be typing a URL that is not yet valid.
return []
}
}

async function fetchContentForURLContextItem(
urlStr: string,
signal?: AbortSignal
): Promise<string | null> {
const url = new URL(urlStr)
if (url.protocol !== 'http' && url.protocol !== 'https') {
return null
}
if (!/(localhost|\.\w{2,})$/.test(url.hostname)) {
return null
}

const resp = await fetch(urlStr, { signal })
if (!resp.ok) {
return null
}
const body = await resp.text()

// HACK(sqs): Rudimentarily strip HTML tags, script, and other unneeded elements from body using
// regexp. This is NOT intending to be a general-purpose HTML parser and is NOT sanitizing the
// value for security.
const bodyWithoutTags = body
.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '')
.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '')
.replace(/<svg\b[^<]*(?:(?!<\/svg>)<[^<]*)*<\/svg>/gi, '')
.replace(/<!--.*?-->/gs, '')
.replace(/\s(?:class|style)=["'][^"']*["']/gi, '')
.replace(/\sdata-[\w-]+(=["'][^"']*["'])?/gi, '')

// TODO(sqs): Arbitrarily trim the response text to avoid overflowing the context window for the
// LLM. Ideally we would make the prompt builder prioritize this context item over other context
// because it is explicitly from the user.
const MAX_LENGTH = 14000
return bodyWithoutTags.length > MAX_LENGTH
? `${bodyWithoutTags.slice(0, MAX_LENGTH)}... (web page content was truncated)`
: bodyWithoutTags
}

/**
* Try to get the title of an HTML document, using incomplete regexp parsing for simplicity (because
* this feature is experimental and we don't need robustness yet).
*/
function tryGetHTMLDocumentTitle(html: string): string | undefined {
return html.match(/<title>(?<title>[^<]+)<\/title>/)?.groups?.title
}

function timeoutSignal(ms?: number): AbortSignal | undefined {
if (ms === undefined) {
return undefined
}
const controller = new AbortController()
setTimeout(() => controller.abort('timeout'), ms)
return controller.signal
}

export default urlFetcher
32 changes: 32 additions & 0 deletions provider/url-fetcher/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{
"name": "@openctx/provider-url-fetcher",
"version": "0.0.1",
"description": "URL Fetcher (OpenCtx provider)",
"license": "Apache-2.0",
"repository": {
"type": "git",
"url": "https://github.com/sourcegraph/openctx",
"directory": "provider/url-fetcher"
},
"type": "module",
"main": "dist/index.js",
"types": "dist/index.d.ts",
"files": [
"dist",
"index.ts",
"!**/*.test.*",
"README.md"
],
"sideEffects": false,
"scripts": {
"build": "tsc --build",
"test": "vitest",
"bundle": "esbuild --bundle --format=esm --outfile=dist/bundle.js index.ts"
},
"dependencies": {
"@openctx/provider": "workspace:*"
},
"devDependencies": {
"esbuild": "^0.19.11"
}
}
12 changes: 12 additions & 0 deletions provider/url-fetcher/tsconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"extends": "../../.config/tsconfig.base.json",
"compilerOptions": {
"module": "ESNext",
"rootDir": ".",
"outDir": "dist",
"lib": ["ESNext"],
},
"include": ["*.ts"],
"exclude": ["dist", "vitest.config.ts"],
"references": [{ "path": "../../lib/provider" }],
}
3 changes: 3 additions & 0 deletions provider/url-fetcher/vitest.config.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import { defineConfig } from 'vitest/config'

export default defineConfig({})

0 comments on commit 51177bd

Please sign in to comment.