-
Notifications
You must be signed in to change notification settings - Fork 22
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This PR ports the URL fetcher provider to OpenCtx. Originally implemented in Cody and currently copying the works from #43 but rebasing on main. --------- Co-authored-by: Keegan Carruthers-Smith <[email protected]>
- Loading branch information
1 parent
30c23cc
commit 51177bd
Showing
7 changed files
with
185 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,7 +10,7 @@ | |
}, | ||
"engines": { | ||
"node": "^20.10.0", | ||
"pnpm": "^8.12.1" | ||
"pnpm": "^8.6.7" | ||
}, | ||
"packageManager": "[email protected]", | ||
"scripts": { | ||
|
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# URL Fetcher context provider for OpenCtx | ||
|
||
|
||
This is a context provider for [OpenCtx](https://openctx.org) that fetches URLs for use as context items. | ||
|
||
## Usage | ||
|
||
Add the following to your settings in any OpenCtx client: | ||
|
||
```json | ||
"openctx.providers": { | ||
// ...other providers... | ||
"https://openctx.org/npm/@openctx/url-fetcher": true | ||
}, | ||
|
||
## Development | ||
|
||
- [Source code](https://sourcegraph.com/github.com/sourcegraph/openctx/-/tree/provider/url-fetcher) | ||
- [Docs](https://openctx.org/docs/providers/url-fetcher) | ||
- License: Apache 2.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
import type { | ||
CapabilitiesParams, | ||
CapabilitiesResult, | ||
ItemsParams, | ||
ItemsResult, | ||
Provider, | ||
} from '@openctx/provider' | ||
|
||
type UrlFetcherSettings = { [key: string]: any } | ||
|
||
/** | ||
* An OpenCtx provider that fetches the content of a URL and provides it as an item. | ||
*/ | ||
const urlFetcher: Provider<UrlFetcherSettings> = { | ||
capabilities(params: CapabilitiesParams, settings: UrlFetcherSettings): CapabilitiesResult { | ||
return { | ||
// empty since we don't provide any annotations. | ||
selector: [], | ||
} | ||
}, | ||
|
||
async items(params: ItemsParams, settings: UrlFetcherSettings): Promise<ItemsResult> { | ||
return fetchItem(params) | ||
}, | ||
} | ||
|
||
async function fetchItem(params: ItemsParams, timeoutMs?: number): Promise<ItemsResult> { | ||
const url = params.query | ||
if (!url) { | ||
return [] | ||
} | ||
try { | ||
const content = await fetchContentForURLContextItem(url.toString(), timeoutSignal(timeoutMs)) | ||
|
||
if (content === null) { | ||
return [] | ||
} | ||
return [ | ||
{ | ||
url, | ||
title: tryGetHTMLDocumentTitle(content) ?? url, | ||
ui: { hover: { text: `Fetched from ${url}` } }, | ||
ai: { content: content }, | ||
}, | ||
] | ||
} catch (error) { | ||
// Suppress errors because the user might be typing a URL that is not yet valid. | ||
return [] | ||
} | ||
} | ||
|
||
async function fetchContentForURLContextItem( | ||
urlStr: string, | ||
signal?: AbortSignal | ||
): Promise<string | null> { | ||
const url = new URL(urlStr) | ||
if (url.protocol !== 'http' && url.protocol !== 'https') { | ||
return null | ||
} | ||
if (!/(localhost|\.\w{2,})$/.test(url.hostname)) { | ||
return null | ||
} | ||
|
||
const resp = await fetch(urlStr, { signal }) | ||
if (!resp.ok) { | ||
return null | ||
} | ||
const body = await resp.text() | ||
|
||
// HACK(sqs): Rudimentarily strip HTML tags, script, and other unneeded elements from body using | ||
// regexp. This is NOT intending to be a general-purpose HTML parser and is NOT sanitizing the | ||
// value for security. | ||
const bodyWithoutTags = body | ||
.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '') | ||
.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '') | ||
.replace(/<svg\b[^<]*(?:(?!<\/svg>)<[^<]*)*<\/svg>/gi, '') | ||
.replace(/<!--.*?-->/gs, '') | ||
.replace(/\s(?:class|style)=["'][^"']*["']/gi, '') | ||
.replace(/\sdata-[\w-]+(=["'][^"']*["'])?/gi, '') | ||
|
||
// TODO(sqs): Arbitrarily trim the response text to avoid overflowing the context window for the | ||
// LLM. Ideally we would make the prompt builder prioritize this context item over other context | ||
// because it is explicitly from the user. | ||
const MAX_LENGTH = 14000 | ||
return bodyWithoutTags.length > MAX_LENGTH | ||
? `${bodyWithoutTags.slice(0, MAX_LENGTH)}... (web page content was truncated)` | ||
: bodyWithoutTags | ||
} | ||
|
||
/** | ||
* Try to get the title of an HTML document, using incomplete regexp parsing for simplicity (because | ||
* this feature is experimental and we don't need robustness yet). | ||
*/ | ||
function tryGetHTMLDocumentTitle(html: string): string | undefined { | ||
return html.match(/<title>(?<title>[^<]+)<\/title>/)?.groups?.title | ||
} | ||
|
||
function timeoutSignal(ms?: number): AbortSignal | undefined { | ||
if (ms === undefined) { | ||
return undefined | ||
} | ||
const controller = new AbortController() | ||
setTimeout(() => controller.abort('timeout'), ms) | ||
return controller.signal | ||
} | ||
|
||
export default urlFetcher |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
{ | ||
"name": "@openctx/provider-url-fetcher", | ||
"version": "0.0.1", | ||
"description": "URL Fetcher (OpenCtx provider)", | ||
"license": "Apache-2.0", | ||
"repository": { | ||
"type": "git", | ||
"url": "https://github.com/sourcegraph/openctx", | ||
"directory": "provider/url-fetcher" | ||
}, | ||
"type": "module", | ||
"main": "dist/index.js", | ||
"types": "dist/index.d.ts", | ||
"files": [ | ||
"dist", | ||
"index.ts", | ||
"!**/*.test.*", | ||
"README.md" | ||
], | ||
"sideEffects": false, | ||
"scripts": { | ||
"build": "tsc --build", | ||
"test": "vitest", | ||
"bundle": "esbuild --bundle --format=esm --outfile=dist/bundle.js index.ts" | ||
}, | ||
"dependencies": { | ||
"@openctx/provider": "workspace:*" | ||
}, | ||
"devDependencies": { | ||
"esbuild": "^0.19.11" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
{ | ||
"extends": "../../.config/tsconfig.base.json", | ||
"compilerOptions": { | ||
"module": "ESNext", | ||
"rootDir": ".", | ||
"outDir": "dist", | ||
"lib": ["ESNext"], | ||
}, | ||
"include": ["*.ts"], | ||
"exclude": ["dist", "vitest.config.ts"], | ||
"references": [{ "path": "../../lib/provider" }], | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
import { defineConfig } from 'vitest/config' | ||
|
||
export default defineConfig({}) |