Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert "enhance: optimistic search page loading and improved content refinement" #424

Merged
merged 1 commit into from
Feb 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 3 additions & 26 deletions google/search/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 0 additions & 2 deletions google/search/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,6 @@
"express": "^4.18.2",
"global-cache-dir": "^6.0.0",
"playwright": "^1.46.0",
"prettier": "^3.4.2",
"tiktoken": "^1.0.18",
"ts-node-dev": "^2.0.0",
"turndown": "^7.1.3"
}
Expand Down
38 changes: 0 additions & 38 deletions google/search/src/headers.ts

This file was deleted.

218 changes: 59 additions & 159 deletions google/search/src/refine.ts
Original file line number Diff line number Diff line change
@@ -1,139 +1,39 @@
import { encoding_for_model } from "tiktoken"
import {GPTScript, type ToolDef} from "@gptscript-ai/gptscript"
import {type SearchResult, type SearchResults} from "./search.ts"
import {type ModelProviderCredentials} from "./headers.ts"

// Max number of tokens in the search results
const MAX_RESULTS_TOKENS = 50000

const gptscript = new GPTScript()

export async function refine (creds: ModelProviderCredentials | undefined, unrefined: SearchResults): Promise<SearchResults> {
const totalUnrefinedTokens = tokenCount(unrefined.results.reduce((acc, result) => acc + result.content, ''))
if (totalUnrefinedTokens <= MAX_RESULTS_TOKENS) {
console.info(`Total tokens (${totalUnrefinedTokens}) are within the limit (${MAX_RESULTS_TOKENS}), skipping refinement`)
return unrefined
}

if (!creds) {
console.warn('No model provider credentials provided, skipping refinement')
return unrefined
}

console.info(`Total tokens (${totalUnrefinedTokens}) are above the limit (${MAX_RESULTS_TOKENS}), calling GPTScript to refine results`)

const now = userDateTime()
let refined = await Promise.all(
export async function refine (unrefined: SearchResults): Promise<SearchResults> {
const now = new Date().toISOString()
const refined = await Promise.all(
unrefined.results.map(async (result) => {
const refinedContent = await refineContent(creds, now, unrefined.query, result)
const refinedTokens = tokenCount(refinedContent.content)
return {
...result,
...refinedContent,
refinedTokens
if (result.content?.length ?? 0 <= 10000) {
// Don't refine content that is 10k tokens or less
return result
}

return await refineResult(now, unrefined.query, result)
})
)

const totalRefinedTokens = refined.reduce((sum, r) => sum + r.refinedTokens, 0)
if (totalRefinedTokens <= MAX_RESULTS_TOKENS) {
// If the refined tokens already fit the limit, return as is.
return { query: unrefined.query, results: refined }
}

// Filter zero score or zero tokens
refined = refined.filter(r => r.score > 0 && r.refinedTokens > 0)

// Sort by "value density" = score / tokens (descending)
refined.sort((a, b) => (b.score / b.refinedTokens) - (a.score / a.refinedTokens))

const pruned: SearchResult[] = []
let tokenBudget = MAX_RESULTS_TOKENS

for (const r of refined) {
if (tokenBudget < 1) break

if (r.refinedTokens >= tokenBudget) {
// If the result is too long, truncate it to fit the budget
const truncated = truncateContent(r.content, tokenBudget)
pruned.push({
...r,
content: truncated.content,
})

// Consume the tokens from the budget
tokenBudget -= truncated.tokenCount
continue
}

// The entire result fits in the budget, so add it to the pruned results
pruned.push(r)
tokenBudget -= r.refinedTokens
}

return { query: unrefined.query, results: pruned }
}

function tokenCount (content?: string): number {
if (!content || content.length === 0) {
return 0
}

const enc = encoding_for_model('gpt-4o-mini');
try {
return enc.encode(content).length;
} catch (e) {
console.warn('Error encoding content', e);
} finally {
// Free encoding resources when done
enc.free()
}

return 0
}


function truncateContent (content: string, maxTokens: number): {
content: string,
tokenCount: number
} {
const codec = encoding_for_model('gpt-4o-mini');
try {
const tokens = codec.encode(content)
const truncated = tokens.slice(0, maxTokens)
return {
content: new TextDecoder().decode(truncated),
tokenCount: truncated.length
}
} finally {
codec.free()
return {
...unrefined,
results: refined.filter(result => hasContent(result.content))
}
}


function userDateTime (): string {
const tz = process.env.TIMEZONE || 'UTC';
try {
new Intl.DateTimeFormat('en-US', { timeZone: tz });
} catch {
return new Date().toLocaleString('en-US', { timeZone: 'UTC', timeZoneName: 'short' });
}
return new Date().toLocaleString('en-US', { timeZone: tz, timeZoneName: 'short' });
function hasContent (content?: string | string[]): boolean {
return !(Array.isArray(content) ? content?.length === 0 : content?.trim() === '')
}


async function refineContent (
creds: ModelProviderCredentials,
async function refineResult (
time: string,
query: string,
result: SearchResult): Promise<{
content: string,
score: number
}> {
result: SearchResult): Promise<SearchResult> {

const tool: ToolDef = {
chat: false,
jsonResponse: false,
jsonResponse: true,
modelName: process.env.OBOT_DEFAULT_LLM_MINI_MODEL ?? 'gpt-4o-mini',
temperature: 0.0,
arguments: {
Expand All @@ -143,54 +43,38 @@ async function refineContent (
type: 'string',
description: 'Current date and time that the search was requested at'
},
topic: {
query: {
type: 'string',
description: 'Topic to extract excerpts for'
description: 'query or subject matter to generate citations for'
},
url: {
type: 'string',
description: 'URL that the markdown content was sourced from'
description: 'URL that the content was sourced from'
},
content: {
type: 'string',
description: 'Markdown document created by exporting an HTML web page to markdown'
description: 'Markdown content to cite'
}
},
required: ['time', 'topic', 'url', 'content']
required: ['query', 'url', 'content']
},
instructions: refineInstructions
}

const run = await gptscript.evaluate(tool, {
BaseURL: creds.baseUrl,
APIKey: creds.apiKey,
input: minify({
time,
topic: query,
url: result.url,
content: result.content
input: JSON.stringify({
query,
...result,
time
})
})

// Parse the output into a score and content
const output = await run.text()
const [firstLine, ...restLines] = output?.split('\n') ?? []
const score = Math.max(1, Math.min(10, parseInt(firstLine, 10))) || 0
const content = restLines.join('\n')

return { score, content }
return await run.json()
}

// Note: Tools can't introspect their parameters schema, so we provide it in the instructions as well
const refineInstructions = `
Do not respond with any additional dialog or commentary.

You are a research assistant tasked with extracting excerpts from a markdown document that will
be used as notes to conduct detailed research about a given topic.

The document is the result of exporting an HTML webpage to markdown.

When given an object with the following JSON schema:
Given an object with the following JSON schema:

${minify({
type: 'object',
Expand All @@ -199,35 +83,51 @@ ${minify({
type: 'string',
description: 'Current date and time that the search was requested at'
},
topic: {
query: {
type: 'string',
description: 'Topic to extract excerpts for'
description: 'Query or subject matter to generate citations for'
},
url: {
type: 'string',
description: 'URL that the markdown content was sourced from'
description: 'URL that the content was sourced from'
},
content: {
type: 'string',
description: 'Markdown document created by exporting an HTML web page to markdown'
description: 'Markdown content to cite'
}
},
required: ['time', 'topic', 'url', 'content', 'time']
required: ['query', 'url', 'content', 'time']
})}

Perform the following steps in order:
1. Refine the markdown content by removing all:
- boilerplate and unintelligable text
- unrelated advertisements, links, and web page structure
2. Select excerpts from the refined content that you think would make good notes for conducting detailed research about the topic
3. Compose a concise markdown document containing the excerpts organized in descending order of importance to understanding the topic. Do not paraphrase, summarize, or reword the excerpts. The goal is to preserve as much of the original content as possible.
4. Grade the corpus of excerpts as a whole based how well it covers the topic on a scale of 0-10, where high scores are good and low scores contain no relevant information
Select all markdown from \${CONTENT} containing information useful to cite when researching \${QUERY}.
Selected markdown should contain the most useful and relevant information to \${QUERY} available in \${CONTENT}.
Don't select markdown that is not helpful or related to \${QUERY}.

Respond with a single object containing all of the selected markdown that adheres to the following JSON schema:

Afterwards, respond with the grade followed by the markdown document on a new line.
${minify({
type: 'object',
properties: {
url: {
type: 'string',
description: 'URL that the content was sourced from'
},
title: {
type: 'string',
description: 'Main title of the source content'
},
content: {
type: 'array',
description: 'Cleaned up markdown from the original content that can be cited to research the query',
items: {
type: 'string'
}
}
},
required: ['url', 'title', 'content']
})}

EXAMPLE
5
<content of markdown document>
Do not respond with any additional dialog or commentary.
`

function minify (obj: object): string {
Expand Down
Loading