obot-platform · njhale · Feb 7, 2025 · Feb 7, 2025
diff --git a/google/search/package-lock.json b/google/search/package-lock.json
diff --git a/google/search/package.json b/google/search/package.json
@@ -31,8 +31,6 @@
     "express": "^4.18.2",
     "global-cache-dir": "^6.0.0",
     "playwright": "^1.46.0",
-    "prettier": "^3.4.2",
-    "tiktoken": "^1.0.18",
     "ts-node-dev": "^2.0.0",
     "turndown": "^7.1.3"
   }

diff --git a/google/search/src/headers.ts b/google/search/src/headers.ts
diff --git a/google/search/src/refine.ts b/google/search/src/refine.ts
@@ -1,139 +1,39 @@
-import { encoding_for_model } from "tiktoken"
 import {GPTScript, type ToolDef} from "@gptscript-ai/gptscript"
 import {type SearchResult, type SearchResults} from "./search.ts"
-import {type ModelProviderCredentials} from "./headers.ts"
-
-// Max number of tokens in the search results
-const MAX_RESULTS_TOKENS = 50000
 
 const gptscript = new GPTScript()
 
-export async function refine (creds: ModelProviderCredentials | undefined, unrefined: SearchResults): Promise<SearchResults> {
-  const totalUnrefinedTokens = tokenCount(unrefined.results.reduce((acc, result) => acc + result.content, ''))
-  if (totalUnrefinedTokens <= MAX_RESULTS_TOKENS) {
-    console.info(`Total tokens (${totalUnrefinedTokens}) are within the limit (${MAX_RESULTS_TOKENS}), skipping refinement`)
-    return unrefined
-  }
-
-  if (!creds) {
-    console.warn('No model provider credentials provided, skipping refinement')
-    return unrefined
-  }
-
-  console.info(`Total tokens (${totalUnrefinedTokens}) are above the limit (${MAX_RESULTS_TOKENS}), calling GPTScript to refine results`)
-
-  const now = userDateTime()
-  let refined = await Promise.all(
+export async function refine (unrefined: SearchResults): Promise<SearchResults> {
+  const now = new Date().toISOString()
+  const refined = await Promise.all(
     unrefined.results.map(async (result) => {
-      const refinedContent = await refineContent(creds, now, unrefined.query, result)
-      const refinedTokens = tokenCount(refinedContent.content)
-      return {
-        ...result,
-        ...refinedContent,
-        refinedTokens 
+      if (result.content?.length ?? 0 <= 10000) {
+        // Don't refine content that is 10k tokens or less
+        return result
       }
+
+      return await refineResult(now, unrefined.query, result)
     })
   )
 
-  const totalRefinedTokens = refined.reduce((sum, r) => sum + r.refinedTokens, 0)
-  if (totalRefinedTokens <= MAX_RESULTS_TOKENS) {
-    // If the refined tokens already fit the limit, return as is.
-    return { query: unrefined.query, results: refined }
-  }
-
-  // Filter zero score or zero tokens
-  refined = refined.filter(r => r.score > 0 && r.refinedTokens > 0)
-
-  // Sort by "value density" = score / tokens (descending)
-  refined.sort((a, b) => (b.score / b.refinedTokens) - (a.score / a.refinedTokens))
-
-  const pruned: SearchResult[] = []
-  let tokenBudget = MAX_RESULTS_TOKENS
-
-  for (const r of refined) {
-    if (tokenBudget < 1) break
-
-    if (r.refinedTokens >= tokenBudget) {
-      // If the result is too long, truncate it to fit the budget
-      const truncated = truncateContent(r.content, tokenBudget)
-      pruned.push({
-        ...r,
-        content: truncated.content,
-      })
-
-      // Consume the tokens from the budget
-      tokenBudget -= truncated.tokenCount
-      continue
-    }
-
-    // The entire result fits in the budget, so add it to the pruned results
-    pruned.push(r)
-    tokenBudget -= r.refinedTokens
-  }
-
-  return { query: unrefined.query, results: pruned }
-}
-
-function tokenCount (content?: string): number {
-  if (!content || content.length === 0) {
-    return 0
-  }
-
-  const enc = encoding_for_model('gpt-4o-mini');
-  try {
-    return enc.encode(content).length;
-  } catch (e) {
-    console.warn('Error encoding content', e);
-  } finally {
-    // Free encoding resources when done
-    enc.free()
-  }
-
-  return 0
-}
-
-
-function truncateContent (content: string, maxTokens: number): {
-  content: string,
-  tokenCount: number
-} {
-  const codec = encoding_for_model('gpt-4o-mini');
-  try {
-    const tokens = codec.encode(content)
-    const truncated = tokens.slice(0, maxTokens)
-    return {
-      content: new TextDecoder().decode(truncated),
-      tokenCount: truncated.length
-    }
-  } finally {
-    codec.free()
+  return {
+    ...unrefined,
+    results: refined.filter(result => hasContent(result.content))
   }
 }
 
-
-function userDateTime (): string {
-  const tz = process.env.TIMEZONE || 'UTC';
-  try {
-    new Intl.DateTimeFormat('en-US', { timeZone: tz });
-  } catch {
-    return new Date().toLocaleString('en-US', { timeZone: 'UTC', timeZoneName: 'short' });
-  }
-  return new Date().toLocaleString('en-US', { timeZone: tz, timeZoneName: 'short' });
+function hasContent (content?: string | string[]): boolean {
+  return !(Array.isArray(content) ? content?.length === 0 : content?.trim() === '')
 }
 
-
-async function refineContent (
-  creds: ModelProviderCredentials,
+async function refineResult (
   time: string,
   query: string,
-  result: SearchResult): Promise<{
-    content: string,
-    score: number
-  }> {
+  result: SearchResult): Promise<SearchResult> {
 
   const tool: ToolDef = {
     chat: false,
-    jsonResponse: false,
+    jsonResponse: true,
     modelName: process.env.OBOT_DEFAULT_LLM_MINI_MODEL ?? 'gpt-4o-mini',
     temperature: 0.0,
     arguments: {
@@ -143,54 +43,38 @@ async function refineContent (
           type: 'string',
           description: 'Current date and time that the search was requested at'
         },
-        topic: {
+        query: {
           type: 'string',
-          description: 'Topic to extract excerpts for'
+          description: 'query or subject matter to generate citations for'
         },
         url: {
           type: 'string',
-          description: 'URL that the markdown content was sourced from'
+          description: 'URL that the content was sourced from'
         },
         content: {
           type: 'string',
-          description: 'Markdown document created by exporting an HTML web page to markdown'
+          description: 'Markdown content to cite'
         }
       },
-      required: ['time', 'topic', 'url', 'content']
+      required: ['query', 'url', 'content']
     },
     instructions: refineInstructions
   }
 
   const run = await gptscript.evaluate(tool, {
-    BaseURL: creds.baseUrl,
-    APIKey: creds.apiKey,
-    input: minify({
-      time,
-      topic: query,
-      url: result.url,
-      content: result.content
+    input: JSON.stringify({
+      query,
+      ...result,
+      time
     })
   })
 
-  // Parse the output into a score and content
-  const output = await run.text()
-  const [firstLine, ...restLines] = output?.split('\n') ?? []
-  const score = Math.max(1, Math.min(10, parseInt(firstLine, 10))) || 0
-  const content = restLines.join('\n')
-
-  return { score, content }
+  return await run.json()
 }
 
 // Note: Tools can't introspect their parameters schema, so we provide it in the instructions as well
 const refineInstructions = `
-Do not respond with any additional dialog or commentary.
-
-You are a research assistant tasked with extracting excerpts from a markdown document that will
-be used as notes to conduct detailed research about a given topic.
-
-The document is the result of exporting an HTML webpage to markdown.
-
-When given an object with the following JSON schema:
+Given an object with the following JSON schema:
 
 ${minify({
   type: 'object',
@@ -199,35 +83,51 @@ ${minify({
       type: 'string',
       description: 'Current date and time that the search was requested at'
     },
-    topic: {
+    query: {
       type: 'string',
-      description: 'Topic to extract excerpts for'
+      description: 'Query or subject matter to generate citations for'
     },
     url: {
       type: 'string',
-      description: 'URL that the markdown content was sourced from'
+      description: 'URL that the content was sourced from'
     },
     content: {
       type: 'string',
-      description: 'Markdown document created by exporting an HTML web page to markdown'
+      description: 'Markdown content to cite'
     }
   },
-  required: ['time', 'topic', 'url', 'content', 'time']
+  required: ['query', 'url', 'content', 'time']
 })}
 
-Perform the following steps in order:
-1. Refine the markdown content by removing all:
-  - boilerplate and unintelligable text
-  - unrelated advertisements, links, and web page structure
-2. Select excerpts from the refined content that you think would make good notes for conducting detailed research about the topic
-3. Compose a concise markdown document containing the excerpts organized in descending order of importance to understanding the topic. Do not paraphrase, summarize, or reword the excerpts. The goal is to preserve as much of the original content as possible.
-4. Grade the corpus of excerpts as a whole based how well it covers the topic on a scale of 0-10, where high scores are good and low scores contain no relevant information
+Select all markdown from \${CONTENT} containing information useful to cite when researching \${QUERY}.
+Selected markdown should contain the most useful and relevant information to \${QUERY} available in \${CONTENT}.
+Don't select markdown that is not helpful or related to \${QUERY}.
+
+Respond with a single object containing all of the selected markdown that adheres to the following JSON schema:
 
-Afterwards, respond with the grade followed by the markdown document on a new line.
+${minify({
+  type: 'object',
+  properties: {
+    url: {
+      type: 'string',
+      description: 'URL that the content was sourced from'
+    },
+    title: {
+      type: 'string',
+      description: 'Main title of the source content'
+    },
+    content: {
+      type: 'array',
+      description: 'Cleaned up markdown from the original content that can be cited to research the query',
+      items: {
+        type: 'string'
+      }
+    }
+  },
+  required: ['url', 'title', 'content']
+})}
 
-EXAMPLE
-5
-<content of markdown document>
+Do not respond with any additional dialog or commentary.
 `
 
 function minify (obj: object): string {