danny-avila · cpbotha · Apr 21, 2024 · Apr 15, 2024 · Apr 20, 2024 · Apr 20, 2024
diff --git a/api/app/clients/tools/structured/GoogleSearch.js b/api/app/clients/tools/structured/GoogleSearch.js
@@ -1,6 +1,11 @@
+const axios = require('axios').default;
 const { z } = require('zod');
 const { Tool } = require('@langchain/core/tools');
 const { getEnvironmentVariable } = require('@langchain/core/utils/env');
+const { JSDOM, VirtualConsole } = require('jsdom');
+const { Readability } = require('@mozilla/readability');
+const { logger } = require('~/config');
+const { ProxyAgent } = require('proxy-agent');
 
 class GoogleSearchResults extends Tool {
   static lc_name() {
@@ -31,6 +36,46 @@ class GoogleSearchResults extends Tool {
         .describe('The maximum number of search results to return. Defaults to 10.'),
       // Note: Google API has its own parameters for search customization, adjust as needed.
     });
+
+    // this will get proxy configuration from the standard environment variables
+    // https://github.com/TooTallNate/proxy-agents/tree/main/packages/proxy-agent
+    this.agent = new ProxyAgent();
+  }
+
+  async handleItem(item) {
+    // fetch page via URL, then parse into DOM for readability
+    // but first setup to ignore jsdom errors, because it often can't parse style sheets
+    // see https://stackoverflow.com/a/69958999/532513
+    const virtualConsole = new VirtualConsole();
+    virtualConsole.on('error', () => {
+      // No-op to skip console errors.
+    });
+
+    // use axios to fetch page into DOM
+    let resp;
+    try {
+      resp = await axios.get(item.link, { httpAgent: this.agent, httpsAgent: this.agent });
+    } catch (error) {
+      // handle exception here, else none of the other page fetch data will successfully return
+      logger.error(`Error fetching page ${item.link}: ${error} - ${error.response.data}`);
+      return null;
+    }
+    const dom = new JSDOM(resp.data, { virtualConsole }).window.document;
+
+    // parse DOM using Readability
+    // title, content, textContent (this is what you want), length (characters), except,
+    const article = new Readability(dom).parse();
+
+    // this means Readability could not extract anything
+    if (!article) {
+      return null;
+    }
+
+    // collapse empty lines in textContent, using start of line and end of line anchors
+    return `Title: ${item.title}\n\nLink: ${item.link}\n\n${article.textContent.replace(
+      /^[\s]*$/gm,
+      '',
+    )}\n`;
   }
 
   async _call(input) {
@@ -41,24 +86,52 @@ class GoogleSearchResults extends Tool {
 
     const { query, max_results = 5 } = validationResult.data;
 
-    const response = await fetch(
-      `https://www.googleapis.com/customsearch/v1?key=${this.apiKey}&cx=${
-        this.searchEngineId
-      }&q=${encodeURIComponent(query)}&num=${max_results}`,
-      {
-        method: 'GET',
-        headers: {
-          'Content-Type': 'application/json',
+    let response;
+    try {
+      response = await axios.get(
+        `https://www.googleapis.com/customsearch/v1?key=${this.apiKey}&cx=${
+          this.searchEngineId
+        }&q=${encodeURIComponent(query)}&num=${max_results}`,
+        {
+          headers: {
+            'Content-Type': 'application/json',
+          },
         },
-      },
-    );
-
-    const json = await response.json();
-    if (!response.ok) {
-      throw new Error(`Request failed with status ${response.status}: ${json.error.message}`);
+      );
+    } catch (error) {
+      // the default axios exception only shows the code
+      // here we add the response body, which usually has more information
+      throw new Error(
+        `Request failed with status ${error.response.status}: ${error.response.data}`,
+      );
     }
 
-    return JSON.stringify(json);
+    // now go through all of the search results, and retrieve the text contents of all search result pages in parallel
+    const webPages = (
+      await Promise.all(response.data.items.map((item, idx) => this.handleItem(item, idx)))
+    )
+      // filter to remove any nulls, which are pages that Readability could not parse
+      .filter((item) => item !== null)
+      // add a correctly numbered heading to each page
+      .map((item, idx) => `## Web page ${idx + 1}\n${item}`);
+
+    // truncate each webPage to truncWords words
+    const truncWords = 1300;
+    const truncWebPages = webPages.map((webPage) => {
+      const words = webPage.split(' ');
+      if (words.length <= truncWords) {
+        return webPage;
+      } else {
+        return words.slice(0, truncWords).join(' ') + ' ...';
+      }
+    });
+
+    const res =
+      'When writing your response, please cite the web pages inline using "[source N](LINK)" ' +
+      'with the number N from the "## Web page N" headings and LINK from the "Link:" at the start of the relevant web page. \n\n' +
+      `${truncWebPages.join('\n\n')}`;
+    //console.log(res);
+    return res;
   }
 }
 

diff --git a/api/package.json b/api/package.json
@@ -41,6 +41,7 @@
     "@langchain/community": "^0.0.46",
     "@langchain/google-genai": "^0.0.11",
     "@langchain/google-vertexai": "^0.0.5",
+    "@mozilla/readability": "^0.5.0",
     "axios": "^1.3.4",
     "bcryptjs": "^2.4.3",
     "cheerio": "^1.0.0-rc.12",
@@ -58,6 +59,8 @@
     "googleapis": "^126.0.1",
     "handlebars": "^4.7.7",
     "html": "^1.0.0",
+    "http-proxy-agent": "^7.0.2",
+    "https-proxy-agent": "^7.0.4",
     "ioredis": "^5.3.2",
     "js-yaml": "^4.1.0",
     "jsonwebtoken": "^9.0.0",
@@ -86,7 +89,9 @@
     "passport-jwt": "^4.0.1",
     "passport-local": "^1.0.0",
     "pino": "^8.12.1",
+    "proxy-agent": "^6.4.0",
     "sharp": "^0.32.6",
+    "socks-proxy-agent": "^8.0.3",
     "tiktoken": "^1.0.10",
     "traverse": "^0.6.7",
     "ua-parser-js": "^1.0.36",
@@ -95,6 +100,7 @@
     "zod": "^3.22.4"
   },
   "devDependencies": {
+    "@types/mozilla-readability": "^0.2.1",
     "jest": "^29.5.0",
     "nodemon": "^3.0.1",
     "supertest": "^6.3.3"