[FEAT] Website depth scraping data connector (#1191)

Mintplex-Labs · May 14, 2024 · 612a7e1 · 612a7e1
1 parent b6be43b
commit 612a7e1
Show file tree

Hide file tree

Showing 8 changed files with 356 additions and 0 deletions.
diff --git a/collector/extensions/index.js b/collector/extensions/index.js
@@ -1,5 +1,6 @@
 const { verifyPayloadIntegrity } = require("../middleware/verifyIntegrity");
 const { reqBody } = require("../utils/http");
+const { validURL } = require("../utils/url");
 
 function extensions(app) {
   if (!app) return;
@@ -86,6 +87,25 @@ function extensions(app) {
     }
   );
 
+  app.post(
+    "/ext/website-depth",
+    [verifyPayloadIntegrity],
+    async function (request, response) {
+      try {
+        const websiteDepth = require("../utils/extensions/WebsiteDepth");
+        const { url, depth = 1, maxLinks = 20 } = reqBody(request);
+        if (!validURL(url)) return { success: false, reason: "Not a valid URL." };
+
+        const scrapedData = await websiteDepth(url, depth, maxLinks);
+        response.status(200).json({ success: true, data: scrapedData });
+      } catch (e) {
+        console.error(e);
+        response.status(400).json({ success: false, reason: e.message });
+      }
+      return;
+    }
+  );
+
   app.post(
     "/ext/confluence",
     [verifyPayloadIntegrity],

diff --git a/collector/utils/extensions/WebsiteDepth/index.js b/collector/utils/extensions/WebsiteDepth/index.js
@@ -0,0 +1,153 @@
+const { v4 } = require("uuid");
+const {
+  PuppeteerWebBaseLoader,
+} = require("langchain/document_loaders/web/puppeteer");
+const { default: slugify } = require("slugify");
+const { parse } = require("node-html-parser");
+const { writeToServerDocuments } = require("../../files");
+const { tokenizeString } = require("../../tokenizer");
+const path = require("path");
+const fs = require("fs");
+
+async function discoverLinks(startUrl, depth = 1, maxLinks = 20) {
+  const baseUrl = new URL(startUrl).origin;
+  const discoveredLinks = new Set();
+  const pendingLinks = [startUrl];
+  let currentLevel = 0;
+  depth = depth < 1 ? 1 : depth;
+  maxLinks = maxLinks < 1 ? 1 : maxLinks;
+
+  // Check depth and if there are any links left to scrape
+  while (currentLevel < depth && pendingLinks.length > 0) {
+    const newLinks = await getPageLinks(pendingLinks[0], baseUrl);
+    pendingLinks.shift();
+
+    for (const link of newLinks) {
+      if (!discoveredLinks.has(link)) {
+        discoveredLinks.add(link);
+        pendingLinks.push(link);
+      }
+
+      // Exit out if we reach maxLinks
+      if (discoveredLinks.size >= maxLinks) {
+        return Array.from(discoveredLinks).slice(0, maxLinks);
+      }
+    }
+
+    if (pendingLinks.length === 0) {
+      currentLevel++;
+    }
+  }
+
+  return Array.from(discoveredLinks);
+}
+
+async function getPageLinks(url, baseUrl) {
+  try {
+    const loader = new PuppeteerWebBaseLoader(url, {
+      launchOptions: { headless: "new" },
+      gotoOptions: { waitUntil: "domcontentloaded" },
+    });
+    const docs = await loader.load();
+    const html = docs[0].pageContent;
+    const links = extractLinks(html, baseUrl);
+    return links;
+  } catch (error) {
+    console.error(`Failed to get page links from ${url}.`, error);
+    return [];
+  }
+}
+
+function extractLinks(html, baseUrl) {
+  const root = parse(html);
+  const links = root.querySelectorAll("a");
+  const extractedLinks = new Set();
+
+  for (const link of links) {
+    const href = link.getAttribute("href");
+    if (href) {
+      const absoluteUrl = new URL(href, baseUrl).href;
+      if (absoluteUrl.startsWith(baseUrl)) {
+        extractedLinks.add(absoluteUrl);
+      }
+    }
+  }
+
+  return Array.from(extractedLinks);
+}
+
+async function bulkScrapePages(links, outputFolder) {
+  const scrapedData = [];
+
+  for (let i = 0; i < links.length; i++) {
+    const link = links[i];
+    console.log(`Scraping ${i + 1}/${links.length}: ${link}`);
+
+    try {
+      const loader = new PuppeteerWebBaseLoader(link, {
+        launchOptions: { headless: "new" },
+        gotoOptions: { waitUntil: "domcontentloaded" },
+        async evaluate(page, browser) {
+          const result = await page.evaluate(() => document.body.innerText);
+          await browser.close();
+          return result;
+        },
+      });
+      const docs = await loader.load();
+      const content = docs[0].pageContent;
+
+      if (!content.length) {
+        console.warn(`Empty content for ${link}. Skipping.`);
+        continue;
+      }
+
+      const url = new URL(link);
+      const filename = (url.host + "-" + url.pathname).replace(".", "_");
+
+      const data = {
+        id: v4(),
+        url: "file://" + slugify(filename) + ".html",
+        title: slugify(filename) + ".html",
+        docAuthor: "no author found",
+        description: "No description found.",
+        docSource: "URL link uploaded by the user.",
+        chunkSource: `link://${link}`,
+        published: new Date().toLocaleString(),
+        wordCount: content.split(" ").length,
+        pageContent: content,
+        token_count_estimate: tokenizeString(content).length,
+      };
+
+      writeToServerDocuments(data, data.title, outputFolder);
+      scrapedData.push(data);
+
+      console.log(`Successfully scraped ${link}.`);
+    } catch (error) {
+      console.error(`Failed to scrape ${link}.`, error);
+    }
+  }
+
+  return scrapedData;
+}
+
+async function websiteScraper(startUrl, depth = 1, maxLinks = 20) {
+  const websiteName = new URL(startUrl).hostname;
+  const outputFolder = path.resolve(
+    __dirname,
+    `../../../../server/storage/documents/${slugify(websiteName)}`
+  );
+
+  fs.mkdirSync(outputFolder, { recursive: true });
+
+  console.log("Discovering links...");
+  const linksToScrape = await discoverLinks(startUrl, depth, maxLinks);
+  console.log(`Found ${linksToScrape.length} links to scrape.`);
+
+  console.log("Starting bulk scraping...");
+  const scrapedData = await bulkScrapePages(linksToScrape, outputFolder);
+  console.log(`Scraped ${scrapedData.length} pages.`);
+
+  return scrapedData;
+}
+
+module.exports = websiteScraper;
diff --git a/frontend/src/components/DataConnectorOption/media/index.js b/frontend/src/components/DataConnectorOption/media/index.js
@@ -1,10 +1,12 @@
 import Github from "./github.svg";
 import YouTube from "./youtube.svg";
+import Link from "./link.svg";
 import Confluence from "./confluence.jpeg";
 
 const ConnectorImages = {
   github: Github,
   youtube: YouTube,
+  websiteDepth: Link,
   confluence: Confluence,
 };
 

diff --git a/frontend/src/components/DataConnectorOption/media/link.svg b/frontend/src/components/DataConnectorOption/media/link.svg