Skip to content

Commit

Permalink
[FEAT] Website depth scraping data connector (#1191)
Browse files Browse the repository at this point in the history
* WIP website depth scraping, (sort of works)

* website depth data connector stable + add maxLinks option

* linting + loading small ui tweak

* refactor website depth data connector for stability, speed, & readability

* patch: remove console log
Guard clause on URL validitiy check
reasonable overrides

---------

Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
  • Loading branch information
shatfield4 and timothycarambat committed May 14, 2024
1 parent b6be43b commit 612a7e1
Show file tree
Hide file tree
Showing 8 changed files with 356 additions and 0 deletions.
20 changes: 20 additions & 0 deletions collector/extensions/index.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
const { verifyPayloadIntegrity } = require("../middleware/verifyIntegrity");
const { reqBody } = require("../utils/http");
const { validURL } = require("../utils/url");

function extensions(app) {
if (!app) return;
Expand Down Expand Up @@ -86,6 +87,25 @@ function extensions(app) {
}
);

app.post(
"/ext/website-depth",
[verifyPayloadIntegrity],
async function (request, response) {
try {
const websiteDepth = require("../utils/extensions/WebsiteDepth");
const { url, depth = 1, maxLinks = 20 } = reqBody(request);
if (!validURL(url)) return { success: false, reason: "Not a valid URL." };

const scrapedData = await websiteDepth(url, depth, maxLinks);
response.status(200).json({ success: true, data: scrapedData });
} catch (e) {
console.error(e);
response.status(400).json({ success: false, reason: e.message });
}
return;
}
);

app.post(
"/ext/confluence",
[verifyPayloadIntegrity],
Expand Down
153 changes: 153 additions & 0 deletions collector/utils/extensions/WebsiteDepth/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
const { v4 } = require("uuid");
const {
PuppeteerWebBaseLoader,
} = require("langchain/document_loaders/web/puppeteer");
const { default: slugify } = require("slugify");
const { parse } = require("node-html-parser");
const { writeToServerDocuments } = require("../../files");
const { tokenizeString } = require("../../tokenizer");
const path = require("path");
const fs = require("fs");

async function discoverLinks(startUrl, depth = 1, maxLinks = 20) {
const baseUrl = new URL(startUrl).origin;
const discoveredLinks = new Set();
const pendingLinks = [startUrl];
let currentLevel = 0;
depth = depth < 1 ? 1 : depth;
maxLinks = maxLinks < 1 ? 1 : maxLinks;

// Check depth and if there are any links left to scrape
while (currentLevel < depth && pendingLinks.length > 0) {
const newLinks = await getPageLinks(pendingLinks[0], baseUrl);
pendingLinks.shift();

for (const link of newLinks) {
if (!discoveredLinks.has(link)) {
discoveredLinks.add(link);
pendingLinks.push(link);
}

// Exit out if we reach maxLinks
if (discoveredLinks.size >= maxLinks) {
return Array.from(discoveredLinks).slice(0, maxLinks);
}
}

if (pendingLinks.length === 0) {
currentLevel++;
}
}

return Array.from(discoveredLinks);
}

async function getPageLinks(url, baseUrl) {
try {
const loader = new PuppeteerWebBaseLoader(url, {
launchOptions: { headless: "new" },
gotoOptions: { waitUntil: "domcontentloaded" },
});
const docs = await loader.load();
const html = docs[0].pageContent;
const links = extractLinks(html, baseUrl);
return links;
} catch (error) {
console.error(`Failed to get page links from ${url}.`, error);
return [];
}
}

function extractLinks(html, baseUrl) {
const root = parse(html);
const links = root.querySelectorAll("a");
const extractedLinks = new Set();

for (const link of links) {
const href = link.getAttribute("href");
if (href) {
const absoluteUrl = new URL(href, baseUrl).href;
if (absoluteUrl.startsWith(baseUrl)) {
extractedLinks.add(absoluteUrl);
}
}
}

return Array.from(extractedLinks);
}

async function bulkScrapePages(links, outputFolder) {
const scrapedData = [];

for (let i = 0; i < links.length; i++) {
const link = links[i];
console.log(`Scraping ${i + 1}/${links.length}: ${link}`);

try {
const loader = new PuppeteerWebBaseLoader(link, {
launchOptions: { headless: "new" },
gotoOptions: { waitUntil: "domcontentloaded" },
async evaluate(page, browser) {
const result = await page.evaluate(() => document.body.innerText);
await browser.close();
return result;
},
});
const docs = await loader.load();
const content = docs[0].pageContent;

if (!content.length) {
console.warn(`Empty content for ${link}. Skipping.`);
continue;
}

const url = new URL(link);
const filename = (url.host + "-" + url.pathname).replace(".", "_");

const data = {
id: v4(),
url: "file://" + slugify(filename) + ".html",
title: slugify(filename) + ".html",
docAuthor: "no author found",
description: "No description found.",
docSource: "URL link uploaded by the user.",
chunkSource: `link://${link}`,
published: new Date().toLocaleString(),
wordCount: content.split(" ").length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
};

writeToServerDocuments(data, data.title, outputFolder);
scrapedData.push(data);

console.log(`Successfully scraped ${link}.`);
} catch (error) {
console.error(`Failed to scrape ${link}.`, error);
}
}

return scrapedData;
}

async function websiteScraper(startUrl, depth = 1, maxLinks = 20) {
const websiteName = new URL(startUrl).hostname;
const outputFolder = path.resolve(
__dirname,
`../../../../server/storage/documents/${slugify(websiteName)}`
);

fs.mkdirSync(outputFolder, { recursive: true });

console.log("Discovering links...");
const linksToScrape = await discoverLinks(startUrl, depth, maxLinks);
console.log(`Found ${linksToScrape.length} links to scrape.`);

console.log("Starting bulk scraping...");
const scrapedData = await bulkScrapePages(linksToScrape, outputFolder);
console.log(`Scraped ${scrapedData.length} pages.`);

return scrapedData;
}

module.exports = websiteScraper;
2 changes: 2 additions & 0 deletions frontend/src/components/DataConnectorOption/media/index.js
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import Github from "./github.svg";
import YouTube from "./youtube.svg";
import Link from "./link.svg";
import Confluence from "./confluence.jpeg";

const ConnectorImages = {
github: Github,
youtube: YouTube,
websiteDepth: Link,
confluence: Confluence,
};

Expand Down
1 change: 1 addition & 0 deletions frontend/src/components/DataConnectorOption/media/link.svg
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 612a7e1

Please sign in to comment.