-
-
Notifications
You must be signed in to change notification settings - Fork 1.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[FEAT] Website depth scraping data connector (#1191)
* WIP website depth scraping, (sort of works) * website depth data connector stable + add maxLinks option * linting + loading small ui tweak * refactor website depth data connector for stability, speed, & readability * patch: remove console log Guard clause on URL validitiy check reasonable overrides --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
- Loading branch information
1 parent
b6be43b
commit 612a7e1
Showing
8 changed files
with
356 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
const { v4 } = require("uuid"); | ||
const { | ||
PuppeteerWebBaseLoader, | ||
} = require("langchain/document_loaders/web/puppeteer"); | ||
const { default: slugify } = require("slugify"); | ||
const { parse } = require("node-html-parser"); | ||
const { writeToServerDocuments } = require("../../files"); | ||
const { tokenizeString } = require("../../tokenizer"); | ||
const path = require("path"); | ||
const fs = require("fs"); | ||
|
||
async function discoverLinks(startUrl, depth = 1, maxLinks = 20) { | ||
const baseUrl = new URL(startUrl).origin; | ||
const discoveredLinks = new Set(); | ||
const pendingLinks = [startUrl]; | ||
let currentLevel = 0; | ||
depth = depth < 1 ? 1 : depth; | ||
maxLinks = maxLinks < 1 ? 1 : maxLinks; | ||
|
||
// Check depth and if there are any links left to scrape | ||
while (currentLevel < depth && pendingLinks.length > 0) { | ||
const newLinks = await getPageLinks(pendingLinks[0], baseUrl); | ||
pendingLinks.shift(); | ||
|
||
for (const link of newLinks) { | ||
if (!discoveredLinks.has(link)) { | ||
discoveredLinks.add(link); | ||
pendingLinks.push(link); | ||
} | ||
|
||
// Exit out if we reach maxLinks | ||
if (discoveredLinks.size >= maxLinks) { | ||
return Array.from(discoveredLinks).slice(0, maxLinks); | ||
} | ||
} | ||
|
||
if (pendingLinks.length === 0) { | ||
currentLevel++; | ||
} | ||
} | ||
|
||
return Array.from(discoveredLinks); | ||
} | ||
|
||
async function getPageLinks(url, baseUrl) { | ||
try { | ||
const loader = new PuppeteerWebBaseLoader(url, { | ||
launchOptions: { headless: "new" }, | ||
gotoOptions: { waitUntil: "domcontentloaded" }, | ||
}); | ||
const docs = await loader.load(); | ||
const html = docs[0].pageContent; | ||
const links = extractLinks(html, baseUrl); | ||
return links; | ||
} catch (error) { | ||
console.error(`Failed to get page links from ${url}.`, error); | ||
return []; | ||
} | ||
} | ||
|
||
function extractLinks(html, baseUrl) { | ||
const root = parse(html); | ||
const links = root.querySelectorAll("a"); | ||
const extractedLinks = new Set(); | ||
|
||
for (const link of links) { | ||
const href = link.getAttribute("href"); | ||
if (href) { | ||
const absoluteUrl = new URL(href, baseUrl).href; | ||
if (absoluteUrl.startsWith(baseUrl)) { | ||
extractedLinks.add(absoluteUrl); | ||
} | ||
} | ||
} | ||
|
||
return Array.from(extractedLinks); | ||
} | ||
|
||
async function bulkScrapePages(links, outputFolder) { | ||
const scrapedData = []; | ||
|
||
for (let i = 0; i < links.length; i++) { | ||
const link = links[i]; | ||
console.log(`Scraping ${i + 1}/${links.length}: ${link}`); | ||
|
||
try { | ||
const loader = new PuppeteerWebBaseLoader(link, { | ||
launchOptions: { headless: "new" }, | ||
gotoOptions: { waitUntil: "domcontentloaded" }, | ||
async evaluate(page, browser) { | ||
const result = await page.evaluate(() => document.body.innerText); | ||
await browser.close(); | ||
return result; | ||
}, | ||
}); | ||
const docs = await loader.load(); | ||
const content = docs[0].pageContent; | ||
|
||
if (!content.length) { | ||
console.warn(`Empty content for ${link}. Skipping.`); | ||
continue; | ||
} | ||
|
||
const url = new URL(link); | ||
const filename = (url.host + "-" + url.pathname).replace(".", "_"); | ||
|
||
const data = { | ||
id: v4(), | ||
url: "file://" + slugify(filename) + ".html", | ||
title: slugify(filename) + ".html", | ||
docAuthor: "no author found", | ||
description: "No description found.", | ||
docSource: "URL link uploaded by the user.", | ||
chunkSource: `link://${link}`, | ||
published: new Date().toLocaleString(), | ||
wordCount: content.split(" ").length, | ||
pageContent: content, | ||
token_count_estimate: tokenizeString(content).length, | ||
}; | ||
|
||
writeToServerDocuments(data, data.title, outputFolder); | ||
scrapedData.push(data); | ||
|
||
console.log(`Successfully scraped ${link}.`); | ||
} catch (error) { | ||
console.error(`Failed to scrape ${link}.`, error); | ||
} | ||
} | ||
|
||
return scrapedData; | ||
} | ||
|
||
async function websiteScraper(startUrl, depth = 1, maxLinks = 20) { | ||
const websiteName = new URL(startUrl).hostname; | ||
const outputFolder = path.resolve( | ||
__dirname, | ||
`../../../../server/storage/documents/${slugify(websiteName)}` | ||
); | ||
|
||
fs.mkdirSync(outputFolder, { recursive: true }); | ||
|
||
console.log("Discovering links..."); | ||
const linksToScrape = await discoverLinks(startUrl, depth, maxLinks); | ||
console.log(`Found ${linksToScrape.length} links to scrape.`); | ||
|
||
console.log("Starting bulk scraping..."); | ||
const scrapedData = await bulkScrapePages(linksToScrape, outputFolder); | ||
console.log(`Scraped ${scrapedData.length} pages.`); | ||
|
||
return scrapedData; | ||
} | ||
|
||
module.exports = websiteScraper; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Oops, something went wrong.