feat(web): Use crawlee/playwright to retrieve web content in quality mode. It retrieves content more reliably than fetch + JSDoc, at the expense of speed.

This commit is contained in:
Willie Zutz 2025-05-24 14:37:19 -06:00
parent 044f30a547
commit 87a7ffb445
10 changed files with 4580 additions and 549 deletions

View file

@ -22,7 +22,7 @@ import LineOutputParser from '../outputParsers/lineOutputParser';
import LineListOutputParser from '../outputParsers/listLineOutputParser';
import { searchSearxng } from '../searxng';
import computeSimilarity from '../utils/computeSimilarity';
import { getDocumentsFromLinks, getWebContent } from '../utils/documents';
import { getDocumentsFromLinks, getWebContent, getWebContentLite } from '../utils/documents';
import formatChatHistoryAsString from '../utils/formatHistory';
import { getModelName } from '../utils/modelUtils';
@ -483,7 +483,7 @@ class MetaSearchAgent implements MetaSearchAgentType {
this.emitProgress(emitter, 60, `Enriching sources`);
sortedDocs = await Promise.all(
sortedDocs.map(async (doc) => {
const webContent = await getWebContent(doc.metadata.url);
const webContent = await getWebContentLite(doc.metadata.url);
const chunks =
webContent?.pageContent
.match(/.{1,500}/g)
@ -610,7 +610,7 @@ ${docs[index].metadata?.url.toLowerCase().includes('file') ? '' : '\n<url>' + do
</${index + 1}>\n`,
)
.join('\n');
// console.log('Processed docs:', fullDocs);
console.log('Processed docs:', fullDocs);
return fullDocs;
}

View file

@ -3,8 +3,9 @@ import { htmlToText } from 'html-to-text';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { Document } from '@langchain/core/documents';
import pdfParse from 'pdf-parse';
import { JSDOM } from 'jsdom';
import { Configuration, Dataset, PlaywrightCrawler } from 'crawlee';
import { Readability } from '@mozilla/readability';
import { JSDOM } from 'jsdom';
import fetch from 'node-fetch';
export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
@ -101,12 +102,114 @@ export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
return docs;
};
interface CrawledContent {
text: string;
title: string;
html?: string;
}
/**
* Fetches web content from a given URL using Crawlee and Playwright. Parses it using Readability.
* Returns a Document object containing the parsed text and metadata.
*
* @param url - The URL to fetch content from.
* @param getHtml - Whether to include the HTML content in the metadata.
* @returns A Promise that resolves to a Document object or null if parsing fails.
*/
export const getWebContent = async (
url: string,
getHtml: boolean = false,
): Promise<Document | null> => {
let crawledContent: CrawledContent | null = null;
const crawler = new PlaywrightCrawler({
async requestHandler({ page }) {
// Wait for the content to load
await page.waitForLoadState('networkidle', {timeout: 10000});
// Allow some time for dynamic content to load
await page.waitForTimeout(3000);
console.log(`Crawling URL: ${url}`);
// Get the page title
const title = await page.title();
try {
// Use Readability to parse the page content
const content = await page.content();
const dom = new JSDOM(content, { url });
const reader = new Readability(dom.window.document, { charThreshold: 25 }).parse();
const crawleeContent: CrawledContent = {
text: reader?.textContent || '',
title,
html: getHtml ? reader?.content || await page.content() : undefined,
};
crawledContent = crawleeContent;
} catch (error) {
console.error(`Failed to parse content with Readability for URL: ${url}`, error);
}
},
maxRequestsPerCrawl: 1,
maxRequestRetries: 2,
retryOnBlocked: true,
maxSessionRotations: 3,
}, new Configuration({ persistStorage: false }));
try {
const response = await fetch(url, { timeout: 5000 });
await crawler.run([url]);
if (!crawledContent) {
console.warn(`Failed to parse article content for URL: ${url}`);
return null;
}
const content = crawledContent as CrawledContent;
// Normalize the text content
const normalizedText = content?.text
?.split('\n')
.map((line: string) => line.trim())
.filter((line: string) => line.length > 0)
.join('\n') || '';
// Create a Document with the parsed content
const returnDoc = new Document({
pageContent: normalizedText,
metadata: {
html: content?.html,
title: content?.title,
url: url,
},
});
console.log(`Got content with Crawlee and Readability, URL: ${url}, Text Length: ${returnDoc.pageContent.length}, html Length: ${returnDoc.metadata.html?.length || 0}`);
return returnDoc;
} catch (error) {
console.error(`Error fetching/parsing URL ${url}:`, error);
return null;
} finally {
await crawler.teardown();
}
};
/**
* Fetches web content from a given URL and parses it using Readability.
* Returns a Document object containing the parsed text and metadata.
*
* @param {string} url - The URL to fetch content from.
* @param {boolean} getHtml - Whether to include the HTML content in the metadata.
* @returns {Promise<Document | null>} A Promise that resolves to a Document object or null if parsing fails.
*/
export const getWebContentLite = async (
url: string,
getHtml: boolean = false,
): Promise<Document | null> => {
try {
const response = await fetch(url, {timeout: 5000});
const html = await response.text();
// Create a DOM from the fetched HTML
@ -124,7 +227,6 @@ export const getWebContent = async (
return null;
}
// Normalize the text content by removing extra spaces and newlines. Iterate through the lines one by one and throw out the ones that are empty or contain only whitespace.
const normalizedText =
article?.textContent
?.split('\n')
@ -139,16 +241,10 @@ export const getWebContent = async (
html: getHtml ? article.content : undefined,
title: article.title || originalTitle,
url: url,
excerpt: article.excerpt || undefined,
byline: article.byline || undefined,
siteName: article.siteName || undefined,
readingTime: article.length
? Math.ceil(article.length / 1000)
: undefined,
},
});
} catch (error) {
console.error(`Error fetching/parsing URL ${url}:`); //, error);
return null;
}
};
};