Perplexica/src/lib/utils/documents.ts

277 lines
8 KiB
TypeScript

import { CheerioWebBaseLoader } from '@langchain/community/document_loaders/web/cheerio';
import { PlaywrightWebBaseLoader } from '@langchain/community/document_loaders/web/playwright';
import { Document } from '@langchain/core/documents';
import { Readability } from '@mozilla/readability';
import axios from 'axios';
import { JSDOM } from 'jsdom';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import fetch from 'node-fetch';
import pdfParse from 'pdf-parse';
import type { Browser, Page } from 'playwright';
export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
const splitter = new RecursiveCharacterTextSplitter();
let docs: Document[] = [];
await Promise.all(
links.map(async (link) => {
link =
link.startsWith('http://') || link.startsWith('https://')
? link
: `https://${link}`;
try {
// First, check if it's a PDF
const headRes = await axios.head(link);
const isPdf = headRes.headers['content-type'] === 'application/pdf';
if (isPdf) {
// Handle PDF files
const res = await axios.get(link, {
responseType: 'arraybuffer',
});
const pdfText = await pdfParse(res.data);
const parsedText = pdfText.text
.replace(/(\r\n|\n|\r)/gm, ' ')
.replace(/\s+/g, ' ')
.trim();
const splittedText = await splitter.splitText(parsedText);
const title = 'PDF Document';
const linkDocs = splittedText.map((text) => {
return new Document({
pageContent: text,
metadata: {
title: title,
url: link,
},
});
});
docs.push(...linkDocs);
return;
}
// Handle web pages using CheerioWebBaseLoader
const loader = new CheerioWebBaseLoader(link, {
selector: 'body',
});
const webDocs = await loader.load();
if (webDocs && webDocs.length > 0) {
const webDoc = webDocs[0];
const splittedText = await splitter.splitText(webDoc.pageContent);
const linkDocs = splittedText.map((text) => {
return new Document({
pageContent: text,
metadata: {
title: webDoc.metadata.title || link,
url: link,
},
});
});
docs.push(...linkDocs);
}
} catch (err) {
console.error(
'An error occurred while getting documents from links: ',
err,
);
docs.push(
new Document({
pageContent: `Failed to retrieve content from the link: ${err}`,
metadata: {
title: 'Failed to retrieve content',
url: link,
},
}),
);
}
}),
);
return docs;
};
/**
* Fetches web content from a given URL using LangChain's PlaywrightWebBaseLoader.
* Parses it using Readability for better content extraction.
* Returns a Document object containing the parsed text and metadata.
*
* @param url - The URL to fetch content from.
* @param getHtml - Whether to include the HTML content in the metadata.
* @returns A Promise that resolves to a Document object or null if parsing fails.
*/
export const getWebContent = async (
url: string,
getHtml: boolean = false,
): Promise<Document | null> => {
try {
console.log(`Fetching content from URL: ${url}`);
const loader = new PlaywrightWebBaseLoader(url, {
launchOptions: {
headless: true,
timeout: 30000,
},
gotoOptions: {
waitUntil: 'domcontentloaded',
timeout: 10000,
},
async evaluate(page: Page, browser: Browser) {
// Wait for the content to load properly
await page.waitForLoadState('networkidle', { timeout: 10000 });
// Allow some time for dynamic content to load
await page.waitForTimeout(3000);
return await page.content();
},
});
const docs = await loader.load();
if (!docs || docs.length === 0) {
console.warn(`Failed to load content for URL: ${url}`);
return null;
}
const doc = docs[0];
const dom = new JSDOM(doc.pageContent, { url });
const reader = new Readability(dom.window.document, { charThreshold: 25 });
const article = reader.parse();
// Normalize the text content
const normalizedText =
article?.textContent
?.split('\n')
.map((line: string) => line.trim())
.filter((line: string) => line.length > 0)
.join('\n') || '';
const returnDoc = new Document({
pageContent: normalizedText,
metadata: {
title: article?.title || doc.metadata.title || '',
url: url,
html: getHtml ? article?.content : undefined,
},
});
console.log(
`Got content with LangChain Playwright, URL: ${url}, Text Length: ${returnDoc.pageContent.length}`,
);
return returnDoc;
} catch (error) {
console.error(`Error fetching/parsing URL ${url}:`, error);
// Fallback to CheerioWebBaseLoader for simpler content extraction
try {
console.log(`Fallback to Cheerio for URL: ${url}`);
const cheerioLoader = new CheerioWebBaseLoader(url);
const docs = await cheerioLoader.load();
if (docs && docs.length > 0) {
return docs[0];
}
} catch (fallbackError) {
console.error(
`Cheerio fallback also failed for URL ${url}:`,
fallbackError,
);
}
return null;
}
};
/**
* Fetches web content from a given URL using CheerioWebBaseLoader for faster, lighter extraction.
* Returns a Document object containing the parsed text and metadata.
*
* @param {string} url - The URL to fetch content from.
* @param {boolean} getHtml - Whether to include the HTML content in the metadata.
* @returns {Promise<Document | null>} A Promise that resolves to a Document object or null if parsing fails.
*/
export const getWebContentLite = async (
url: string,
getHtml: boolean = false,
): Promise<Document | null> => {
try {
console.log(`Fetching content (lite) from URL: ${url}`);
const loader = new CheerioWebBaseLoader(url);
const docs = await loader.load();
if (!docs || docs.length === 0) {
console.warn(`Failed to load content for URL: ${url}`);
return null;
}
const doc = docs[0];
// Try to use Readability for better content extraction if possible
if (getHtml) {
try {
const response = await fetch(url, { timeout: 5000 });
const html = await response.text();
const dom = new JSDOM(html, { url });
const originalTitle = dom.window.document.title;
const reader = new Readability(dom.window.document, {
charThreshold: 25,
});
const article = reader.parse();
if (article) {
const normalizedText =
article.textContent
?.split('\n')
.map((line) => line.trim())
.filter((line) => line.length > 0)
.join('\n') || '';
return new Document({
pageContent: normalizedText,
metadata: {
html: article.content,
title: article.title || originalTitle,
url: url,
},
});
}
} catch (readabilityError) {
console.warn(
`Readability parsing failed for ${url}, using Cheerio fallback`,
);
}
}
// Normalize the text content from Cheerio
const normalizedText = doc.pageContent
.split('\n')
.map((line: string) => line.trim())
.filter((line: string) => line.length > 0)
.join('\n');
return new Document({
pageContent: normalizedText,
metadata: {
title: doc.metadata.title || 'Web Page',
url: url,
html: getHtml ? doc.pageContent : undefined,
},
});
} catch (error) {
console.error(`Error fetching/parsing URL ${url}:`, error);
return null;
}
};