feat(web): Use crawlee/playwright to retrieve web content in quality mode. It retrieves content more reliably than fetch + JSDoc, at the expense of speed.
This commit is contained in:
parent
044f30a547
commit
87a7ffb445
10 changed files with 4580 additions and 549 deletions
|
|
@ -1,9 +1,10 @@
|
||||||
FROM node:20.18.0-slim AS builder
|
FROM --platform=linux/amd64 node:20-slim AS builder
|
||||||
|
|
||||||
WORKDIR /home/perplexica
|
WORKDIR /home/perplexica
|
||||||
|
|
||||||
COPY package.json yarn.lock ./
|
COPY package.json yarn.lock ./
|
||||||
RUN yarn install --frozen-lockfile --network-timeout 600000
|
RUN yarn install --frozen-lockfile --network-timeout 600000
|
||||||
|
ENV NEXT_TELEMETRY_DISABLED=1
|
||||||
|
|
||||||
COPY tsconfig.json next.config.mjs next-env.d.ts postcss.config.js drizzle.config.ts tailwind.config.ts ./
|
COPY tsconfig.json next.config.mjs next-env.d.ts postcss.config.js drizzle.config.ts tailwind.config.ts ./
|
||||||
COPY src ./src
|
COPY src ./src
|
||||||
|
|
@ -12,7 +13,9 @@ COPY public ./public
|
||||||
RUN mkdir -p /home/perplexica/data
|
RUN mkdir -p /home/perplexica/data
|
||||||
RUN yarn build
|
RUN yarn build
|
||||||
|
|
||||||
FROM node:20.18.0-slim
|
FROM --platform=linux/amd64 node:20-slim
|
||||||
|
|
||||||
|
ENV NEXT_TELEMETRY_DISABLED=1
|
||||||
|
|
||||||
WORKDIR /home/perplexica
|
WORKDIR /home/perplexica
|
||||||
|
|
||||||
|
|
@ -22,6 +25,11 @@ COPY --from=builder /home/perplexica/.next/static ./public/_next/static
|
||||||
COPY --from=builder /home/perplexica/.next/standalone ./
|
COPY --from=builder /home/perplexica/.next/standalone ./
|
||||||
COPY --from=builder /home/perplexica/data ./data
|
COPY --from=builder /home/perplexica/data ./data
|
||||||
|
|
||||||
RUN mkdir /home/perplexica/uploads
|
RUN mkdir /home/perplexica/uploads && \
|
||||||
|
npx -y playwright install chromium --with-deps && \
|
||||||
|
npm install playwright && \
|
||||||
|
apt-get update && \
|
||||||
|
apt-get install -y procps && \
|
||||||
|
apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
CMD ["node", "server.js"]
|
CMD ["node", "server.js"]
|
||||||
|
|
@ -14,6 +14,8 @@ services:
|
||||||
build:
|
build:
|
||||||
context: .
|
context: .
|
||||||
dockerfile: app.dockerfile
|
dockerfile: app.dockerfile
|
||||||
|
platforms:
|
||||||
|
- linux/amd64
|
||||||
environment:
|
environment:
|
||||||
- SEARXNG_API_URL=http://searxng:8080
|
- SEARXNG_API_URL=http://searxng:8080
|
||||||
ports:
|
ports:
|
||||||
|
|
|
||||||
|
|
@ -59,8 +59,9 @@ The API accepts a JSON object in the request body, where you define the focus mo
|
||||||
|
|
||||||
- **`optimizationMode`** (string, optional): Specifies the optimization mode to control the balance between performance and quality. Available modes:
|
- **`optimizationMode`** (string, optional): Specifies the optimization mode to control the balance between performance and quality. Available modes:
|
||||||
|
|
||||||
- `speed`: Prioritize speed and return the fastest answer.
|
- `speed`: Prioritize speed and get the quickest possible answer. Minimum effort retrieving web content. - Only uses SearXNG result previews.
|
||||||
- `balanced`: Provide a balanced answer with good speed and reasonable quality.
|
- `balanced`: Find the right balance between speed and accuracy. Medium effort retrieving web content. - Uses web scraping technologies to retrieve partial content from full web pages.
|
||||||
|
- `quality`: Get the most thorough and accurate answer. High effort retrieving web content. Requires a good AI model. May take a long time. - Uses web scraping technologies to retrieve and summarize full web content.
|
||||||
|
|
||||||
- **`query`** (string, required): The search query or question.
|
- **`query`** (string, required): The search query or question.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,7 @@ const nextConfig = {
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
serverExternalPackages: ['pdf-parse'],
|
serverExternalPackages: ['pdf-parse', 'crawlee', 'playwright'],
|
||||||
};
|
};
|
||||||
|
|
||||||
export default nextConfig;
|
export default nextConfig;
|
||||||
|
|
|
||||||
3051
package-lock.json
generated
3051
package-lock.json
generated
File diff suppressed because it is too large
Load diff
|
|
@ -31,6 +31,7 @@
|
||||||
"clsx": "^2.1.0",
|
"clsx": "^2.1.0",
|
||||||
"compute-cosine-similarity": "^1.1.0",
|
"compute-cosine-similarity": "^1.1.0",
|
||||||
"compute-dot": "^1.1.0",
|
"compute-dot": "^1.1.0",
|
||||||
|
"crawlee": "^3.13.5",
|
||||||
"drizzle-orm": "^0.40.1",
|
"drizzle-orm": "^0.40.1",
|
||||||
"html-to-text": "^9.0.5",
|
"html-to-text": "^9.0.5",
|
||||||
"jsdom": "^26.1.0",
|
"jsdom": "^26.1.0",
|
||||||
|
|
@ -40,6 +41,7 @@
|
||||||
"next": "^15.2.2",
|
"next": "^15.2.2",
|
||||||
"next-themes": "^0.3.0",
|
"next-themes": "^0.3.0",
|
||||||
"pdf-parse": "^1.1.1",
|
"pdf-parse": "^1.1.1",
|
||||||
|
"playwright": "*",
|
||||||
"react": "^18",
|
"react": "^18",
|
||||||
"react-dom": "^18",
|
"react-dom": "^18",
|
||||||
"react-syntax-highlighter": "^15.6.1",
|
"react-syntax-highlighter": "^15.6.1",
|
||||||
|
|
|
||||||
|
|
@ -11,19 +11,19 @@ const OptimizationModes = [
|
||||||
{
|
{
|
||||||
key: 'speed',
|
key: 'speed',
|
||||||
title: 'Speed',
|
title: 'Speed',
|
||||||
description: 'Prioritize speed and get the quickest possible answer.',
|
description: 'Prioritize speed and get the quickest possible answer. Minimum effort retrieving web content.',
|
||||||
icon: <Zap size={20} className="text-[#FF9800]" />,
|
icon: <Zap size={20} className="text-[#FF9800]" />,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
key: 'balanced',
|
key: 'balanced',
|
||||||
title: 'Balanced',
|
title: 'Balanced',
|
||||||
description: 'Find the right balance between speed and accuracy',
|
description: 'Find the right balance between speed and accuracy. Medium effort retrieving web content.',
|
||||||
icon: <Sliders size={20} className="text-[#4CAF50]" />,
|
icon: <Sliders size={20} className="text-[#4CAF50]" />,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
key: 'quality',
|
key: 'quality',
|
||||||
title: 'Quality',
|
title: 'Quality',
|
||||||
description: 'Get the most thorough and accurate answer',
|
description: 'Get the most thorough and accurate answer. High effort retrieving web content. Requires a good AI model. May take a long time.',
|
||||||
icon: (
|
icon: (
|
||||||
<Star
|
<Star
|
||||||
size={16}
|
size={16}
|
||||||
|
|
|
||||||
|
|
@ -22,7 +22,7 @@ import LineOutputParser from '../outputParsers/lineOutputParser';
|
||||||
import LineListOutputParser from '../outputParsers/listLineOutputParser';
|
import LineListOutputParser from '../outputParsers/listLineOutputParser';
|
||||||
import { searchSearxng } from '../searxng';
|
import { searchSearxng } from '../searxng';
|
||||||
import computeSimilarity from '../utils/computeSimilarity';
|
import computeSimilarity from '../utils/computeSimilarity';
|
||||||
import { getDocumentsFromLinks, getWebContent } from '../utils/documents';
|
import { getDocumentsFromLinks, getWebContent, getWebContentLite } from '../utils/documents';
|
||||||
import formatChatHistoryAsString from '../utils/formatHistory';
|
import formatChatHistoryAsString from '../utils/formatHistory';
|
||||||
import { getModelName } from '../utils/modelUtils';
|
import { getModelName } from '../utils/modelUtils';
|
||||||
|
|
||||||
|
|
@ -483,7 +483,7 @@ class MetaSearchAgent implements MetaSearchAgentType {
|
||||||
this.emitProgress(emitter, 60, `Enriching sources`);
|
this.emitProgress(emitter, 60, `Enriching sources`);
|
||||||
sortedDocs = await Promise.all(
|
sortedDocs = await Promise.all(
|
||||||
sortedDocs.map(async (doc) => {
|
sortedDocs.map(async (doc) => {
|
||||||
const webContent = await getWebContent(doc.metadata.url);
|
const webContent = await getWebContentLite(doc.metadata.url);
|
||||||
const chunks =
|
const chunks =
|
||||||
webContent?.pageContent
|
webContent?.pageContent
|
||||||
.match(/.{1,500}/g)
|
.match(/.{1,500}/g)
|
||||||
|
|
@ -610,7 +610,7 @@ ${docs[index].metadata?.url.toLowerCase().includes('file') ? '' : '\n<url>' + do
|
||||||
</${index + 1}>\n`,
|
</${index + 1}>\n`,
|
||||||
)
|
)
|
||||||
.join('\n');
|
.join('\n');
|
||||||
// console.log('Processed docs:', fullDocs);
|
console.log('Processed docs:', fullDocs);
|
||||||
return fullDocs;
|
return fullDocs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3,8 +3,9 @@ import { htmlToText } from 'html-to-text';
|
||||||
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
|
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
|
||||||
import { Document } from '@langchain/core/documents';
|
import { Document } from '@langchain/core/documents';
|
||||||
import pdfParse from 'pdf-parse';
|
import pdfParse from 'pdf-parse';
|
||||||
import { JSDOM } from 'jsdom';
|
import { Configuration, Dataset, PlaywrightCrawler } from 'crawlee';
|
||||||
import { Readability } from '@mozilla/readability';
|
import { Readability } from '@mozilla/readability';
|
||||||
|
import { JSDOM } from 'jsdom';
|
||||||
import fetch from 'node-fetch';
|
import fetch from 'node-fetch';
|
||||||
|
|
||||||
export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
|
export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
|
||||||
|
|
@ -101,9 +102,111 @@ export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
|
||||||
return docs;
|
return docs;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
interface CrawledContent {
|
||||||
|
text: string;
|
||||||
|
title: string;
|
||||||
|
html?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetches web content from a given URL using Crawlee and Playwright. Parses it using Readability.
|
||||||
|
* Returns a Document object containing the parsed text and metadata.
|
||||||
|
*
|
||||||
|
* @param url - The URL to fetch content from.
|
||||||
|
* @param getHtml - Whether to include the HTML content in the metadata.
|
||||||
|
* @returns A Promise that resolves to a Document object or null if parsing fails.
|
||||||
|
*/
|
||||||
export const getWebContent = async (
|
export const getWebContent = async (
|
||||||
url: string,
|
url: string,
|
||||||
getHtml: boolean = false,
|
getHtml: boolean = false,
|
||||||
|
): Promise<Document | null> => {
|
||||||
|
let crawledContent: CrawledContent | null = null;
|
||||||
|
const crawler = new PlaywrightCrawler({
|
||||||
|
async requestHandler({ page }) {
|
||||||
|
// Wait for the content to load
|
||||||
|
await page.waitForLoadState('networkidle', {timeout: 10000});
|
||||||
|
|
||||||
|
// Allow some time for dynamic content to load
|
||||||
|
await page.waitForTimeout(3000);
|
||||||
|
|
||||||
|
console.log(`Crawling URL: ${url}`);
|
||||||
|
|
||||||
|
// Get the page title
|
||||||
|
const title = await page.title();
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Use Readability to parse the page content
|
||||||
|
const content = await page.content();
|
||||||
|
const dom = new JSDOM(content, { url });
|
||||||
|
const reader = new Readability(dom.window.document, { charThreshold: 25 }).parse();
|
||||||
|
const crawleeContent: CrawledContent = {
|
||||||
|
text: reader?.textContent || '',
|
||||||
|
title,
|
||||||
|
html: getHtml ? reader?.content || await page.content() : undefined,
|
||||||
|
};
|
||||||
|
|
||||||
|
crawledContent = crawleeContent;
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Failed to parse content with Readability for URL: ${url}`, error);
|
||||||
|
}
|
||||||
|
|
||||||
|
},
|
||||||
|
maxRequestsPerCrawl: 1,
|
||||||
|
maxRequestRetries: 2,
|
||||||
|
retryOnBlocked: true,
|
||||||
|
maxSessionRotations: 3,
|
||||||
|
}, new Configuration({ persistStorage: false }));
|
||||||
|
|
||||||
|
try {
|
||||||
|
await crawler.run([url]);
|
||||||
|
|
||||||
|
if (!crawledContent) {
|
||||||
|
console.warn(`Failed to parse article content for URL: ${url}`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const content = crawledContent as CrawledContent;
|
||||||
|
|
||||||
|
// Normalize the text content
|
||||||
|
const normalizedText = content?.text
|
||||||
|
?.split('\n')
|
||||||
|
.map((line: string) => line.trim())
|
||||||
|
.filter((line: string) => line.length > 0)
|
||||||
|
.join('\n') || '';
|
||||||
|
|
||||||
|
// Create a Document with the parsed content
|
||||||
|
const returnDoc = new Document({
|
||||||
|
pageContent: normalizedText,
|
||||||
|
metadata: {
|
||||||
|
html: content?.html,
|
||||||
|
title: content?.title,
|
||||||
|
url: url,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
console.log(`Got content with Crawlee and Readability, URL: ${url}, Text Length: ${returnDoc.pageContent.length}, html Length: ${returnDoc.metadata.html?.length || 0}`);
|
||||||
|
return returnDoc;
|
||||||
|
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Error fetching/parsing URL ${url}:`, error);
|
||||||
|
return null;
|
||||||
|
} finally {
|
||||||
|
await crawler.teardown();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetches web content from a given URL and parses it using Readability.
|
||||||
|
* Returns a Document object containing the parsed text and metadata.
|
||||||
|
*
|
||||||
|
* @param {string} url - The URL to fetch content from.
|
||||||
|
* @param {boolean} getHtml - Whether to include the HTML content in the metadata.
|
||||||
|
* @returns {Promise<Document | null>} A Promise that resolves to a Document object or null if parsing fails.
|
||||||
|
*/
|
||||||
|
export const getWebContentLite = async (
|
||||||
|
url: string,
|
||||||
|
getHtml: boolean = false,
|
||||||
): Promise<Document | null> => {
|
): Promise<Document | null> => {
|
||||||
try {
|
try {
|
||||||
const response = await fetch(url, {timeout: 5000});
|
const response = await fetch(url, {timeout: 5000});
|
||||||
|
|
@ -124,7 +227,6 @@ export const getWebContent = async (
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Normalize the text content by removing extra spaces and newlines. Iterate through the lines one by one and throw out the ones that are empty or contain only whitespace.
|
|
||||||
const normalizedText =
|
const normalizedText =
|
||||||
article?.textContent
|
article?.textContent
|
||||||
?.split('\n')
|
?.split('\n')
|
||||||
|
|
@ -139,12 +241,6 @@ export const getWebContent = async (
|
||||||
html: getHtml ? article.content : undefined,
|
html: getHtml ? article.content : undefined,
|
||||||
title: article.title || originalTitle,
|
title: article.title || originalTitle,
|
||||||
url: url,
|
url: url,
|
||||||
excerpt: article.excerpt || undefined,
|
|
||||||
byline: article.byline || undefined,
|
|
||||||
siteName: article.siteName || undefined,
|
|
||||||
readingTime: article.length
|
|
||||||
? Math.ceil(article.length / 1000)
|
|
||||||
: undefined,
|
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue