feat(optimization): improve quality mode performance by limiting source gathering
This commit is contained in:
parent
6343dd5303
commit
8ce50b48f0
4 changed files with 298 additions and 186 deletions
|
|
@ -11,19 +11,22 @@ const OptimizationModes = [
|
||||||
{
|
{
|
||||||
key: 'speed',
|
key: 'speed',
|
||||||
title: 'Speed',
|
title: 'Speed',
|
||||||
description: 'Prioritize speed and get the quickest possible answer. Minimum effort retrieving web content.',
|
description:
|
||||||
|
'Prioritize speed and get the quickest possible answer. Minimum effort retrieving web content.',
|
||||||
icon: <Zap size={20} className="text-[#FF9800]" />,
|
icon: <Zap size={20} className="text-[#FF9800]" />,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
key: 'balanced',
|
key: 'balanced',
|
||||||
title: 'Balanced',
|
title: 'Balanced',
|
||||||
description: 'Find the right balance between speed and accuracy. Medium effort retrieving web content.',
|
description:
|
||||||
|
'Find the right balance between speed and accuracy. Medium effort retrieving web content.',
|
||||||
icon: <Sliders size={20} className="text-[#4CAF50]" />,
|
icon: <Sliders size={20} className="text-[#4CAF50]" />,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
key: 'quality',
|
key: 'quality',
|
||||||
title: 'Quality',
|
title: 'Quality',
|
||||||
description: 'Get the most thorough and accurate answer. High effort retrieving web content. Requires a good AI model. May take a long time.',
|
description:
|
||||||
|
'Get the most thorough and accurate answer. High effort retrieving web content. Requires a good AI model. May take a long time.',
|
||||||
icon: (
|
icon: (
|
||||||
<Star
|
<Star
|
||||||
size={16}
|
size={16}
|
||||||
|
|
|
||||||
|
|
@ -7,21 +7,34 @@ import { BaseChatModel } from '@langchain/core/language_models/chat_models';
|
||||||
import { ChatOpenAI } from '@langchain/openai';
|
import { ChatOpenAI } from '@langchain/openai';
|
||||||
|
|
||||||
const suggestionGeneratorPrompt = `
|
const suggestionGeneratorPrompt = `
|
||||||
You are an AI suggestion generator for an AI powered search engine. You will be given a conversation below. You need to generate 4-5 suggestions based on the conversation. The suggestion should be relevant to the conversation that can be used by the user to ask the chat model for more information.
|
You are an AI suggestion generator for an AI powered search engine.
|
||||||
You need to make sure the suggestions are relevant to the conversation and are helpful to the user. Keep a note that the user might use these suggestions to ask a chat model for more information.
|
|
||||||
Make sure the suggestions are medium in length and are informative and relevant to the conversation.
|
|
||||||
If you are a thinking or reasoning AI, you should avoid using \`<suggestions>\` and \`</suggestions>\` tags in your thinking. Those tags should only be used in the final output.
|
|
||||||
|
|
||||||
Provide these suggestions separated by newlines between the XML tags <suggestions> and </suggestions>. For example:
|
# Instructions
|
||||||
|
- You will be given a conversation below
|
||||||
|
- Generate 5 total suggestions based on the conversation
|
||||||
|
- Three of the suggestions should be relevant to the conversation so it can be used by the user to ask the chat model for more information
|
||||||
|
- Two of the suggestions should still be relevant to the conversation but could optionally steer the conversation in a different direction
|
||||||
|
- The suggestions should be in the form of questions
|
||||||
|
- The suggestions should not be something that is already in the conversation
|
||||||
|
- The conversation history is provided in the conversation section below
|
||||||
|
|
||||||
|
# Output Format
|
||||||
|
- If you are a thinking or reasoning AI, you should avoid using \`<suggestions>\` and \`</suggestions>\` tags in your thinking. Those tags should only be used in the final output.
|
||||||
|
- Provide these suggestions separated by newlines between the XML tags <suggestions> and </suggestions>. For example:
|
||||||
|
- Make sure each suggestion is a single line and does not contain any newlines or any formatting
|
||||||
|
- Example output is provided in the example section below
|
||||||
|
|
||||||
|
<example>
|
||||||
<suggestions>
|
<suggestions>
|
||||||
Tell me more about SpaceX and their recent projects
|
Tell me more about SpaceX and their recent projects
|
||||||
What is the latest news on SpaceX?
|
What is the latest news on SpaceX?
|
||||||
Who is the CEO of SpaceX?
|
Who is the CEO of SpaceX?
|
||||||
</suggestions>
|
</suggestions>
|
||||||
|
</example>
|
||||||
|
|
||||||
Conversation:
|
<conversation>
|
||||||
{chat_history}
|
{chat_history}
|
||||||
|
</conversation>
|
||||||
`;
|
`;
|
||||||
|
|
||||||
type SuggestionGeneratorInput = {
|
type SuggestionGeneratorInput = {
|
||||||
|
|
|
||||||
|
|
@ -22,7 +22,11 @@ import LineOutputParser from '../outputParsers/lineOutputParser';
|
||||||
import LineListOutputParser from '../outputParsers/listLineOutputParser';
|
import LineListOutputParser from '../outputParsers/listLineOutputParser';
|
||||||
import { searchSearxng } from '../searxng';
|
import { searchSearxng } from '../searxng';
|
||||||
import computeSimilarity from '../utils/computeSimilarity';
|
import computeSimilarity from '../utils/computeSimilarity';
|
||||||
import { getDocumentsFromLinks, getWebContent, getWebContentLite } from '../utils/documents';
|
import {
|
||||||
|
getDocumentsFromLinks,
|
||||||
|
getWebContent,
|
||||||
|
getWebContentLite,
|
||||||
|
} from '../utils/documents';
|
||||||
import formatChatHistoryAsString from '../utils/formatHistory';
|
import formatChatHistoryAsString from '../utils/formatHistory';
|
||||||
import { getModelName } from '../utils/modelUtils';
|
import { getModelName } from '../utils/modelUtils';
|
||||||
|
|
||||||
|
|
@ -99,70 +103,71 @@ class MetaSearchAgent implements MetaSearchAgentType {
|
||||||
llm,
|
llm,
|
||||||
this.strParser,
|
this.strParser,
|
||||||
RunnableLambda.from(async (input: string) => {
|
RunnableLambda.from(async (input: string) => {
|
||||||
//console.log(`LLM response for initial web search:"${input}"`);
|
try {
|
||||||
const linksOutputParser = new LineListOutputParser({
|
//console.log(`LLM response for initial web search:"${input}"`);
|
||||||
key: 'links',
|
const linksOutputParser = new LineListOutputParser({
|
||||||
});
|
key: 'links',
|
||||||
|
|
||||||
const questionOutputParser = new LineOutputParser({
|
|
||||||
key: 'answer',
|
|
||||||
});
|
|
||||||
|
|
||||||
const links = await linksOutputParser.parse(input);
|
|
||||||
let question = await questionOutputParser.parse(input);
|
|
||||||
|
|
||||||
//console.log('question', question);
|
|
||||||
|
|
||||||
if (question === 'not_needed') {
|
|
||||||
return { query: '', docs: [] };
|
|
||||||
}
|
|
||||||
|
|
||||||
if (links.length > 0) {
|
|
||||||
if (question.length === 0) {
|
|
||||||
question = 'summarize';
|
|
||||||
}
|
|
||||||
|
|
||||||
let docs: Document[] = [];
|
|
||||||
|
|
||||||
const linkDocs = await getDocumentsFromLinks({ links });
|
|
||||||
|
|
||||||
const docGroups: Document[] = [];
|
|
||||||
|
|
||||||
linkDocs.map((doc) => {
|
|
||||||
const URLDocExists = docGroups.find(
|
|
||||||
(d) =>
|
|
||||||
d.metadata.url === doc.metadata.url &&
|
|
||||||
d.metadata.totalDocs < 10,
|
|
||||||
);
|
|
||||||
|
|
||||||
if (!URLDocExists) {
|
|
||||||
docGroups.push({
|
|
||||||
...doc,
|
|
||||||
metadata: {
|
|
||||||
...doc.metadata,
|
|
||||||
totalDocs: 1,
|
|
||||||
},
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
const docIndex = docGroups.findIndex(
|
|
||||||
(d) =>
|
|
||||||
d.metadata.url === doc.metadata.url &&
|
|
||||||
d.metadata.totalDocs < 10,
|
|
||||||
);
|
|
||||||
|
|
||||||
if (docIndex !== -1) {
|
|
||||||
docGroups[docIndex].pageContent =
|
|
||||||
docGroups[docIndex].pageContent + `\n\n` + doc.pageContent;
|
|
||||||
docGroups[docIndex].metadata.totalDocs += 1;
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
|
|
||||||
this.emitProgress(emitter, 20, `Summarizing content`);
|
const questionOutputParser = new LineOutputParser({
|
||||||
|
key: 'answer',
|
||||||
|
});
|
||||||
|
|
||||||
await Promise.all(
|
const links = await linksOutputParser.parse(input);
|
||||||
docGroups.map(async (doc) => {
|
let question = await questionOutputParser.parse(input);
|
||||||
const res = await llm.invoke(`
|
|
||||||
|
//console.log('question', question);
|
||||||
|
|
||||||
|
if (question === 'not_needed') {
|
||||||
|
return { query: '', docs: [] };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (links.length > 0) {
|
||||||
|
if (question.length === 0) {
|
||||||
|
question = 'summarize';
|
||||||
|
}
|
||||||
|
|
||||||
|
let docs: Document[] = [];
|
||||||
|
|
||||||
|
const linkDocs = await getDocumentsFromLinks({ links });
|
||||||
|
|
||||||
|
const docGroups: Document[] = [];
|
||||||
|
|
||||||
|
linkDocs.map((doc) => {
|
||||||
|
const URLDocExists = docGroups.find(
|
||||||
|
(d) =>
|
||||||
|
d.metadata.url === doc.metadata.url &&
|
||||||
|
d.metadata.totalDocs < 10,
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!URLDocExists) {
|
||||||
|
docGroups.push({
|
||||||
|
...doc,
|
||||||
|
metadata: {
|
||||||
|
...doc.metadata,
|
||||||
|
totalDocs: 1,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
const docIndex = docGroups.findIndex(
|
||||||
|
(d) =>
|
||||||
|
d.metadata.url === doc.metadata.url &&
|
||||||
|
d.metadata.totalDocs < 10,
|
||||||
|
);
|
||||||
|
|
||||||
|
if (docIndex !== -1) {
|
||||||
|
docGroups[docIndex].pageContent =
|
||||||
|
docGroups[docIndex].pageContent + `\n\n` + doc.pageContent;
|
||||||
|
docGroups[docIndex].metadata.totalDocs += 1;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
this.emitProgress(emitter, 20, `Summarizing content`);
|
||||||
|
|
||||||
|
await Promise.all(
|
||||||
|
docGroups.map(async (doc) => {
|
||||||
|
const res = await llm.invoke(`
|
||||||
You are a web search summarizer, tasked with summarizing a piece of text retrieved from a web search. Your job is to summarize the
|
You are a web search summarizer, tasked with summarizing a piece of text retrieved from a web search. Your job is to summarize the
|
||||||
text into a detailed, 2-4 paragraph explanation that captures the main ideas and provides a comprehensive answer to the query.
|
text into a detailed, 2-4 paragraph explanation that captures the main ideas and provides a comprehensive answer to the query.
|
||||||
If the query is \"summarize\", you should provide a detailed summary of the text. If the query is a specific question, you should answer it in the summary.
|
If the query is \"summarize\", you should provide a detailed summary of the text. If the query is a specific question, you should answer it in the summary.
|
||||||
|
|
@ -223,50 +228,55 @@ class MetaSearchAgent implements MetaSearchAgentType {
|
||||||
Make sure to answer the query in the summary.
|
Make sure to answer the query in the summary.
|
||||||
`);
|
`);
|
||||||
|
|
||||||
const document = new Document({
|
const document = new Document({
|
||||||
pageContent: res.content as string,
|
pageContent: res.content as string,
|
||||||
metadata: {
|
metadata: {
|
||||||
title: doc.metadata.title,
|
title: doc.metadata.title,
|
||||||
url: doc.metadata.url,
|
url: doc.metadata.url,
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
docs.push(document);
|
docs.push(document);
|
||||||
}),
|
|
||||||
);
|
|
||||||
|
|
||||||
return { query: question, docs: docs };
|
|
||||||
} else {
|
|
||||||
this.emitProgress(emitter, 20, `Searching the web`);
|
|
||||||
if (this.config.additionalSearchCriteria) {
|
|
||||||
question = `${question} ${this.config.additionalSearchCriteria}`;
|
|
||||||
}
|
|
||||||
|
|
||||||
const searxngResult = await searchSearxng(question, {
|
|
||||||
language: 'en',
|
|
||||||
engines: this.config.activeEngines,
|
|
||||||
});
|
|
||||||
|
|
||||||
// Store the SearXNG URL for later use in emitting to the client
|
|
||||||
this.searxngUrl = searxngResult.searchUrl;
|
|
||||||
|
|
||||||
const documents = searxngResult.results.map(
|
|
||||||
(result) =>
|
|
||||||
new Document({
|
|
||||||
pageContent:
|
|
||||||
result.content ||
|
|
||||||
(this.config.activeEngines.includes('youtube')
|
|
||||||
? result.title
|
|
||||||
: '') /* Todo: Implement transcript grabbing using Youtubei (source: https://www.npmjs.com/package/youtubei) */,
|
|
||||||
metadata: {
|
|
||||||
title: result.title,
|
|
||||||
url: result.url,
|
|
||||||
...(result.img_src && { img_src: result.img_src }),
|
|
||||||
},
|
|
||||||
}),
|
}),
|
||||||
);
|
);
|
||||||
|
|
||||||
return { query: question, docs: documents, searchQuery: question };
|
return { query: question, docs: docs };
|
||||||
|
} else {
|
||||||
|
if (this.config.additionalSearchCriteria) {
|
||||||
|
question = `${question} ${this.config.additionalSearchCriteria}`;
|
||||||
|
}
|
||||||
|
this.emitProgress(emitter, 20, `Searching the web: "${question}"`);
|
||||||
|
|
||||||
|
const searxngResult = await searchSearxng(question, {
|
||||||
|
language: 'en',
|
||||||
|
engines: this.config.activeEngines,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Store the SearXNG URL for later use in emitting to the client
|
||||||
|
this.searxngUrl = searxngResult.searchUrl;
|
||||||
|
|
||||||
|
const documents = searxngResult.results.map(
|
||||||
|
(result) =>
|
||||||
|
new Document({
|
||||||
|
pageContent:
|
||||||
|
result.content ||
|
||||||
|
(this.config.activeEngines.includes('youtube')
|
||||||
|
? result.title
|
||||||
|
: '') /* Todo: Implement transcript grabbing using Youtubei (source: https://www.npmjs.com/package/youtubei) */,
|
||||||
|
metadata: {
|
||||||
|
title: result.title,
|
||||||
|
url: result.url,
|
||||||
|
...(result.img_src && { img_src: result.img_src }),
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
return { query: question, docs: documents, searchQuery: question };
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error in search retriever chain:', error);
|
||||||
|
emitter.emit('error', JSON.stringify({ data: error }));
|
||||||
|
throw error;
|
||||||
}
|
}
|
||||||
}),
|
}),
|
||||||
]);
|
]);
|
||||||
|
|
@ -360,6 +370,103 @@ class MetaSearchAgent implements MetaSearchAgentType {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private async checkIfEnoughInformation(
|
||||||
|
docs: Document[],
|
||||||
|
query: string,
|
||||||
|
llm: BaseChatModel,
|
||||||
|
emitter: eventEmitter,
|
||||||
|
): Promise<boolean> {
|
||||||
|
const formattedDocs = this.processDocs(docs);
|
||||||
|
|
||||||
|
const response =
|
||||||
|
await llm.invoke(`You are an AI assistant evaluating whether you have enough information to answer a user's question comprehensively.
|
||||||
|
|
||||||
|
Based on the following sources, determine if you have sufficient information to provide a detailed, accurate answer to the query: "${query}"
|
||||||
|
|
||||||
|
Sources:
|
||||||
|
${formattedDocs}
|
||||||
|
|
||||||
|
Look for:
|
||||||
|
1. Key facts and details directly relevant to the query
|
||||||
|
2. Multiple perspectives or sources if the topic is complex
|
||||||
|
3. Up-to-date information if the query requires current data
|
||||||
|
4. Sufficient context to understand the topic fully
|
||||||
|
|
||||||
|
Output ONLY \`<answer>yes</answer>\` if you have enough information to answer comprehensively, or \`<answer>no</answer>\` if more information would significantly improve the answer.`);
|
||||||
|
|
||||||
|
const answerParser = new LineOutputParser({
|
||||||
|
key: 'answer',
|
||||||
|
});
|
||||||
|
const responseText = await answerParser.parse(
|
||||||
|
(response.content as string).trim().toLowerCase(),
|
||||||
|
);
|
||||||
|
if (responseText !== 'yes') {
|
||||||
|
console.log(
|
||||||
|
`LLM response for checking if we have enough information: "${response.content}"`,
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
console.log(
|
||||||
|
'LLM response indicates we have enough information to answer the query.',
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return responseText === 'yes';
|
||||||
|
}
|
||||||
|
|
||||||
|
private async processSource(
|
||||||
|
doc: Document,
|
||||||
|
query: string,
|
||||||
|
llm: BaseChatModel,
|
||||||
|
summaryParser: LineOutputParser,
|
||||||
|
): Promise<Document | null> {
|
||||||
|
try {
|
||||||
|
const url = doc.metadata.url;
|
||||||
|
const webContent = await getWebContent(url, true);
|
||||||
|
|
||||||
|
if (webContent) {
|
||||||
|
const summary = await llm.invoke(`
|
||||||
|
You are a web content summarizer, tasked with creating a detailed, accurate summary of content from a webpage
|
||||||
|
Your summary should:
|
||||||
|
- Be thorough and comprehensive, capturing all key points
|
||||||
|
- Format the content using markdown, including headings, lists, and tables
|
||||||
|
- Include specific details, numbers, and quotes when relevant
|
||||||
|
- Be concise and to the point, avoiding unnecessary fluff
|
||||||
|
- Answer the user's query, which is: ${query}
|
||||||
|
- Output your answer in an XML format, with the summary inside the \`summary\` XML tag
|
||||||
|
- If the content is not relevant to the query, respond with "not_needed" to start the summary tag, followed by a one line description of why the source is not needed
|
||||||
|
- E.g. "not_needed: There is relevant information in the source, but it doesn't contain specifics about X"
|
||||||
|
- Make sure the reason the source is not needed is very specific and detailed
|
||||||
|
- Include useful links to external resources, if applicable
|
||||||
|
|
||||||
|
Here is the content to summarize:
|
||||||
|
${webContent.metadata.html ? webContent.metadata.html : webContent.pageContent}
|
||||||
|
`);
|
||||||
|
|
||||||
|
const summarizedContent = await summaryParser.parse(
|
||||||
|
summary.content as string,
|
||||||
|
);
|
||||||
|
|
||||||
|
if (summarizedContent.toLocaleLowerCase().startsWith('not_needed')) {
|
||||||
|
console.log(
|
||||||
|
`LLM response for URL "${url}" indicates it's not needed:`,
|
||||||
|
summarizedContent,
|
||||||
|
);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return new Document({
|
||||||
|
pageContent: summarizedContent,
|
||||||
|
metadata: {
|
||||||
|
...webContent.metadata,
|
||||||
|
url: url,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Error processing URL ${doc.metadata.url}:`, error);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
private async rerankDocs(
|
private async rerankDocs(
|
||||||
query: string,
|
query: string,
|
||||||
docs: Document[],
|
docs: Document[],
|
||||||
|
|
@ -477,7 +584,7 @@ class MetaSearchAgent implements MetaSearchAgentType {
|
||||||
...sortedDocs,
|
...sortedDocs,
|
||||||
...docsWithContent.slice(0, 15 - sortedDocs.length),
|
...docsWithContent.slice(0, 15 - sortedDocs.length),
|
||||||
];
|
];
|
||||||
|
|
||||||
this.emitProgress(emitter, 60, `Enriching sources`);
|
this.emitProgress(emitter, 60, `Enriching sources`);
|
||||||
sortedDocs = await Promise.all(
|
sortedDocs = await Promise.all(
|
||||||
sortedDocs.map(async (doc) => {
|
sortedDocs.map(async (doc) => {
|
||||||
|
|
@ -510,84 +617,63 @@ class MetaSearchAgent implements MetaSearchAgentType {
|
||||||
|
|
||||||
return sortedDocs;
|
return sortedDocs;
|
||||||
} else if (optimizationMode === 'quality') {
|
} else if (optimizationMode === 'quality') {
|
||||||
this.emitProgress(emitter, 30, 'Ranking sources...');
|
|
||||||
|
|
||||||
const summaryParser = new LineOutputParser({
|
const summaryParser = new LineOutputParser({
|
||||||
key: 'summary',
|
key: 'summary',
|
||||||
});
|
});
|
||||||
|
|
||||||
// Get full content and generate detailed summaries for top results sequentially
|
|
||||||
const enhancedDocs: Document[] = [];
|
const enhancedDocs: Document[] = [];
|
||||||
const maxEnhancedDocs = 5;
|
const maxEnhancedDocs = 5;
|
||||||
for (let i = 0; i < docsWithContent.length; i++) {
|
|
||||||
|
// Process sources one by one until we have enough information or hit the max
|
||||||
|
for (
|
||||||
|
let i = 0;
|
||||||
|
i < docsWithContent.length && enhancedDocs.length < maxEnhancedDocs;
|
||||||
|
i++
|
||||||
|
) {
|
||||||
if (signal.aborted) {
|
if (signal.aborted) {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
if (enhancedDocs.length >= maxEnhancedDocs) {
|
|
||||||
break; // Limit to 5 documents
|
const currentProgress = enhancedDocs.length * 10 + 40;
|
||||||
}
|
|
||||||
const result = docsWithContent[i];
|
|
||||||
|
|
||||||
this.emitProgress(
|
this.emitProgress(
|
||||||
emitter,
|
emitter,
|
||||||
enhancedDocs.length * 10 + 40,
|
currentProgress,
|
||||||
`Deep analyzing sources: ${enhancedDocs.length + 1}/${maxEnhancedDocs}`,
|
`Deep analyzing: ${enhancedDocs.length} relevant sources found so far`,
|
||||||
);
|
);
|
||||||
|
|
||||||
try {
|
const result = docsWithContent[i];
|
||||||
const url = result.metadata.url;
|
const processedDoc = await this.processSource(
|
||||||
const webContent = await getWebContent(url, true);
|
result,
|
||||||
|
query,
|
||||||
|
llm,
|
||||||
|
summaryParser,
|
||||||
|
);
|
||||||
|
|
||||||
if (webContent) {
|
if (processedDoc) {
|
||||||
// Generate a detailed summary using the LLM
|
enhancedDocs.push(processedDoc);
|
||||||
const summary = await llm.invoke(`
|
|
||||||
You are a web content summarizer, tasked with creating a detailed, accurate summary of content from a webpage
|
|
||||||
Your summary should:
|
|
||||||
- Be thorough and comprehensive, capturing all key points
|
|
||||||
- Format the content using markdown, including headings, lists, and tables
|
|
||||||
- Include specific details, numbers, and quotes when relevant
|
|
||||||
- Be concise and to the point, avoiding unnecessary fluff
|
|
||||||
- Answer the user's query, which is: ${query}
|
|
||||||
- Output your answer in an XML format, with the summary inside the \`summary\` XML tag
|
|
||||||
- If the content is not relevant to the query, respond with "not_needed" to start the summary tag, followed by a one line description of why the source is not needed
|
|
||||||
- E.g. "not_needed: There is relevant information in the source, but it doesn't contain specifics about X"
|
|
||||||
- Make sure the reason the source is not needed is very specific and detailed
|
|
||||||
- Include useful links to external resources, if applicable
|
|
||||||
|
|
||||||
Here is the content to summarize:
|
// After getting initial 2 sources or adding a new one, check if we have enough info
|
||||||
${webContent.metadata.html ? webContent.metadata.html : webContent.pageContent}
|
if (enhancedDocs.length >= 2) {
|
||||||
`);
|
this.emitProgress(
|
||||||
|
emitter,
|
||||||
const summarizedContent = await summaryParser.parse(
|
currentProgress,
|
||||||
summary.content as string,
|
`Checking if we have enough information to answer the query`,
|
||||||
);
|
);
|
||||||
|
const hasEnoughInfo = await this.checkIfEnoughInformation(
|
||||||
if (
|
enhancedDocs,
|
||||||
summarizedContent.toLocaleLowerCase().startsWith('not_needed')
|
query,
|
||||||
) {
|
llm,
|
||||||
console.log(
|
emitter,
|
||||||
`LLM response for URL "${url}" indicates it's not needed:`,
|
);
|
||||||
summarizedContent,
|
if (hasEnoughInfo) {
|
||||||
);
|
break;
|
||||||
continue; // Skip this document if not needed
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//console.log(`LLM response for URL "${url}":`, summarizedContent);
|
|
||||||
enhancedDocs.push(
|
|
||||||
new Document({
|
|
||||||
pageContent: summarizedContent,
|
|
||||||
metadata: {
|
|
||||||
...webContent.metadata,
|
|
||||||
url: url,
|
|
||||||
},
|
|
||||||
}),
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
} catch (error) {
|
|
||||||
console.error(`Error processing URL ${result.metadata.url}:`, error);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
this.emitProgress(emitter, 95, `Ranking attached files`);
|
||||||
// Add relevant file documents
|
// Add relevant file documents
|
||||||
const fileDocs = await getRankedDocs(queryEmbedding, true, false, 8);
|
const fileDocs = await getRankedDocs(queryEmbedding, true, false, 8);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -121,10 +121,11 @@ export const getWebContent = async (
|
||||||
getHtml: boolean = false,
|
getHtml: boolean = false,
|
||||||
): Promise<Document | null> => {
|
): Promise<Document | null> => {
|
||||||
let crawledContent: CrawledContent | null = null;
|
let crawledContent: CrawledContent | null = null;
|
||||||
const crawler = new PlaywrightCrawler({
|
const crawler = new PlaywrightCrawler(
|
||||||
|
{
|
||||||
async requestHandler({ page }) {
|
async requestHandler({ page }) {
|
||||||
// Wait for the content to load
|
// Wait for the content to load
|
||||||
await page.waitForLoadState('networkidle', {timeout: 10000});
|
await page.waitForLoadState('networkidle', { timeout: 10000 });
|
||||||
|
|
||||||
// Allow some time for dynamic content to load
|
// Allow some time for dynamic content to load
|
||||||
await page.waitForTimeout(3000);
|
await page.waitForTimeout(3000);
|
||||||
|
|
@ -138,24 +139,32 @@ export const getWebContent = async (
|
||||||
// Use Readability to parse the page content
|
// Use Readability to parse the page content
|
||||||
const content = await page.content();
|
const content = await page.content();
|
||||||
const dom = new JSDOM(content, { url });
|
const dom = new JSDOM(content, { url });
|
||||||
const reader = new Readability(dom.window.document, { charThreshold: 25 }).parse();
|
const reader = new Readability(dom.window.document, {
|
||||||
|
charThreshold: 25,
|
||||||
|
}).parse();
|
||||||
const crawleeContent: CrawledContent = {
|
const crawleeContent: CrawledContent = {
|
||||||
text: reader?.textContent || '',
|
text: reader?.textContent || '',
|
||||||
title,
|
title,
|
||||||
html: getHtml ? reader?.content || await page.content() : undefined,
|
html: getHtml
|
||||||
|
? reader?.content || (await page.content())
|
||||||
|
: undefined,
|
||||||
};
|
};
|
||||||
|
|
||||||
crawledContent = crawleeContent;
|
crawledContent = crawleeContent;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Failed to parse content with Readability for URL: ${url}`, error);
|
console.error(
|
||||||
|
`Failed to parse content with Readability for URL: ${url}`,
|
||||||
|
error,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
},
|
},
|
||||||
maxRequestsPerCrawl: 1,
|
maxRequestsPerCrawl: 1,
|
||||||
maxRequestRetries: 2,
|
maxRequestRetries: 2,
|
||||||
retryOnBlocked: true,
|
retryOnBlocked: true,
|
||||||
maxSessionRotations: 3,
|
maxSessionRotations: 3,
|
||||||
}, new Configuration({ persistStorage: false }));
|
},
|
||||||
|
new Configuration({ persistStorage: false }),
|
||||||
|
);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await crawler.run([url]);
|
await crawler.run([url]);
|
||||||
|
|
@ -168,11 +177,12 @@ export const getWebContent = async (
|
||||||
const content = crawledContent as CrawledContent;
|
const content = crawledContent as CrawledContent;
|
||||||
|
|
||||||
// Normalize the text content
|
// Normalize the text content
|
||||||
const normalizedText = content?.text
|
const normalizedText =
|
||||||
?.split('\n')
|
content?.text
|
||||||
.map((line: string) => line.trim())
|
?.split('\n')
|
||||||
.filter((line: string) => line.length > 0)
|
.map((line: string) => line.trim())
|
||||||
.join('\n') || '';
|
.filter((line: string) => line.length > 0)
|
||||||
|
.join('\n') || '';
|
||||||
|
|
||||||
// Create a Document with the parsed content
|
// Create a Document with the parsed content
|
||||||
const returnDoc = new Document({
|
const returnDoc = new Document({
|
||||||
|
|
@ -184,10 +194,10 @@ export const getWebContent = async (
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
|
console.log(
|
||||||
console.log(`Got content with Crawlee and Readability, URL: ${url}, Text Length: ${returnDoc.pageContent.length}, html Length: ${returnDoc.metadata.html?.length || 0}`);
|
`Got content with Crawlee and Readability, URL: ${url}, Text Length: ${returnDoc.pageContent.length}, html Length: ${returnDoc.metadata.html?.length || 0}`,
|
||||||
|
);
|
||||||
return returnDoc;
|
return returnDoc;
|
||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Error fetching/parsing URL ${url}:`, error);
|
console.error(`Error fetching/parsing URL ${url}:`, error);
|
||||||
return null;
|
return null;
|
||||||
|
|
@ -209,7 +219,7 @@ export const getWebContentLite = async (
|
||||||
getHtml: boolean = false,
|
getHtml: boolean = false,
|
||||||
): Promise<Document | null> => {
|
): Promise<Document | null> => {
|
||||||
try {
|
try {
|
||||||
const response = await fetch(url, {timeout: 5000});
|
const response = await fetch(url, { timeout: 5000 });
|
||||||
const html = await response.text();
|
const html = await response.text();
|
||||||
|
|
||||||
// Create a DOM from the fetched HTML
|
// Create a DOM from the fetched HTML
|
||||||
|
|
@ -247,4 +257,4 @@ export const getWebContentLite = async (
|
||||||
console.error(`Error fetching/parsing URL ${url}:`); //, error);
|
console.error(`Error fetching/parsing URL ${url}:`); //, error);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue