feat(agent): Implement URL summarization agent
This commit is contained in:
parent
d66300e78e
commit
f88c650704
6 changed files with 347 additions and 2 deletions
|
|
@ -66,4 +66,12 @@ export const AgentState = Annotation.Root({
|
||||||
reducer: (x, y) => y ?? x,
|
reducer: (x, y) => y ?? x,
|
||||||
default: () => 'webSearch',
|
default: () => 'webSearch',
|
||||||
}),
|
}),
|
||||||
|
urlsToSummarize: Annotation<string[]>({
|
||||||
|
reducer: (x, y) => y ?? x,
|
||||||
|
default: () => [],
|
||||||
|
}),
|
||||||
|
summarizationIntent: Annotation<string>({
|
||||||
|
reducer: (x, y) => y ?? x,
|
||||||
|
default: () => '',
|
||||||
|
}),
|
||||||
});
|
});
|
||||||
|
|
|
||||||
|
|
@ -80,6 +80,38 @@ export class AnalyzerAgent {
|
||||||
state.originalQuery = state.query;
|
state.originalQuery = state.query;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check for URLs first - if found and not yet processed, route to URL summarization
|
||||||
|
if (!state.urlsToSummarize || state.urlsToSummarize.length === 0) {
|
||||||
|
const urlRegex = /https?:\/\/[^\s]+/gi;
|
||||||
|
const urls = [...new Set(state.query.match(urlRegex) || [])];
|
||||||
|
|
||||||
|
if (urls.length > 0) {
|
||||||
|
console.log('URLs detected in initial query, routing to URL summarization');
|
||||||
|
console.log(`URLs found: ${urls.join(', ')}`);
|
||||||
|
|
||||||
|
// Emit URL detection event
|
||||||
|
this.emitter.emit('agent_action', {
|
||||||
|
type: 'agent_action',
|
||||||
|
data: {
|
||||||
|
action: 'URLS_DETECTED_ROUTING',
|
||||||
|
message: `Detected ${urls.length} URL(s) in query - processing content first`,
|
||||||
|
details: {
|
||||||
|
query: state.query,
|
||||||
|
urls: urls,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
return new Command({
|
||||||
|
goto: 'url_summarization',
|
||||||
|
update: {
|
||||||
|
urlsToSummarize: urls,
|
||||||
|
summarizationIntent: `Process the content from the provided URLs to help answer: ${state.query}`,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Skip full analysis if this is the first run.
|
// Skip full analysis if this is the first run.
|
||||||
//if (state.fullAnalysisAttempts > 0) {
|
//if (state.fullAnalysisAttempts > 0) {
|
||||||
// Emit initial analysis event
|
// Emit initial analysis event
|
||||||
|
|
@ -108,7 +140,7 @@ export class AnalyzerAgent {
|
||||||
context: state.relevantDocuments
|
context: state.relevantDocuments
|
||||||
.map(
|
.map(
|
||||||
(doc, index) =>
|
(doc, index) =>
|
||||||
`<source${index + 1}>${doc?.metadata?.title ? `<title>${doc?.metadata?.title}</title>` : ''}<content>${doc.pageContent}</content></source${index + 1}>`,
|
`<source${index + 1}>${doc?.metadata?.title ? `<title>${doc?.metadata?.title}</title>` : ''}${doc?.metadata.url ? `<url>${doc?.metadata?.url}</url>` : ''}<content>${doc.pageContent}</content></source${index + 1}>`,
|
||||||
)
|
)
|
||||||
.join('\n\n'),
|
.join('\n\n'),
|
||||||
date: formatDateForLLM(new Date()),
|
date: formatDateForLLM(new Date()),
|
||||||
|
|
|
||||||
|
|
@ -5,3 +5,4 @@ export { SynthesizerAgent } from './synthesizerAgent';
|
||||||
export { TaskManagerAgent } from './taskManagerAgent';
|
export { TaskManagerAgent } from './taskManagerAgent';
|
||||||
export { FileSearchAgent } from './fileSearchAgent';
|
export { FileSearchAgent } from './fileSearchAgent';
|
||||||
export { ContentRouterAgent } from './contentRouterAgent';
|
export { ContentRouterAgent } from './contentRouterAgent';
|
||||||
|
export { URLSummarizationAgent } from './urlSummarizationAgent';
|
||||||
|
|
|
||||||
288
src/lib/agents/urlSummarizationAgent.ts
Normal file
288
src/lib/agents/urlSummarizationAgent.ts
Normal file
|
|
@ -0,0 +1,288 @@
|
||||||
|
import { BaseChatModel } from '@langchain/core/language_models/chat_models';
|
||||||
|
import { AIMessage } from '@langchain/core/messages';
|
||||||
|
import { Command, END } from '@langchain/langgraph';
|
||||||
|
import { EventEmitter } from 'events';
|
||||||
|
import { Document } from 'langchain/document';
|
||||||
|
import { AgentState } from './agentState';
|
||||||
|
import { getWebContent } from '../utils/documents';
|
||||||
|
import { removeThinkingBlocks } from '../utils/contentUtils';
|
||||||
|
import { setTemperature } from '../utils/modelUtils';
|
||||||
|
|
||||||
|
export class URLSummarizationAgent {
|
||||||
|
private llm: BaseChatModel;
|
||||||
|
private emitter: EventEmitter;
|
||||||
|
private systemInstructions: string;
|
||||||
|
private signal: AbortSignal;
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
llm: BaseChatModel,
|
||||||
|
emitter: EventEmitter,
|
||||||
|
systemInstructions: string,
|
||||||
|
signal: AbortSignal,
|
||||||
|
) {
|
||||||
|
this.llm = llm;
|
||||||
|
this.emitter = emitter;
|
||||||
|
this.systemInstructions = systemInstructions;
|
||||||
|
this.signal = signal;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* URL processing agent node
|
||||||
|
*/
|
||||||
|
async execute(state: typeof AgentState.State): Promise<Command> {
|
||||||
|
try {
|
||||||
|
setTemperature(this.llm, 0); // Set temperature to 0 for deterministic output
|
||||||
|
|
||||||
|
// Use pre-analyzed URLs from ContentRouterAgent
|
||||||
|
const urlsToProcess = state.urlsToSummarize || [];
|
||||||
|
const summarizationIntent = state.summarizationIntent || 'process content to help answer the user query';
|
||||||
|
|
||||||
|
if (urlsToProcess.length === 0) {
|
||||||
|
console.log('No URLs found for processing, routing back to content router');
|
||||||
|
return new Command({
|
||||||
|
goto: 'content_router',
|
||||||
|
update: {
|
||||||
|
messages: [new AIMessage('No URLs found for processing, routing to content router')],
|
||||||
|
},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`URL processing detected. URLs: ${urlsToProcess.join(', ')}`);
|
||||||
|
console.log(`Processing intent: ${summarizationIntent}`);
|
||||||
|
|
||||||
|
// Emit URL detection event
|
||||||
|
this.emitter.emit('agent_action', {
|
||||||
|
type: 'agent_action',
|
||||||
|
data: {
|
||||||
|
action: 'URL_PROCESSING_DETECTED',
|
||||||
|
message: `Processing ${urlsToProcess.length} URL(s) to extract content for analysis`,
|
||||||
|
details: {
|
||||||
|
query: state.query,
|
||||||
|
urls: urlsToProcess,
|
||||||
|
intent: summarizationIntent,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const documents: Document[] = [];
|
||||||
|
|
||||||
|
// Process each URL
|
||||||
|
for (const url of urlsToProcess) {
|
||||||
|
if (this.signal.aborted) {
|
||||||
|
console.warn('URL summarization operation aborted by signal');
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Emit URL processing event
|
||||||
|
this.emitter.emit('agent_action', {
|
||||||
|
type: 'agent_action',
|
||||||
|
data: {
|
||||||
|
action: 'PROCESSING_URL',
|
||||||
|
message: `Retrieving and processing content from: ${url}`,
|
||||||
|
details: {
|
||||||
|
query: state.query,
|
||||||
|
sourceUrl: url,
|
||||||
|
intent: summarizationIntent,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
// Fetch full content using the enhanced web content retrieval
|
||||||
|
const webContent = await getWebContent(url, true);
|
||||||
|
|
||||||
|
if (!webContent || !webContent.pageContent) {
|
||||||
|
console.warn(`No content retrieved from URL: ${url}`);
|
||||||
|
|
||||||
|
// Emit URL processing failure event
|
||||||
|
this.emitter.emit('agent_action', {
|
||||||
|
type: 'agent_action',
|
||||||
|
data: {
|
||||||
|
action: 'URL_PROCESSING_FAILED',
|
||||||
|
message: `Failed to retrieve content from: ${url}`,
|
||||||
|
details: {
|
||||||
|
query: state.query,
|
||||||
|
sourceUrl: url,
|
||||||
|
reason: 'No content retrieved',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const contentLength = webContent.pageContent.length;
|
||||||
|
let finalContent: string;
|
||||||
|
let processingType: string;
|
||||||
|
|
||||||
|
// If content is short (< 4000 chars), use it directly; otherwise summarize
|
||||||
|
if (contentLength < 4000) {
|
||||||
|
finalContent = webContent.pageContent;
|
||||||
|
processingType = 'url-direct-content';
|
||||||
|
|
||||||
|
console.log(`Content is short (${contentLength} chars), using directly without summarization`);
|
||||||
|
|
||||||
|
// Emit direct content usage event
|
||||||
|
this.emitter.emit('agent_action', {
|
||||||
|
type: 'agent_action',
|
||||||
|
data: {
|
||||||
|
action: 'URL_DIRECT_CONTENT',
|
||||||
|
message: `Content is short (${contentLength} chars), using directly from: ${url}`,
|
||||||
|
details: {
|
||||||
|
query: state.query,
|
||||||
|
sourceUrl: url,
|
||||||
|
sourceTitle: webContent.metadata.title || 'Web Page',
|
||||||
|
contentLength: contentLength,
|
||||||
|
intent: summarizationIntent,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
// Content is long, summarize using LLM
|
||||||
|
console.log(`Content is long (${contentLength} chars), generating summary`);
|
||||||
|
|
||||||
|
const systemPrompt = this.systemInstructions
|
||||||
|
? `${this.systemInstructions}\n\n`
|
||||||
|
: '';
|
||||||
|
|
||||||
|
const summarizationPrompt = `${systemPrompt}You are a web content processor. Extract and summarize ONLY the information from the provided web page content that is relevant to the user's query.
|
||||||
|
|
||||||
|
# Critical Instructions
|
||||||
|
- Output ONLY a summary of the web page content provided below
|
||||||
|
- Focus on information that relates to or helps answer the user's query
|
||||||
|
- Do NOT add pleasantries, greetings, or conversational elements
|
||||||
|
- Do NOT mention missing URLs, other pages, or content not provided
|
||||||
|
- Do NOT ask follow-up questions or suggest additional actions
|
||||||
|
- Do NOT add commentary about the user's request or query
|
||||||
|
- Present the information in a clear, well-structured format with key facts and details
|
||||||
|
- Include all relevant details that could help answer the user's question
|
||||||
|
|
||||||
|
# User's Query: ${state.query}
|
||||||
|
|
||||||
|
# Content Title: ${webContent.metadata.title || 'Web Page'}
|
||||||
|
# Content URL: ${url}
|
||||||
|
|
||||||
|
# Web Page Content to Summarize:
|
||||||
|
${webContent.pageContent}
|
||||||
|
|
||||||
|
Provide a comprehensive summary of the above web page content, focusing on information relevant to the user's query:`;
|
||||||
|
|
||||||
|
const result = await this.llm.invoke(summarizationPrompt, {
|
||||||
|
signal: this.signal,
|
||||||
|
});
|
||||||
|
|
||||||
|
finalContent = removeThinkingBlocks(result.content as string);
|
||||||
|
processingType = 'url-content-extraction';
|
||||||
|
}
|
||||||
|
|
||||||
|
if (finalContent && finalContent.trim().length > 0) {
|
||||||
|
const document = new Document({
|
||||||
|
pageContent: finalContent,
|
||||||
|
metadata: {
|
||||||
|
title: webContent.metadata.title || 'URL Content',
|
||||||
|
url: url,
|
||||||
|
source: url,
|
||||||
|
processingType: processingType,
|
||||||
|
processingIntent: summarizationIntent,
|
||||||
|
originalContentLength: contentLength,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
documents.push(document);
|
||||||
|
|
||||||
|
// Emit successful URL processing event
|
||||||
|
this.emitter.emit('agent_action', {
|
||||||
|
type: 'agent_action',
|
||||||
|
data: {
|
||||||
|
action: 'URL_CONTENT_EXTRACTED',
|
||||||
|
message: `Successfully processed content from: ${url}`,
|
||||||
|
details: {
|
||||||
|
query: state.query,
|
||||||
|
sourceUrl: url,
|
||||||
|
sourceTitle: webContent.metadata.title || 'Web Page',
|
||||||
|
contentLength: finalContent.length,
|
||||||
|
originalContentLength: contentLength,
|
||||||
|
processingType: processingType,
|
||||||
|
intent: summarizationIntent,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log(
|
||||||
|
`Successfully processed content from ${url} (${finalContent.length} characters, ${processingType})`,
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
console.warn(`No valid content generated for URL: ${url}`);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`Error processing URL ${url}:`, error);
|
||||||
|
|
||||||
|
// Emit URL processing error event
|
||||||
|
this.emitter.emit('agent_action', {
|
||||||
|
type: 'agent_action',
|
||||||
|
data: {
|
||||||
|
action: 'URL_PROCESSING_ERROR',
|
||||||
|
message: `Error processing URL: ${url}`,
|
||||||
|
details: {
|
||||||
|
query: state.query,
|
||||||
|
sourceUrl: url,
|
||||||
|
error: error instanceof Error ? error.message : 'Unknown error',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (documents.length === 0) {
|
||||||
|
const errorMessage = `No content could be retrieved or summarized from the provided URL(s): ${urlsToProcess.join(', ')}`;
|
||||||
|
console.error(errorMessage);
|
||||||
|
|
||||||
|
return new Command({
|
||||||
|
goto: 'analyzer',
|
||||||
|
update: {
|
||||||
|
messages: [new AIMessage(errorMessage)],
|
||||||
|
},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Emit completion event
|
||||||
|
this.emitter.emit('agent_action', {
|
||||||
|
type: 'agent_action',
|
||||||
|
data: {
|
||||||
|
action: 'URL_PROCESSING_COMPLETED',
|
||||||
|
message: `Successfully processed ${documents.length} URL(s) and extracted content`,
|
||||||
|
details: {
|
||||||
|
query: state.query,
|
||||||
|
processedUrls: urlsToProcess.length,
|
||||||
|
successfulExtractions: documents.length,
|
||||||
|
intent: summarizationIntent,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
const responseMessage = `URL processing completed. Successfully processed ${documents.length} out of ${urlsToProcess.length} URLs.`;
|
||||||
|
console.log(responseMessage);
|
||||||
|
|
||||||
|
return new Command({
|
||||||
|
goto: 'analyzer', // Route to analyzer to continue with normal workflow after URL processing
|
||||||
|
update: {
|
||||||
|
messages: [new AIMessage(responseMessage)],
|
||||||
|
relevantDocuments: documents,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
console.error('URL summarization error:', error);
|
||||||
|
const errorMessage = new AIMessage(
|
||||||
|
`URL summarization failed: ${error instanceof Error ? error.message : 'Unknown error'}`,
|
||||||
|
);
|
||||||
|
|
||||||
|
return new Command({
|
||||||
|
goto: END,
|
||||||
|
update: {
|
||||||
|
messages: [errorMessage],
|
||||||
|
},
|
||||||
|
});
|
||||||
|
} finally {
|
||||||
|
setTemperature(this.llm, undefined); // Reset temperature to default
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -28,6 +28,7 @@ Consider the following when evaluating sufficiency:
|
||||||
- If the user is requesting to avoid web searches → respond with \`good_content\`
|
- If the user is requesting to avoid web searches → respond with \`good_content\`
|
||||||
- If the user is asking you to be creative, such as writing a story, poem, or creative content → respond with \`good_content\` unless the context is clearly insufficient
|
- If the user is asking you to be creative, such as writing a story, poem, or creative content → respond with \`good_content\` unless the context is clearly insufficient
|
||||||
- If file documents contain complete information for file-specific queries → respond with \`good_content\`
|
- If file documents contain complete information for file-specific queries → respond with \`good_content\`
|
||||||
|
- If the user is requesting specific web content and there is a source that corresponds to that request in the context, it can be considered sufficient even if the content is not exhaustive or looks like errors → respond with \`good_content\`
|
||||||
|
|
||||||
## Step 2: If content is insufficient, determine the type of missing information
|
## Step 2: If content is insufficient, determine the type of missing information
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -21,6 +21,7 @@ import {
|
||||||
TaskManagerAgent,
|
TaskManagerAgent,
|
||||||
FileSearchAgent,
|
FileSearchAgent,
|
||||||
ContentRouterAgent,
|
ContentRouterAgent,
|
||||||
|
URLSummarizationAgent,
|
||||||
} from '../agents';
|
} from '../agents';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -37,6 +38,7 @@ export class AgentSearch {
|
||||||
private synthesizerAgent: SynthesizerAgent;
|
private synthesizerAgent: SynthesizerAgent;
|
||||||
private fileSearchAgent: FileSearchAgent;
|
private fileSearchAgent: FileSearchAgent;
|
||||||
private contentRouterAgent: ContentRouterAgent;
|
private contentRouterAgent: ContentRouterAgent;
|
||||||
|
private urlSummarizationAgent: URLSummarizationAgent;
|
||||||
private emitter: EventEmitter;
|
private emitter: EventEmitter;
|
||||||
private focusMode: string;
|
private focusMode: string;
|
||||||
|
|
||||||
|
|
@ -95,6 +97,12 @@ export class AgentSearch {
|
||||||
systemInstructions,
|
systemInstructions,
|
||||||
signal,
|
signal,
|
||||||
);
|
);
|
||||||
|
this.urlSummarizationAgent = new URLSummarizationAgent(
|
||||||
|
llm,
|
||||||
|
emitter,
|
||||||
|
systemInstructions,
|
||||||
|
signal,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -102,6 +110,13 @@ export class AgentSearch {
|
||||||
*/
|
*/
|
||||||
private createWorkflow() {
|
private createWorkflow() {
|
||||||
const workflow = new StateGraph(AgentState)
|
const workflow = new StateGraph(AgentState)
|
||||||
|
.addNode(
|
||||||
|
'url_summarization',
|
||||||
|
this.urlSummarizationAgent.execute.bind(this.urlSummarizationAgent),
|
||||||
|
{
|
||||||
|
ends: ['task_manager', 'analyzer'],
|
||||||
|
},
|
||||||
|
)
|
||||||
.addNode(
|
.addNode(
|
||||||
'task_manager',
|
'task_manager',
|
||||||
this.taskManagerAgent.execute.bind(this.taskManagerAgent),
|
this.taskManagerAgent.execute.bind(this.taskManagerAgent),
|
||||||
|
|
@ -134,7 +149,7 @@ export class AgentSearch {
|
||||||
'analyzer',
|
'analyzer',
|
||||||
this.analyzerAgent.execute.bind(this.analyzerAgent),
|
this.analyzerAgent.execute.bind(this.analyzerAgent),
|
||||||
{
|
{
|
||||||
ends: ['task_manager', 'synthesizer'],
|
ends: ['url_summarization', 'task_manager', 'synthesizer'],
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
.addNode(
|
.addNode(
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue