This commit is contained in:
WaelAbouceo 2025-02-02 12:14:15 +02:00
parent 0737701de0
commit 7844ca9343
24 changed files with 624 additions and 172 deletions

View file

@ -25,6 +25,7 @@ import formatChatHistoryAsString from '../utils/formatHistory';
import eventEmitter from 'events';
import { StreamEvent } from '@langchain/core/tracers/log_stream';
import { IterableReadableStream } from '@langchain/core/utils/stream';
import logger from '../utils/logger'; // Winston logger
export interface MetaSearchAgentType {
searchAndAnswer: (
@ -58,20 +59,24 @@ class MetaSearchAgent implements MetaSearchAgentType {
constructor(config: Config) {
this.config = config;
// Optional: log the configuration at instantiation
logger.info(`MetaSearchAgent created with config: ${JSON.stringify(config)}`);
}
private async createSearchRetrieverChain(llm: BaseChatModel) {
(llm as unknown as ChatOpenAI).temperature = 0;
logger.info('createSearchRetrieverChain: LLM temperature set to 0');
return RunnableSequence.from([
PromptTemplate.fromTemplate(this.config.queryGeneratorPrompt),
llm,
this.strParser,
RunnableLambda.from(async (input: string) => {
logger.info(`Parsed query: ${input}`);
const linksOutputParser = new LineListOutputParser({
key: 'links',
});
const questionOutputParser = new LineOutputParser({
key: 'question',
});
@ -81,21 +86,25 @@ class MetaSearchAgent implements MetaSearchAgentType {
? await questionOutputParser.parse(input)
: input;
logger.info(`Links found: ${JSON.stringify(links, null, 2)}`);
logger.info(`Question parsed: ${question}`);
if (question === 'not_needed') {
logger.info('No question needed ("not_needed"), returning empty docs.');
return { query: '', docs: [] };
}
if (links.length > 0) {
logger.info('Handling user-provided links...');
if (question.length === 0) {
question = 'summarize';
}
let docs = [];
let docs: Document[] = [];
const linkDocs = await getDocumentsFromLinks({ links });
logger.info(`Fetched ${linkDocs.length} documents from user links.`);
const docGroups: Document[] = [];
linkDocs.map((doc) => {
const URLDocExists = docGroups.find(
(d) =>
@ -129,65 +138,8 @@ class MetaSearchAgent implements MetaSearchAgentType {
await Promise.all(
docGroups.map(async (doc) => {
const res = await llm.invoke(`
You are a web search summarizer, tasked with summarizing a piece of text retrieved from a web search. Your job is to summarize the
text into a detailed, 2-4 paragraph explanation that captures the main ideas and provides a comprehensive answer to the query.
If the query is \"summarize\", you should provide a detailed summary of the text. If the query is a specific question, you should answer it in the summary.
- **Journalistic tone**: The summary should sound professional and journalistic, not too casual or vague.
- **Thorough and detailed**: Ensure that every key point from the text is captured and that the summary directly answers the query.
- **Not too lengthy, but detailed**: The summary should be informative but not excessively long. Focus on providing detailed information in a concise format.
The text will be shared inside the \`text\` XML tag, and the query inside the \`query\` XML tag.
<example>
1. \`<text>
Docker is a set of platform-as-a-service products that use OS-level virtualization to deliver software in packages called containers.
It was first released in 2013 and is developed by Docker, Inc. Docker is designed to make it easier to create, deploy, and run applications
by using containers.
</text>
<query>
What is Docker and how does it work?
</query>
Response:
Docker is a revolutionary platform-as-a-service product developed by Docker, Inc., that uses container technology to make application
deployment more efficient. It allows developers to package their software with all necessary dependencies, making it easier to run in
any environment. Released in 2013, Docker has transformed the way applications are built, deployed, and managed.
\`
2. \`<text>
The theory of relativity, or simply relativity, encompasses two interrelated theories of Albert Einstein: special relativity and general
relativity. However, the word "relativity" is sometimes used in reference to Galilean invariance. The term "theory of relativity" was based
on the expression "relative theory" used by Max Planck in 1906. The theory of relativity usually encompasses two interrelated theories by
Albert Einstein: special relativity and general relativity. Special relativity applies to all physical phenomena in the absence of gravity.
General relativity explains the law of gravitation and its relation to other forces of nature. It applies to the cosmological and astrophysical
realm, including astronomy.
</text>
<query>
summarize
</query>
Response:
The theory of relativity, developed by Albert Einstein, encompasses two main theories: special relativity and general relativity. Special
relativity applies to all physical phenomena in the absence of gravity, while general relativity explains the law of gravitation and its
relation to other forces of nature. The theory of relativity is based on the concept of "relative theory," as introduced by Max Planck in
1906. It is a fundamental theory in physics that has revolutionized our understanding of the universe.
\`
</example>
Everything below is the actual data you will be working with. Good luck!
<query>
${question}
</query>
<text>
${doc.pageContent}
</text>
Make sure to answer the query in the summary.
`);
... // Summarizer prompt ...
`);
const document = new Document({
pageContent: res.content as string,
@ -200,13 +152,16 @@ class MetaSearchAgent implements MetaSearchAgentType {
docs.push(document);
}),
);
logger.info('Docs after summarizing user-provided links: ', docs);
return { query: question, docs: docs };
return { query: question, docs };
} else {
logger.info(`No links specified, searching via Searxng on query: "${question}"`);
const res = await searchSearxng(question, {
language: 'en',
engines: this.config.activeEngines,
});
logger.info(`Searxng returned ${res.results.length} results.`);
const documents = res.results.map(
(result) =>
@ -215,7 +170,7 @@ class MetaSearchAgent implements MetaSearchAgentType {
result.content ||
(this.config.activeEngines.includes('youtube')
? result.title
: '') /* Todo: Implement transcript grabbing using Youtubei (source: https://www.npmjs.com/package/youtubei) */,
: ''),
metadata: {
title: result.title,
url: result.url,
@ -236,15 +191,15 @@ class MetaSearchAgent implements MetaSearchAgentType {
embeddings: Embeddings,
optimizationMode: 'speed' | 'balanced' | 'quality',
) {
logger.info(`Creating answering chain. Optimization mode: ${optimizationMode}`);
return RunnableSequence.from([
RunnableMap.from({
query: (input: BasicChainInput) => input.query,
chat_history: (input: BasicChainInput) => input.chat_history,
date: () => new Date().toISOString(),
context: RunnableLambda.from(async (input: BasicChainInput) => {
const processedHistory = formatChatHistoryAsString(
input.chat_history,
);
logger.info('Retrieving final source documents...');
const processedHistory = formatChatHistoryAsString(input.chat_history);
let docs: Document[] | null = null;
let query = input.query;
@ -260,6 +215,7 @@ class MetaSearchAgent implements MetaSearchAgentType {
query = searchRetrieverResult.query;
docs = searchRetrieverResult.docs;
logger.info(`Got ${docs.length} docs from searchRetriever.`);
}
const sortedDocs = await this.rerankDocs(
@ -269,6 +225,7 @@ class MetaSearchAgent implements MetaSearchAgentType {
embeddings,
optimizationMode,
);
logger.info(`Sorted docs length: ${sortedDocs?.length ?? 0}`);
return sortedDocs;
})
@ -296,7 +253,9 @@ class MetaSearchAgent implements MetaSearchAgentType {
embeddings: Embeddings,
optimizationMode: 'speed' | 'balanced' | 'quality',
) {
logger.info(`Reranking. Query="${query}", initial docs=${docs.length}, fileIds=${fileIds.length}`);
if (docs.length === 0 && fileIds.length === 0) {
logger.info('No docs or fileIds to rerank. Returning empty.');
return docs;
}
@ -307,32 +266,34 @@ class MetaSearchAgent implements MetaSearchAgentType {
const contentPath = filePath + '-extracted.json';
const embeddingsPath = filePath + '-embeddings.json';
logger.info(`Reading content from ${contentPath}`);
logger.info(`Reading embeddings from ${embeddingsPath}`);
const content = JSON.parse(fs.readFileSync(contentPath, 'utf8'));
const embeddings = JSON.parse(fs.readFileSync(embeddingsPath, 'utf8'));
const fileEmbeddings = JSON.parse(fs.readFileSync(embeddingsPath, 'utf8'));
const fileSimilaritySearchObject = content.contents.map(
(c: string, i) => {
return {
fileName: content.title,
content: c,
embeddings: embeddings.embeddings[i],
};
},
(c: string, i: number) => ({
fileName: content.title,
content: c,
embeddings: fileEmbeddings.embeddings[i],
}),
);
return fileSimilaritySearchObject;
})
.flat();
// If only summarizing, just return top docs
if (query.toLocaleLowerCase() === 'summarize') {
logger.info(`Query is "summarize". Returning top 15 docs from web sources.`);
return docs.slice(0, 15);
}
const docsWithContent = docs.filter(
(doc) => doc.pageContent && doc.pageContent.length > 0,
);
const docsWithContent = docs.filter((doc) => doc.pageContent && doc.pageContent.length > 0);
if (optimizationMode === 'speed' || this.config.rerank === false) {
logger.info(`Reranking in 'speed' mode or no rerank. Docs with content: ${docsWithContent.length}`);
if (filesData.length > 0) {
const [queryEmbedding] = await Promise.all([
embeddings.embedQuery(query),
@ -343,14 +304,13 @@ class MetaSearchAgent implements MetaSearchAgentType {
pageContent: fileData.content,
metadata: {
title: fileData.fileName,
url: `File`,
url: 'File',
},
});
});
const similarity = filesData.map((fileData, i) => {
const sim = computeSimilarity(queryEmbedding, fileData.embeddings);
return {
index: i,
similarity: sim,
@ -358,28 +318,23 @@ class MetaSearchAgent implements MetaSearchAgentType {
});
let sortedDocs = similarity
.filter(
(sim) => sim.similarity > (this.config.rerankThreshold ?? 0.3),
)
.filter((sim) => sim.similarity > (this.config.rerankThreshold ?? 0.3))
.sort((a, b) => b.similarity - a.similarity)
.slice(0, 15)
.map((sim) => fileDocs[sim.index]);
sortedDocs =
docsWithContent.length > 0 ? sortedDocs.slice(0, 8) : sortedDocs;
return [
...sortedDocs,
...docsWithContent.slice(0, 15 - sortedDocs.length),
];
sortedDocs = docsWithContent.length > 0 ? sortedDocs.slice(0, 8) : sortedDocs;
logger.info(`Final sorted docs in 'speed' mode: ${sortedDocs.length}`);
return [...sortedDocs, ...docsWithContent.slice(0, 15 - sortedDocs.length)];
} else {
logger.info('No file data, returning top 15 from docsWithContent.');
return docsWithContent.slice(0, 15);
}
} else if (optimizationMode === 'balanced') {
logger.info('Reranking in balanced mode.');
const [docEmbeddings, queryEmbedding] = await Promise.all([
embeddings.embedDocuments(
docsWithContent.map((doc) => doc.pageContent),
),
embeddings.embedDocuments(docsWithContent.map((doc) => doc.pageContent)),
embeddings.embedQuery(query),
]);
@ -389,7 +344,7 @@ class MetaSearchAgent implements MetaSearchAgentType {
pageContent: fileData.content,
metadata: {
title: fileData.fileName,
url: `File`,
url: 'File',
},
});
}),
@ -399,7 +354,6 @@ class MetaSearchAgent implements MetaSearchAgentType {
const similarity = docEmbeddings.map((docEmbedding, i) => {
const sim = computeSimilarity(queryEmbedding, docEmbedding);
return {
index: i,
similarity: sim,
@ -412,8 +366,13 @@ class MetaSearchAgent implements MetaSearchAgentType {
.slice(0, 15)
.map((sim) => docsWithContent[sim.index]);
logger.info(`Final sorted docs in 'balanced' mode: ${sortedDocs.length}`);
return sortedDocs;
}
// If "quality" is passed but not implemented, you might want to log or fallback
logger.warn(`Optimization mode "${optimizationMode}" not fully implemented. Returning docs as-is.`);
return docsWithContent.slice(0, 15);
}
private processDocs(docs: Document[]) {
@ -429,12 +388,16 @@ class MetaSearchAgent implements MetaSearchAgentType {
stream: IterableReadableStream<StreamEvent>,
emitter: eventEmitter,
) {
logger.info('Starting to stream chain events...');
for await (const event of stream) {
// You can add debug logs here to see each event
// logger.info(`Event: ${JSON.stringify(event, null, 2)}`);
if (
event.event === 'on_chain_end' &&
event.name === 'FinalSourceRetriever'
) {
``;
logger.info('FinalSourceRetriever ended, sending docs to front-end...');
emitter.emit(
'data',
JSON.stringify({ type: 'sources', data: event.data.output }),
@ -444,6 +407,7 @@ class MetaSearchAgent implements MetaSearchAgentType {
event.event === 'on_chain_stream' &&
event.name === 'FinalResponseGenerator'
) {
logger.info('Response chunk received, streaming to client...');
emitter.emit(
'data',
JSON.stringify({ type: 'response', data: event.data.chunk }),
@ -453,9 +417,11 @@ class MetaSearchAgent implements MetaSearchAgentType {
event.event === 'on_chain_end' &&
event.name === 'FinalResponseGenerator'
) {
logger.info('FinalResponseGenerator ended, signaling end of stream.');
emitter.emit('end');
}
}
logger.info('Finished streaming chain events.');
}
async searchAndAnswer(
@ -468,6 +434,11 @@ class MetaSearchAgent implements MetaSearchAgentType {
) {
const emitter = new eventEmitter();
logger.info(`Received query: "${message}"`);
logger.info(`History length: ${history.length}`);
logger.info(`Optimization mode: ${optimizationMode}`);
logger.info(`File IDs: ${fileIds.join(', ') || 'None'}`);
const answeringChain = await this.createAnsweringChain(
llm,
fileIds,
@ -475,17 +446,17 @@ class MetaSearchAgent implements MetaSearchAgentType {
optimizationMode,
);
const stream = answeringChain.streamEvents(
{
chat_history: history,
query: message,
},
{
version: 'v1',
},
);
this.handleStream(stream, emitter);
// .streamEvents(...) can throw, so a try/catch can help you catch/log errors
try {
const stream = answeringChain.streamEvents(
{ chat_history: history, query: message },
{ version: 'v1' },
);
this.handleStream(stream, emitter);
} catch (error: any) {
logger.error(`Error in searchAndAnswer streaming: ${error.message}`);
emitter.emit('error', error);
}
return emitter;
}