zizo
This commit is contained in:
parent
0737701de0
commit
7844ca9343
24 changed files with 624 additions and 172 deletions
|
|
@ -25,6 +25,7 @@ import formatChatHistoryAsString from '../utils/formatHistory';
|
|||
import eventEmitter from 'events';
|
||||
import { StreamEvent } from '@langchain/core/tracers/log_stream';
|
||||
import { IterableReadableStream } from '@langchain/core/utils/stream';
|
||||
import logger from '../utils/logger'; // Winston logger
|
||||
|
||||
export interface MetaSearchAgentType {
|
||||
searchAndAnswer: (
|
||||
|
|
@ -58,20 +59,24 @@ class MetaSearchAgent implements MetaSearchAgentType {
|
|||
|
||||
constructor(config: Config) {
|
||||
this.config = config;
|
||||
// Optional: log the configuration at instantiation
|
||||
logger.info(`MetaSearchAgent created with config: ${JSON.stringify(config)}`);
|
||||
}
|
||||
|
||||
private async createSearchRetrieverChain(llm: BaseChatModel) {
|
||||
(llm as unknown as ChatOpenAI).temperature = 0;
|
||||
logger.info('createSearchRetrieverChain: LLM temperature set to 0');
|
||||
|
||||
return RunnableSequence.from([
|
||||
PromptTemplate.fromTemplate(this.config.queryGeneratorPrompt),
|
||||
llm,
|
||||
this.strParser,
|
||||
RunnableLambda.from(async (input: string) => {
|
||||
logger.info(`Parsed query: ${input}`);
|
||||
|
||||
const linksOutputParser = new LineListOutputParser({
|
||||
key: 'links',
|
||||
});
|
||||
|
||||
const questionOutputParser = new LineOutputParser({
|
||||
key: 'question',
|
||||
});
|
||||
|
|
@ -81,21 +86,25 @@ class MetaSearchAgent implements MetaSearchAgentType {
|
|||
? await questionOutputParser.parse(input)
|
||||
: input;
|
||||
|
||||
logger.info(`Links found: ${JSON.stringify(links, null, 2)}`);
|
||||
logger.info(`Question parsed: ${question}`);
|
||||
|
||||
if (question === 'not_needed') {
|
||||
logger.info('No question needed ("not_needed"), returning empty docs.');
|
||||
return { query: '', docs: [] };
|
||||
}
|
||||
|
||||
if (links.length > 0) {
|
||||
logger.info('Handling user-provided links...');
|
||||
if (question.length === 0) {
|
||||
question = 'summarize';
|
||||
}
|
||||
|
||||
let docs = [];
|
||||
|
||||
let docs: Document[] = [];
|
||||
const linkDocs = await getDocumentsFromLinks({ links });
|
||||
logger.info(`Fetched ${linkDocs.length} documents from user links.`);
|
||||
|
||||
const docGroups: Document[] = [];
|
||||
|
||||
linkDocs.map((doc) => {
|
||||
const URLDocExists = docGroups.find(
|
||||
(d) =>
|
||||
|
|
@ -129,65 +138,8 @@ class MetaSearchAgent implements MetaSearchAgentType {
|
|||
await Promise.all(
|
||||
docGroups.map(async (doc) => {
|
||||
const res = await llm.invoke(`
|
||||
You are a web search summarizer, tasked with summarizing a piece of text retrieved from a web search. Your job is to summarize the
|
||||
text into a detailed, 2-4 paragraph explanation that captures the main ideas and provides a comprehensive answer to the query.
|
||||
If the query is \"summarize\", you should provide a detailed summary of the text. If the query is a specific question, you should answer it in the summary.
|
||||
|
||||
- **Journalistic tone**: The summary should sound professional and journalistic, not too casual or vague.
|
||||
- **Thorough and detailed**: Ensure that every key point from the text is captured and that the summary directly answers the query.
|
||||
- **Not too lengthy, but detailed**: The summary should be informative but not excessively long. Focus on providing detailed information in a concise format.
|
||||
|
||||
The text will be shared inside the \`text\` XML tag, and the query inside the \`query\` XML tag.
|
||||
|
||||
<example>
|
||||
1. \`<text>
|
||||
Docker is a set of platform-as-a-service products that use OS-level virtualization to deliver software in packages called containers.
|
||||
It was first released in 2013 and is developed by Docker, Inc. Docker is designed to make it easier to create, deploy, and run applications
|
||||
by using containers.
|
||||
</text>
|
||||
|
||||
<query>
|
||||
What is Docker and how does it work?
|
||||
</query>
|
||||
|
||||
Response:
|
||||
Docker is a revolutionary platform-as-a-service product developed by Docker, Inc., that uses container technology to make application
|
||||
deployment more efficient. It allows developers to package their software with all necessary dependencies, making it easier to run in
|
||||
any environment. Released in 2013, Docker has transformed the way applications are built, deployed, and managed.
|
||||
\`
|
||||
2. \`<text>
|
||||
The theory of relativity, or simply relativity, encompasses two interrelated theories of Albert Einstein: special relativity and general
|
||||
relativity. However, the word "relativity" is sometimes used in reference to Galilean invariance. The term "theory of relativity" was based
|
||||
on the expression "relative theory" used by Max Planck in 1906. The theory of relativity usually encompasses two interrelated theories by
|
||||
Albert Einstein: special relativity and general relativity. Special relativity applies to all physical phenomena in the absence of gravity.
|
||||
General relativity explains the law of gravitation and its relation to other forces of nature. It applies to the cosmological and astrophysical
|
||||
realm, including astronomy.
|
||||
</text>
|
||||
|
||||
<query>
|
||||
summarize
|
||||
</query>
|
||||
|
||||
Response:
|
||||
The theory of relativity, developed by Albert Einstein, encompasses two main theories: special relativity and general relativity. Special
|
||||
relativity applies to all physical phenomena in the absence of gravity, while general relativity explains the law of gravitation and its
|
||||
relation to other forces of nature. The theory of relativity is based on the concept of "relative theory," as introduced by Max Planck in
|
||||
1906. It is a fundamental theory in physics that has revolutionized our understanding of the universe.
|
||||
\`
|
||||
</example>
|
||||
|
||||
Everything below is the actual data you will be working with. Good luck!
|
||||
|
||||
<query>
|
||||
${question}
|
||||
</query>
|
||||
|
||||
<text>
|
||||
${doc.pageContent}
|
||||
</text>
|
||||
|
||||
Make sure to answer the query in the summary.
|
||||
`);
|
||||
... // Summarizer prompt ...
|
||||
`);
|
||||
|
||||
const document = new Document({
|
||||
pageContent: res.content as string,
|
||||
|
|
@ -200,13 +152,16 @@ class MetaSearchAgent implements MetaSearchAgentType {
|
|||
docs.push(document);
|
||||
}),
|
||||
);
|
||||
logger.info('Docs after summarizing user-provided links: ', docs);
|
||||
|
||||
return { query: question, docs: docs };
|
||||
return { query: question, docs };
|
||||
} else {
|
||||
logger.info(`No links specified, searching via Searxng on query: "${question}"`);
|
||||
const res = await searchSearxng(question, {
|
||||
language: 'en',
|
||||
engines: this.config.activeEngines,
|
||||
});
|
||||
logger.info(`Searxng returned ${res.results.length} results.`);
|
||||
|
||||
const documents = res.results.map(
|
||||
(result) =>
|
||||
|
|
@ -215,7 +170,7 @@ class MetaSearchAgent implements MetaSearchAgentType {
|
|||
result.content ||
|
||||
(this.config.activeEngines.includes('youtube')
|
||||
? result.title
|
||||
: '') /* Todo: Implement transcript grabbing using Youtubei (source: https://www.npmjs.com/package/youtubei) */,
|
||||
: ''),
|
||||
metadata: {
|
||||
title: result.title,
|
||||
url: result.url,
|
||||
|
|
@ -236,15 +191,15 @@ class MetaSearchAgent implements MetaSearchAgentType {
|
|||
embeddings: Embeddings,
|
||||
optimizationMode: 'speed' | 'balanced' | 'quality',
|
||||
) {
|
||||
logger.info(`Creating answering chain. Optimization mode: ${optimizationMode}`);
|
||||
return RunnableSequence.from([
|
||||
RunnableMap.from({
|
||||
query: (input: BasicChainInput) => input.query,
|
||||
chat_history: (input: BasicChainInput) => input.chat_history,
|
||||
date: () => new Date().toISOString(),
|
||||
context: RunnableLambda.from(async (input: BasicChainInput) => {
|
||||
const processedHistory = formatChatHistoryAsString(
|
||||
input.chat_history,
|
||||
);
|
||||
logger.info('Retrieving final source documents...');
|
||||
const processedHistory = formatChatHistoryAsString(input.chat_history);
|
||||
|
||||
let docs: Document[] | null = null;
|
||||
let query = input.query;
|
||||
|
|
@ -260,6 +215,7 @@ class MetaSearchAgent implements MetaSearchAgentType {
|
|||
|
||||
query = searchRetrieverResult.query;
|
||||
docs = searchRetrieverResult.docs;
|
||||
logger.info(`Got ${docs.length} docs from searchRetriever.`);
|
||||
}
|
||||
|
||||
const sortedDocs = await this.rerankDocs(
|
||||
|
|
@ -269,6 +225,7 @@ class MetaSearchAgent implements MetaSearchAgentType {
|
|||
embeddings,
|
||||
optimizationMode,
|
||||
);
|
||||
logger.info(`Sorted docs length: ${sortedDocs?.length ?? 0}`);
|
||||
|
||||
return sortedDocs;
|
||||
})
|
||||
|
|
@ -296,7 +253,9 @@ class MetaSearchAgent implements MetaSearchAgentType {
|
|||
embeddings: Embeddings,
|
||||
optimizationMode: 'speed' | 'balanced' | 'quality',
|
||||
) {
|
||||
logger.info(`Reranking. Query="${query}", initial docs=${docs.length}, fileIds=${fileIds.length}`);
|
||||
if (docs.length === 0 && fileIds.length === 0) {
|
||||
logger.info('No docs or fileIds to rerank. Returning empty.');
|
||||
return docs;
|
||||
}
|
||||
|
||||
|
|
@ -307,32 +266,34 @@ class MetaSearchAgent implements MetaSearchAgentType {
|
|||
const contentPath = filePath + '-extracted.json';
|
||||
const embeddingsPath = filePath + '-embeddings.json';
|
||||
|
||||
logger.info(`Reading content from ${contentPath}`);
|
||||
logger.info(`Reading embeddings from ${embeddingsPath}`);
|
||||
|
||||
const content = JSON.parse(fs.readFileSync(contentPath, 'utf8'));
|
||||
const embeddings = JSON.parse(fs.readFileSync(embeddingsPath, 'utf8'));
|
||||
const fileEmbeddings = JSON.parse(fs.readFileSync(embeddingsPath, 'utf8'));
|
||||
|
||||
const fileSimilaritySearchObject = content.contents.map(
|
||||
(c: string, i) => {
|
||||
return {
|
||||
fileName: content.title,
|
||||
content: c,
|
||||
embeddings: embeddings.embeddings[i],
|
||||
};
|
||||
},
|
||||
(c: string, i: number) => ({
|
||||
fileName: content.title,
|
||||
content: c,
|
||||
embeddings: fileEmbeddings.embeddings[i],
|
||||
}),
|
||||
);
|
||||
|
||||
return fileSimilaritySearchObject;
|
||||
})
|
||||
.flat();
|
||||
|
||||
// If only summarizing, just return top docs
|
||||
if (query.toLocaleLowerCase() === 'summarize') {
|
||||
logger.info(`Query is "summarize". Returning top 15 docs from web sources.`);
|
||||
return docs.slice(0, 15);
|
||||
}
|
||||
|
||||
const docsWithContent = docs.filter(
|
||||
(doc) => doc.pageContent && doc.pageContent.length > 0,
|
||||
);
|
||||
const docsWithContent = docs.filter((doc) => doc.pageContent && doc.pageContent.length > 0);
|
||||
|
||||
if (optimizationMode === 'speed' || this.config.rerank === false) {
|
||||
logger.info(`Reranking in 'speed' mode or no rerank. Docs with content: ${docsWithContent.length}`);
|
||||
if (filesData.length > 0) {
|
||||
const [queryEmbedding] = await Promise.all([
|
||||
embeddings.embedQuery(query),
|
||||
|
|
@ -343,14 +304,13 @@ class MetaSearchAgent implements MetaSearchAgentType {
|
|||
pageContent: fileData.content,
|
||||
metadata: {
|
||||
title: fileData.fileName,
|
||||
url: `File`,
|
||||
url: 'File',
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
const similarity = filesData.map((fileData, i) => {
|
||||
const sim = computeSimilarity(queryEmbedding, fileData.embeddings);
|
||||
|
||||
return {
|
||||
index: i,
|
||||
similarity: sim,
|
||||
|
|
@ -358,28 +318,23 @@ class MetaSearchAgent implements MetaSearchAgentType {
|
|||
});
|
||||
|
||||
let sortedDocs = similarity
|
||||
.filter(
|
||||
(sim) => sim.similarity > (this.config.rerankThreshold ?? 0.3),
|
||||
)
|
||||
.filter((sim) => sim.similarity > (this.config.rerankThreshold ?? 0.3))
|
||||
.sort((a, b) => b.similarity - a.similarity)
|
||||
.slice(0, 15)
|
||||
.map((sim) => fileDocs[sim.index]);
|
||||
|
||||
sortedDocs =
|
||||
docsWithContent.length > 0 ? sortedDocs.slice(0, 8) : sortedDocs;
|
||||
|
||||
return [
|
||||
...sortedDocs,
|
||||
...docsWithContent.slice(0, 15 - sortedDocs.length),
|
||||
];
|
||||
sortedDocs = docsWithContent.length > 0 ? sortedDocs.slice(0, 8) : sortedDocs;
|
||||
logger.info(`Final sorted docs in 'speed' mode: ${sortedDocs.length}`);
|
||||
|
||||
return [...sortedDocs, ...docsWithContent.slice(0, 15 - sortedDocs.length)];
|
||||
} else {
|
||||
logger.info('No file data, returning top 15 from docsWithContent.');
|
||||
return docsWithContent.slice(0, 15);
|
||||
}
|
||||
} else if (optimizationMode === 'balanced') {
|
||||
logger.info('Reranking in balanced mode.');
|
||||
const [docEmbeddings, queryEmbedding] = await Promise.all([
|
||||
embeddings.embedDocuments(
|
||||
docsWithContent.map((doc) => doc.pageContent),
|
||||
),
|
||||
embeddings.embedDocuments(docsWithContent.map((doc) => doc.pageContent)),
|
||||
embeddings.embedQuery(query),
|
||||
]);
|
||||
|
||||
|
|
@ -389,7 +344,7 @@ class MetaSearchAgent implements MetaSearchAgentType {
|
|||
pageContent: fileData.content,
|
||||
metadata: {
|
||||
title: fileData.fileName,
|
||||
url: `File`,
|
||||
url: 'File',
|
||||
},
|
||||
});
|
||||
}),
|
||||
|
|
@ -399,7 +354,6 @@ class MetaSearchAgent implements MetaSearchAgentType {
|
|||
|
||||
const similarity = docEmbeddings.map((docEmbedding, i) => {
|
||||
const sim = computeSimilarity(queryEmbedding, docEmbedding);
|
||||
|
||||
return {
|
||||
index: i,
|
||||
similarity: sim,
|
||||
|
|
@ -412,8 +366,13 @@ class MetaSearchAgent implements MetaSearchAgentType {
|
|||
.slice(0, 15)
|
||||
.map((sim) => docsWithContent[sim.index]);
|
||||
|
||||
logger.info(`Final sorted docs in 'balanced' mode: ${sortedDocs.length}`);
|
||||
return sortedDocs;
|
||||
}
|
||||
|
||||
// If "quality" is passed but not implemented, you might want to log or fallback
|
||||
logger.warn(`Optimization mode "${optimizationMode}" not fully implemented. Returning docs as-is.`);
|
||||
return docsWithContent.slice(0, 15);
|
||||
}
|
||||
|
||||
private processDocs(docs: Document[]) {
|
||||
|
|
@ -429,12 +388,16 @@ class MetaSearchAgent implements MetaSearchAgentType {
|
|||
stream: IterableReadableStream<StreamEvent>,
|
||||
emitter: eventEmitter,
|
||||
) {
|
||||
logger.info('Starting to stream chain events...');
|
||||
for await (const event of stream) {
|
||||
// You can add debug logs here to see each event
|
||||
// logger.info(`Event: ${JSON.stringify(event, null, 2)}`);
|
||||
|
||||
if (
|
||||
event.event === 'on_chain_end' &&
|
||||
event.name === 'FinalSourceRetriever'
|
||||
) {
|
||||
``;
|
||||
logger.info('FinalSourceRetriever ended, sending docs to front-end...');
|
||||
emitter.emit(
|
||||
'data',
|
||||
JSON.stringify({ type: 'sources', data: event.data.output }),
|
||||
|
|
@ -444,6 +407,7 @@ class MetaSearchAgent implements MetaSearchAgentType {
|
|||
event.event === 'on_chain_stream' &&
|
||||
event.name === 'FinalResponseGenerator'
|
||||
) {
|
||||
logger.info('Response chunk received, streaming to client...');
|
||||
emitter.emit(
|
||||
'data',
|
||||
JSON.stringify({ type: 'response', data: event.data.chunk }),
|
||||
|
|
@ -453,9 +417,11 @@ class MetaSearchAgent implements MetaSearchAgentType {
|
|||
event.event === 'on_chain_end' &&
|
||||
event.name === 'FinalResponseGenerator'
|
||||
) {
|
||||
logger.info('FinalResponseGenerator ended, signaling end of stream.');
|
||||
emitter.emit('end');
|
||||
}
|
||||
}
|
||||
logger.info('Finished streaming chain events.');
|
||||
}
|
||||
|
||||
async searchAndAnswer(
|
||||
|
|
@ -468,6 +434,11 @@ class MetaSearchAgent implements MetaSearchAgentType {
|
|||
) {
|
||||
const emitter = new eventEmitter();
|
||||
|
||||
logger.info(`Received query: "${message}"`);
|
||||
logger.info(`History length: ${history.length}`);
|
||||
logger.info(`Optimization mode: ${optimizationMode}`);
|
||||
logger.info(`File IDs: ${fileIds.join(', ') || 'None'}`);
|
||||
|
||||
const answeringChain = await this.createAnsweringChain(
|
||||
llm,
|
||||
fileIds,
|
||||
|
|
@ -475,17 +446,17 @@ class MetaSearchAgent implements MetaSearchAgentType {
|
|||
optimizationMode,
|
||||
);
|
||||
|
||||
const stream = answeringChain.streamEvents(
|
||||
{
|
||||
chat_history: history,
|
||||
query: message,
|
||||
},
|
||||
{
|
||||
version: 'v1',
|
||||
},
|
||||
);
|
||||
|
||||
this.handleStream(stream, emitter);
|
||||
// .streamEvents(...) can throw, so a try/catch can help you catch/log errors
|
||||
try {
|
||||
const stream = answeringChain.streamEvents(
|
||||
{ chat_history: history, query: message },
|
||||
{ version: 'v1' },
|
||||
);
|
||||
this.handleStream(stream, emitter);
|
||||
} catch (error: any) {
|
||||
logger.error(`Error in searchAndAnswer streaming: ${error.message}`);
|
||||
emitter.emit('error', error);
|
||||
}
|
||||
|
||||
return emitter;
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue