zizo

2025-02-02 12:14:15 +02:00 · 2025-02-02 12:14:15 +02:00 · 7844ca9343
commit 7844ca9343
parent 0737701de0
24 changed files with 624 additions and 172 deletions
--- a/src/search/metaSearchAgent.ts
+++ b/src/search/metaSearchAgent.ts
@ -25,6 +25,7 @@ import formatChatHistoryAsString from '../utils/formatHistory';
 import eventEmitter from 'events';
 import { StreamEvent } from '@langchain/core/tracers/log_stream';
 import { IterableReadableStream } from '@langchain/core/utils/stream';
+import logger from '../utils/logger'; // Winston logger

 export interface MetaSearchAgentType {
  searchAndAnswer: (
@ -58,20 +59,24 @@ class MetaSearchAgent implements MetaSearchAgentType {

  constructor(config: Config) {
    this.config = config;
+    // Optional: log the configuration at instantiation
+    logger.info(`MetaSearchAgent created with config: ${JSON.stringify(config)}`);
  }

  private async createSearchRetrieverChain(llm: BaseChatModel) {
    (llm as unknown as ChatOpenAI).temperature = 0;
+    logger.info('createSearchRetrieverChain: LLM temperature set to 0');

    return RunnableSequence.from([
      PromptTemplate.fromTemplate(this.config.queryGeneratorPrompt),
      llm,
      this.strParser,
      RunnableLambda.from(async (input: string) => {
+        logger.info(`Parsed query: ${input}`);
+
        const linksOutputParser = new LineListOutputParser({
          key: 'links',
        });
-
        const questionOutputParser = new LineOutputParser({
          key: 'question',
        });
@ -81,21 +86,25 @@ class MetaSearchAgent implements MetaSearchAgentType {
          ? await questionOutputParser.parse(input)
          : input;

+        logger.info(`Links found: ${JSON.stringify(links, null, 2)}`);
+        logger.info(`Question parsed: ${question}`);
+
        if (question === 'not_needed') {
+          logger.info('No question needed ("not_needed"), returning empty docs.');
          return { query: '', docs: [] };
        }

        if (links.length > 0) {
+          logger.info('Handling user-provided links...');
          if (question.length === 0) {
            question = 'summarize';
          }

-          let docs = [];
-
+          let docs: Document[] = [];
          const linkDocs = await getDocumentsFromLinks({ links });
+          logger.info(`Fetched ${linkDocs.length} documents from user links.`);

          const docGroups: Document[] = [];
-
          linkDocs.map((doc) => {
            const URLDocExists = docGroups.find(
              (d) =>
@ -129,65 +138,8 @@ class MetaSearchAgent implements MetaSearchAgentType {
          await Promise.all(
            docGroups.map(async (doc) => {
              const res = await llm.invoke(`
-            You are a web search summarizer, tasked with summarizing a piece of text retrieved from a web search. Your job is to summarize the 
-            text into a detailed, 2-4 paragraph explanation that captures the main ideas and provides a comprehensive answer to the query.
-            If the query is \"summarize\", you should provide a detailed summary of the text. If the query is a specific question, you should answer it in the summary.
-            
-            - **Journalistic tone**: The summary should sound professional and journalistic, not too casual or vague.
-            - **Thorough and detailed**: Ensure that every key point from the text is captured and that the summary directly answers the query.
-            - **Not too lengthy, but detailed**: The summary should be informative but not excessively long. Focus on providing detailed information in a concise format.
-
-            The text will be shared inside the \`text\` XML tag, and the query inside the \`query\` XML tag.
-
-            <example>
-            1. \`<text>
-            Docker is a set of platform-as-a-service products that use OS-level virtualization to deliver software in packages called containers. 
-            It was first released in 2013 and is developed by Docker, Inc. Docker is designed to make it easier to create, deploy, and run applications 
-            by using containers.
-            </text>
-
-            <query>
-            What is Docker and how does it work?
-            </query>
-
-            Response:
-            Docker is a revolutionary platform-as-a-service product developed by Docker, Inc., that uses container technology to make application 
-            deployment more efficient. It allows developers to package their software with all necessary dependencies, making it easier to run in 
-            any environment. Released in 2013, Docker has transformed the way applications are built, deployed, and managed.
-            \`
-            2. \`<text>
-            The theory of relativity, or simply relativity, encompasses two interrelated theories of Albert Einstein: special relativity and general
-            relativity. However, the word "relativity" is sometimes used in reference to Galilean invariance. The term "theory of relativity" was based
-            on the expression "relative theory" used by Max Planck in 1906. The theory of relativity usually encompasses two interrelated theories by
-            Albert Einstein: special relativity and general relativity. Special relativity applies to all physical phenomena in the absence of gravity.
-            General relativity explains the law of gravitation and its relation to other forces of nature. It applies to the cosmological and astrophysical
-            realm, including astronomy.
-            </text>
-
-            <query>
-            summarize
-            </query>
-
-            Response:
-            The theory of relativity, developed by Albert Einstein, encompasses two main theories: special relativity and general relativity. Special
-            relativity applies to all physical phenomena in the absence of gravity, while general relativity explains the law of gravitation and its
-            relation to other forces of nature. The theory of relativity is based on the concept of "relative theory," as introduced by Max Planck in
-            1906. It is a fundamental theory in physics that has revolutionized our understanding of the universe.
-            \`
-            </example>
-
-            Everything below is the actual data you will be working with. Good luck!
-
-            <query>
-            ${question}
-            </query>
-
-            <text>
-            ${doc.pageContent}
-            </text>
-
-            Make sure to answer the query in the summary.
-          `);
+                ... // Summarizer prompt ...
+              `);

              const document = new Document({
                pageContent: res.content as string,
@ -200,13 +152,16 @@ class MetaSearchAgent implements MetaSearchAgentType {
              docs.push(document);
            }),
          );
+          logger.info('Docs after summarizing user-provided links: ', docs);

-          return { query: question, docs: docs };
+          return { query: question, docs };
        } else {
+          logger.info(`No links specified, searching via Searxng on query: "${question}"`);
          const res = await searchSearxng(question, {
            language: 'en',
            engines: this.config.activeEngines,
          });
+          logger.info(`Searxng returned ${res.results.length} results.`);

          const documents = res.results.map(
            (result) =>
@ -215,7 +170,7 @@ class MetaSearchAgent implements MetaSearchAgentType {
                  result.content ||
                  (this.config.activeEngines.includes('youtube')
                    ? result.title
-                    : '') /* Todo: Implement transcript grabbing using Youtubei (source: https://www.npmjs.com/package/youtubei) */,
+                    : ''),
                metadata: {
                  title: result.title,
                  url: result.url,
@ -236,15 +191,15 @@ class MetaSearchAgent implements MetaSearchAgentType {
    embeddings: Embeddings,
    optimizationMode: 'speed' | 'balanced' | 'quality',
  ) {
+    logger.info(`Creating answering chain. Optimization mode: ${optimizationMode}`);
    return RunnableSequence.from([
      RunnableMap.from({
        query: (input: BasicChainInput) => input.query,
        chat_history: (input: BasicChainInput) => input.chat_history,
        date: () => new Date().toISOString(),
        context: RunnableLambda.from(async (input: BasicChainInput) => {
-          const processedHistory = formatChatHistoryAsString(
-            input.chat_history,
-          );
+          logger.info('Retrieving final source documents...');
+          const processedHistory = formatChatHistoryAsString(input.chat_history);

          let docs: Document[] | null = null;
          let query = input.query;
@ -260,6 +215,7 @@ class MetaSearchAgent implements MetaSearchAgentType {

            query = searchRetrieverResult.query;
            docs = searchRetrieverResult.docs;
+            logger.info(`Got ${docs.length} docs from searchRetriever.`);
          }

          const sortedDocs = await this.rerankDocs(
@ -269,6 +225,7 @@ class MetaSearchAgent implements MetaSearchAgentType {
            embeddings,
            optimizationMode,
          );
+          logger.info(`Sorted docs length: ${sortedDocs?.length ?? 0}`);

          return sortedDocs;
        })
@ -296,7 +253,9 @@ class MetaSearchAgent implements MetaSearchAgentType {
    embeddings: Embeddings,
    optimizationMode: 'speed' | 'balanced' | 'quality',
  ) {
+    logger.info(`Reranking. Query="${query}", initial docs=${docs.length}, fileIds=${fileIds.length}`);
    if (docs.length === 0 && fileIds.length === 0) {
+      logger.info('No docs or fileIds to rerank. Returning empty.');
      return docs;
    }

@ -307,32 +266,34 @@ class MetaSearchAgent implements MetaSearchAgentType {
        const contentPath = filePath + '-extracted.json';
        const embeddingsPath = filePath + '-embeddings.json';

+        logger.info(`Reading content from ${contentPath}`);
+        logger.info(`Reading embeddings from ${embeddingsPath}`);
+
        const content = JSON.parse(fs.readFileSync(contentPath, 'utf8'));
-        const embeddings = JSON.parse(fs.readFileSync(embeddingsPath, 'utf8'));
+        const fileEmbeddings = JSON.parse(fs.readFileSync(embeddingsPath, 'utf8'));

        const fileSimilaritySearchObject = content.contents.map(
-          (c: string, i) => {
-            return {
-              fileName: content.title,
-              content: c,
-              embeddings: embeddings.embeddings[i],
-            };
-          },
+          (c: string, i: number) => ({
+            fileName: content.title,
+            content: c,
+            embeddings: fileEmbeddings.embeddings[i],
+          }),
        );

        return fileSimilaritySearchObject;
      })
      .flat();

+    // If only summarizing, just return top docs
    if (query.toLocaleLowerCase() === 'summarize') {
+      logger.info(`Query is "summarize". Returning top 15 docs from web sources.`);
      return docs.slice(0, 15);
    }

-    const docsWithContent = docs.filter(
-      (doc) => doc.pageContent && doc.pageContent.length > 0,
-    );
+    const docsWithContent = docs.filter((doc) => doc.pageContent && doc.pageContent.length > 0);

    if (optimizationMode === 'speed' || this.config.rerank === false) {
+      logger.info(`Reranking in 'speed' mode or no rerank. Docs with content: ${docsWithContent.length}`);
      if (filesData.length > 0) {
        const [queryEmbedding] = await Promise.all([
          embeddings.embedQuery(query),
@ -343,14 +304,13 @@ class MetaSearchAgent implements MetaSearchAgentType {
            pageContent: fileData.content,
            metadata: {
              title: fileData.fileName,
-              url: `File`,
+              url: 'File',
            },
          });
        });

        const similarity = filesData.map((fileData, i) => {
          const sim = computeSimilarity(queryEmbedding, fileData.embeddings);
-
          return {
            index: i,
            similarity: sim,
@ -358,28 +318,23 @@ class MetaSearchAgent implements MetaSearchAgentType {
        });

        let sortedDocs = similarity
-          .filter(
-            (sim) => sim.similarity > (this.config.rerankThreshold ?? 0.3),
-          )
+          .filter((sim) => sim.similarity > (this.config.rerankThreshold ?? 0.3))
          .sort((a, b) => b.similarity - a.similarity)
          .slice(0, 15)
          .map((sim) => fileDocs[sim.index]);

-        sortedDocs =
-          docsWithContent.length > 0 ? sortedDocs.slice(0, 8) : sortedDocs;
-
-        return [
-          ...sortedDocs,
-          ...docsWithContent.slice(0, 15 - sortedDocs.length),
-        ];
+        sortedDocs = docsWithContent.length > 0 ? sortedDocs.slice(0, 8) : sortedDocs;
+        logger.info(`Final sorted docs in 'speed' mode: ${sortedDocs.length}`);
+        
+        return [...sortedDocs, ...docsWithContent.slice(0, 15 - sortedDocs.length)];
      } else {
+        logger.info('No file data, returning top 15 from docsWithContent.');
        return docsWithContent.slice(0, 15);
      }
    } else if (optimizationMode === 'balanced') {
+      logger.info('Reranking in balanced mode.');
      const [docEmbeddings, queryEmbedding] = await Promise.all([
-        embeddings.embedDocuments(
-          docsWithContent.map((doc) => doc.pageContent),
-        ),
+        embeddings.embedDocuments(docsWithContent.map((doc) => doc.pageContent)),
        embeddings.embedQuery(query),
      ]);

@ -389,7 +344,7 @@ class MetaSearchAgent implements MetaSearchAgentType {
            pageContent: fileData.content,
            metadata: {
              title: fileData.fileName,
-              url: `File`,
+              url: 'File',
            },
          });
        }),
@ -399,7 +354,6 @@ class MetaSearchAgent implements MetaSearchAgentType {

      const similarity = docEmbeddings.map((docEmbedding, i) => {
        const sim = computeSimilarity(queryEmbedding, docEmbedding);
-
        return {
          index: i,
          similarity: sim,
@ -412,8 +366,13 @@ class MetaSearchAgent implements MetaSearchAgentType {
        .slice(0, 15)
        .map((sim) => docsWithContent[sim.index]);

+      logger.info(`Final sorted docs in 'balanced' mode: ${sortedDocs.length}`);
      return sortedDocs;
    }
+
+    // If "quality" is passed but not implemented, you might want to log or fallback
+    logger.warn(`Optimization mode "${optimizationMode}" not fully implemented. Returning docs as-is.`);
+    return docsWithContent.slice(0, 15);
  }

  private processDocs(docs: Document[]) {
@ -429,12 +388,16 @@ class MetaSearchAgent implements MetaSearchAgentType {
    stream: IterableReadableStream<StreamEvent>,
    emitter: eventEmitter,
  ) {
+    logger.info('Starting to stream chain events...');
    for await (const event of stream) {
+      // You can add debug logs here to see each event
+      // logger.info(`Event: ${JSON.stringify(event, null, 2)}`);
+
      if (
        event.event === 'on_chain_end' &&
        event.name === 'FinalSourceRetriever'
      ) {
-        ``;
+        logger.info('FinalSourceRetriever ended, sending docs to front-end...');
        emitter.emit(
          'data',
          JSON.stringify({ type: 'sources', data: event.data.output }),
@ -444,6 +407,7 @@ class MetaSearchAgent implements MetaSearchAgentType {
        event.event === 'on_chain_stream' &&
        event.name === 'FinalResponseGenerator'
      ) {
+        logger.info('Response chunk received, streaming to client...');
        emitter.emit(
          'data',
          JSON.stringify({ type: 'response', data: event.data.chunk }),
@ -453,9 +417,11 @@ class MetaSearchAgent implements MetaSearchAgentType {
        event.event === 'on_chain_end' &&
        event.name === 'FinalResponseGenerator'
      ) {
+        logger.info('FinalResponseGenerator ended, signaling end of stream.');
        emitter.emit('end');
      }
    }
+    logger.info('Finished streaming chain events.');
  }

  async searchAndAnswer(
@ -468,6 +434,11 @@ class MetaSearchAgent implements MetaSearchAgentType {
  ) {
    const emitter = new eventEmitter();

+    logger.info(`Received query: "${message}"`);
+    logger.info(`History length: ${history.length}`);
+    logger.info(`Optimization mode: ${optimizationMode}`);
+    logger.info(`File IDs: ${fileIds.join(', ') || 'None'}`);
+
    const answeringChain = await this.createAnsweringChain(
      llm,
      fileIds,
@ -475,17 +446,17 @@ class MetaSearchAgent implements MetaSearchAgentType {
      optimizationMode,
    );

-    const stream = answeringChain.streamEvents(
-      {
-        chat_history: history,
-        query: message,
-      },
-      {
-        version: 'v1',
-      },
-    );
-
-    this.handleStream(stream, emitter);
+    // .streamEvents(...) can throw, so a try/catch can help you catch/log errors
+    try {
+      const stream = answeringChain.streamEvents(
+        { chat_history: history, query: message },
+        { version: 'v1' },
+      );
+      this.handleStream(stream, emitter);
+    } catch (error: any) {
+      logger.error(`Error in searchAndAnswer streaming: ${error.message}`);
+      emitter.emit('error', error);
+    }

    return emitter;
  }