Perplexica/src/lib/utils/documents.ts

import { CheerioWebBaseLoader } from '@langchain/community/document_loaders/web/cheerio';
import { PlaywrightWebBaseLoader } from '@langchain/community/document_loaders/web/playwright';
import { Document } from '@langchain/core/documents';
import { Readability } from '@mozilla/readability';
import axios from 'axios';
import { JSDOM } from 'jsdom';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import fetch from 'node-fetch';
import pdfParse from 'pdf-parse';
import type { Browser, Page } from 'playwright';

export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
  const splitter = new RecursiveCharacterTextSplitter();

  let docs: Document[] = [];

  await Promise.all(
    links.map(async (link) => {
      link =
        link.startsWith('http://') || link.startsWith('https://')
          ? link
          : `https://${link}`;

      try {
        // First, check if it's a PDF
        const headRes = await axios.head(link);
        const isPdf = headRes.headers['content-type'] === 'application/pdf';

        if (isPdf) {
          // Handle PDF files
          const res = await axios.get(link, {
            responseType: 'arraybuffer',
          });

          const pdfText = await pdfParse(res.data);
          const parsedText = pdfText.text
            .replace(/(\r\n|\n|\r)/gm, ' ')
            .replace(/\s+/g, ' ')
            .trim();

          const splittedText = await splitter.splitText(parsedText);
          const title = 'PDF Document';

          const linkDocs = splittedText.map((text) => {
            return new Document({
              pageContent: text,
              metadata: {
                title: title,
                url: link,
              },
            });
          });

          docs.push(...linkDocs);
          return;
        }

        // Handle web pages using CheerioWebBaseLoader
        const loader = new CheerioWebBaseLoader(link, {
          selector: 'body',
        });

        const webDocs = await loader.load();

        if (webDocs && webDocs.length > 0) {
          const webDoc = webDocs[0];
          const splittedText = await splitter.splitText(webDoc.pageContent);

          const linkDocs = splittedText.map((text) => {
            return new Document({
              pageContent: text,
              metadata: {
                title: webDoc.metadata.title || link,
                url: link,
              },
            });
          });

          docs.push(...linkDocs);
        }
      } catch (err) {
        console.error(
          'An error occurred while getting documents from links: ',
          err,
        );
        docs.push(
          new Document({
            pageContent: `Failed to retrieve content from the link: ${err}`,
            metadata: {
              title: 'Failed to retrieve content',
              url: link,
            },
          }),
        );
      }
    }),
  );

  return docs;
};

/**
 * Fetches web content from a given URL using LangChain's PlaywrightWebBaseLoader.
 * Parses it using Readability for better content extraction.
 * Returns a Document object containing the parsed text and metadata.
 *
 * @param url - The URL to fetch content from.
 * @param getHtml - Whether to include the HTML content in the metadata.
 * @returns A Promise that resolves to a Document object or null if parsing fails.
 */
export const getWebContent = async (
  url: string,
  getHtml: boolean = false,
): Promise<Document | null> => {
  try {
    console.log(`Fetching content from URL: ${url}`);

    const loader = new PlaywrightWebBaseLoader(url, {
      launchOptions: {
        headless: true,
        timeout: 30000,
      },
      gotoOptions: {
        waitUntil: 'domcontentloaded',
        timeout: 10000,
      },
      async evaluate(page: Page, browser: Browser) {
        // Wait for the content to load properly
        await page.waitForLoadState('networkidle', { timeout: 10000 });

        // Allow some time for dynamic content to load
        await page.waitForTimeout(3000);

        return await page.content();
      },
    });

    const docs = await loader.load();

    if (!docs || docs.length === 0) {
      console.warn(`Failed to load content for URL: ${url}`);
      return null;
    }

    const doc = docs[0];

    const dom = new JSDOM(doc.pageContent, { url });
    const reader = new Readability(dom.window.document, { charThreshold: 25 });
    const article = reader.parse();

    // Normalize the text content
    const normalizedText =
      article?.textContent
        ?.split('\n')
        .map((line: string) => line.trim())
        .filter((line: string) => line.length > 0)
        .join('\n') || '';

    const returnDoc = new Document({
      pageContent: normalizedText,
      metadata: {
        title: article?.title || doc.metadata.title || '',
        url: url,
        html: getHtml ? article?.content : undefined,
      },
    });

    console.log(
      `Got content with LangChain Playwright, URL: ${url}, Text Length: ${returnDoc.pageContent.length}`,
    );

    return returnDoc;
  } catch (error) {
    console.error(`Error fetching/parsing URL ${url}:`, error);

    // Fallback to CheerioWebBaseLoader for simpler content extraction
    try {
      console.log(`Fallback to Cheerio for URL: ${url}`);
      const cheerioLoader = new CheerioWebBaseLoader(url);
      const docs = await cheerioLoader.load();

      if (docs && docs.length > 0) {
        return docs[0];
      }
    } catch (fallbackError) {
      console.error(
        `Cheerio fallback also failed for URL ${url}:`,
        fallbackError,
      );
    }

    return null;
  }
};

/**
 * Fetches web content from a given URL using CheerioWebBaseLoader for faster, lighter extraction.
 * Returns a Document object containing the parsed text and metadata.
 *
 * @param {string} url - The URL to fetch content from.
 * @param {boolean} getHtml - Whether to include the HTML content in the metadata.
 * @returns {Promise<Document | null>} A Promise that resolves to a Document object or null if parsing fails.
 */
export const getWebContentLite = async (
  url: string,
  getHtml: boolean = false,
): Promise<Document | null> => {
  try {
    console.log(`Fetching content (lite) from URL: ${url}`);

    const loader = new CheerioWebBaseLoader(url);

    const docs = await loader.load();

    if (!docs || docs.length === 0) {
      console.warn(`Failed to load content for URL: ${url}`);
      return null;
    }

    const doc = docs[0];

    // Try to use Readability for better content extraction if possible
    if (getHtml) {
      try {
        const response = await fetch(url, { timeout: 5000 });
        const html = await response.text();
        const dom = new JSDOM(html, { url });
        const originalTitle = dom.window.document.title;
        const reader = new Readability(dom.window.document, {
          charThreshold: 25,
        });
        const article = reader.parse();

        if (article) {
          const normalizedText =
            article.textContent
              ?.split('\n')
              .map((line) => line.trim())
              .filter((line) => line.length > 0)
              .join('\n') || '';

          return new Document({
            pageContent: normalizedText,
            metadata: {
              html: article.content,
              title: article.title || originalTitle,
              url: url,
            },
          });
        }
      } catch (readabilityError) {
        console.warn(
          `Readability parsing failed for ${url}, using Cheerio fallback`,
        );
      }
    }

    // Normalize the text content from Cheerio
    const normalizedText = doc.pageContent
      .split('\n')
      .map((line: string) => line.trim())
      .filter((line: string) => line.length > 0)
      .join('\n');

    return new Document({
      pageContent: normalizedText,
      metadata: {
        title: doc.metadata.title || 'Web Page',
        url: url,
        html: getHtml ? doc.pageContent : undefined,
      },
    });
  } catch (error) {
    console.error(`Error fetching/parsing URL ${url}:`, error);
    return null;
  }
};