feat(app): Introduce quality mode. Improve functionality of balanced mode using readability to get page content and pull relevant excerpts

feat(UI): Show progress during inferrence feat(security): Don't show API keys in the UI any more feat(models): Support Claude 4 Anthropic models
2025-05-23 18:03:35 -06:00 · 2025-05-23 18:03:35 -06:00 · c47a630372
commit c47a630372
parent 288120dc1d
17 changed files with 2142 additions and 818 deletions
--- a/src/lib/utils/documents.ts
+++ b/src/lib/utils/documents.ts
@ -3,6 +3,9 @@ import { htmlToText } from 'html-to-text';
 import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
 import { Document } from '@langchain/core/documents';
 import pdfParse from 'pdf-parse';
+import { JSDOM } from 'jsdom';
+import { Readability } from '@mozilla/readability';
+import fetch from 'node-fetch';

 export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
  const splitter = new RecursiveCharacterTextSplitter();
@ -97,3 +100,55 @@ export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {

  return docs;
 };
+
+export const getWebContent = async (
+  url: string,
+  getHtml: boolean = false,
+): Promise<Document | null> => {
+  try {
+    const response = await fetch(url, { timeout: 5000 });
+    const html = await response.text();
+
+    // Create a DOM from the fetched HTML
+    const dom = new JSDOM(html, { url });
+
+    // Get title before we modify the DOM
+    const originalTitle = dom.window.document.title;
+
+    // Use Readability to parse the article content
+    const reader = new Readability(dom.window.document, { charThreshold: 25 });
+    const article = reader.parse();
+
+    if (!article) {
+      console.warn(`Failed to parse article content for URL: ${url}`);
+      return null;
+    }
+
+    // Normalize the text content by removing extra spaces and newlines. Iterate through the lines one by one and throw out the ones that are empty or contain only whitespace.
+    const normalizedText =
+      article?.textContent
+        ?.split('\n')
+        .map((line) => line.trim())
+        .filter((line) => line.length > 0)
+        .join('\n') || '';
+
+    // Create a Document with the parsed content
+    return new Document({
+      pageContent: normalizedText || '',
+      metadata: {
+        html: getHtml ? article.content : undefined,
+        title: article.title || originalTitle,
+        url: url,
+        excerpt: article.excerpt || undefined,
+        byline: article.byline || undefined,
+        siteName: article.siteName || undefined,
+        readingTime: article.length
+          ? Math.ceil(article.length / 1000)
+          : undefined,
+      },
+    });
+  } catch (error) {
+    console.error(`Error fetching/parsing URL ${url}:`); //, error);
+    return null;
+  }
+};