feat(app): Introduce quality mode. Improve functionality of balanced mode using readability to get page content and pull relevant excerpts
feat(UI): Show progress during inferrence feat(security): Don't show API keys in the UI any more feat(models): Support Claude 4 Anthropic models
This commit is contained in:
parent
288120dc1d
commit
c47a630372
17 changed files with 2142 additions and 818 deletions
|
|
@ -3,6 +3,9 @@ import { htmlToText } from 'html-to-text';
|
|||
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
|
||||
import { Document } from '@langchain/core/documents';
|
||||
import pdfParse from 'pdf-parse';
|
||||
import { JSDOM } from 'jsdom';
|
||||
import { Readability } from '@mozilla/readability';
|
||||
import fetch from 'node-fetch';
|
||||
|
||||
export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
|
||||
const splitter = new RecursiveCharacterTextSplitter();
|
||||
|
|
@ -97,3 +100,55 @@ export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
|
|||
|
||||
return docs;
|
||||
};
|
||||
|
||||
export const getWebContent = async (
|
||||
url: string,
|
||||
getHtml: boolean = false,
|
||||
): Promise<Document | null> => {
|
||||
try {
|
||||
const response = await fetch(url, { timeout: 5000 });
|
||||
const html = await response.text();
|
||||
|
||||
// Create a DOM from the fetched HTML
|
||||
const dom = new JSDOM(html, { url });
|
||||
|
||||
// Get title before we modify the DOM
|
||||
const originalTitle = dom.window.document.title;
|
||||
|
||||
// Use Readability to parse the article content
|
||||
const reader = new Readability(dom.window.document, { charThreshold: 25 });
|
||||
const article = reader.parse();
|
||||
|
||||
if (!article) {
|
||||
console.warn(`Failed to parse article content for URL: ${url}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Normalize the text content by removing extra spaces and newlines. Iterate through the lines one by one and throw out the ones that are empty or contain only whitespace.
|
||||
const normalizedText =
|
||||
article?.textContent
|
||||
?.split('\n')
|
||||
.map((line) => line.trim())
|
||||
.filter((line) => line.length > 0)
|
||||
.join('\n') || '';
|
||||
|
||||
// Create a Document with the parsed content
|
||||
return new Document({
|
||||
pageContent: normalizedText || '',
|
||||
metadata: {
|
||||
html: getHtml ? article.content : undefined,
|
||||
title: article.title || originalTitle,
|
||||
url: url,
|
||||
excerpt: article.excerpt || undefined,
|
||||
byline: article.byline || undefined,
|
||||
siteName: article.siteName || undefined,
|
||||
readingTime: article.length
|
||||
? Math.ceil(article.length / 1000)
|
||||
: undefined,
|
||||
},
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(`Error fetching/parsing URL ${url}:`); //, error);
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue