import { BaseMessage } from '@langchain/core/messages';
import {
PromptTemplate,
ChatPromptTemplate,
MessagesPlaceholder,
} from '@langchain/core/prompts';
import {
RunnableSequence,
RunnableMap,
RunnableLambda,
} from '@langchain/core/runnables';
import { StringOutputParser } from '@langchain/core/output_parsers';
import { Document } from '@langchain/core/documents';
import { searchSearxng } from '../lib/searxng';
import type { StreamEvent } from '@langchain/core/tracers/log_stream';
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
import type { Embeddings } from '@langchain/core/embeddings';
import formatChatHistoryAsString from '../utils/formatHistory';
import eventEmitter from 'events';
import computeSimilarity from '../utils/computeSimilarity';
import logger from '../utils/logger';
import { IterableReadableStream } from '@langchain/core/utils/stream';
const basicYoutubeSearchRetrieverPrompt = `
You will be given a conversation below and a follow-up question. Rephrase the follow-up question if needed so it can be used by the LLM to search YouTube for relevant videos.
If the question is a writing task or a simple greeting (like "hi" or "hello"), return \`not_needed\`.
Example:
1. Follow-up question: How does an A.C work?
Rephrased: A.C working
2. Follow-up question: Linear algebra explanation video
Rephrased: What is linear algebra?
3. Follow-up question: What is the theory of relativity?
Rephrased: What is the theory of relativity?
Conversation:
{chat_history}
Follow-up question: {query}
Rephrased question:
`;
const basicYoutubeSearchResponsePrompt = `
You are Perplexica, an AI model expert in searching YouTube and summarizing video content. You are set to 'YouTube' mode, meaning you will search for videos and use their transcripts to generate informative answers.
Your response should be based on the context provided (descriptions of video content), and you must provide relevant and accurate information. You should use an unbiased, journalistic tone, without repeating text from the search results. Do not tell the user to open links or visit websites; the answer should be directly in your response. If the user asks for links, you may provide them.
Your response should be medium to long in length, offering valuable and detailed insights. Use bullet points if necessary and be sure to structure the response clearly. Be informative, ensuring the answer addresses the query fully.
Cite the content by using [number] notation at the end of sentences. Each relevant context should be cited to indicate where the information comes from. If multiple search results support the same sentence, use different citation numbers for each relevant source.
Anything inside the \`context\` HTML block is for your knowledge and not shared with the user. Answer based on this information and cite accordingly.
{context}
If no relevant information is found in the search results, say: 'Hmm, sorry I could not find any relevant information on this topic. Would you like me to search again or ask something else?'
Today's date is ${new Date().toISOString()}
`;
const strParser = new StringOutputParser();
const handleStream = async (
stream: IterableReadableStream,
emitter: eventEmitter,
) => {
for await (const event of stream) {
if (
event.event === 'on_chain_end' &&
event.name === 'FinalSourceRetriever'
) {
emitter.emit(
'data',
JSON.stringify({ type: 'sources', data: event.data.output }),
);
}
if (
event.event === 'on_chain_stream' &&
event.name === 'FinalResponseGenerator'
) {
emitter.emit(
'data',
JSON.stringify({ type: 'response', data: event.data.chunk }),
);
}
if (
event.event === 'on_chain_end' &&
event.name === 'FinalResponseGenerator'
) {
emitter.emit('end');
}
}
};
type BasicChainInput = {
chat_history: BaseMessage[];
query: string;
};
const createBasicYoutubeSearchRetrieverChain = (llm: BaseChatModel) => {
return RunnableSequence.from([
PromptTemplate.fromTemplate(basicYoutubeSearchRetrieverPrompt),
llm,
strParser,
RunnableLambda.from(async (input: string) => {
if (input === 'not_needed') {
return { query: '', docs: [] };
}
const res = await searchSearxng(input, {
language: 'en',
engines: ['youtube'],
});
const documents = res.results.map(
(result) =>
new Document({
pageContent: result.content ? result.content : result.title,
metadata: {
title: result.title,
url: result.url,
...(result.img_src && { img_src: result.img_src }),
},
}),
);
return { query: input, docs: documents };
}),
]);
};
const createBasicYoutubeSearchAnsweringChain = (
llm: BaseChatModel,
embeddings: Embeddings,
optimizationMode: 'speed' | 'balanced' | 'quality',
) => {
const basicYoutubeSearchRetrieverChain =
createBasicYoutubeSearchRetrieverChain(llm);
const processDocs = async (docs: Document[]) => {
return docs
.map((_, index) => `${index + 1}. ${docs[index].pageContent}`)
.join('\n');
};
const rerankDocs = async ({
query,
docs,
}: {
query: string;
docs: Document[];
}) => {
if (docs.length === 0) {
return docs;
}
const docsWithContent = docs.filter(
(doc) => doc.pageContent && doc.pageContent.length > 0,
);
if (optimizationMode === 'speed') {
return docsWithContent.slice(0, 15);
} else {
const [docEmbeddings, queryEmbedding] = await Promise.all([
embeddings.embedDocuments(
docsWithContent.map((doc) => doc.pageContent),
),
embeddings.embedQuery(query),
]);
const similarity = docEmbeddings.map((docEmbedding, i) => {
const sim = computeSimilarity(queryEmbedding, docEmbedding);
return {
index: i,
similarity: sim,
};
});
const sortedDocs = similarity
.filter((sim) => sim.similarity > 0.3)
.sort((a, b) => b.similarity - a.similarity)
.slice(0, 15)
.map((sim) => docsWithContent[sim.index]);
return sortedDocs;
}
};
return RunnableSequence.from([
RunnableMap.from({
query: (input: BasicChainInput) => input.query,
chat_history: (input: BasicChainInput) => input.chat_history,
context: RunnableSequence.from([
(input) => ({
query: input.query,
chat_history: formatChatHistoryAsString(input.chat_history),
}),
basicYoutubeSearchRetrieverChain
.pipe(rerankDocs)
.withConfig({
runName: 'FinalSourceRetriever',
})
.pipe(processDocs),
]),
}),
ChatPromptTemplate.fromMessages([
['system', basicYoutubeSearchResponsePrompt],
new MessagesPlaceholder('chat_history'),
['user', '{query}'],
]),
llm,
strParser,
]).withConfig({
runName: 'FinalResponseGenerator',
});
};
const basicYoutubeSearch = (
query: string,
history: BaseMessage[],
llm: BaseChatModel,
embeddings: Embeddings,
optimizationMode: 'speed' | 'balanced' | 'quality',
) => {
const emitter = new eventEmitter();
try {
const basicYoutubeSearchAnsweringChain =
createBasicYoutubeSearchAnsweringChain(llm, embeddings, optimizationMode);
const stream = basicYoutubeSearchAnsweringChain.streamEvents(
{
chat_history: history,
query: query,
},
{
version: 'v1',
},
);
handleStream(stream, emitter);
} catch (err) {
emitter.emit(
'error',
JSON.stringify({ data: 'An error has occurred please try again later' }),
);
logger.error(`Error in youtube search: ${err}`);
}
return emitter;
};
const handleYoutubeSearch = (
message: string,
history: BaseMessage[],
llm: BaseChatModel,
embeddings: Embeddings,
optimizationMode: 'speed' | 'balanced' | 'quality',
) => {
const emitter = basicYoutubeSearch(
message,
history,
llm,
embeddings,
optimizationMode,
);
return emitter;
};
export default handleYoutubeSearch;