diff --git a/src/pages/api/mediaHandler.ts b/src/pages/api/mediaHandler.ts index b1fbf3f..5e9069d 100644 --- a/src/pages/api/mediaHandler.ts +++ b/src/pages/api/mediaHandler.ts @@ -1,10 +1,152 @@ -import { NextApiRequest, NextApiResponse } from 'next'; +import { openaiWhisper } from '@/features/openaiWhisper/openaiWhisper'; +import { whispercpp } from '@/features/whispercpp/whispercpp'; +import { askVisionLLM } from '@/utils/askLlm'; +import { TimestampedPrompt } from '@/features/amicaLife/eventHandler'; +import { config as configs } from '@/utils/config'; +import { randomBytes } from 'crypto'; +import sharp from 'sharp'; +import { NextApiResponse } from 'next'; +import { handleConfig } from '@/features/externalAPI/externalAPI'; +import { NextRequest } from 'next/server'; -export default function handler(req: NextApiRequest, res: NextApiResponse) { - if (req.method === 'GET') { - res.status(200).json({ message: 'Media Handler' }); +interface ApiResponse { + sessionId?: string; + outputType?: string; + response?: string | TimestampedPrompt[]; + error?: string; +} + +// Configure body parsing: disable only for multipart/form-data +export const config = { + api: { + bodyParser: false, + }, +}; + +const generateSessionId = (sessionId?: string) => sessionId || randomBytes(8).toString('hex'); + +// Helper for setting error responses +const sendError = (res: NextApiResponse, sessionId: string, message: string, status = 400) => + res.status(status).json({ sessionId, error: message }); + +// Main API handler +export default async function handler(req: NextRequest, res: NextApiResponse) { + if (configs('external_api_enabled') !== 'true') { + return sendError(res, '', 'API is currently disabled.', 503); + } + + const currentSessionId = generateSessionId(); + const timestamp = new Date().toISOString(); + + const contentType = req.headers.get('content-type'); + + if (contentType?.includes('multipart/form-data')) { + // Handle form-data using NextRequest.formData() method + const formData = await req.formData(); + + const fields: any = {}; + const files: any = {}; + + formData.forEach((value, key) => { + if (value instanceof File) { + files[key] = value; + } else { + fields[key] = value; + } + }); + + handleRequest(currentSessionId, timestamp, fields, files, res); } else { - res.setHeader('Allow', ['GET']); - res.status(405).end(`Method ${req.method} Not Allowed`); + return sendError(res, currentSessionId, 'Incorrect type'); + } +} + +async function handleRequest( + sessionId: string, + timestamp: string, + fields: any, + files: any, + res: NextApiResponse +) { + let response: string | undefined | TimestampedPrompt[]; + let outputType: string | undefined; + + const inputType = fields.inputType[0]; + const payload = files?.payload; + + // Syncing config to be accessible from server side + await handleConfig('fetch'); + + try { + switch (inputType) { + case 'Voice': + if (payload) { + const audioFile = new File([payload], 'input.wav', { type: 'audio/wav' }); + response = await transcribeVoice(audioFile); + outputType = 'Text'; + } else { + throw new Error('Voice input file missing.'); + } + break; + + case 'Image': + if (payload) { + const imageBuffer = Buffer.from(await payload.arrayBuffer()); + response = await processImage(imageBuffer); + outputType = 'Text'; + } else { + throw new Error('Image input file missing.'); + } + break; + + default: + return sendError(res, sessionId, 'Unknown input type.'); + } + + res.status(200).json({ sessionId, outputType, response }); + } catch (error) { + console.error('Handler error:', error); + return sendError(res, sessionId, 'An error occurred while processing the request.', 500); + } +} + +// Transcribe voice input to text +async function transcribeVoice(audio: File): Promise { + try { + switch (configs('stt_backend')) { + case 'whisper_openai': { + const result = await openaiWhisper(audio); + return result?.text; // Assuming the transcription is in result.text + } + case 'whispercpp': { + const result = await whispercpp(audio); + return result?.text; // Assuming the transcription is in result.text + } + default: + throw new Error('Invalid STT backend configuration.'); + } + } catch (error) { + console.error('Transcription error:', error); + throw new Error('Failed to transcribe audio.'); + } +} + +// Process image using Vision LLM +async function processImage(payload: Buffer): Promise { + const jpegImg = await convertToJpeg(payload); + if (!jpegImg) { + throw new Error('Failed to process image'); + } + return await askVisionLLM(jpegImg); +} + +// Convert image to JPEG and return as base64 +async function convertToJpeg(payload: Buffer): Promise { + try { + const jpegBuffer = await sharp(payload).jpeg().toBuffer(); + return jpegBuffer.toString('base64'); + } catch (error) { + console.error('Error converting image to .jpeg:', error); + return null; } } diff --git a/src/utils/askLlm.ts b/src/utils/askLlm.ts index e2bb1da..d86782c 100644 --- a/src/utils/askLlm.ts +++ b/src/utils/askLlm.ts @@ -3,11 +3,10 @@ import { Chat } from "@/features/chat/chat"; import { getEchoChatResponseStream } from "@/features/chat/echoChat"; import { getOpenAiChatResponseStream } from "@/features/chat/openAiChat"; -import { getLlamaCppChatResponseStream } from "@/features/chat/llamaCppChat"; +import { getLlamaCppChatResponseStream, getLlavaCppChatResponse } from "@/features/chat/llamaCppChat"; import { getWindowAiChatResponseStream } from "@/features/chat/windowAiChat"; -import { getOllamaChatResponseStream } from "@/features/chat/ollamaChat"; +import { getOllamaChatResponseStream, getOllamaVisionChatResponse } from "@/features/chat/ollamaChat"; import { getKoboldAiChatResponseStream } from "@/features/chat/koboldAiChat"; -import { getOpenRouterChatResponseStream } from "@/features/chat/openRouterChat"; import { config } from "@/utils/config"; import { processResponse } from "@/utils/processResponse"; @@ -51,8 +50,6 @@ export async function askLLM( return getOllamaChatResponseStream(messages); case "koboldai": return getKoboldAiChatResponseStream(messages); - case "openrouter": - return getOpenRouterChatResponseStream(messages); default: return getEchoChatResponseStream(messages); } @@ -63,15 +60,15 @@ export async function askLLM( } catch (e: any) { const errMsg = `Error: ${e.toString()}`; console.error(errMsg); - alert.error("Failed to get subconcious subroutine response", errMsg); + alert.error("Failed to get LLM response", errMsg); return errMsg; } const stream = streams[streams.length - 1]; if (!stream) { - const errMsg = "Error: Null subconcious subroutine stream encountered."; + const errMsg = "Error: Null LLM stream encountered."; console.error(errMsg); - alert.error("Null subconcious subroutine stream encountered", errMsg); + alert.error("Null LLM stream encountered", errMsg); return errMsg; } @@ -84,7 +81,7 @@ export async function askLLM( chat !== null ? currentStreamIdx = chat.currentStreamIdx : null; setChatProcessing(true); - console.time("Subconcious subroutine stream processing"); + console.time("LLM stream processing"); const reader = stream.getReader(); readers.push(reader); let receivedMessage = ""; @@ -114,7 +111,7 @@ export async function askLLM( receivedMessage += value; receivedMessage = receivedMessage.trimStart(); - + if (chat !== null) { const proc = processResponse({ @@ -135,16 +132,16 @@ export async function askLLM( screenplay: aiTalks[0], streamIdx: currentStreamIdx, }); - + if (! firstSentenceEncountered) { console.timeEnd('performance_time_to_first_sentence'); firstSentenceEncountered = true; } - + return false; // normal processing } }); - + sentences = proc.sentences; aiTextLog = proc.aiTextLog; receivedMessage = proc.receivedMessage; @@ -152,9 +149,9 @@ export async function askLLM( rolePlay = proc.rolePlay; if (proc.shouldBreak) { break; - } + } } - + } } catch (e: any) { const errMsg = e.toString(); @@ -163,7 +160,7 @@ export async function askLLM( if (!reader.closed) { reader.releaseLock(); } - console.timeEnd("Subconcious subroutine stream processing"); + console.timeEnd("LLM stream processing"); if (currentStreamIdx === currentStreamIdx) { setChatProcessing(false); } @@ -172,4 +169,41 @@ export async function askLLM( return result; } +export async function askVisionLLM( + imageData: string, +): Promise { + try { + const visionBackend = config("vision_backend"); + + console.debug("vision_backend", visionBackend); + + const messages: Message[] = [ + { role: "system", content: config("vision_system_prompt") }, + { + role: 'user', + content: "Describe the image as accurately as possible" + }, + ]; + + let res = ""; + if (visionBackend === "vision_llamacpp") { + res = await getLlavaCppChatResponse(messages, imageData); + } else if (visionBackend === "vision_ollama") { + res = await getOllamaVisionChatResponse(messages, imageData); + } else { + console.warn("vision_backend not supported", visionBackend); + return "vision_backend not supported"; + } + + let content = `This is a picture I just took from my webcam (described between [[ and ]] ): [[${res}]] Please respond accordingly and as if it were just sent and as though you can see it.` + const result = await askLLM(config("system_prompt"),content,null); + + return result; + } catch (e: any) { + console.error("getVisionResponse", e.toString()); + // alert?.error("Failed to get vision response", e.toString()); + return "Failed to get vision response"; + } +} + export default askLLM;