Skip to content

Commit

Permalink
added mediaHandler
Browse files Browse the repository at this point in the history
  • Loading branch information
flukexp committed Jan 18, 2025
1 parent 45b9ee7 commit d96c12e
Show file tree
Hide file tree
Showing 2 changed files with 198 additions and 22 deletions.
154 changes: 148 additions & 6 deletions src/pages/api/mediaHandler.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,152 @@
import { NextApiRequest, NextApiResponse } from 'next';
import { openaiWhisper } from '@/features/openaiWhisper/openaiWhisper';
import { whispercpp } from '@/features/whispercpp/whispercpp';
import { askVisionLLM } from '@/utils/askLlm';
import { TimestampedPrompt } from '@/features/amicaLife/eventHandler';
import { config as configs } from '@/utils/config';
import { randomBytes } from 'crypto';
import sharp from 'sharp';
import { NextApiResponse } from 'next';
import { handleConfig } from '@/features/externalAPI/externalAPI';
import { NextRequest } from 'next/server';

export default function handler(req: NextApiRequest, res: NextApiResponse) {
if (req.method === 'GET') {
res.status(200).json({ message: 'Media Handler' });
interface ApiResponse {
sessionId?: string;
outputType?: string;
response?: string | TimestampedPrompt[];
error?: string;
}

// Configure body parsing: disable only for multipart/form-data
export const config = {
api: {
bodyParser: false,
},
};

const generateSessionId = (sessionId?: string) => sessionId || randomBytes(8).toString('hex');

// Helper for setting error responses
const sendError = (res: NextApiResponse<ApiResponse>, sessionId: string, message: string, status = 400) =>
res.status(status).json({ sessionId, error: message });

// Main API handler
export default async function handler(req: NextRequest, res: NextApiResponse<ApiResponse>) {
if (configs('external_api_enabled') !== 'true') {
return sendError(res, '', 'API is currently disabled.', 503);
}

const currentSessionId = generateSessionId();
const timestamp = new Date().toISOString();

const contentType = req.headers.get('content-type');

if (contentType?.includes('multipart/form-data')) {
// Handle form-data using NextRequest.formData() method
const formData = await req.formData();

const fields: any = {};
const files: any = {};

formData.forEach((value, key) => {
if (value instanceof File) {
files[key] = value;
} else {
fields[key] = value;
}
});

handleRequest(currentSessionId, timestamp, fields, files, res);
} else {
res.setHeader('Allow', ['GET']);
res.status(405).end(`Method ${req.method} Not Allowed`);
return sendError(res, currentSessionId, 'Incorrect type');
}
}

async function handleRequest(
sessionId: string,
timestamp: string,
fields: any,
files: any,
res: NextApiResponse<ApiResponse>
) {
let response: string | undefined | TimestampedPrompt[];
let outputType: string | undefined;

const inputType = fields.inputType[0];
const payload = files?.payload;

// Syncing config to be accessible from server side
await handleConfig('fetch');

try {
switch (inputType) {
case 'Voice':
if (payload) {
const audioFile = new File([payload], 'input.wav', { type: 'audio/wav' });
response = await transcribeVoice(audioFile);
outputType = 'Text';
} else {
throw new Error('Voice input file missing.');
}
break;

case 'Image':
if (payload) {
const imageBuffer = Buffer.from(await payload.arrayBuffer());
response = await processImage(imageBuffer);
outputType = 'Text';
} else {
throw new Error('Image input file missing.');
}
break;

default:
return sendError(res, sessionId, 'Unknown input type.');
}

res.status(200).json({ sessionId, outputType, response });
} catch (error) {
console.error('Handler error:', error);
return sendError(res, sessionId, 'An error occurred while processing the request.', 500);
}
}

// Transcribe voice input to text
async function transcribeVoice(audio: File): Promise<string> {
try {
switch (configs('stt_backend')) {
case 'whisper_openai': {
const result = await openaiWhisper(audio);
return result?.text; // Assuming the transcription is in result.text
}
case 'whispercpp': {
const result = await whispercpp(audio);
return result?.text; // Assuming the transcription is in result.text
}
default:
throw new Error('Invalid STT backend configuration.');
}
} catch (error) {
console.error('Transcription error:', error);
throw new Error('Failed to transcribe audio.');
}
}

// Process image using Vision LLM
async function processImage(payload: Buffer): Promise<string> {
const jpegImg = await convertToJpeg(payload);
if (!jpegImg) {
throw new Error('Failed to process image');
}
return await askVisionLLM(jpegImg);
}

// Convert image to JPEG and return as base64
async function convertToJpeg(payload: Buffer): Promise<string | null> {
try {
const jpegBuffer = await sharp(payload).jpeg().toBuffer();
return jpegBuffer.toString('base64');
} catch (error) {
console.error('Error converting image to .jpeg:', error);
return null;
}
}
66 changes: 50 additions & 16 deletions src/utils/askLlm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,10 @@ import { Chat } from "@/features/chat/chat";

import { getEchoChatResponseStream } from "@/features/chat/echoChat";
import { getOpenAiChatResponseStream } from "@/features/chat/openAiChat";
import { getLlamaCppChatResponseStream } from "@/features/chat/llamaCppChat";
import { getLlamaCppChatResponseStream, getLlavaCppChatResponse } from "@/features/chat/llamaCppChat";
import { getWindowAiChatResponseStream } from "@/features/chat/windowAiChat";
import { getOllamaChatResponseStream } from "@/features/chat/ollamaChat";
import { getOllamaChatResponseStream, getOllamaVisionChatResponse } from "@/features/chat/ollamaChat";
import { getKoboldAiChatResponseStream } from "@/features/chat/koboldAiChat";
import { getOpenRouterChatResponseStream } from "@/features/chat/openRouterChat";

import { config } from "@/utils/config";
import { processResponse } from "@/utils/processResponse";
Expand Down Expand Up @@ -51,8 +50,6 @@ export async function askLLM(
return getOllamaChatResponseStream(messages);
case "koboldai":
return getKoboldAiChatResponseStream(messages);
case "openrouter":
return getOpenRouterChatResponseStream(messages);
default:
return getEchoChatResponseStream(messages);
}
Expand All @@ -63,15 +60,15 @@ export async function askLLM(
} catch (e: any) {
const errMsg = `Error: ${e.toString()}`;
console.error(errMsg);
alert.error("Failed to get subconcious subroutine response", errMsg);
alert.error("Failed to get LLM response", errMsg);
return errMsg;
}

const stream = streams[streams.length - 1];
if (!stream) {
const errMsg = "Error: Null subconcious subroutine stream encountered.";
const errMsg = "Error: Null LLM stream encountered.";
console.error(errMsg);
alert.error("Null subconcious subroutine stream encountered", errMsg);
alert.error("Null LLM stream encountered", errMsg);
return errMsg;
}

Expand All @@ -84,7 +81,7 @@ export async function askLLM(
chat !== null ? currentStreamIdx = chat.currentStreamIdx : null;
setChatProcessing(true);

console.time("Subconcious subroutine stream processing");
console.time("LLM stream processing");
const reader = stream.getReader();
readers.push(reader);
let receivedMessage = "";
Expand Down Expand Up @@ -114,7 +111,7 @@ export async function askLLM(

receivedMessage += value;
receivedMessage = receivedMessage.trimStart();


if (chat !== null) {
const proc = processResponse({
Expand All @@ -135,26 +132,26 @@ export async function askLLM(
screenplay: aiTalks[0],
streamIdx: currentStreamIdx,
});

if (! firstSentenceEncountered) {
console.timeEnd('performance_time_to_first_sentence');
firstSentenceEncountered = true;
}

return false; // normal processing
}
});

sentences = proc.sentences;
aiTextLog = proc.aiTextLog;
receivedMessage = proc.receivedMessage;
tag = proc.tag;
rolePlay = proc.rolePlay;
if (proc.shouldBreak) {
break;
}
}
}

}
} catch (e: any) {
const errMsg = e.toString();
Expand All @@ -163,7 +160,7 @@ export async function askLLM(
if (!reader.closed) {
reader.releaseLock();
}
console.timeEnd("Subconcious subroutine stream processing");
console.timeEnd("LLM stream processing");
if (currentStreamIdx === currentStreamIdx) {
setChatProcessing(false);
}
Expand All @@ -172,4 +169,41 @@ export async function askLLM(
return result;
}

export async function askVisionLLM(
imageData: string,
): Promise<string> {
try {
const visionBackend = config("vision_backend");

console.debug("vision_backend", visionBackend);

const messages: Message[] = [
{ role: "system", content: config("vision_system_prompt") },
{
role: 'user',
content: "Describe the image as accurately as possible"
},
];

let res = "";
if (visionBackend === "vision_llamacpp") {
res = await getLlavaCppChatResponse(messages, imageData);
} else if (visionBackend === "vision_ollama") {
res = await getOllamaVisionChatResponse(messages, imageData);
} else {
console.warn("vision_backend not supported", visionBackend);
return "vision_backend not supported";
}

let content = `This is a picture I just took from my webcam (described between [[ and ]] ): [[${res}]] Please respond accordingly and as if it were just sent and as though you can see it.`
const result = await askLLM(config("system_prompt"),content,null);

return result;
} catch (e: any) {
console.error("getVisionResponse", e.toString());
// alert?.error("Failed to get vision response", e.toString());
return "Failed to get vision response";
}
}

export default askLLM;

0 comments on commit d96c12e

Please sign in to comment.