✨ feat: Enhance OpenAI speech-to-text functionality and modify API en…

…dpoints This commit introduces new features to enhance the OpenAI speech-to-text functionality. It includes adding a new function for recording audio, modifying the existing OpenAI STT function, and updating API endpoints and proxy URLs for Microsoft Speech, Azure Speech, and OpenAI. Additionally, the commit involves deleting the "mimeType.ts" file, importing different functions and modules, updating import statements and function names, and modifying error handling and stopping behavior. Changes: - Enhancements to OpenAI speech-to-text functionality - Addition of a new function for recording audio - Modification of the existing OpenAI STT function - Updates to API endpoints and proxy URLs for Microsoft Speech, Azure Speech, and OpenAI - Deletion of "mimeType.ts" file - Importing different functions and modules - Updating import statements and function names - Modifying error handling and stopping behavior
lobehub · Nov 13, 2023 · 61126aa · 61126aa
1 parent ef43fda
commit 61126aa
Show file tree

Hide file tree

Showing 13 changed files with 213 additions and 117 deletions.
diff --git a/src/const/api.ts b/src/const/api.ts
@@ -3,16 +3,31 @@ import urlJoin from 'url-join';
 export const MICROSOFT_SPPECH_URL =
   'https://southeastasia.api.speech.microsoft.com/accfreetrial/texttospeech/acc/v3.0-beta1/vcg/speak';
 export const MICROSOFT_SPEECH_PROXY_URL =
-  process.env.MICROSOFT_SPEECH_PROXY_URL || '/api/microsoft-speech';
-export const AZURE_SPEECH_PROXY_URL = process.env.AZURE_SPEECH_PROXY_URL || '/api/azure-speech';
-export const AZURE_SPEECH_KEY = process.env.AZURE_SPEECH_KEY || '';
-export const AZURE_SPEECH_REGION = process.env.AZURE_SPEECH_REGION || '';
-export const OPENAI_API_KEY = process.env.OPENAI_API_KEY || '';
-export const OPENAI_PROXY_URL = process.env.OPENAI_PROXY_URL || 'https://api.openai.com/v1';
+  process.env.MICROSOFT_SPEECH_PROXY_URL ||
+  process.env.NEXT_PUBLIC_MICROSOFT_SPEECH_PROXY_URL ||
+  '/api/microsoft-speech';
+export const AZURE_SPEECH_PROXY_URL =
+  process.env.AZURE_SPEECH_PROXY_URL ||
+  process.env.NEXT_PUBLIC_AZURE_SPEECH_PROXY_URL ||
+  '/api/azure-speech';
+export const AZURE_SPEECH_KEY =
+  process.env.AZURE_SPEECH_KEY || process.env.NEXT_PUBLIC_AZURE_SPEECH_KEY || '';
+export const AZURE_SPEECH_REGION =
+  process.env.AZURE_SPEECH_REGION || process.env.NEXT_PUBLIC_AZURE_SPEECH_REGION || '';
+export const OPENAI_API_KEY =
+  process.env.OPENAI_API_KEY || process.env.NEXT_PUBLIC_OPENAI_API_KEY || '';
+export const OPENAI_PROXY_URL =
+  process.env.OPENAI_PROXY_URL ||
+  process.env.NEXT_PUBLIC_OPENAI_PROXY_URL ||
+  'https://api.openai.com/v1';
 export const OPENAI_TTS_URL = (api?: string) => urlJoin(api || OPENAI_PROXY_URL, 'audio/speech');
 export const OPENAI_STT_URL = (api?: string) =>
   urlJoin(api || OPENAI_PROXY_URL, 'audio/transcriptions');
 export const EDDGE_PROXY_URL =
   process.env.EDDGE_PROXY_URL ||
+  process.env.NEXT_PUBLIC_EDDGE_PROXY_UR ||
   'wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1';
-export const EDDGE_API_TOKEN = process.env.EDDGE_API_TOKEN || '6A5AA1D4EAFF4E9FB37E23D68491D6F4';
+export const EDDGE_API_TOKEN =
+  process.env.EDDGE_API_TOKEN ||
+  process.env.NEXT_PUBLIC_EDDGE_API_TOKEN ||
+  '6A5AA1D4EAFF4E9FB37E23D68491D6F4';
diff --git a/src/const/mimeType.ts b/src/const/mimeType.ts
diff --git a/src/index.ts b/src/index.ts
@@ -20,8 +20,10 @@ export { useEdgeSpeech } from './useEdgeSpeech';
 export { useMicrosoftSpeech } from './useMicrosoftSpeech';
 export {
   type OpenaiSpeechRecognitionOptions,
+  type OpenaiSTTFetcher,
   useOpenaiSTT,
   useOpenaiSTTWithPSR,
+  useOpenaiSTTWithRecord,
   useOpenaiSTTWithSR,
 } from './useOpenaiSTT';
 export { useOpenaiTTS } from './useOpenaiTTS';
@@ -31,6 +33,7 @@ export {
   useSpeechRecognition,
 } from './useSpeechRecognition/useSpeechRecognition';
 export { useSpeechSynthes } from './useSpeechSynthes';
+export { getRecordMineType, type RecordMineType } from './utils/getRecordMineType';
 export {
   genLevaOptions,
   getAzureVoiceOptions,

diff --git a/src/services/fetchOpenaiSTT.ts b/src/services/fetchOpenaiSTT.ts
@@ -1,20 +1,21 @@
 import { v4 as uuidv4 } from 'uuid';
 
 import { OPENAI_API_KEY, OPENAI_STT_URL } from '@/const/api';
-import { RECORD_MIME_TYPE } from '@/const/mimeType';
+import { RecordMineType, getRecordMineType } from '@/utils/getRecordMineType';
 
 export interface OpenaiSttOptions {
   api?: {
     key?: string;
     proxy?: string;
   };
+  mineType?: RecordMineType;
   model?: 'whisper-1';
 }
 
 // 纯文本生成语音
 export const fetchOpenaiSTT = async (
   speech: Blob,
-  { api = {}, model = 'whisper-1' }: OpenaiSttOptions,
+  { api = {}, model = 'whisper-1', mineType }: OpenaiSttOptions,
 ): Promise<string> => {
   const key = api?.key || OPENAI_API_KEY;
   const url = OPENAI_STT_URL(api?.proxy);
@@ -23,8 +24,10 @@ export const fetchOpenaiSTT = async (
     Authorization: `Bearer ${key}`,
   });
 
-  const filename = `${uuidv4()}.${RECORD_MIME_TYPE().extension}`;
-  const file = new File([speech], filename, { type: RECORD_MIME_TYPE().mineType });
+  const filename = `${uuidv4()}.${mineType?.extension || getRecordMineType().extension}`;
+  const file = new File([speech], filename, {
+    type: mineType?.mineType || getRecordMineType().mineType,
+  });
 
   const body = new FormData();
   body.append('file', file);

diff --git a/src/useAudioRecorder/index.ts b/src/useAudioRecorder/index.ts
@@ -1,6 +1,6 @@
 import { useCallback, useState } from 'react';
 
-import { RECORD_MIME_TYPE } from '@/const/mimeType';
+import { getRecordMineType } from '@/utils/getRecordMineType';
 import { secondsToMinutesAndSeconds } from '@/utils/secondsToMinutesAndSeconds';
 
 export const useAudioRecorder = (onBolbAvailable?: (blob: Blob) => void) => {
@@ -39,7 +39,7 @@ export const useAudioRecorder = (onBolbAvailable?: (blob: Blob) => void) => {
       .then((stream) => {
         setIsRecording(true);
         const recorder: MediaRecorder = new MediaRecorder(stream, {
-          mimeType: RECORD_MIME_TYPE().mineType,
+          mimeType: getRecordMineType().mineType,
         });
         setMediaRecorder(recorder);
         recorder.start();

diff --git a/src/useOpenaiSTT/demos/index.tsx b/src/useOpenaiSTT/demos/index.tsx
@@ -1,4 +1,4 @@
-import { useOpenaiSTT } from '@lobehub/tts';
+import { useOpenaiSTTWithRecord } from '@lobehub/tts';
 import { Icon, StoryBook, useControls, useCreateStore } from '@lobehub/ui';
 import { Button, Input } from 'antd';
 import { Mic, StopCircle } from 'lucide-react';
@@ -20,7 +20,9 @@ export default () => {
     { store },
   );
 
-  const { text, start, stop, isLoading, isRecording, url, formattedTime } = useOpenaiSTT({ api });
+  const { text, start, stop, isLoading, isRecording, url, formattedTime } = useOpenaiSTTWithRecord({
+    api,
+  });
   return (
     <StoryBook levaStore={store}>
       <Flexbox gap={8}>

diff --git a/src/useOpenaiSTT/index.ts b/src/useOpenaiSTT/index.ts
@@ -1,3 +1,7 @@
-export { type OpenaiSpeechRecognitionOptions, useOpenaiSTT } from './useOpenaiSTT';
+export { type OpenaiSTTFetcher, useOpenaiSTT } from './useOpenaiSTT';
 export { useOpenaiSTTWithPSR } from './useOpenaiSTTWithPSR';
+export {
+  type OpenaiSpeechRecognitionOptions,
+  useOpenaiSTTWithRecord,
+} from './useOpenaiSTTWithRecord';
 export { useOpenaiSTTWithSR } from './useOpenaiSTTWithSR';
diff --git a/src/useOpenaiSTT/useOpenaiSTT.ts b/src/useOpenaiSTT/useOpenaiSTT.ts
@@ -1,55 +1,25 @@
-import { useCallback, useState } from 'react';
-import useSWR from 'swr';
+import useSWR, { type SWRConfiguration } from 'swr';
 
 import { OpenaiSttOptions, fetchOpenaiSTT } from '@/services/fetchOpenaiSTT';
-import { useAudioRecorder } from '@/useAudioRecorder';
-import { SpeechRecognitionOptions } from '@/useSpeechRecognition/useSpeechRecognition';
+import { getRecordMineType } from '@/utils/getRecordMineType';
 
-export type OpenaiSpeechRecognitionOptions = SpeechRecognitionOptions & OpenaiSttOptions;
+export type OpenaiSTTFetcher = (blob: Blob, sttOptions: OpenaiSttOptions) => Promise<string>;
+export const useOpenaiSTT = (
+  shouldFetch?: boolean,
+  blob?: Blob,
+  options?: OpenaiSttOptions,
+  config?: SWRConfiguration,
+  fetcher?: OpenaiSTTFetcher,
+) => {
+  const key = new Date().getDate().toString();
 
-export const useOpenaiSTT = ({
-  onBolbAvailable,
-  onTextChange,
-  ...options
-}: OpenaiSpeechRecognitionOptions) => {
-  const [isGlobalLoading, setIsGlobalLoading] = useState<boolean>(false);
-  const [shouldFetch, setShouldFetch] = useState<boolean>(false);
-  const [text, setText] = useState<string>();
-  const { start, stop, blob, url, isRecording, time, formattedTime } = useAudioRecorder(
-    (blobData) => {
-      setShouldFetch(true);
-      onBolbAvailable?.(blobData);
-    },
-  );
+  const optionsWithMineType: OpenaiSttOptions = { ...options, mineType: getRecordMineType() };
 
-  const key = new Date().getDate().toString();
+  const openaiSTTFetcher = fetcher ?? fetchOpenaiSTT;
 
-  const { isLoading } = useSWR(
+  return useSWR(
     shouldFetch && blob ? key : null,
-    async () => await fetchOpenaiSTT(blob as any, options),
-    {
-      onSuccess: (value) => {
-        onTextChange?.(value);
-        setIsGlobalLoading(false);
-      },
-    },
+    async () => await openaiSTTFetcher(blob as Blob, optionsWithMineType),
+    config,
   );
-
-  const handleStart = useCallback(() => {
-    setIsGlobalLoading(true);
-    start();
-    setText('');
-  }, [start]);
-
-  return {
-    blob,
-    formattedTime,
-    isLoading: isGlobalLoading || isLoading || isRecording,
-    isRecording,
-    start: handleStart,
-    stop,
-    text,
-    time,
-    url,
-  };
 };
diff --git a/src/useOpenaiSTT/useOpenaiSTTWithPSR.ts b/src/useOpenaiSTT/useOpenaiSTTWithPSR.ts
@@ -1,14 +1,14 @@
 import { useCallback, useState } from 'react';
-import useSWR from 'swr';
 
-import { fetchOpenaiSTT } from '@/services/fetchOpenaiSTT';
+import { OpenaiSTTFetcher, useOpenaiSTT } from '@/useOpenaiSTT/useOpenaiSTT';
 import { usePersistedSpeechRecognition } from '@/useSpeechRecognition';
 
-import { OpenaiSpeechRecognitionOptions } from './useOpenaiSTT';
+import { OpenaiSpeechRecognitionOptions } from './useOpenaiSTTWithRecord';
 
 export const useOpenaiSTTWithPSR = (
   locale: string,
   { onBolbAvailable, onTextChange, ...options }: OpenaiSpeechRecognitionOptions,
+  fetcher?: OpenaiSTTFetcher,
 ) => {
   const [isGlobalLoading, setIsGlobalLoading] = useState<boolean>(false);
   const [shouldFetch, setShouldFetch] = useState<boolean>(false);
@@ -32,34 +32,43 @@ export const useOpenaiSTTWithPSR = (
     },
   });
 
-  const key = new Date().getDate().toString();
+  const handleStart = useCallback(() => {
+    setIsGlobalLoading(true);
+    start();
+    setText('');
+  }, [start]);
+
+  const handleStop = useCallback(() => {
+    stop();
+    setShouldFetch(false);
+    setIsGlobalLoading(false);
+  }, [stop]);
 
-  const { isLoading } = useSWR(
-    shouldFetch && blob ? key : null,
-    async () => await fetchOpenaiSTT(blob as any, options),
+  const { isLoading } = useOpenaiSTT(
+    shouldFetch,
+    blob,
+    options,
     {
+      onError: (err) => {
+        console.error(err);
+        handleStop();
+      },
       onSuccess: (data) => {
-        setShouldFetch(false);
         setText(data);
         onTextChange?.(data);
-        setIsGlobalLoading(false);
+        handleStop();
       },
     },
+    fetcher,
   );
 
-  const handleStart = useCallback(() => {
-    setIsGlobalLoading(true);
-    start();
-    setText('');
-  }, [start]);
-
   return {
     blob,
     formattedTime,
     isLoading: isGlobalLoading || isLoading || isRecording,
     isRecording,
     start: handleStart,
-    stop,
+    stop: handleStop,
     text,
     time,
     url,

diff --git a/src/useOpenaiSTT/useOpenaiSTTWithRecord.ts b/src/useOpenaiSTT/useOpenaiSTTWithRecord.ts
@@ -0,0 +1,65 @@
+import { useCallback, useState } from 'react';
+
+import { OpenaiSttOptions } from '@/services/fetchOpenaiSTT';
+import { useAudioRecorder } from '@/useAudioRecorder';
+import { OpenaiSTTFetcher, useOpenaiSTT } from '@/useOpenaiSTT/useOpenaiSTT';
+import { SpeechRecognitionOptions } from '@/useSpeechRecognition/useSpeechRecognition';
+
+export type OpenaiSpeechRecognitionOptions = SpeechRecognitionOptions & OpenaiSttOptions;
+
+export const useOpenaiSTTWithRecord = (
+  { onBolbAvailable, onTextChange, ...options }: OpenaiSpeechRecognitionOptions,
+  fetcher?: OpenaiSTTFetcher,
+) => {
+  const [isGlobalLoading, setIsGlobalLoading] = useState<boolean>(false);
+  const [shouldFetch, setShouldFetch] = useState<boolean>(false);
+  const [text, setText] = useState<string>();
+  const { start, stop, blob, url, isRecording, time, formattedTime } = useAudioRecorder(
+    (blobData) => {
+      setShouldFetch(true);
+      onBolbAvailable?.(blobData);
+    },
+  );
+
+  const handleStart = useCallback(() => {
+    setIsGlobalLoading(true);
+    start();
+    setText('');
+  }, [start]);
+
+  const handleStop = useCallback(() => {
+    stop();
+    setShouldFetch(false);
+    setIsGlobalLoading(false);
+  }, [stop]);
+
+  const { isLoading } = useOpenaiSTT(
+    shouldFetch,
+    blob,
+    options,
+    {
+      onError: (err) => {
+        console.error(err);
+        handleStop();
+      },
+      onSuccess: (data, value) => {
+        setText(data);
+        onTextChange?.(value);
+        handleStop();
+      },
+    },
+    fetcher,
+  );
+
+  return {
+    blob,
+    formattedTime,
+    isLoading: isGlobalLoading || isLoading || isRecording,
+    isRecording,
+    start: handleStart,
+    stop: handleStop,
+    text,
+    time,
+    url,
+  };
+};