Skip to content

Commit

Permalink
✨ feat: Enhance OpenAI speech-to-text functionality and modify API en…
Browse files Browse the repository at this point in the history
…dpoints

This commit introduces new features to enhance the OpenAI speech-to-text functionality. It includes adding a new function for recording audio, modifying the existing OpenAI STT function, and updating API endpoints and proxy URLs for Microsoft Speech, Azure Speech, and OpenAI. Additionally, the commit involves deleting the "mimeType.ts" file, importing different functions and modules, updating import statements and function names, and modifying error handling and stopping behavior.

Changes:
- Enhancements to OpenAI speech-to-text functionality
- Addition of a new function for recording audio
- Modification of the existing OpenAI STT function
- Updates to API endpoints and proxy URLs for Microsoft Speech, Azure Speech, and OpenAI
- Deletion of "mimeType.ts" file
- Importing different functions and modules
- Updating import statements and function names
- Modifying error handling and stopping behavior
  • Loading branch information
canisminor1990 committed Nov 13, 2023
1 parent ef43fda commit 61126aa
Show file tree
Hide file tree
Showing 13 changed files with 213 additions and 117 deletions.
29 changes: 22 additions & 7 deletions src/const/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,31 @@ import urlJoin from 'url-join';
export const MICROSOFT_SPPECH_URL =
'https://southeastasia.api.speech.microsoft.com/accfreetrial/texttospeech/acc/v3.0-beta1/vcg/speak';
export const MICROSOFT_SPEECH_PROXY_URL =
process.env.MICROSOFT_SPEECH_PROXY_URL || '/api/microsoft-speech';
export const AZURE_SPEECH_PROXY_URL = process.env.AZURE_SPEECH_PROXY_URL || '/api/azure-speech';
export const AZURE_SPEECH_KEY = process.env.AZURE_SPEECH_KEY || '';
export const AZURE_SPEECH_REGION = process.env.AZURE_SPEECH_REGION || '';
export const OPENAI_API_KEY = process.env.OPENAI_API_KEY || '';
export const OPENAI_PROXY_URL = process.env.OPENAI_PROXY_URL || 'https://api.openai.com/v1';
process.env.MICROSOFT_SPEECH_PROXY_URL ||
process.env.NEXT_PUBLIC_MICROSOFT_SPEECH_PROXY_URL ||
'/api/microsoft-speech';
export const AZURE_SPEECH_PROXY_URL =
process.env.AZURE_SPEECH_PROXY_URL ||
process.env.NEXT_PUBLIC_AZURE_SPEECH_PROXY_URL ||
'/api/azure-speech';
export const AZURE_SPEECH_KEY =
process.env.AZURE_SPEECH_KEY || process.env.NEXT_PUBLIC_AZURE_SPEECH_KEY || '';
export const AZURE_SPEECH_REGION =
process.env.AZURE_SPEECH_REGION || process.env.NEXT_PUBLIC_AZURE_SPEECH_REGION || '';
export const OPENAI_API_KEY =
process.env.OPENAI_API_KEY || process.env.NEXT_PUBLIC_OPENAI_API_KEY || '';
export const OPENAI_PROXY_URL =
process.env.OPENAI_PROXY_URL ||
process.env.NEXT_PUBLIC_OPENAI_PROXY_URL ||
'https://api.openai.com/v1';
export const OPENAI_TTS_URL = (api?: string) => urlJoin(api || OPENAI_PROXY_URL, 'audio/speech');
export const OPENAI_STT_URL = (api?: string) =>
urlJoin(api || OPENAI_PROXY_URL, 'audio/transcriptions');
export const EDDGE_PROXY_URL =
process.env.EDDGE_PROXY_URL ||
process.env.NEXT_PUBLIC_EDDGE_PROXY_UR ||
'wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1';
export const EDDGE_API_TOKEN = process.env.EDDGE_API_TOKEN || '6A5AA1D4EAFF4E9FB37E23D68491D6F4';
export const EDDGE_API_TOKEN =
process.env.EDDGE_API_TOKEN ||
process.env.NEXT_PUBLIC_EDDGE_API_TOKEN ||
'6A5AA1D4EAFF4E9FB37E23D68491D6F4';
11 changes: 0 additions & 11 deletions src/const/mimeType.ts

This file was deleted.

3 changes: 3 additions & 0 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@ export { useEdgeSpeech } from './useEdgeSpeech';
export { useMicrosoftSpeech } from './useMicrosoftSpeech';
export {
type OpenaiSpeechRecognitionOptions,
type OpenaiSTTFetcher,
useOpenaiSTT,
useOpenaiSTTWithPSR,
useOpenaiSTTWithRecord,
useOpenaiSTTWithSR,
} from './useOpenaiSTT';
export { useOpenaiTTS } from './useOpenaiTTS';
Expand All @@ -31,6 +33,7 @@ export {
useSpeechRecognition,
} from './useSpeechRecognition/useSpeechRecognition';
export { useSpeechSynthes } from './useSpeechSynthes';
export { getRecordMineType, type RecordMineType } from './utils/getRecordMineType';
export {
genLevaOptions,
getAzureVoiceOptions,
Expand Down
11 changes: 7 additions & 4 deletions src/services/fetchOpenaiSTT.ts
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
import { v4 as uuidv4 } from 'uuid';

import { OPENAI_API_KEY, OPENAI_STT_URL } from '@/const/api';
import { RECORD_MIME_TYPE } from '@/const/mimeType';
import { RecordMineType, getRecordMineType } from '@/utils/getRecordMineType';

export interface OpenaiSttOptions {
api?: {
key?: string;
proxy?: string;
};
mineType?: RecordMineType;
model?: 'whisper-1';
}

// 纯文本生成语音
export const fetchOpenaiSTT = async (
speech: Blob,
{ api = {}, model = 'whisper-1' }: OpenaiSttOptions,
{ api = {}, model = 'whisper-1', mineType }: OpenaiSttOptions,
): Promise<string> => {
const key = api?.key || OPENAI_API_KEY;
const url = OPENAI_STT_URL(api?.proxy);
Expand All @@ -23,8 +24,10 @@ export const fetchOpenaiSTT = async (
Authorization: `Bearer ${key}`,
});

const filename = `${uuidv4()}.${RECORD_MIME_TYPE().extension}`;
const file = new File([speech], filename, { type: RECORD_MIME_TYPE().mineType });
const filename = `${uuidv4()}.${mineType?.extension || getRecordMineType().extension}`;
const file = new File([speech], filename, {
type: mineType?.mineType || getRecordMineType().mineType,
});

const body = new FormData();
body.append('file', file);
Expand Down
4 changes: 2 additions & 2 deletions src/useAudioRecorder/index.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { useCallback, useState } from 'react';

import { RECORD_MIME_TYPE } from '@/const/mimeType';
import { getRecordMineType } from '@/utils/getRecordMineType';
import { secondsToMinutesAndSeconds } from '@/utils/secondsToMinutesAndSeconds';

export const useAudioRecorder = (onBolbAvailable?: (blob: Blob) => void) => {
Expand Down Expand Up @@ -39,7 +39,7 @@ export const useAudioRecorder = (onBolbAvailable?: (blob: Blob) => void) => {
.then((stream) => {
setIsRecording(true);
const recorder: MediaRecorder = new MediaRecorder(stream, {
mimeType: RECORD_MIME_TYPE().mineType,
mimeType: getRecordMineType().mineType,
});
setMediaRecorder(recorder);
recorder.start();
Expand Down
6 changes: 4 additions & 2 deletions src/useOpenaiSTT/demos/index.tsx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { useOpenaiSTT } from '@lobehub/tts';
import { useOpenaiSTTWithRecord } from '@lobehub/tts';
import { Icon, StoryBook, useControls, useCreateStore } from '@lobehub/ui';
import { Button, Input } from 'antd';
import { Mic, StopCircle } from 'lucide-react';
Expand All @@ -20,7 +20,9 @@ export default () => {
{ store },
);

const { text, start, stop, isLoading, isRecording, url, formattedTime } = useOpenaiSTT({ api });
const { text, start, stop, isLoading, isRecording, url, formattedTime } = useOpenaiSTTWithRecord({
api,
});
return (
<StoryBook levaStore={store}>
<Flexbox gap={8}>
Expand Down
6 changes: 5 additions & 1 deletion src/useOpenaiSTT/index.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
export { type OpenaiSpeechRecognitionOptions, useOpenaiSTT } from './useOpenaiSTT';
export { type OpenaiSTTFetcher, useOpenaiSTT } from './useOpenaiSTT';
export { useOpenaiSTTWithPSR } from './useOpenaiSTTWithPSR';
export {
type OpenaiSpeechRecognitionOptions,
useOpenaiSTTWithRecord,
} from './useOpenaiSTTWithRecord';
export { useOpenaiSTTWithSR } from './useOpenaiSTTWithSR';
62 changes: 16 additions & 46 deletions src/useOpenaiSTT/useOpenaiSTT.ts
Original file line number Diff line number Diff line change
@@ -1,55 +1,25 @@
import { useCallback, useState } from 'react';
import useSWR from 'swr';
import useSWR, { type SWRConfiguration } from 'swr';

import { OpenaiSttOptions, fetchOpenaiSTT } from '@/services/fetchOpenaiSTT';
import { useAudioRecorder } from '@/useAudioRecorder';
import { SpeechRecognitionOptions } from '@/useSpeechRecognition/useSpeechRecognition';
import { getRecordMineType } from '@/utils/getRecordMineType';

export type OpenaiSpeechRecognitionOptions = SpeechRecognitionOptions & OpenaiSttOptions;
export type OpenaiSTTFetcher = (blob: Blob, sttOptions: OpenaiSttOptions) => Promise<string>;
export const useOpenaiSTT = (
shouldFetch?: boolean,
blob?: Blob,
options?: OpenaiSttOptions,
config?: SWRConfiguration,
fetcher?: OpenaiSTTFetcher,
) => {
const key = new Date().getDate().toString();

export const useOpenaiSTT = ({
onBolbAvailable,
onTextChange,
...options
}: OpenaiSpeechRecognitionOptions) => {
const [isGlobalLoading, setIsGlobalLoading] = useState<boolean>(false);
const [shouldFetch, setShouldFetch] = useState<boolean>(false);
const [text, setText] = useState<string>();
const { start, stop, blob, url, isRecording, time, formattedTime } = useAudioRecorder(
(blobData) => {
setShouldFetch(true);
onBolbAvailable?.(blobData);
},
);
const optionsWithMineType: OpenaiSttOptions = { ...options, mineType: getRecordMineType() };

const key = new Date().getDate().toString();
const openaiSTTFetcher = fetcher ?? fetchOpenaiSTT;

const { isLoading } = useSWR(
return useSWR(
shouldFetch && blob ? key : null,
async () => await fetchOpenaiSTT(blob as any, options),
{
onSuccess: (value) => {
onTextChange?.(value);
setIsGlobalLoading(false);
},
},
async () => await openaiSTTFetcher(blob as Blob, optionsWithMineType),
config,
);

const handleStart = useCallback(() => {
setIsGlobalLoading(true);
start();
setText('');
}, [start]);

return {
blob,
formattedTime,
isLoading: isGlobalLoading || isLoading || isRecording,
isRecording,
start: handleStart,
stop,
text,
time,
url,
};
};
41 changes: 25 additions & 16 deletions src/useOpenaiSTT/useOpenaiSTTWithPSR.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import { useCallback, useState } from 'react';
import useSWR from 'swr';

import { fetchOpenaiSTT } from '@/services/fetchOpenaiSTT';
import { OpenaiSTTFetcher, useOpenaiSTT } from '@/useOpenaiSTT/useOpenaiSTT';
import { usePersistedSpeechRecognition } from '@/useSpeechRecognition';

import { OpenaiSpeechRecognitionOptions } from './useOpenaiSTT';
import { OpenaiSpeechRecognitionOptions } from './useOpenaiSTTWithRecord';

export const useOpenaiSTTWithPSR = (
locale: string,
{ onBolbAvailable, onTextChange, ...options }: OpenaiSpeechRecognitionOptions,
fetcher?: OpenaiSTTFetcher,
) => {
const [isGlobalLoading, setIsGlobalLoading] = useState<boolean>(false);
const [shouldFetch, setShouldFetch] = useState<boolean>(false);
Expand All @@ -32,34 +32,43 @@ export const useOpenaiSTTWithPSR = (
},
});

const key = new Date().getDate().toString();
const handleStart = useCallback(() => {
setIsGlobalLoading(true);
start();
setText('');
}, [start]);

const handleStop = useCallback(() => {
stop();
setShouldFetch(false);
setIsGlobalLoading(false);
}, [stop]);

const { isLoading } = useSWR(
shouldFetch && blob ? key : null,
async () => await fetchOpenaiSTT(blob as any, options),
const { isLoading } = useOpenaiSTT(
shouldFetch,
blob,
options,
{
onError: (err) => {
console.error(err);
handleStop();
},
onSuccess: (data) => {
setShouldFetch(false);
setText(data);
onTextChange?.(data);
setIsGlobalLoading(false);
handleStop();
},
},
fetcher,
);

const handleStart = useCallback(() => {
setIsGlobalLoading(true);
start();
setText('');
}, [start]);

return {
blob,
formattedTime,
isLoading: isGlobalLoading || isLoading || isRecording,
isRecording,
start: handleStart,
stop,
stop: handleStop,
text,
time,
url,
Expand Down
65 changes: 65 additions & 0 deletions src/useOpenaiSTT/useOpenaiSTTWithRecord.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import { useCallback, useState } from 'react';

import { OpenaiSttOptions } from '@/services/fetchOpenaiSTT';
import { useAudioRecorder } from '@/useAudioRecorder';
import { OpenaiSTTFetcher, useOpenaiSTT } from '@/useOpenaiSTT/useOpenaiSTT';
import { SpeechRecognitionOptions } from '@/useSpeechRecognition/useSpeechRecognition';

export type OpenaiSpeechRecognitionOptions = SpeechRecognitionOptions & OpenaiSttOptions;

export const useOpenaiSTTWithRecord = (
{ onBolbAvailable, onTextChange, ...options }: OpenaiSpeechRecognitionOptions,
fetcher?: OpenaiSTTFetcher,
) => {
const [isGlobalLoading, setIsGlobalLoading] = useState<boolean>(false);
const [shouldFetch, setShouldFetch] = useState<boolean>(false);
const [text, setText] = useState<string>();
const { start, stop, blob, url, isRecording, time, formattedTime } = useAudioRecorder(
(blobData) => {
setShouldFetch(true);
onBolbAvailable?.(blobData);
},
);

const handleStart = useCallback(() => {
setIsGlobalLoading(true);
start();
setText('');
}, [start]);

const handleStop = useCallback(() => {
stop();
setShouldFetch(false);
setIsGlobalLoading(false);
}, [stop]);

const { isLoading } = useOpenaiSTT(
shouldFetch,
blob,
options,
{
onError: (err) => {
console.error(err);
handleStop();
},
onSuccess: (data, value) => {
setText(data);
onTextChange?.(value);
handleStop();
},
},
fetcher,
);

return {
blob,
formattedTime,
isLoading: isGlobalLoading || isLoading || isRecording,
isRecording,
start: handleStart,
stop: handleStop,
text,
time,
url,
};
};
Loading

0 comments on commit 61126aa

Please sign in to comment.