From 136ef1d687bd33cbb66ca2d1ca03394033e69685 Mon Sep 17 00:00:00 2001 From: Quan HL Date: Sun, 5 Nov 2023 06:27:34 +0700 Subject: [PATCH 01/11] feat TTS synthAduio from API --- app.js | 4 +- lib/routes/api/accounts.js | 211 +++++++++++++++++++++++++++++++++++++ package-lock.json | 60 ++++++----- package.json | 4 +- 4 files changed, 252 insertions(+), 27 deletions(-) diff --git a/app.js b/app.js index 43ce96a5..2686673b 100644 --- a/app.js +++ b/app.js @@ -52,7 +52,8 @@ const { const { getTtsVoices, getTtsSize, - purgeTtsCache + purgeTtsCache, + synthAudio } = require('@jambonz/speech-utils')(JAMBONES_REDIS_SENTINELS || { host: process.env.JAMBONES_REDIS_HOST, port: process.env.JAMBONES_REDIS_PORT || 6379 @@ -98,6 +99,7 @@ app.locals = { getTtsVoices, getTtsSize, purgeTtsCache, + synthAudio, lookupAppBySid, lookupAccountBySid, lookupAccountByPhoneNumber, diff --git a/lib/routes/api/accounts.js b/lib/routes/api/accounts.js index c5abb80b..d546d5e5 100644 --- a/lib/routes/api/accounts.js +++ b/lib/routes/api/accounts.js @@ -26,6 +26,9 @@ const VoipCarrier = require('../../models/voip-carrier'); const { encrypt } = require('../../utils/encrypt-decrypt'); const { testS3Storage, testGoogleStorage, testAzureStorage } = require('../../utils/storage-utils'); const translator = short(); +const { validate } = require('@jambonz/verb-specifications'); +const SpeechCredential = require('../../models/speech-credential'); +const fs = require('fs'); let idx = 0; @@ -1038,4 +1041,212 @@ router.get('/:sid/Queues', async(req, res) => { } }); +// Tts +const validateTtsRequestBody = async(body, logger) => { + let copiedBody = {...body}; + if (typeof copiedBody !== 'object') { + throw new DbErrorBadRequest('Invalid tts request body, it should be say verb object'); + } + copiedBody.verb = 'say'; + copiedBody = [copiedBody]; + try { + validate(logger, copiedBody); + } catch (err) { + throw new DbErrorBadRequest(err); + } +}; + +const getSpeechCredential = (credentials) => { + for (const credential of credentials) { + if (credential.use_for_tts && credential.tts_tested_ok) { + const { vendor } = credential; + if ('google' === vendor) { + const cred = JSON.parse(credential.service_key.replace(/\n/g, '\\n')); + return { + speech_credential_sid: credential.speech_credential_sid, + credentials: cred + }; + } + else if (['aws', 'polly'].includes(vendor)) { + return { + speech_credential_sid: credential.speech_credential_sid, + accessKeyId: credential.access_key_id, + secretAccessKey: credential.secret_access_key, + region: credential.aws_region + }; + } + else if ('microsoft' === vendor) { + return { + speech_credential_sid: credential.speech_credential_sid, + api_key: credential.api_key, + region: credential.region, + use_custom_stt: credential.use_custom_stt, + custom_stt_endpoint: credential.custom_stt_endpoint, + custom_stt_endpoint_url: credential.custom_stt_endpoint_url, + use_custom_tts: credential.use_custom_tts, + custom_tts_endpoint: credential.custom_tts_endpoint, + custom_tts_endpoint_url: credential.custom_tts_endpoint_url + }; + } + else if ('wellsaid' === vendor) { + return { + speech_credential_sid: credential.speech_credential_sid, + api_key: credential.api_key + }; + } + else if ('nuance' === vendor) { + return { + speech_credential_sid: credential.speech_credential_sid, + client_id: credential.client_id, + secret: credential.secret, + nuance_tts_uri: credential.nuance_tts_uri, + nuance_stt_uri: credential.nuance_stt_uri + }; + } + else if ('deepgram' === vendor) { + return { + speech_credential_sid: credential.speech_credential_sid, + api_key: credential.api_key + }; + } + else if ('soniox' === vendor) { + return { + speech_credential_sid: credential.speech_credential_sid, + api_key: credential.api_key + }; + } + else if ('ibm' === vendor) { + return { + speech_credential_sid: credential.speech_credential_sid, + tts_api_key: credential.tts_api_key, + tts_region: credential.tts_region, + stt_api_key: credential.stt_api_key, + stt_region: credential.stt_region + }; + } + else if ('nvidia' === vendor) { + return { + speech_credential_sid: credential.speech_credential_sid, + riva_server_uri: credential.riva_server_uri + }; + } + else if ('cobalt' === vendor) { + return { + speech_credential_sid: credential.speech_credential_sid, + cobalt_server_uri: credential.cobalt_server_uri + }; + } else if ('elevenlabs' === vendor) { + return { + api_key: credential.api_key, + model_id: credential.model_id + }; + } else if ('assemblyai' === vendor) { + return { + speech_credential_sid: credential.speech_credential_sid, + api_key: credential.api_key + }; + } else if (vendor.startsWith('custom:')) { + return { + speech_credential_sid: credential.speech_credential_sid, + auth_token: credential.auth_token, + custom_stt_url: credential.custom_stt_url, + custom_tts_url: credential.custom_tts_url + }; + } + } + } +}; + +router.post('/:sid/Tts', async(req, res) => { + const {logger, synthAudio} = req.app.locals; + try { + const accountSid = parseAccountSid(req); + const body = req.body; + await validateRequest(req, accountSid); + await validateTtsRequestBody(body, logger); + const { text, synthesizer } = body; + const { vendor, label, language } = synthesizer; + const engine = synthesizer.engine || 'standard'; + const options = synthesizer.options || {}; + const salt = uuidv4(); + /* parse Nuance voices into name and model */ + let voice = synthesizer.voice; + let model; + if (vendor === 'nuance' && voice) { + const arr = /([A-Za-z-]*)\s+-\s+(enhanced|standard)/.exec(voice); + if (arr) { + voice = arr[1]; + model = arr[2]; + } + } + + const result = await Account.retrieve(accountSid); + if (!result || result.length === 0) { + throw new DbErrorBadRequest(`Account not found for sid ${accountSid}`); + } + if (result[0].is_active) { + throw new DbErrorBadRequest(`Account not active for sid ${accountSid}`); + } + + const speechCreds = await SpeechCredential.getSpeechCredentialsByVendorAndLabel(null, accountSid, vendor, label); + if (!speechCreds || speechCreds.length === 0) { + throw new + DbErrorBadRequest(`There is no available speech credential for ${vendor}${label ? ` and ${label}` : ''}`); + } + + let credentials = getSpeechCredential(speechCreds); + if (!credentials) { + throw new + DbErrorBadRequest(`There is no available speech credential for ${vendor}${label ? ` and ${label}` : ''}`); + } + /* allow for microsoft custom region voice and api_key to be specified as an override */ + if (vendor === 'microsoft' && options.deploymentId) { + credentials = credentials || {}; + credentials.use_custom_tts = true; + credentials.custom_tts_endpoint = options.deploymentId; + credentials.api_key = options.apiKey || credentials.apiKey; + credentials.region = options.region || credentials.region; + voice = options.voice || voice; + } + const stats = { + histogram: () => {}, + increment: () => {}, + }; + const { filePath } = await synthAudio(stats, { + account_sid: accountSid, + text, + vendor, + language, + voice, + engine, + model, + salt, + credentials, + disableTtsCache: false + }); + + const stat = fs.statSync(filePath); + res.writeHead(200, { + 'Content-Type': 'audio/mpeg', + 'Content-Length': stat.size, + }); + + const readStream = fs.createReadStream(filePath); + // We replaced all the event handlers with a simple call to readStream.pipe() + readStream.pipe(res); + + readStream.on('end', () => { + // Delete the file after it's been read + fs.unlink(filePath, (err) => { + if (err) throw err; + logger.error(`${filePath} was deleted`); + }); + }); + + } catch (err) { + sysError(logger, res, err); + } +}); + + module.exports = router; diff --git a/package-lock.json b/package-lock.json index 97adc99e..a050cef0 100644 --- a/package-lock.json +++ b/package-lock.json @@ -19,9 +19,9 @@ "@jambonz/lamejs": "^1.2.2", "@jambonz/mw-registrar": "^0.2.5", "@jambonz/realtimedb-helpers": "^0.8.7", - "@jambonz/speech-utils": "^0.0.15", + "@jambonz/speech-utils": "^0.0.24", "@jambonz/time-series": "^0.2.8", - "@jambonz/verb-specifications": "^0.0.29", + "@jambonz/verb-specifications": "^0.0.45", "@soniox/soniox-node": "^1.1.1", "argon2": "^0.30.3", "assemblyai": "^3.0.1", @@ -1968,11 +1968,11 @@ } }, "node_modules/@jambonz/speech-utils": { - "version": "0.0.15", - "resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.0.15.tgz", - "integrity": "sha512-di8jVnSdCXzkhKrX4vS5MyALw2KS3D1kodGiral3E9NrcUXDZhiLs7vZOW4EdAS1TTykl+Rb/GwNQ8xqkufxKQ==", + "version": "0.0.24", + "resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.0.24.tgz", + "integrity": "sha512-FpywxkjohC7wBBS9Xz9pRYXyO9D/tt2xgHFseWSoe0kO/6DbFudoLAI5uGB7AyUbTeGeKBiVZDJSNadsM4Iv4A==", "dependencies": { - "@aws-sdk/client-polly": "^3.303.0", + "@aws-sdk/client-polly": "^3.359.0", "@google-cloud/text-to-speech": "^4.2.1", "@grpc/grpc-js": "^1.8.13", "bent": "^7.3.12", @@ -1981,7 +1981,7 @@ "google-protobuf": "^3.21.2", "ibm-watson": "^8.0.0", "ioredis": "^5.3.2", - "microsoft-cognitiveservices-speech-sdk": "^1.26.0", + "microsoft-cognitiveservices-speech-sdk": "^1.31.0", "undici": "^5.21.0" } }, @@ -2096,9 +2096,9 @@ } }, "node_modules/@jambonz/verb-specifications": { - "version": "0.0.29", - "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.29.tgz", - "integrity": "sha512-jeYI+GN7Y5nXhdFG3SXvXaBlhCjIC+l5AcBywDDGxxyuuKRTukPS0MSvCtWPZP6H3wYYGqfJ4DR/vgtBF3pvyQ==", + "version": "0.0.45", + "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.45.tgz", + "integrity": "sha512-0cC7cfyXuOlqjfrtA9GC7A84efInj4z+ZSsibONqHMw3FVJE5IvcvabRojarDHooIn9Uw6AEX/zZ7BZqfgVmJw==", "dependencies": { "debug": "^4.3.4", "pino": "^8.8.0" @@ -3070,6 +3070,11 @@ "@types/node": "*" } }, + "node_modules/@types/webrtc": { + "version": "0.0.37", + "resolved": "https://registry.npmjs.org/@types/webrtc/-/webrtc-0.0.37.tgz", + "integrity": "sha512-JGAJC/ZZDhcrrmepU4sPLQLIOIAgs5oIK+Ieq90K8fdaNMhfdfqmYatJdgif1NDQtvrSlTOGJDUYHIDunuufOg==" + }, "node_modules/@types/websocket": { "version": "1.0.5", "resolved": "https://registry.npmjs.org/@types/websocket/-/websocket-1.0.5.tgz", @@ -6872,10 +6877,11 @@ } }, "node_modules/microsoft-cognitiveservices-speech-sdk": { - "version": "1.28.0", - "resolved": "https://registry.npmjs.org/microsoft-cognitiveservices-speech-sdk/-/microsoft-cognitiveservices-speech-sdk-1.28.0.tgz", - "integrity": "sha512-d+hCqTSeVCGtog5BgUKdIVTNifuigap9VSJbtDUP4kW1uG/yp7zRnqsjYx9nV9sRfuiYwZCyFzGG+VXGa37QDw==", + "version": "1.33.0", + "resolved": "https://registry.npmjs.org/microsoft-cognitiveservices-speech-sdk/-/microsoft-cognitiveservices-speech-sdk-1.33.0.tgz", + "integrity": "sha512-1I9tDcUaR5PJ3V9m8ecfKaPhPbYpadBEpOHi7E8VS6b5VQYoCllkAQ34WCBs05EI2Zo+8VuYXq7ClMS/IVXY/A==", "dependencies": { + "@types/webrtc": "^0.0.37", "agent-base": "^6.0.1", "bent": "^7.3.12", "https-proxy-agent": "^4.0.0", @@ -11295,11 +11301,11 @@ } }, "@jambonz/speech-utils": { - "version": "0.0.15", - "resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.0.15.tgz", - "integrity": "sha512-di8jVnSdCXzkhKrX4vS5MyALw2KS3D1kodGiral3E9NrcUXDZhiLs7vZOW4EdAS1TTykl+Rb/GwNQ8xqkufxKQ==", + "version": "0.0.24", + "resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.0.24.tgz", + "integrity": "sha512-FpywxkjohC7wBBS9Xz9pRYXyO9D/tt2xgHFseWSoe0kO/6DbFudoLAI5uGB7AyUbTeGeKBiVZDJSNadsM4Iv4A==", "requires": { - "@aws-sdk/client-polly": "^3.303.0", + "@aws-sdk/client-polly": "^3.359.0", "@google-cloud/text-to-speech": "^4.2.1", "@grpc/grpc-js": "^1.8.13", "bent": "^7.3.12", @@ -11308,7 +11314,7 @@ "google-protobuf": "^3.21.2", "ibm-watson": "^8.0.0", "ioredis": "^5.3.2", - "microsoft-cognitiveservices-speech-sdk": "^1.26.0", + "microsoft-cognitiveservices-speech-sdk": "^1.31.0", "undici": "^5.21.0" }, "dependencies": { @@ -11414,9 +11420,9 @@ } }, "@jambonz/verb-specifications": { - "version": "0.0.29", - "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.29.tgz", - "integrity": "sha512-jeYI+GN7Y5nXhdFG3SXvXaBlhCjIC+l5AcBywDDGxxyuuKRTukPS0MSvCtWPZP6H3wYYGqfJ4DR/vgtBF3pvyQ==", + "version": "0.0.45", + "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.45.tgz", + "integrity": "sha512-0cC7cfyXuOlqjfrtA9GC7A84efInj4z+ZSsibONqHMw3FVJE5IvcvabRojarDHooIn9Uw6AEX/zZ7BZqfgVmJw==", "requires": { "debug": "^4.3.4", "pino": "^8.8.0" @@ -12235,6 +12241,11 @@ "@types/node": "*" } }, + "@types/webrtc": { + "version": "0.0.37", + "resolved": "https://registry.npmjs.org/@types/webrtc/-/webrtc-0.0.37.tgz", + "integrity": "sha512-JGAJC/ZZDhcrrmepU4sPLQLIOIAgs5oIK+Ieq90K8fdaNMhfdfqmYatJdgif1NDQtvrSlTOGJDUYHIDunuufOg==" + }, "@types/websocket": { "version": "1.0.5", "resolved": "https://registry.npmjs.org/@types/websocket/-/websocket-1.0.5.tgz", @@ -15124,10 +15135,11 @@ } }, "microsoft-cognitiveservices-speech-sdk": { - "version": "1.28.0", - "resolved": "https://registry.npmjs.org/microsoft-cognitiveservices-speech-sdk/-/microsoft-cognitiveservices-speech-sdk-1.28.0.tgz", - "integrity": "sha512-d+hCqTSeVCGtog5BgUKdIVTNifuigap9VSJbtDUP4kW1uG/yp7zRnqsjYx9nV9sRfuiYwZCyFzGG+VXGa37QDw==", + "version": "1.33.0", + "resolved": "https://registry.npmjs.org/microsoft-cognitiveservices-speech-sdk/-/microsoft-cognitiveservices-speech-sdk-1.33.0.tgz", + "integrity": "sha512-1I9tDcUaR5PJ3V9m8ecfKaPhPbYpadBEpOHi7E8VS6b5VQYoCllkAQ34WCBs05EI2Zo+8VuYXq7ClMS/IVXY/A==", "requires": { + "@types/webrtc": "^0.0.37", "agent-base": "^6.0.1", "bent": "^7.3.12", "https-proxy-agent": "^4.0.0", diff --git a/package.json b/package.json index cf12e5a1..6e93039b 100644 --- a/package.json +++ b/package.json @@ -29,9 +29,9 @@ "@jambonz/lamejs": "^1.2.2", "@jambonz/mw-registrar": "^0.2.5", "@jambonz/realtimedb-helpers": "^0.8.7", - "@jambonz/speech-utils": "^0.0.15", + "@jambonz/speech-utils": "^0.0.24", "@jambonz/time-series": "^0.2.8", - "@jambonz/verb-specifications": "^0.0.29", + "@jambonz/verb-specifications": "^0.0.45", "@soniox/soniox-node": "^1.1.1", "argon2": "^0.30.3", "assemblyai": "^3.0.1", From f2ab0939423d83c651f39393c495f4b3621de932 Mon Sep 17 00:00:00 2001 From: Quan HL Date: Sun, 5 Nov 2023 21:00:21 +0700 Subject: [PATCH 02/11] fix failing testcase --- lib/routes/api/accounts.js | 210 ----------------------------------- lib/routes/api/tts-cache.js | 215 ++++++++++++++++++++++++++++++++++++ package-lock.json | 26 ++--- package.json | 2 +- 4 files changed, 223 insertions(+), 230 deletions(-) diff --git a/lib/routes/api/accounts.js b/lib/routes/api/accounts.js index d546d5e5..5e962094 100644 --- a/lib/routes/api/accounts.js +++ b/lib/routes/api/accounts.js @@ -26,9 +26,6 @@ const VoipCarrier = require('../../models/voip-carrier'); const { encrypt } = require('../../utils/encrypt-decrypt'); const { testS3Storage, testGoogleStorage, testAzureStorage } = require('../../utils/storage-utils'); const translator = short(); -const { validate } = require('@jambonz/verb-specifications'); -const SpeechCredential = require('../../models/speech-credential'); -const fs = require('fs'); let idx = 0; @@ -1041,212 +1038,5 @@ router.get('/:sid/Queues', async(req, res) => { } }); -// Tts -const validateTtsRequestBody = async(body, logger) => { - let copiedBody = {...body}; - if (typeof copiedBody !== 'object') { - throw new DbErrorBadRequest('Invalid tts request body, it should be say verb object'); - } - copiedBody.verb = 'say'; - copiedBody = [copiedBody]; - try { - validate(logger, copiedBody); - } catch (err) { - throw new DbErrorBadRequest(err); - } -}; - -const getSpeechCredential = (credentials) => { - for (const credential of credentials) { - if (credential.use_for_tts && credential.tts_tested_ok) { - const { vendor } = credential; - if ('google' === vendor) { - const cred = JSON.parse(credential.service_key.replace(/\n/g, '\\n')); - return { - speech_credential_sid: credential.speech_credential_sid, - credentials: cred - }; - } - else if (['aws', 'polly'].includes(vendor)) { - return { - speech_credential_sid: credential.speech_credential_sid, - accessKeyId: credential.access_key_id, - secretAccessKey: credential.secret_access_key, - region: credential.aws_region - }; - } - else if ('microsoft' === vendor) { - return { - speech_credential_sid: credential.speech_credential_sid, - api_key: credential.api_key, - region: credential.region, - use_custom_stt: credential.use_custom_stt, - custom_stt_endpoint: credential.custom_stt_endpoint, - custom_stt_endpoint_url: credential.custom_stt_endpoint_url, - use_custom_tts: credential.use_custom_tts, - custom_tts_endpoint: credential.custom_tts_endpoint, - custom_tts_endpoint_url: credential.custom_tts_endpoint_url - }; - } - else if ('wellsaid' === vendor) { - return { - speech_credential_sid: credential.speech_credential_sid, - api_key: credential.api_key - }; - } - else if ('nuance' === vendor) { - return { - speech_credential_sid: credential.speech_credential_sid, - client_id: credential.client_id, - secret: credential.secret, - nuance_tts_uri: credential.nuance_tts_uri, - nuance_stt_uri: credential.nuance_stt_uri - }; - } - else if ('deepgram' === vendor) { - return { - speech_credential_sid: credential.speech_credential_sid, - api_key: credential.api_key - }; - } - else if ('soniox' === vendor) { - return { - speech_credential_sid: credential.speech_credential_sid, - api_key: credential.api_key - }; - } - else if ('ibm' === vendor) { - return { - speech_credential_sid: credential.speech_credential_sid, - tts_api_key: credential.tts_api_key, - tts_region: credential.tts_region, - stt_api_key: credential.stt_api_key, - stt_region: credential.stt_region - }; - } - else if ('nvidia' === vendor) { - return { - speech_credential_sid: credential.speech_credential_sid, - riva_server_uri: credential.riva_server_uri - }; - } - else if ('cobalt' === vendor) { - return { - speech_credential_sid: credential.speech_credential_sid, - cobalt_server_uri: credential.cobalt_server_uri - }; - } else if ('elevenlabs' === vendor) { - return { - api_key: credential.api_key, - model_id: credential.model_id - }; - } else if ('assemblyai' === vendor) { - return { - speech_credential_sid: credential.speech_credential_sid, - api_key: credential.api_key - }; - } else if (vendor.startsWith('custom:')) { - return { - speech_credential_sid: credential.speech_credential_sid, - auth_token: credential.auth_token, - custom_stt_url: credential.custom_stt_url, - custom_tts_url: credential.custom_tts_url - }; - } - } - } -}; - -router.post('/:sid/Tts', async(req, res) => { - const {logger, synthAudio} = req.app.locals; - try { - const accountSid = parseAccountSid(req); - const body = req.body; - await validateRequest(req, accountSid); - await validateTtsRequestBody(body, logger); - const { text, synthesizer } = body; - const { vendor, label, language } = synthesizer; - const engine = synthesizer.engine || 'standard'; - const options = synthesizer.options || {}; - const salt = uuidv4(); - /* parse Nuance voices into name and model */ - let voice = synthesizer.voice; - let model; - if (vendor === 'nuance' && voice) { - const arr = /([A-Za-z-]*)\s+-\s+(enhanced|standard)/.exec(voice); - if (arr) { - voice = arr[1]; - model = arr[2]; - } - } - - const result = await Account.retrieve(accountSid); - if (!result || result.length === 0) { - throw new DbErrorBadRequest(`Account not found for sid ${accountSid}`); - } - if (result[0].is_active) { - throw new DbErrorBadRequest(`Account not active for sid ${accountSid}`); - } - - const speechCreds = await SpeechCredential.getSpeechCredentialsByVendorAndLabel(null, accountSid, vendor, label); - if (!speechCreds || speechCreds.length === 0) { - throw new - DbErrorBadRequest(`There is no available speech credential for ${vendor}${label ? ` and ${label}` : ''}`); - } - - let credentials = getSpeechCredential(speechCreds); - if (!credentials) { - throw new - DbErrorBadRequest(`There is no available speech credential for ${vendor}${label ? ` and ${label}` : ''}`); - } - /* allow for microsoft custom region voice and api_key to be specified as an override */ - if (vendor === 'microsoft' && options.deploymentId) { - credentials = credentials || {}; - credentials.use_custom_tts = true; - credentials.custom_tts_endpoint = options.deploymentId; - credentials.api_key = options.apiKey || credentials.apiKey; - credentials.region = options.region || credentials.region; - voice = options.voice || voice; - } - const stats = { - histogram: () => {}, - increment: () => {}, - }; - const { filePath } = await synthAudio(stats, { - account_sid: accountSid, - text, - vendor, - language, - voice, - engine, - model, - salt, - credentials, - disableTtsCache: false - }); - - const stat = fs.statSync(filePath); - res.writeHead(200, { - 'Content-Type': 'audio/mpeg', - 'Content-Length': stat.size, - }); - - const readStream = fs.createReadStream(filePath); - // We replaced all the event handlers with a simple call to readStream.pipe() - readStream.pipe(res); - - readStream.on('end', () => { - // Delete the file after it's been read - fs.unlink(filePath, (err) => { - if (err) throw err; - logger.error(`${filePath} was deleted`); - }); - }); - - } catch (err) { - sysError(logger, res, err); - } -}); - module.exports = router; diff --git a/lib/routes/api/tts-cache.js b/lib/routes/api/tts-cache.js index 7bc5c371..398994ec 100644 --- a/lib/routes/api/tts-cache.js +++ b/lib/routes/api/tts-cache.js @@ -2,6 +2,13 @@ const router = require('express').Router(); const { parseAccountSid } = require('./utils'); +const { validate } = require('@jambonz/verb-specifications'); +const SpeechCredential = require('../../models/speech-credential'); +const fs = require('fs'); +const { v4: uuidv4 } = require('uuid'); +const {DbErrorBadRequest} = require('../../utils/errors'); +const Account = require('../../models/account'); +const sysError = require('../error'); router.delete('/', async(req, res) => { const {purgeTtsCache} = req.app.locals; @@ -26,4 +33,212 @@ router.get('/', async(req, res) => { res.status(200).json({size}); }); +// Tts +const validateTtsRequestBody = async(body, logger) => { + let copiedBody = {...body}; + if (typeof copiedBody !== 'object') { + throw new DbErrorBadRequest('Invalid tts request body, it should be say verb object'); + } + copiedBody.verb = 'say'; + copiedBody = [copiedBody]; + try { + validate(logger, copiedBody); + } catch (err) { + throw new DbErrorBadRequest(err); + } +}; + + +const getSpeechCredential = (credentials) => { + for (const credential of credentials) { + if (credential.use_for_tts && credential.tts_tested_ok) { + const { vendor } = credential; + if ('google' === vendor) { + const cred = JSON.parse(credential.service_key.replace(/\n/g, '\\n')); + return { + speech_credential_sid: credential.speech_credential_sid, + credentials: cred + }; + } + else if (['aws', 'polly'].includes(vendor)) { + return { + speech_credential_sid: credential.speech_credential_sid, + accessKeyId: credential.access_key_id, + secretAccessKey: credential.secret_access_key, + region: credential.aws_region + }; + } + else if ('microsoft' === vendor) { + return { + speech_credential_sid: credential.speech_credential_sid, + api_key: credential.api_key, + region: credential.region, + use_custom_stt: credential.use_custom_stt, + custom_stt_endpoint: credential.custom_stt_endpoint, + custom_stt_endpoint_url: credential.custom_stt_endpoint_url, + use_custom_tts: credential.use_custom_tts, + custom_tts_endpoint: credential.custom_tts_endpoint, + custom_tts_endpoint_url: credential.custom_tts_endpoint_url + }; + } + else if ('wellsaid' === vendor) { + return { + speech_credential_sid: credential.speech_credential_sid, + api_key: credential.api_key + }; + } + else if ('nuance' === vendor) { + return { + speech_credential_sid: credential.speech_credential_sid, + client_id: credential.client_id, + secret: credential.secret, + nuance_tts_uri: credential.nuance_tts_uri, + nuance_stt_uri: credential.nuance_stt_uri + }; + } + else if ('deepgram' === vendor) { + return { + speech_credential_sid: credential.speech_credential_sid, + api_key: credential.api_key + }; + } + else if ('soniox' === vendor) { + return { + speech_credential_sid: credential.speech_credential_sid, + api_key: credential.api_key + }; + } + else if ('ibm' === vendor) { + return { + speech_credential_sid: credential.speech_credential_sid, + tts_api_key: credential.tts_api_key, + tts_region: credential.tts_region, + stt_api_key: credential.stt_api_key, + stt_region: credential.stt_region + }; + } + else if ('nvidia' === vendor) { + return { + speech_credential_sid: credential.speech_credential_sid, + riva_server_uri: credential.riva_server_uri + }; + } + else if ('cobalt' === vendor) { + return { + speech_credential_sid: credential.speech_credential_sid, + cobalt_server_uri: credential.cobalt_server_uri + }; + } else if ('elevenlabs' === vendor) { + return { + api_key: credential.api_key, + model_id: credential.model_id + }; + } else if ('assemblyai' === vendor) { + return { + speech_credential_sid: credential.speech_credential_sid, + api_key: credential.api_key + }; + } else if (vendor.startsWith('custom:')) { + return { + speech_credential_sid: credential.speech_credential_sid, + auth_token: credential.auth_token, + custom_stt_url: credential.custom_stt_url, + custom_tts_url: credential.custom_tts_url + }; + } + } + } +}; + + +router.post('/Synthesize', async(req, res) => { + const {logger, synthAudio} = req.app.locals; + try { + const accountSid = parseAccountSid(req); + const body = req.body; + await validateTtsRequestBody(body, logger); + const { text, synthesizer } = body; + const { vendor, label, language } = synthesizer; + const engine = synthesizer.engine || 'standard'; + const options = synthesizer.options || {}; + const salt = uuidv4(); + /* parse Nuance voices into name and model */ + let voice = synthesizer.voice; + let model; + if (vendor === 'nuance' && voice) { + const arr = /([A-Za-z-]*)\s+-\s+(enhanced|standard)/.exec(voice); + if (arr) { + voice = arr[1]; + model = arr[2]; + } + } + + const result = await Account.retrieve(accountSid); + if (!result || result.length === 0) { + throw new DbErrorBadRequest(`Account not found for sid ${accountSid}`); + } + if (result[0].is_active) { + throw new DbErrorBadRequest(`Account not active for sid ${accountSid}`); + } + + const speechCreds = await SpeechCredential.getSpeechCredentialsByVendorAndLabel(null, accountSid, vendor, label); + if (!speechCreds || speechCreds.length === 0) { + throw new + DbErrorBadRequest(`There is no available speech credential for ${vendor}${label ? ` and ${label}` : ''}`); + } + + let credentials = getSpeechCredential(speechCreds); + if (!credentials) { + throw new + DbErrorBadRequest(`There is no available speech credential for ${vendor}${label ? ` and ${label}` : ''}`); + } + /* allow for microsoft custom region voice and api_key to be specified as an override */ + if (vendor === 'microsoft' && options.deploymentId) { + credentials = credentials || {}; + credentials.use_custom_tts = true; + credentials.custom_tts_endpoint = options.deploymentId; + credentials.api_key = options.apiKey || credentials.apiKey; + credentials.region = options.region || credentials.region; + voice = options.voice || voice; + } + const stats = { + histogram: () => {}, + increment: () => {}, + }; + const { filePath } = await synthAudio(stats, { + account_sid: accountSid, + text, + vendor, + language, + voice, + engine, + model, + salt, + credentials, + disableTtsCache: false + }); + + const stat = fs.statSync(filePath); + res.writeHead(200, { + 'Content-Type': 'audio/mpeg', + 'Content-Length': stat.size, + }); + + const readStream = fs.createReadStream(filePath); + // We replaced all the event handlers with a simple call to readStream.pipe() + readStream.pipe(res); + + readStream.on('end', () => { + // Delete the file after it's been read + fs.unlink(filePath, (err) => { + if (err) throw err; + logger.error(`${filePath} was deleted`); + }); + }); + + } catch (err) { + sysError(logger, res, err); + } +}); + module.exports = router; diff --git a/package-lock.json b/package-lock.json index a050cef0..a4d57d29 100644 --- a/package-lock.json +++ b/package-lock.json @@ -35,7 +35,7 @@ "ibm-watson": "^7.1.2", "jsonwebtoken": "^9.0.0", "mailgun.js": "^9.1.2", - "microsoft-cognitiveservices-speech-sdk": "^1.24.1", + "microsoft-cognitiveservices-speech-sdk": "1.31.0", "mysql2": "^2.3.3", "nocache": "3.0.4", "passport": "^0.6.0", @@ -3070,11 +3070,6 @@ "@types/node": "*" } }, - "node_modules/@types/webrtc": { - "version": "0.0.37", - "resolved": "https://registry.npmjs.org/@types/webrtc/-/webrtc-0.0.37.tgz", - "integrity": "sha512-JGAJC/ZZDhcrrmepU4sPLQLIOIAgs5oIK+Ieq90K8fdaNMhfdfqmYatJdgif1NDQtvrSlTOGJDUYHIDunuufOg==" - }, "node_modules/@types/websocket": { "version": "1.0.5", "resolved": "https://registry.npmjs.org/@types/websocket/-/websocket-1.0.5.tgz", @@ -6877,11 +6872,10 @@ } }, "node_modules/microsoft-cognitiveservices-speech-sdk": { - "version": "1.33.0", - "resolved": "https://registry.npmjs.org/microsoft-cognitiveservices-speech-sdk/-/microsoft-cognitiveservices-speech-sdk-1.33.0.tgz", - "integrity": "sha512-1I9tDcUaR5PJ3V9m8ecfKaPhPbYpadBEpOHi7E8VS6b5VQYoCllkAQ34WCBs05EI2Zo+8VuYXq7ClMS/IVXY/A==", + "version": "1.31.0", + "resolved": "https://registry.npmjs.org/microsoft-cognitiveservices-speech-sdk/-/microsoft-cognitiveservices-speech-sdk-1.31.0.tgz", + "integrity": "sha512-wmNi0XoGtQwRoI2To6QSrGHVW0d8WfhJwXtE2nk48l4YkBiDqdPV2tdSXFHRrdv3uwr/+THip45H91Fllpm8qA==", "dependencies": { - "@types/webrtc": "^0.0.37", "agent-base": "^6.0.1", "bent": "^7.3.12", "https-proxy-agent": "^4.0.0", @@ -12241,11 +12235,6 @@ "@types/node": "*" } }, - "@types/webrtc": { - "version": "0.0.37", - "resolved": "https://registry.npmjs.org/@types/webrtc/-/webrtc-0.0.37.tgz", - "integrity": "sha512-JGAJC/ZZDhcrrmepU4sPLQLIOIAgs5oIK+Ieq90K8fdaNMhfdfqmYatJdgif1NDQtvrSlTOGJDUYHIDunuufOg==" - }, "@types/websocket": { "version": "1.0.5", "resolved": "https://registry.npmjs.org/@types/websocket/-/websocket-1.0.5.tgz", @@ -15135,11 +15124,10 @@ } }, "microsoft-cognitiveservices-speech-sdk": { - "version": "1.33.0", - "resolved": "https://registry.npmjs.org/microsoft-cognitiveservices-speech-sdk/-/microsoft-cognitiveservices-speech-sdk-1.33.0.tgz", - "integrity": "sha512-1I9tDcUaR5PJ3V9m8ecfKaPhPbYpadBEpOHi7E8VS6b5VQYoCllkAQ34WCBs05EI2Zo+8VuYXq7ClMS/IVXY/A==", + "version": "1.31.0", + "resolved": "https://registry.npmjs.org/microsoft-cognitiveservices-speech-sdk/-/microsoft-cognitiveservices-speech-sdk-1.31.0.tgz", + "integrity": "sha512-wmNi0XoGtQwRoI2To6QSrGHVW0d8WfhJwXtE2nk48l4YkBiDqdPV2tdSXFHRrdv3uwr/+THip45H91Fllpm8qA==", "requires": { - "@types/webrtc": "^0.0.37", "agent-base": "^6.0.1", "bent": "^7.3.12", "https-proxy-agent": "^4.0.0", diff --git a/package.json b/package.json index 6e93039b..749a1854 100644 --- a/package.json +++ b/package.json @@ -45,7 +45,7 @@ "ibm-watson": "^7.1.2", "jsonwebtoken": "^9.0.0", "mailgun.js": "^9.1.2", - "microsoft-cognitiveservices-speech-sdk": "^1.24.1", + "microsoft-cognitiveservices-speech-sdk": "1.31.0", "mysql2": "^2.3.3", "nocache": "3.0.4", "passport": "^0.6.0", From 6f11ed9476003859dea7569490785f5dc93f11d8 Mon Sep 17 00:00:00 2001 From: Quan HL Date: Sun, 5 Nov 2023 21:41:07 +0700 Subject: [PATCH 03/11] wip --- lib/models/speech-credential.js | 10 ++++++++++ lib/routes/api/tts-cache.js | 9 +++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/lib/models/speech-credential.js b/lib/models/speech-credential.js index 4e124354..7371d697 100644 --- a/lib/models/speech-credential.js +++ b/lib/models/speech-credential.js @@ -32,6 +32,16 @@ class SpeechCredential extends Model { return rows; } + static async getSPeechCredentialsForAccount(account_sid) { + const sql = `SELECT * +FROM speech_credentials +WHERE account_sid = ? OR (account_sid is NULL AND service_provider_sid = +(SELECT service_provider_sid from accounts where account_sid = ?))`; + + const [rows] = await promisePool.query(sql, [account_sid, account_sid]); + return rows; + } + static async disableStt(account_sid) { await promisePool.execute('UPDATE speech_credentials SET use_for_stt = 0 WHERE account_sid = ?', [account_sid]); } diff --git a/lib/routes/api/tts-cache.js b/lib/routes/api/tts-cache.js index 398994ec..9fc25aa9 100644 --- a/lib/routes/api/tts-cache.js +++ b/lib/routes/api/tts-cache.js @@ -49,9 +49,10 @@ const validateTtsRequestBody = async(body, logger) => { }; -const getSpeechCredential = (credentials) => { +const getSpeechCredential = (credentials, vendor, label) => { for (const credential of credentials) { - if (credential.use_for_tts && credential.tts_tested_ok) { + if (credential.use_for_tts && credential.tts_tested_ok && + credential.vendor === vendor && credential.label === label) { const { vendor } = credential; if ('google' === vendor) { const cred = JSON.parse(credential.service_key.replace(/\n/g, '\\n')); @@ -177,11 +178,11 @@ router.post('/Synthesize', async(req, res) => { if (!result || result.length === 0) { throw new DbErrorBadRequest(`Account not found for sid ${accountSid}`); } - if (result[0].is_active) { + if (!result[0].is_active) { throw new DbErrorBadRequest(`Account not active for sid ${accountSid}`); } - const speechCreds = await SpeechCredential.getSpeechCredentialsByVendorAndLabel(null, accountSid, vendor, label); + const speechCreds = await SpeechCredential.getSPeechCredentialsForAccount(accountSid); if (!speechCreds || speechCreds.length === 0) { throw new DbErrorBadRequest(`There is no available speech credential for ${vendor}${label ? ` and ${label}` : ''}`); From 27d2f6e9204d97659edd29fc802364b0fdf23051 Mon Sep 17 00:00:00 2001 From: Quan HL Date: Sun, 5 Nov 2023 21:46:52 +0700 Subject: [PATCH 04/11] wip --- lib/routes/api/tts-cache.js | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/routes/api/tts-cache.js b/lib/routes/api/tts-cache.js index 9fc25aa9..71707ca4 100644 --- a/lib/routes/api/tts-cache.js +++ b/lib/routes/api/tts-cache.js @@ -188,10 +188,11 @@ router.post('/Synthesize', async(req, res) => { DbErrorBadRequest(`There is no available speech credential for ${vendor}${label ? ` and ${label}` : ''}`); } - let credentials = getSpeechCredential(speechCreds); + let credentials = getSpeechCredential(speechCreds, vendor, label); if (!credentials) { throw new - DbErrorBadRequest(`There is no available speech credential for ${vendor}${label ? ` and ${label}` : ''}`); + DbErrorBadRequest(`There is no available speech credential + for ${vendor}${label ? ` and ${label}` : ''} or tested`); } /* allow for microsoft custom region voice and api_key to be specified as an override */ if (vendor === 'microsoft' && options.deploymentId) { From 368b0f89bbd0982c12409a4366da1796338f3217 Mon Sep 17 00:00:00 2001 From: Hoan HL Date: Mon, 6 Nov 2023 09:19:32 +0700 Subject: [PATCH 05/11] wip --- lib/routes/api/tts-cache.js | 181 +++++------------------------------- 1 file changed, 22 insertions(+), 159 deletions(-) diff --git a/lib/routes/api/tts-cache.js b/lib/routes/api/tts-cache.js index 71707ca4..8434f617 100644 --- a/lib/routes/api/tts-cache.js +++ b/lib/routes/api/tts-cache.js @@ -2,13 +2,13 @@ const router = require('express').Router(); const { parseAccountSid } = require('./utils'); -const { validate } = require('@jambonz/verb-specifications'); const SpeechCredential = require('../../models/speech-credential'); const fs = require('fs'); const { v4: uuidv4 } = require('uuid'); const {DbErrorBadRequest} = require('../../utils/errors'); const Account = require('../../models/account'); const sysError = require('../error'); +const { decrypt } = require('../../utils/encrypt-decrypt'); router.delete('/', async(req, res) => { const {purgeTtsCache} = req.app.locals; @@ -33,175 +33,38 @@ router.get('/', async(req, res) => { res.status(200).json({size}); }); -// Tts -const validateTtsRequestBody = async(body, logger) => { - let copiedBody = {...body}; - if (typeof copiedBody !== 'object') { - throw new DbErrorBadRequest('Invalid tts request body, it should be say verb object'); - } - copiedBody.verb = 'say'; - copiedBody = [copiedBody]; - try { - validate(logger, copiedBody); - } catch (err) { - throw new DbErrorBadRequest(err); - } -}; - - -const getSpeechCredential = (credentials, vendor, label) => { - for (const credential of credentials) { - if (credential.use_for_tts && credential.tts_tested_ok && - credential.vendor === vendor && credential.label === label) { - const { vendor } = credential; - if ('google' === vendor) { - const cred = JSON.parse(credential.service_key.replace(/\n/g, '\\n')); - return { - speech_credential_sid: credential.speech_credential_sid, - credentials: cred - }; - } - else if (['aws', 'polly'].includes(vendor)) { - return { - speech_credential_sid: credential.speech_credential_sid, - accessKeyId: credential.access_key_id, - secretAccessKey: credential.secret_access_key, - region: credential.aws_region - }; - } - else if ('microsoft' === vendor) { - return { - speech_credential_sid: credential.speech_credential_sid, - api_key: credential.api_key, - region: credential.region, - use_custom_stt: credential.use_custom_stt, - custom_stt_endpoint: credential.custom_stt_endpoint, - custom_stt_endpoint_url: credential.custom_stt_endpoint_url, - use_custom_tts: credential.use_custom_tts, - custom_tts_endpoint: credential.custom_tts_endpoint, - custom_tts_endpoint_url: credential.custom_tts_endpoint_url - }; - } - else if ('wellsaid' === vendor) { - return { - speech_credential_sid: credential.speech_credential_sid, - api_key: credential.api_key - }; - } - else if ('nuance' === vendor) { - return { - speech_credential_sid: credential.speech_credential_sid, - client_id: credential.client_id, - secret: credential.secret, - nuance_tts_uri: credential.nuance_tts_uri, - nuance_stt_uri: credential.nuance_stt_uri - }; - } - else if ('deepgram' === vendor) { - return { - speech_credential_sid: credential.speech_credential_sid, - api_key: credential.api_key - }; - } - else if ('soniox' === vendor) { - return { - speech_credential_sid: credential.speech_credential_sid, - api_key: credential.api_key - }; - } - else if ('ibm' === vendor) { - return { - speech_credential_sid: credential.speech_credential_sid, - tts_api_key: credential.tts_api_key, - tts_region: credential.tts_region, - stt_api_key: credential.stt_api_key, - stt_region: credential.stt_region - }; - } - else if ('nvidia' === vendor) { - return { - speech_credential_sid: credential.speech_credential_sid, - riva_server_uri: credential.riva_server_uri - }; - } - else if ('cobalt' === vendor) { - return { - speech_credential_sid: credential.speech_credential_sid, - cobalt_server_uri: credential.cobalt_server_uri - }; - } else if ('elevenlabs' === vendor) { - return { - api_key: credential.api_key, - model_id: credential.model_id - }; - } else if ('assemblyai' === vendor) { - return { - speech_credential_sid: credential.speech_credential_sid, - api_key: credential.api_key - }; - } else if (vendor.startsWith('custom:')) { - return { - speech_credential_sid: credential.speech_credential_sid, - auth_token: credential.auth_token, - custom_stt_url: credential.custom_stt_url, - custom_tts_url: credential.custom_tts_url - }; - } - } - } -}; - - router.post('/Synthesize', async(req, res) => { const {logger, synthAudio} = req.app.locals; try { const accountSid = parseAccountSid(req); const body = req.body; - await validateTtsRequestBody(body, logger); - const { text, synthesizer } = body; - const { vendor, label, language } = synthesizer; - const engine = synthesizer.engine || 'standard'; - const options = synthesizer.options || {}; - const salt = uuidv4(); - /* parse Nuance voices into name and model */ - let voice = synthesizer.voice; - let model; - if (vendor === 'nuance' && voice) { - const arr = /([A-Za-z-]*)\s+-\s+(enhanced|standard)/.exec(voice); - if (arr) { - voice = arr[1]; - model = arr[2]; - } + if (!body.speech_credential_sid || !body.text || !body.language || !body.voice) { + throw new DbErrorBadRequest('speech_credential_sid, text, language, voice are all required'); } const result = await Account.retrieve(accountSid); - if (!result || result.length === 0) { + if (!result || result.length === 0 || !result[0].is_active) { throw new DbErrorBadRequest(`Account not found for sid ${accountSid}`); } - if (!result[0].is_active) { - throw new DbErrorBadRequest(`Account not active for sid ${accountSid}`); - } - - const speechCreds = await SpeechCredential.getSPeechCredentialsForAccount(accountSid); - if (!speechCreds || speechCreds.length === 0) { + const credentials = await SpeechCredential.retrieve(body.speech_credential_sid); + if (!credentials || credentials.length === 0) { throw new - DbErrorBadRequest(`There is no available speech credential for ${vendor}${label ? ` and ${label}` : ''}`); + DbErrorBadRequest(`There is no available speech credential for ${body.speech_credential_sid}`); } + const speechCredential = credentials[0]; + speechCredential.credential = JSON.parse(decrypt(speechCredential.credential)); - let credentials = getSpeechCredential(speechCreds, vendor, label); - if (!credentials) { - throw new - DbErrorBadRequest(`There is no available speech credential - for ${vendor}${label ? ` and ${label}` : ''} or tested`); - } - /* allow for microsoft custom region voice and api_key to be specified as an override */ - if (vendor === 'microsoft' && options.deploymentId) { - credentials = credentials || {}; - credentials.use_custom_tts = true; - credentials.custom_tts_endpoint = options.deploymentId; - credentials.api_key = options.apiKey || credentials.apiKey; - credentials.region = options.region || credentials.region; - voice = options.voice || voice; + const { text, language, engine = 'standard' } = body; + const salt = uuidv4(); + /* parse Nuance voices into name and model */ + let voice = body.voice; + let model; + if (speechCredential.vendor === 'nuance' && voice) { + const arr = /([A-Za-z-]*)\s+-\s+(enhanced|standard)/.exec(voice); + if (arr) { + voice = arr[1]; + model = arr[2]; + } } const stats = { histogram: () => {}, @@ -210,13 +73,13 @@ router.post('/Synthesize', async(req, res) => { const { filePath } = await synthAudio(stats, { account_sid: accountSid, text, - vendor, + vendor: speechCredential.vendor, language, voice, engine, model, salt, - credentials, + credentials: speechCredential, disableTtsCache: false }); From 7d0dd4ea2fba37cb069ab0ffb49a82340868764d Mon Sep 17 00:00:00 2001 From: Hoan HL Date: Mon, 6 Nov 2023 10:02:07 +0700 Subject: [PATCH 06/11] wip --- lib/routes/api/tts-cache.js | 6 +- lib/utils/speech-utils.js | 106 +++++++++++++++++++++++++++++++++++- 2 files changed, 108 insertions(+), 4 deletions(-) diff --git a/lib/routes/api/tts-cache.js b/lib/routes/api/tts-cache.js index 8434f617..431b886f 100644 --- a/lib/routes/api/tts-cache.js +++ b/lib/routes/api/tts-cache.js @@ -8,7 +8,7 @@ const { v4: uuidv4 } = require('uuid'); const {DbErrorBadRequest} = require('../../utils/errors'); const Account = require('../../models/account'); const sysError = require('../error'); -const { decrypt } = require('../../utils/encrypt-decrypt'); +const { getSpeechCredential } = require('../../utils/speech-utils'); router.delete('/', async(req, res) => { const {purgeTtsCache} = req.app.locals; @@ -52,7 +52,7 @@ router.post('/Synthesize', async(req, res) => { DbErrorBadRequest(`There is no available speech credential for ${body.speech_credential_sid}`); } const speechCredential = credentials[0]; - speechCredential.credential = JSON.parse(decrypt(speechCredential.credential)); + const cred = getSpeechCredential(speechCredential); const { text, language, engine = 'standard' } = body; const salt = uuidv4(); @@ -79,7 +79,7 @@ router.post('/Synthesize', async(req, res) => { engine, model, salt, - credentials: speechCredential, + credentials: cred, disableTtsCache: false }); diff --git a/lib/utils/speech-utils.js b/lib/utils/speech-utils.js index 35bc0972..06dcd299 100644 --- a/lib/utils/speech-utils.js +++ b/lib/utils/speech-utils.js @@ -287,6 +287,109 @@ const testAssemblyStt = async(logger, credentials) => { }); }; +const getSpeechCredential = (credential, logger) => { + const {vendor} = credential; + logger.info( + `Speech vendor: ${credential.vendor} ${credential.label ? `, label: ${credential.label}` : ''} selected`); + if ('google' === vendor) { + try { + const cred = JSON.parse(credential.service_key.replace(/\n/g, '\\n')); + return { + speech_credential_sid: credential.speech_credential_sid, + credentials: cred + }; + } catch (err) { + logger.info({err}, `malformed google service_key provisioned for account ${credential.speech_credential_sid}`); + } + } + else if (['aws', 'polly'].includes(vendor)) { + return { + speech_credential_sid: credential.speech_credential_sid, + accessKeyId: credential.access_key_id, + secretAccessKey: credential.secret_access_key, + region: credential.aws_region || 'us-east-1' + }; + } + else if ('microsoft' === vendor) { + return { + speech_credential_sid: credential.speech_credential_sid, + api_key: credential.api_key, + region: credential.region, + use_custom_stt: credential.use_custom_stt, + custom_stt_endpoint: credential.custom_stt_endpoint, + custom_stt_endpoint_url: credential.custom_stt_endpoint_url, + use_custom_tts: credential.use_custom_tts, + custom_tts_endpoint: credential.custom_tts_endpoint, + custom_tts_endpoint_url: credential.custom_tts_endpoint_url + }; + } + else if ('wellsaid' === vendor) { + return { + speech_credential_sid: credential.speech_credential_sid, + api_key: credential.api_key + }; + } + else if ('nuance' === vendor) { + return { + speech_credential_sid: credential.speech_credential_sid, + client_id: credential.client_id, + secret: credential.secret, + nuance_tts_uri: credential.nuance_tts_uri, + nuance_stt_uri: credential.nuance_stt_uri + }; + } + else if ('deepgram' === vendor) { + return { + speech_credential_sid: credential.speech_credential_sid, + api_key: credential.api_key + }; + } + else if ('soniox' === vendor) { + return { + speech_credential_sid: credential.speech_credential_sid, + api_key: credential.api_key + }; + } + else if ('ibm' === vendor) { + return { + speech_credential_sid: credential.speech_credential_sid, + tts_api_key: credential.tts_api_key, + tts_region: credential.tts_region, + stt_api_key: credential.stt_api_key, + stt_region: credential.stt_region + }; + } + else if ('nvidia' === vendor) { + return { + speech_credential_sid: credential.speech_credential_sid, + riva_server_uri: credential.riva_server_uri + }; + } + else if ('cobalt' === vendor) { + return { + speech_credential_sid: credential.speech_credential_sid, + cobalt_server_uri: credential.cobalt_server_uri + }; + } else if ('elevenlabs' === vendor) { + return { + api_key: credential.api_key, + model_id: credential.model_id + }; + } else if ('assemblyai' === vendor) { + return { + speech_credential_sid: credential.speech_credential_sid, + api_key: credential.api_key + }; + } else if (vendor.startsWith('custom:')) { + return { + speech_credential_sid: credential.speech_credential_sid, + auth_token: credential.auth_token, + custom_stt_url: credential.custom_stt_url, + custom_tts_url: credential.custom_tts_url + }; + } +}; + module.exports = { testGoogleTts, testGoogleStt, @@ -303,5 +406,6 @@ module.exports = { testIbmStt, testSonioxStt, testElevenlabs, - testAssemblyStt + testAssemblyStt, + getSpeechCredential }; From bd38d05b0facab4a124082b722c7c91bc2df9e1f Mon Sep 17 00:00:00 2001 From: Hoan HL Date: Mon, 6 Nov 2023 10:48:03 +0700 Subject: [PATCH 07/11] uip --- lib/routes/api/speech-credentials.js | 77 +------------- lib/routes/api/tts-cache.js | 12 ++- lib/utils/speech-utils.js | 148 +++++++++++++-------------- 3 files changed, 83 insertions(+), 154 deletions(-) diff --git a/lib/routes/api/speech-credentials.js b/lib/routes/api/speech-credentials.js index 3756b980..7c37ea9d 100644 --- a/lib/routes/api/speech-credentials.js +++ b/lib/routes/api/speech-credentials.js @@ -3,8 +3,8 @@ const assert = require('assert'); const Account = require('../../models/account'); const SpeechCredential = require('../../models/speech-credential'); const sysError = require('../error'); -const {decrypt, encrypt, obscureKey} = require('../../utils/encrypt-decrypt'); -const {parseAccountSid, parseServiceProviderSid, parseSpeechCredentialSid} = require('./utils'); +const {decrypt, encrypt} = require('../../utils/encrypt-decrypt'); +const {parseAccountSid, parseServiceProviderSid, parseSpeechCredentialSid, decryptCredential} = require('./utils'); const {DbErrorUnprocessableRequest, DbErrorForbidden} = require('../../utils/errors'); const { testGoogleTts, @@ -274,79 +274,6 @@ router.post('/', async(req, res) => { } }); -function decryptCredential(obj, credential, logger) { - if ('google' === obj.vendor) { - const o = JSON.parse(decrypt(credential)); - const key_header = '-----BEGIN PRIVATE KEY-----\n'; - const obscured = { - ...o, - private_key: `${key_header}${obscureKey(o.private_key.slice(key_header.length, o.private_key.length))}` - }; - obj.service_key = JSON.stringify(obscured); - } - else if ('aws' === obj.vendor) { - const o = JSON.parse(decrypt(credential)); - obj.access_key_id = o.access_key_id; - obj.secret_access_key = obscureKey(o.secret_access_key); - obj.aws_region = o.aws_region; - logger.info({obj, o}, 'retrieving aws speech credential'); - } - else if ('microsoft' === obj.vendor) { - const o = JSON.parse(decrypt(credential)); - obj.api_key = obscureKey(o.api_key); - obj.region = o.region; - obj.use_custom_tts = o.use_custom_tts; - obj.custom_tts_endpoint = o.custom_tts_endpoint; - obj.custom_tts_endpoint_url = o.custom_tts_endpoint_url; - obj.use_custom_stt = o.use_custom_stt; - obj.custom_stt_endpoint = o.custom_stt_endpoint; - obj.custom_stt_endpoint_url = o.custom_stt_endpoint_url; - logger.info({obj, o}, 'retrieving azure speech credential'); - } - else if ('wellsaid' === obj.vendor) { - const o = JSON.parse(decrypt(credential)); - obj.api_key = obscureKey(o.api_key); - } - else if ('nuance' === obj.vendor) { - const o = JSON.parse(decrypt(credential)); - obj.client_id = o.client_id; - obj.secret = o.secret ? obscureKey(o.secret) : null; - } - else if ('deepgram' === obj.vendor) { - const o = JSON.parse(decrypt(credential)); - obj.api_key = obscureKey(o.api_key); - } - else if ('ibm' === obj.vendor) { - const o = JSON.parse(decrypt(credential)); - obj.tts_api_key = obscureKey(o.tts_api_key); - obj.tts_region = o.tts_region; - obj.stt_api_key = obscureKey(o.stt_api_key); - obj.stt_region = o.stt_region; - obj.instance_id = o.instance_id; - } else if ('nvidia' === obj.vendor) { - const o = JSON.parse(decrypt(credential)); - obj.riva_server_uri = o.riva_server_uri; - } else if ('cobalt' === obj.vendor) { - const o = JSON.parse(decrypt(credential)); - obj.cobalt_server_uri = o.cobalt_server_uri; - } else if ('soniox' === obj.vendor) { - const o = JSON.parse(decrypt(credential)); - obj.api_key = obscureKey(o.api_key); - } else if ('elevenlabs' === obj.vendor) { - const o = JSON.parse(decrypt(credential)); - obj.api_key = obscureKey(o.api_key); - obj.model_id = o.model_id; - } else if (obj.vendor.startsWith('custom:')) { - const o = JSON.parse(decrypt(credential)); - obj.auth_token = obscureKey(o.auth_token); - obj.custom_stt_url = o.custom_stt_url; - obj.custom_tts_url = o.custom_tts_url; - } else if ('assemblyai' === obj.vendor) { - const o = JSON.parse(decrypt(credential)); - obj.api_key = obscureKey(o.api_key); - } -} - /** * retrieve all speech credentials for an account */ diff --git a/lib/routes/api/tts-cache.js b/lib/routes/api/tts-cache.js index 431b886f..70f9935d 100644 --- a/lib/routes/api/tts-cache.js +++ b/lib/routes/api/tts-cache.js @@ -8,7 +8,7 @@ const { v4: uuidv4 } = require('uuid'); const {DbErrorBadRequest} = require('../../utils/errors'); const Account = require('../../models/account'); const sysError = require('../error'); -const { getSpeechCredential } = require('../../utils/speech-utils'); +const { getSpeechCredential, decryptCredential } = require('../../utils/speech-utils'); router.delete('/', async(req, res) => { const {purgeTtsCache} = req.app.locals; @@ -51,15 +51,17 @@ router.post('/Synthesize', async(req, res) => { throw new DbErrorBadRequest(`There is no available speech credential for ${body.speech_credential_sid}`); } - const speechCredential = credentials[0]; - const cred = getSpeechCredential(speechCredential); + const {credential, ...obj} = credentials[0]; + + decryptCredential(obj, credential, logger, false); + const cred = getSpeechCredential(obj, logger); const { text, language, engine = 'standard' } = body; const salt = uuidv4(); /* parse Nuance voices into name and model */ let voice = body.voice; let model; - if (speechCredential.vendor === 'nuance' && voice) { + if (cred.vendor === 'nuance' && voice) { const arr = /([A-Za-z-]*)\s+-\s+(enhanced|standard)/.exec(voice); if (arr) { voice = arr[1]; @@ -73,7 +75,7 @@ router.post('/Synthesize', async(req, res) => { const { filePath } = await synthAudio(stats, { account_sid: accountSid, text, - vendor: speechCredential.vendor, + vendor: cred, language, voice, engine, diff --git a/lib/utils/speech-utils.js b/lib/utils/speech-utils.js index 06dcd299..dffa557e 100644 --- a/lib/utils/speech-utils.js +++ b/lib/utils/speech-utils.js @@ -6,6 +6,7 @@ const { SpeechClient } = require('@soniox/soniox-node'); const bent = require('bent'); const fs = require('fs'); const { AssemblyAI } = require('assemblyai'); +const {decrypt, obscureKey} = require('../../utils/encrypt-decrypt'); const testSonioxStt = async(logger, credentials) => { @@ -295,7 +296,7 @@ const getSpeechCredential = (credential, logger) => { try { const cred = JSON.parse(credential.service_key.replace(/\n/g, '\\n')); return { - speech_credential_sid: credential.speech_credential_sid, + ...credential, credentials: cred }; } catch (err) { @@ -304,91 +305,89 @@ const getSpeechCredential = (credential, logger) => { } else if (['aws', 'polly'].includes(vendor)) { return { - speech_credential_sid: credential.speech_credential_sid, + ...credential, accessKeyId: credential.access_key_id, secretAccessKey: credential.secret_access_key, region: credential.aws_region || 'us-east-1' }; } - else if ('microsoft' === vendor) { - return { - speech_credential_sid: credential.speech_credential_sid, - api_key: credential.api_key, - region: credential.region, - use_custom_stt: credential.use_custom_stt, - custom_stt_endpoint: credential.custom_stt_endpoint, - custom_stt_endpoint_url: credential.custom_stt_endpoint_url, - use_custom_tts: credential.use_custom_tts, - custom_tts_endpoint: credential.custom_tts_endpoint, - custom_tts_endpoint_url: credential.custom_tts_endpoint_url + return credential; +}; + +function decryptCredential(obj, credential, logger, isObscureKey = true) { + if ('google' === obj.vendor) { + const o = JSON.parse(decrypt(credential)); + const key_header = '-----BEGIN PRIVATE KEY-----\n'; + const obscured = { + ...o, + private_key: `${key_header}${isObscureKey ? + obscureKey(o.private_key.slice(key_header.length, o.private_key.length)) : + o.private_key.slice(key_header.length, o.private_key.length)}` }; + obj.service_key = JSON.stringify(obscured); } - else if ('wellsaid' === vendor) { - return { - speech_credential_sid: credential.speech_credential_sid, - api_key: credential.api_key - }; + else if ('aws' === obj.vendor) { + const o = JSON.parse(decrypt(credential)); + obj.access_key_id = o.access_key_id; + obj.secret_access_key = isObscureKey ? obscureKey(o.secret_access_key) : o.secret_access_key; + obj.aws_region = o.aws_region; + logger.info({obj, o}, 'retrieving aws speech credential'); } - else if ('nuance' === vendor) { - return { - speech_credential_sid: credential.speech_credential_sid, - client_id: credential.client_id, - secret: credential.secret, - nuance_tts_uri: credential.nuance_tts_uri, - nuance_stt_uri: credential.nuance_stt_uri - }; + else if ('microsoft' === obj.vendor) { + const o = JSON.parse(decrypt(credential)); + obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key; + obj.region = o.region; + obj.use_custom_tts = o.use_custom_tts; + obj.custom_tts_endpoint = o.custom_tts_endpoint; + obj.custom_tts_endpoint_url = o.custom_tts_endpoint_url; + obj.use_custom_stt = o.use_custom_stt; + obj.custom_stt_endpoint = o.custom_stt_endpoint; + obj.custom_stt_endpoint_url = o.custom_stt_endpoint_url; + logger.info({obj, o}, 'retrieving azure speech credential'); } - else if ('deepgram' === vendor) { - return { - speech_credential_sid: credential.speech_credential_sid, - api_key: credential.api_key - }; + else if ('wellsaid' === obj.vendor) { + const o = JSON.parse(decrypt(credential)); + obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key; } - else if ('soniox' === vendor) { - return { - speech_credential_sid: credential.speech_credential_sid, - api_key: credential.api_key - }; + else if ('nuance' === obj.vendor) { + const o = JSON.parse(decrypt(credential)); + obj.client_id = o.client_id; + obj.secret = o.secret ? (isObscureKey ? obscureKey(o.secret) : o.secret) : null; } - else if ('ibm' === vendor) { - return { - speech_credential_sid: credential.speech_credential_sid, - tts_api_key: credential.tts_api_key, - tts_region: credential.tts_region, - stt_api_key: credential.stt_api_key, - stt_region: credential.stt_region - }; + else if ('deepgram' === obj.vendor) { + const o = JSON.parse(decrypt(credential)); + obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key; } - else if ('nvidia' === vendor) { - return { - speech_credential_sid: credential.speech_credential_sid, - riva_server_uri: credential.riva_server_uri - }; - } - else if ('cobalt' === vendor) { - return { - speech_credential_sid: credential.speech_credential_sid, - cobalt_server_uri: credential.cobalt_server_uri - }; - } else if ('elevenlabs' === vendor) { - return { - api_key: credential.api_key, - model_id: credential.model_id - }; - } else if ('assemblyai' === vendor) { - return { - speech_credential_sid: credential.speech_credential_sid, - api_key: credential.api_key - }; - } else if (vendor.startsWith('custom:')) { - return { - speech_credential_sid: credential.speech_credential_sid, - auth_token: credential.auth_token, - custom_stt_url: credential.custom_stt_url, - custom_tts_url: credential.custom_tts_url - }; + else if ('ibm' === obj.vendor) { + const o = JSON.parse(decrypt(credential)); + obj.tts_api_key = isObscureKey ? obscureKey(o.tts_api_key) : o.tts_api_key; + obj.tts_region = o.tts_region; + obj.stt_api_key = isObscureKey ? obscureKey(o.stt_api_key) : o.stt_api_key; + obj.stt_region = o.stt_region; + obj.instance_id = o.instance_id; + } else if ('nvidia' === obj.vendor) { + const o = JSON.parse(decrypt(credential)); + obj.riva_server_uri = o.riva_server_uri; + } else if ('cobalt' === obj.vendor) { + const o = JSON.parse(decrypt(credential)); + obj.cobalt_server_uri = o.cobalt_server_uri; + } else if ('soniox' === obj.vendor) { + const o = JSON.parse(decrypt(credential)); + obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key; + } else if ('elevenlabs' === obj.vendor) { + const o = JSON.parse(decrypt(credential)); + obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key; + obj.model_id = o.model_id; + } else if (obj.vendor.startsWith('custom:')) { + const o = JSON.parse(decrypt(credential)); + obj.auth_token = isObscureKey ? obscureKey(o.auth_token) : o.auth_token; + obj.custom_stt_url = o.custom_stt_url; + obj.custom_tts_url = o.custom_tts_url; + } else if ('assemblyai' === obj.vendor) { + const o = JSON.parse(decrypt(credential)); + obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key; } -}; +} module.exports = { testGoogleTts, @@ -407,5 +406,6 @@ module.exports = { testSonioxStt, testElevenlabs, testAssemblyStt, - getSpeechCredential + getSpeechCredential, + decryptCredential }; From 8571f51bc368b819f5d25ec9a84a274d9690277d Mon Sep 17 00:00:00 2001 From: Hoan HL Date: Mon, 6 Nov 2023 10:50:16 +0700 Subject: [PATCH 08/11] wip --- lib/utils/speech-utils.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/utils/speech-utils.js b/lib/utils/speech-utils.js index dffa557e..215e2d43 100644 --- a/lib/utils/speech-utils.js +++ b/lib/utils/speech-utils.js @@ -6,7 +6,7 @@ const { SpeechClient } = require('@soniox/soniox-node'); const bent = require('bent'); const fs = require('fs'); const { AssemblyAI } = require('assemblyai'); -const {decrypt, obscureKey} = require('../../utils/encrypt-decrypt'); +const {decrypt, obscureKey} = require('./encrypt-decrypt'); const testSonioxStt = async(logger, credentials) => { From 29a88ddc16cfe6c3fa7dda289fd7c9e6e64ccec5 Mon Sep 17 00:00:00 2001 From: Quan HL Date: Mon, 6 Nov 2023 13:16:04 +0700 Subject: [PATCH 09/11] fix --- lib/routes/api/speech-credentials.js | 3 ++- lib/routes/api/tts-cache.js | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/routes/api/speech-credentials.js b/lib/routes/api/speech-credentials.js index 7c37ea9d..41f56739 100644 --- a/lib/routes/api/speech-credentials.js +++ b/lib/routes/api/speech-credentials.js @@ -4,7 +4,8 @@ const Account = require('../../models/account'); const SpeechCredential = require('../../models/speech-credential'); const sysError = require('../error'); const {decrypt, encrypt} = require('../../utils/encrypt-decrypt'); -const {parseAccountSid, parseServiceProviderSid, parseSpeechCredentialSid, decryptCredential} = require('./utils'); +const {parseAccountSid, parseServiceProviderSid, parseSpeechCredentialSid} = require('./utils'); +const {decryptCredential} = require('../../utils/speech-utils'); const {DbErrorUnprocessableRequest, DbErrorForbidden} = require('../../utils/errors'); const { testGoogleTts, diff --git a/lib/routes/api/tts-cache.js b/lib/routes/api/tts-cache.js index 70f9935d..222e171b 100644 --- a/lib/routes/api/tts-cache.js +++ b/lib/routes/api/tts-cache.js @@ -75,7 +75,7 @@ router.post('/Synthesize', async(req, res) => { const { filePath } = await synthAudio(stats, { account_sid: accountSid, text, - vendor: cred, + vendor: cred.vendor, language, voice, engine, From eac3c0e7c1491cfc61de257705bcad30fe3434bc Mon Sep 17 00:00:00 2001 From: Quan HL Date: Mon, 6 Nov 2023 13:18:00 +0700 Subject: [PATCH 10/11] fix --- lib/models/speech-credential.js | 10 ---------- lib/routes/api/tts-cache.js | 2 +- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/lib/models/speech-credential.js b/lib/models/speech-credential.js index 7371d697..4e124354 100644 --- a/lib/models/speech-credential.js +++ b/lib/models/speech-credential.js @@ -32,16 +32,6 @@ class SpeechCredential extends Model { return rows; } - static async getSPeechCredentialsForAccount(account_sid) { - const sql = `SELECT * -FROM speech_credentials -WHERE account_sid = ? OR (account_sid is NULL AND service_provider_sid = -(SELECT service_provider_sid from accounts where account_sid = ?))`; - - const [rows] = await promisePool.query(sql, [account_sid, account_sid]); - return rows; - } - static async disableStt(account_sid) { await promisePool.execute('UPDATE speech_credentials SET use_for_stt = 0 WHERE account_sid = ?', [account_sid]); } diff --git a/lib/routes/api/tts-cache.js b/lib/routes/api/tts-cache.js index 222e171b..9a0ff995 100644 --- a/lib/routes/api/tts-cache.js +++ b/lib/routes/api/tts-cache.js @@ -99,7 +99,7 @@ router.post('/Synthesize', async(req, res) => { // Delete the file after it's been read fs.unlink(filePath, (err) => { if (err) throw err; - logger.error(`${filePath} was deleted`); + logger.info(`${filePath} was deleted`); }); }); From 1be2b18a496989340bc81091bf473049ced69c25 Mon Sep 17 00:00:00 2001 From: Quan HL Date: Mon, 6 Nov 2023 13:41:16 +0700 Subject: [PATCH 11/11] swagger --- lib/swagger/swagger.yaml | 59 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/lib/swagger/swagger.yaml b/lib/swagger/swagger.yaml index 11149f29..e6463fa6 100644 --- a/lib/swagger/swagger.yaml +++ b/lib/swagger/swagger.yaml @@ -4209,6 +4209,65 @@ paths: application/json: schema: $ref: '#/components/schemas/RegisteredClient' + /Accounts/{AccountSid}/TtsCache/Synthesize: + post: + tags: + - Accounts + summary: get TTS from provider + operationId: Synthesize + requestBody: + content: + application/json: + schema: + type: object + properties: + speech_credential_sid: + type: string + description: Speech credential Sid + example: 553b4b6b-8918-4394-a46d-1e3c5a3c717b + text: + type: string + description: the text to convert to audio + example: Hello How are you + language: + type: string + description: language is used in text + example: en-US + voice: + type: string + description: voice ID + example: en-US-Standard-C + required: + - speech_credential_sid + - text + - language + - voice + responses: + 200: + description: Audio is created + content: + audio/mpeg: + schema: + type: string + format: binary + 400: + description: bad request + content: + application/json: + schema: + $ref: '#/components/schemas/GeneralError' + 422: + description: unprocessable entity + content: + application/json: + schema: + $ref: '#/components/schemas/GeneralError' + 500: + description: system error + content: + application/json: + schema: + $ref: '#/components/schemas/GeneralError' /Lcrs: post: tags: