diff --git a/nvdaHelper/local/nvdaHelperLocal.def b/nvdaHelper/local/nvdaHelperLocal.def index ed63733f45..5f95d8cea8 100644 --- a/nvdaHelper/local/nvdaHelperLocal.def +++ b/nvdaHelper/local/nvdaHelperLocal.def @@ -82,6 +82,7 @@ EXPORTS wasPlay_pause wasPlay_resume wasPlay_setChannelVolume + wasPlay_startTrimmingLeadingSilence wasPlay_startup wasSilence_init wasSilence_playFor diff --git a/nvdaHelper/local/silenceDetect.h b/nvdaHelper/local/silenceDetect.h new file mode 100644 index 0000000000..ac1531ec3b --- /dev/null +++ b/nvdaHelper/local/silenceDetect.h @@ -0,0 +1,323 @@ +#ifndef SILENCEDETECT_H +#define SILENCEDETECT_H + +#include +#include +#include +#include +#include +#include + +namespace SilenceDetect { + +/** + * Compile-time wave format tag. + * Supports integer and floating-point formats. + * `SampleType` should be the smallest numeric type that can hold a sample, for example, 32-bit int for 24-bit format. + * Signedness of `SampleType` matters. For unsigned types, the zero point is at middle, e.g. 128 for 8-bit unsigned. + * `bytesPerSample` should be <= `sizeof(SampleType)` for integer formats, + * and == `sizeof(SampleType)` for floating-point formats. + * Assumes C++20 standard. + */ +template +struct WaveFormat { + static_assert(std::is_arithmetic_v, "SampleType should be an integer or floating-point type"); + static_assert(!(std::is_floating_point_v && bytesPerSample != sizeof(SampleType)), + "When SampleType is a floating-point type, bytesPerSample should be equal to sizeof(SampleType)"); + static_assert(!(std::is_integral_v && !(bytesPerSample <= sizeof(SampleType) && bytesPerSample > 0)), + "When SampleType is an integer type, bytesPerSample should be less than or equal to sizeof(SampleType) and greater than 0"); + + typedef SampleType SampleType; + static constexpr size_t bytesPerSample = bytesPerSample; + + static constexpr SampleType zeroPoint() { + // for unsigned types, zero point is at middle + // for signed types, zero is zero + if constexpr (std::is_unsigned_v) + return SampleType(1) << (bytesPerSample * 8 - 1); + else + return SampleType(); + } + + static constexpr SampleType (max)() { + if constexpr (std::is_floating_point_v) + return SampleType(1); + else + return (std::numeric_limits::max)() >> ((sizeof(SampleType) - bytesPerSample) * 8); + } + + static constexpr SampleType (min)() { + if constexpr (std::is_floating_point_v) + return SampleType(-1); + else + return (std::numeric_limits::min)() >> ((sizeof(SampleType) - bytesPerSample) * 8); + } + + static constexpr SampleType defaultThreshold() { + // Default threshold: 1 / 2^10 or 0.0009765625 + if constexpr (std::is_floating_point_v) + return SampleType(1) / (1 << 10); + else if constexpr (bytesPerSample * 8 > 10) + return SampleType(1) << (bytesPerSample * 8 - 10); + else + return SampleType(); + } + + static constexpr auto toSigned(SampleType smp) { + if constexpr (std::is_integral_v) { + // In C++20, signed integer types must use two's complement, + // so the following conversion is well-defined. + using SignedType = std::make_signed_t; + return SignedType(smp - zeroPoint()); + } else { + return smp; + } + } + + static constexpr SampleType fromSigned(SampleType smp) { + if constexpr (std::is_integral_v) { + // Signed overflow is undefined behavior, + // so convert to unsigned first. + using UnsignedType = std::make_unsigned_t; + return SampleType(UnsignedType(smp) + zeroPoint()); + } else { + return smp; + } + } + + static constexpr SampleType signExtend(SampleType smp) { + if constexpr (std::is_unsigned_v || bytesPerSample == sizeof(SampleType)) { + return smp; + } else { + constexpr auto shift = (sizeof(SampleType) - bytesPerSample) * 8; + // Convert to unsigned first to prevent left-shifting negative numbers + using UnsignedType = std::make_unsigned_t; + return SampleType(UnsignedType(smp) << shift) >> shift; + } + } + + template + static constexpr SampleType convertFrom(SrcFmt::SampleType smp) { + using SrcType = SrcFmt::SampleType; + if constexpr (std::is_floating_point_v && std::is_floating_point_v) { + // both floating points, convert directly + return SampleType(smp); + } else if constexpr (std::is_integral_v && std::is_integral_v) { + // both integers, do bit shifting + const auto dstsmp = SrcFmt::toSigned(smp); + if constexpr (bytesPerSample >= SrcFmt::bytesPerSample) { + constexpr auto shift = (bytesPerSample - SrcFmt::bytesPerSample) * 8; + // Convert to unsigned target type first to prevent overflows and left-shifting negative numbers + using UnsignedType = std::make_unsigned_t; + return fromSigned(UnsignedType(dstsmp) << shift); + } else { + constexpr auto shift = (SrcFmt::bytesPerSample - bytesPerSample) * 8; + return fromSigned(dstsmp >> shift); + } + } else if constexpr (std::is_floating_point_v && std::is_integral_v) { + // floating point to integer, e.g. [-1.0f, 1.0f] -> [-32767, 32767] + return fromSigned(smp * ((max)() - zeroPoint())); + } else { + // integer to floating point, e.g. [-32768, 32767] -> [-1.0f, 1.0f) + return SampleType(SrcFmt::toSigned(smp) / (SrcFmt::zeroPoint() - (SrcFmt::min)())); + } + } +}; + +inline WORD getFormatTag(const WAVEFORMATEX* wfx) { + if (wfx->wFormatTag == WAVE_FORMAT_EXTENSIBLE) { + auto wfext = reinterpret_cast(wfx); + if (IS_VALID_WAVEFORMATEX_GUID(&wfext->SubFormat)) + return EXTRACT_WAVEFORMATEX_ID(&wfext->SubFormat); + } + return wfx->wFormatTag; +} + +/** + * Return the leading silence wave data length, in bytes. + * Assumes the wave data to be of one channel (mono). + * Uses a `WaveFormat` type (`Fmt`) to determine the wave format. + */ +template +size_t getLeadingSilenceSizeMono( + const unsigned char* waveData, + size_t size, + typename Fmt::SampleType threshold +) { + using SampleType = Fmt::SampleType; + constexpr size_t bytesPerSample = Fmt::bytesPerSample; + + if (size < bytesPerSample) + return 0; + + constexpr SampleType zeroPoint = Fmt::zeroPoint(); + const SampleType minValue = zeroPoint - threshold, maxValue = zeroPoint + threshold; + + // Check each sample + const unsigned char* p = waveData; + const unsigned char* pEnd = waveData + (size - (size % bytesPerSample)); + for (; p != pEnd; p += bytesPerSample) { + SampleType smp; + memcpy(&smp, p, bytesPerSample); + smp = Fmt::signExtend(smp); + // this sample is out of range, so the leading silence starts at the previous sample + if (smp < minValue || smp > maxValue) + return p - waveData; + } + + // The whole data block is silence + return size; +} + +/** + * Return the trailing silence wave data length, in bytes. + * Assumes the wave data to be of one channel (mono). + * Uses a `WaveFormat` type (`Fmt`) to determine the wave format. + */ +template +size_t getTrailingSilenceSizeMono( + const unsigned char* waveData, + size_t size, + typename Fmt::SampleType threshold +) { + using SampleType = Fmt::SampleType; + constexpr size_t bytesPerSample = Fmt::bytesPerSample; + + if (size < bytesPerSample) + return 0; + + constexpr SampleType zeroPoint = Fmt::zeroPoint(); + const SampleType minValue = zeroPoint - threshold, maxValue = zeroPoint + threshold; + + // Check each sample in reverse order + const unsigned char* p = waveData + (size - (size % bytesPerSample)); + do { + p -= bytesPerSample; + SampleType smp; + memcpy(&smp, p, bytesPerSample); + smp = Fmt::signExtend(smp); + // this sample is out of range, so the trailing silence starts at the next sample + if (smp < minValue || smp > maxValue) + return size - (p - waveData) - bytesPerSample; + } while (p != waveData); + + // The whole data block is silence + return size; +} + +/** + * Invoke a functor with an argument of a WaveFormat type that corresponds to the specified WAVEFORMATEX. + * Return false if the WAVEFORMATEX is unknown. + */ +template +bool callByWaveFormat(const WAVEFORMATEX* wfx, Func&& func) { + switch (getFormatTag(wfx)) { + case WAVE_FORMAT_PCM: + switch (wfx->wBitsPerSample) { + case 8: // 8-bits are unsigned, others are signed + func(WaveFormat()); + break; + case 16: + func(WaveFormat()); + break; + case 24: + func(WaveFormat()); + break; + case 32: + func(WaveFormat()); + break; + default: + return false; + } + break; + case WAVE_FORMAT_IEEE_FLOAT: + switch (wfx->wBitsPerSample) { + case 32: + func(WaveFormat()); + break; + case 64: + func(WaveFormat()); + break; + default: + return false; + } + break; + default: + return false; + } + return true; +} + +/** + * Return the leading silence wave data length, in bytes. + * Uses a `WAVEFORMATEX` to determine the wave format. + */ +inline size_t getLeadingSilenceSize( + const WAVEFORMATEX* wfx, + const unsigned char* waveData, + size_t size +) { + size_t len; + if (!callByWaveFormat(wfx, [=, &len](auto fmtTag) { + using Fmt = decltype(fmtTag); + len = getLeadingSilenceSizeMono( + waveData, size, Fmt::defaultThreshold()); + })) + return 0; + + return len - len % wfx->nBlockAlign; // round down to block (channel) boundaries +} + +/** + * Return the trailing silence wave data length, in bytes. + * Uses a `WAVEFORMATEX` to determine the wave format. + */ +inline size_t getTrailingSilenceSize( + const WAVEFORMATEX* wfx, + const unsigned char* waveData, + size_t size +) { + size_t len; + if (!callByWaveFormat(wfx, [=, &len](auto fmtTag) { + using Fmt = decltype(fmtTag); + len = getTrailingSilenceSizeMono( + waveData, size, Fmt::defaultThreshold()); + })) + return 0; + + size_t align = wfx->nBlockAlign; + len += align - 1; + len -= len % align; // round up to block (channel) boundaries + return len; +} + +inline std::vector generateSilenceBytes(const WAVEFORMATEX* wfx, size_t size) { + std::vector wave; + size -= size % wfx->nBlockAlign; + callByWaveFormat(wfx, [=, &wave](auto fmtTag) { + using Fmt = decltype(fmtTag); + constexpr auto bytesPerSample = Fmt::bytesPerSample; + constexpr auto zeroPoint = Fmt::zeroPoint(); + if constexpr (zeroPoint == 0) { + wave.assign(size, 0); + } else if constexpr (bytesPerSample == 1) { + wave.assign(size, zeroPoint); + } else { + wave.assign(size, 0); + unsigned char *p = wave.data(); + unsigned char *pEnd = p + size; + for (; p != pEnd; p += bytesPerSample) { + memcpy(p, &zeroPoint, bytesPerSample); + } + } + }); + return wave; +} + +inline std::vector generateSilenceMs(const WAVEFORMATEX* wfx, unsigned int milliseconds) { + return generateSilenceBytes(wfx, (size_t)wfx->nAvgBytesPerSec * milliseconds / 1000); +} + +} // namespace SilenceDetect + +#endif // SILENCEDETECT_H diff --git a/nvdaHelper/local/wasapi.cpp b/nvdaHelper/local/wasapi.cpp index e942f6952d..540815c33b 100644 --- a/nvdaHelper/local/wasapi.cpp +++ b/nvdaHelper/local/wasapi.cpp @@ -24,6 +24,7 @@ This license can be found at: #include #include #include +#include "silenceDetect.h" /** * Support for audio playback using WASAPI. @@ -194,6 +195,8 @@ class WasapiPlayer { HRESULT resume(); HRESULT setChannelVolume(unsigned int channel, float level); + void startTrimmingLeadingSilence(bool start); + private: void maybeFireCallback(); @@ -245,6 +248,7 @@ class WasapiPlayer { unsigned int defaultDeviceChangeCount; unsigned int deviceStateChangeCount; bool isUsingPreferredDevice = false; + bool isTrimmingLeadingSilence = false; }; WasapiPlayer::WasapiPlayer(wchar_t* endpointId, WAVEFORMATEX format, @@ -342,6 +346,19 @@ HRESULT WasapiPlayer::feed(unsigned char* data, unsigned int size, return true; }; + if (isTrimmingLeadingSilence) { + size_t silenceSize = SilenceDetect::getLeadingSilenceSize(&format, data, size); + if (silenceSize >= size) { + // The whole chunk is silence. Continue checking for silence in the next chunk. + remainingFrames = 0; + } else { + // Silence ends in this chunk. Skip the silence and continue. + data += silenceSize; + remainingFrames = (size - silenceSize) / format.nBlockAlign; + isTrimmingLeadingSilence = false; // Stop checking for silence + } + } + while (remainingFrames > 0) { UINT32 paddingFrames; @@ -643,6 +660,10 @@ HRESULT WasapiPlayer::setChannelVolume(unsigned int channel, float level) { return volume->SetChannelVolume(channel, level); } +void WasapiPlayer::startTrimmingLeadingSilence(bool start) { + isTrimmingLeadingSilence = start; +} + HRESULT WasapiPlayer::disableCommunicationDucking(IMMDevice* device) { // Disable the default ducking experience used when a communication audio // session is active, as we never want NVDA's audio to be ducked. @@ -839,6 +860,10 @@ HRESULT wasPlay_setChannelVolume( return player->setChannelVolume(channel, level); } +void wasPlay_startTrimmingLeadingSilence(WasapiPlayer* player, bool start) { + player->startTrimmingLeadingSilence(start); +} + /** * This must be called once per session at startup before wasPlay_create is * called. diff --git a/projectDocs/dev/developerGuide/developerGuide.md b/projectDocs/dev/developerGuide/developerGuide.md index d0b8c24712..2a60cbe475 100644 --- a/projectDocs/dev/developerGuide/developerGuide.md +++ b/projectDocs/dev/developerGuide/developerGuide.md @@ -1393,6 +1393,7 @@ For examples of how to define and use new extension points, please see the code |`Action` |`synthIndexReached` |Notifies when a synthesizer reaches an index during speech.| |`Action` |`synthDoneSpeaking` |Notifies when a synthesizer finishes speaking.| |`Action` |`synthChanged` |Notifies of synthesizer changes.| +|`Action` |`pre_synthSpeak` |Notifies when the current synthesizer is about to speak something.| ### tones {#tonesExtPts} diff --git a/source/nvwave.py b/source/nvwave.py index 59ce04d6fd..99e59d9cb0 100644 --- a/source/nvwave.py +++ b/source/nvwave.py @@ -39,6 +39,9 @@ import core import globalVars from pycaw.utils import AudioUtilities +from speech import SpeechSequence +from speech.commands import BreakCommand +from synthDriverHandler import pre_synthSpeak from utils.mmdevice import _getOutputDevices @@ -206,6 +209,7 @@ def isInError() -> bool: wasPlay_callback = CFUNCTYPE(None, c_void_p, c_uint) +_isLeadingSilenceInserted: bool = False class WasapiWavePlayer(garbageHandler.TrackedObject): @@ -289,6 +293,10 @@ def __init__( if config.conf["audio"]["audioAwakeTime"] > 0: NVDAHelper.localLib.wasSilence_init(outputDevice) WasapiWavePlayer._silenceDevice = outputDevice + # Enable trimming by default for speech only + self.enableTrimmingLeadingSilence(purpose is AudioPurpose.SPEECH) + if self._enableTrimmingLeadingSilence: + self.startTrimmingLeadingSilence() @wasPlay_callback def _callback(cppPlayer, feedId): @@ -323,7 +331,7 @@ def open(self): NVDAHelper.localLib.wasPlay_open(self._player) except WindowsError: log.warning( - "Couldn't open specified or default audio device. " "There may be no audio devices.", + "Couldn't open specified or default audio device. There may be no audio devices.", ) WavePlayer.audioDeviceError_static = True raise @@ -351,12 +359,17 @@ def feed( @param onDone: Function to call when this chunk has finished playing. @raise WindowsError: If there was an error initially opening the device. """ + global _isLeadingSilenceInserted self.open() if self._audioDucker: self._audioDucker.enable() feedId = c_uint() if onDone else None # Never treat this instance as idle while we're feeding. self._lastActiveTime = None + # If a BreakCommand is used to insert leading silence in this utterance, + # turn off trimming temporarily. + if self._purpose is AudioPurpose.SPEECH and _isLeadingSilenceInserted: + self.startTrimmingLeadingSilence(False) try: NVDAHelper.localLib.wasPlay_feed( self._player, @@ -393,6 +406,8 @@ def sync(self): def idle(self): """Indicate that this player is now idle; i.e. the current continuous segment of audio is complete.""" self.sync() + if self._enableTrimmingLeadingSilence: + self.startTrimmingLeadingSilence() if self._audioDucker: self._audioDucker.disable() @@ -401,6 +416,8 @@ def stop(self): if self._audioDucker: self._audioDucker.disable() NVDAHelper.localLib.wasPlay_stop(self._player) + if self._enableTrimmingLeadingSilence: + self.startTrimmingLeadingSilence() self._lastActiveTime = None self._isPaused = False self._doneCallbacks = {} @@ -455,6 +472,17 @@ def setVolume( if not (all and e.winerror == E_INVALIDARG): raise + def enableTrimmingLeadingSilence(self, enable: bool) -> None: + """Enable or disable automatic leading silence removal. + This is by default enabled for speech audio, and disabled for non-speech audio.""" + self._enableTrimmingLeadingSilence = enable + if not enable: + self.startTrimmingLeadingSilence(False) + + def startTrimmingLeadingSilence(self, start: bool = True) -> None: + """Start or stop trimming the leading silence from the next audio chunk.""" + NVDAHelper.localLib.wasPlay_startTrimmingLeadingSilence(self._player, start) + def _setVolumeFromConfig(self): if self._purpose is not AudioPurpose.SOUNDS: return @@ -508,6 +536,7 @@ def _idleCheck(cls): if player._lastActiveTime <= threshold: try: NVDAHelper.localLib.wasPlay_idle(player._player) + player.startTrimmingLeadingSilence() except OSError: # #16125: IAudioClock::GetPosition sometimes fails with an access # violation on a device which has been invalidated. This shouldn't happen @@ -530,6 +559,18 @@ def _idleCheck(cls): fileWavePlayerThread: threading.Thread | None = None +def _onPreSpeak(speechSequence: SpeechSequence): + global _isLeadingSilenceInserted + _isLeadingSilenceInserted = False + # Check if leading silence of the current utterance is inserted by a BreakCommand. + for item in speechSequence: + if isinstance(item, BreakCommand): + _isLeadingSilenceInserted = True + break + elif isinstance(item, str): + break + + def initialize(): NVDAHelper.localLib.wasPlay_create.restype = c_void_p for func in ( @@ -547,12 +588,14 @@ def initialize(): func.restype = HRESULT NVDAHelper.localLib.wasPlay_startup() getOnErrorSoundRequested().register(playErrorSound) + pre_synthSpeak.register(_onPreSpeak) def terminate() -> None: if WasapiWavePlayer._silenceDevice is not None: NVDAHelper.localLib.wasSilence_terminate() getOnErrorSoundRequested().unregister(playErrorSound) + pre_synthSpeak.unregister(_onPreSpeak) def playErrorSound() -> None: diff --git a/source/speech/manager.py b/source/speech/manager.py index b642bfdf83..d5b374ccaf 100644 --- a/source/speech/manager.py +++ b/source/speech/manager.py @@ -22,7 +22,7 @@ from .priorities import Spri, SPEECH_PRIORITIES from logHandler import log -from synthDriverHandler import getSynth +from synthDriverHandler import getSynth, pre_synthSpeak from typing import ( Dict, Any, @@ -431,6 +431,7 @@ def _pushNextSpeech(self, doneSpeaking: bool): self._indexesSpeaking.append(item.index) self._cancelledLastSpeechWithSynth = False log._speechManagerUnitTest(f"Synth Gets: {seq}") + pre_synthSpeak.notify(speechSequence=seq) getSynth().speak(seq) def _getNextPriority(self): diff --git a/source/synthDriverHandler.py b/source/synthDriverHandler.py index 1e4bb9b02c..07a4a997cf 100644 --- a/source/synthDriverHandler.py +++ b/source/synthDriverHandler.py @@ -567,3 +567,11 @@ def isDebugForSynthDriver(): @param isFallback: Whether the synth is set as fallback synth due to another synth's failure @type isFallback: bool """ + +pre_synthSpeak = extensionPoints.Action() +""" +Notifies when speak() of the current synthesizer is about to be called. + +:param speechSequence: the speech sequence to pass to speak() +:type speechSequence: speech.SpeechSequence +""" diff --git a/user_docs/en/changes.md b/user_docs/en/changes.md index fe7c3b241d..a5414e4502 100644 --- a/user_docs/en/changes.md +++ b/user_docs/en/changes.md @@ -59,6 +59,7 @@ Prefix matching on command line flags, e.g. using `--di` for `--disable-addons` * Microsoft Speech API version 5 and Microsoft Speech Platform voices now use WASAPI for audio output, which may improve the responsiveness of those voices. (#13284, @gexgd0419) * The keyboard settings for "Speak typed characters" and "Speak typed words" now have three options: Off, Only in edit controls, and Always. (#17505, @Cary-rowen) * By default, "Speak typed characters" is now set to "Only in edit controls". +* The silence at the beginning of speech will now be trimmed when using OneCore voices, SAPI5 voices, and some of the third-party voice add-ons to improve their responsiveness. (#17614, @gexgd0419) ### Bug Fixes