diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
index a18c454c77f..1ff473308f7 100644
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -27,4 +27,4 @@ git commit -m "xxxxxx, test=doc"
1. 虽然跳过了 CI,但是还要先排队排到才能跳过,所以非自己方向看到 pending 不要着急 🤣
2. 在 `git commit --amend` 的时候才加 `test=xxx` 可能不太有效
3. 一个 pr 多次提交 commit 注意每次都要加 `test=xxx`,因为每个 commit 都会触发 CI
-4. 删除 python 环境中已经安装好的的 paddlespeech,否则可能会影响 import paddlespeech 的顺序
+4. 删除 python 环境中已经安装好的 paddlespeech,否则可能会影响 import paddlespeech 的顺序
diff --git a/audio/paddleaudio/backends/soundfile_backend.py b/audio/paddleaudio/backends/soundfile_backend.py
index ae7b5b52d49..9195ea0974a 100644
--- a/audio/paddleaudio/backends/soundfile_backend.py
+++ b/audio/paddleaudio/backends/soundfile_backend.py
@@ -191,7 +191,7 @@ def soundfile_save(y: np.ndarray, sr: int, file: os.PathLike) -> None:
if sr <= 0:
raise ParameterError(
- f'Sample rate should be larger than 0, recieved sr = {sr}')
+ f'Sample rate should be larger than 0, received sr = {sr}')
if y.dtype not in ['int16', 'int8']:
warnings.warn(
diff --git a/demos/TTSAndroid/README.md b/demos/TTSAndroid/README.md
index 36ff969fb03..36848cbe370 100644
--- a/demos/TTSAndroid/README.md
+++ b/demos/TTSAndroid/README.md
@@ -1,6 +1,6 @@
# 语音合成 Java API Demo 使用指南
-在 Android 上实现语音合成功能,此 Demo 有很好的的易用性和开放性,如在 Demo 中跑自己训练好的模型等。
+在 Android 上实现语音合成功能,此 Demo 有很好的易用性和开放性,如在 Demo 中跑自己训练好的模型等。
本文主要介绍语音合成 Demo 运行方法。
diff --git a/demos/TTSArmLinux/front.conf b/demos/TTSArmLinux/front.conf
index 04bd2d97f05..5960b32a92d 100644
--- a/demos/TTSArmLinux/front.conf
+++ b/demos/TTSArmLinux/front.conf
@@ -6,13 +6,13 @@
--jieba_stop_word_path=./dict/jieba/stop_words.utf8
# dict conf fastspeech2_0.4
---seperate_tone=false
+--separate_tone=false
--word2phone_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
--phone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
--tone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
# dict conf speedyspeech_0.5
-#--seperate_tone=true
+#--separate_tone=true
#--word2phone_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/word2phone.dict
#--phone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt
#--tone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
diff --git a/demos/TTSCppFrontend/front_demo/front.conf b/demos/TTSCppFrontend/front_demo/front.conf
index e9ce1c94d73..abff444703c 100644
--- a/demos/TTSCppFrontend/front_demo/front.conf
+++ b/demos/TTSCppFrontend/front_demo/front.conf
@@ -6,13 +6,13 @@
--jieba_stop_word_path=./front_demo/dict/jieba/stop_words.utf8
# dict conf fastspeech2_0.4
---seperate_tone=false
+--separate_tone=false
--word2phone_path=./front_demo/dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
--phone2id_path=./front_demo/dict/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
--tone2id_path=./front_demo/dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
# dict conf speedyspeech_0.5
-#--seperate_tone=true
+#--separate_tone=true
#--word2phone_path=./front_demo/dict/speedyspeech_nosil_baker_ckpt_0.5/word2phone.dict
#--phone2id_path=./front_demo/dict/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt
#--tone2id_path=./front_demo/dict/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
diff --git a/demos/TTSCppFrontend/front_demo/front_demo.cpp b/demos/TTSCppFrontend/front_demo/front_demo.cpp
index 19f16758bc6..77f3fc725d0 100644
--- a/demos/TTSCppFrontend/front_demo/front_demo.cpp
+++ b/demos/TTSCppFrontend/front_demo/front_demo.cpp
@@ -20,7 +20,7 @@
DEFINE_string(sentence, "你好,欢迎使用语音合成服务", "Text to be synthesized");
DEFINE_string(front_conf, "./front_demo/front.conf", "Front conf file");
-// DEFINE_string(seperate_tone, "true", "If true, get phoneids and tonesid");
+// DEFINE_string(separate_tone, "true", "If true, get phoneids and tonesid");
int main(int argc, char** argv) {
diff --git a/demos/TTSCppFrontend/front_demo/gentools/word2phones.py b/demos/TTSCppFrontend/front_demo/gentools/word2phones.py
index 8726ee89cf6..d9baeea9c09 100644
--- a/demos/TTSCppFrontend/front_demo/gentools/word2phones.py
+++ b/demos/TTSCppFrontend/front_demo/gentools/word2phones.py
@@ -20,7 +20,7 @@
newdict = "./dict/word_phones.dict"
-def GenPhones(initials, finals, seperate=True):
+def GenPhones(initials, finals, separate=True):
phones = []
for c, v in zip(initials, finals):
@@ -30,9 +30,9 @@ def GenPhones(initials, finals, seperate=True):
elif c in ['zh', 'ch', 'sh', 'r']:
v = re.sub('i', 'iii', v)
if c:
- if seperate is True:
+ if separate is True:
phones.append(c + '0')
- elif seperate is False:
+ elif separate is False:
phones.append(c)
else:
print("Not sure whether phone and tone need to be separated")
diff --git a/demos/TTSCppFrontend/src/front/front_interface.cpp b/demos/TTSCppFrontend/src/front/front_interface.cpp
index 8bd466d28e9..e7b08c798f8 100644
--- a/demos/TTSCppFrontend/src/front/front_interface.cpp
+++ b/demos/TTSCppFrontend/src/front/front_interface.cpp
@@ -126,7 +126,7 @@ int FrontEngineInterface::init() {
}
// 生成音调字典(音调到音调id的映射)
- if (_seperate_tone == "true") {
+ if (_separate_tone == "true") {
if (0 != GenDict(_tone2id_path, &tone_id_map)) {
LOG(ERROR) << "Genarate tone2id dict failed";
return -1;
@@ -168,7 +168,7 @@ int FrontEngineInterface::ReadConfFile() {
_jieba_stop_word_path = conf_map["jieba_stop_word_path"];
// dict path
- _seperate_tone = conf_map["seperate_tone"];
+ _separate_tone = conf_map["separate_tone"];
_word2phone_path = conf_map["word2phone_path"];
_phone2id_path = conf_map["phone2id_path"];
_tone2id_path = conf_map["tone2id_path"];
@@ -295,7 +295,7 @@ int FrontEngineInterface::GetWordsIds(
}
}
} else { // 标点符号
- if (_seperate_tone == "true") {
+ if (_separate_tone == "true") {
phone = "sp0"; // speedyspeech
} else {
phone = "sp"; // fastspeech2
@@ -354,7 +354,7 @@ int FrontEngineInterface::Phone2Phoneid(const std::string &phone,
std::string temp_phone;
for (int i = 0; i < phone_vec.size(); i++) {
temp_phone = phone_vec[i];
- if (_seperate_tone == "true") {
+ if (_separate_tone == "true") {
phoneid->push_back(atoi(
(phone_id_map[temp_phone.substr(0, temp_phone.length() - 1)])
.c_str()));
diff --git a/demos/TTSCppFrontend/src/front/front_interface.h b/demos/TTSCppFrontend/src/front/front_interface.h
index fc33a4de6bc..8c16859cf46 100644
--- a/demos/TTSCppFrontend/src/front/front_interface.h
+++ b/demos/TTSCppFrontend/src/front/front_interface.h
@@ -182,7 +182,7 @@ class FrontEngineInterface : public TextNormalizer {
std::string _jieba_idf_path;
std::string _jieba_stop_word_path;
- std::string _seperate_tone;
+ std::string _separate_tone;
std::string _word2phone_path;
std::string _phone2id_path;
std::string _tone2id_path;
diff --git a/demos/speech_web/README.md b/demos/speech_web/README.md
index 572781ab682..fc1fe7105f4 100644
--- a/demos/speech_web/README.md
+++ b/demos/speech_web/README.md
@@ -23,7 +23,7 @@ Paddle Speech Demo 是一个以 PaddleSpeech 的语音交互功能为主体开
+ ERNIE-SAT:语言-语音跨模态大模型 ERNIE-SAT 可视化展示示例,支持个性化合成,跨语言语音合成(音频为中文则输入英文文本进行合成),语音编辑(修改音频文字中间的结果)功能。 ERNIE-SAT 更多实现细节,可以参考:
+ [【ERNIE-SAT with AISHELL-3 dataset】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/ernie_sat)
- + [【ERNIE-SAT with with AISHELL3 and VCTK datasets】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3_vctk/ernie_sat)
+ + [【ERNIE-SAT with AISHELL3 and VCTK datasets】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3_vctk/ernie_sat)
+ [【ERNIE-SAT with VCTK dataset】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/ernie_sat)
运行效果:
diff --git a/demos/speech_web/speech_server/main.py b/demos/speech_web/speech_server/main.py
index 03e7e5996c2..f4678628f15 100644
--- a/demos/speech_web/speech_server/main.py
+++ b/demos/speech_web/speech_server/main.py
@@ -260,7 +260,7 @@ async def websocket_endpoint_online(websocket: WebSocket):
# and we break the loop
if message['signal'] == 'start':
resp = {"status": "ok", "signal": "server_ready"}
- # do something at begining here
+ # do something at beginning here
# create the instance to process the audio
# connection_handler = chatbot.asr.connection_handler
connection_handler = PaddleASRConnectionHanddler(engine)
diff --git a/docs/tutorial/st/st_tutorial.ipynb b/docs/tutorial/st/st_tutorial.ipynb
index 2fb850535ed..e755bebad17 100644
--- a/docs/tutorial/st/st_tutorial.ipynb
+++ b/docs/tutorial/st/st_tutorial.ipynb
@@ -62,7 +62,7 @@
"collapsed": false
},
"source": [
- "# 使用Transformer进行端到端语音翻译的的基本流程\n",
+ "# 使用Transformer进行端到端语音翻译的基本流程\n",
"## 基础模型\n",
"由于 ASR 章节已经介绍了 Transformer 以及语音特征抽取,在此便不做过多介绍,感兴趣的同学可以去相关章节进行了解。\n",
"\n",
diff --git a/docs/tutorial/tts/tts_tutorial.ipynb b/docs/tutorial/tts/tts_tutorial.ipynb
index 583adb01470..0cecb680d61 100644
--- a/docs/tutorial/tts/tts_tutorial.ipynb
+++ b/docs/tutorial/tts/tts_tutorial.ipynb
@@ -464,7 +464,7 @@
"
FastSpeech2 网络结构图\n",
"\n",
"\n",
- "PaddleSpeech TTS 实现的 FastSpeech2 与论文不同的地方在于,我们使用的的是 phone 级别的 `pitch` 和 `energy`(与 FastPitch 类似),这样的合成结果可以更加**稳定**。\n",
+ "PaddleSpeech TTS 实现的 FastSpeech2 与论文不同的地方在于,我们使用的是 phone 级别的 `pitch` 和 `energy`(与 FastPitch 类似),这样的合成结果可以更加**稳定**。\n",
"
\n",
"
FastPitch 网络结构图\n",
"\n",
diff --git a/examples/librispeech/asr2/README.md b/examples/librispeech/asr2/README.md
index 26978520da2..253c9b45950 100644
--- a/examples/librispeech/asr2/README.md
+++ b/examples/librispeech/asr2/README.md
@@ -153,7 +153,7 @@ After training the model, we need to get the final model for testing and inferen
```bash
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# avg n best model
- avg.sh lastest exp/${ckpt}/checkpoints ${avg_num}
+ avg.sh latest exp/${ckpt}/checkpoints ${avg_num}
fi
```
The `avg.sh` is in the `../../../utils/` which is define in the `path.sh`.
diff --git a/examples/other/mfa/local/generate_lexicon.py b/examples/other/mfa/local/generate_lexicon.py
index 3deb2470189..e63b5eb27d0 100644
--- a/examples/other/mfa/local/generate_lexicon.py
+++ b/examples/other/mfa/local/generate_lexicon.py
@@ -48,7 +48,7 @@ def rule(C, V, R, T):
'i' is distinguished when appeared in phonemes, and separated into 3 categories, 'i', 'ii' and 'iii'.
- Erhua is is possibly applied to every finals, except for finals that already ends with 'r'.
+ Erhua is possibly applied to every finals, except for finals that already ends with 'r'.
When a syllable is impossible or does not have any characters with this pronunciation, return None
to filter it out.
diff --git a/examples/tiny/asr1/README.md b/examples/tiny/asr1/README.md
index cfa26670451..489f5bc3e76 100644
--- a/examples/tiny/asr1/README.md
+++ b/examples/tiny/asr1/README.md
@@ -37,7 +37,7 @@ It will support the way of using `--variable value` in the shell scripts.
Some local variables are set in `run.sh`.
`gpus` denotes the GPU number you want to use. If you set `gpus=`, it means you only use CPU.
`stage` denotes the number of stage you want the start from in the experiments.
-`stop stage` denotes the number of stage you want the stop at in the expriments.
+`stop stage` denotes the number of stage you want the stop at in the experiments.
`conf_path` denotes the config path of the model.
`avg_num`denotes the number K of top-K models you want to average to get the final model.
`ckpt` denotes the checkpoint prefix of the model, e.g. "transformerr"
diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py
index 6663bcf87be..37d99226204 100644
--- a/paddlespeech/s2t/__init__.py
+++ b/paddlespeech/s2t/__init__.py
@@ -267,7 +267,7 @@ def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
if not hasattr(paddle.Tensor, 'to'):
- logger.debug("register user to to paddle.Tensor, remove this when fixed!")
+ logger.debug("register user to paddle.Tensor, remove this when fixed!")
setattr(paddle.Tensor, 'to', to)
setattr(paddle.static.Variable, 'to', to)
diff --git a/paddlespeech/s2t/frontend/augmentor/augmentation.py b/paddlespeech/s2t/frontend/augmentor/augmentation.py
index 4c5ca4fe630..744ea56dd79 100644
--- a/paddlespeech/s2t/frontend/augmentor/augmentation.py
+++ b/paddlespeech/s2t/frontend/augmentor/augmentation.py
@@ -45,7 +45,7 @@ class AugmentationPipeline():
samples to make the model invariant to certain types of perturbations in the
real world, improving model's generalization ability.
- The pipeline is built according the the augmentation configuration in json
+ The pipeline is built according to the augmentation configuration in json
string, e.g.
.. code-block::
diff --git a/paddlespeech/s2t/io/speechbrain/sampler.py b/paddlespeech/s2t/io/speechbrain/sampler.py
index ba13193eb6e..09a884c2b84 100755
--- a/paddlespeech/s2t/io/speechbrain/sampler.py
+++ b/paddlespeech/s2t/io/speechbrain/sampler.py
@@ -283,7 +283,7 @@ def _get_boundaries_through_warping(
num_quantiles, )
# get quantiles using lognormal distribution
quantiles = lognorm.ppf(latent_boundaries, 1)
- # scale up to to max_batch_length
+ # scale up to max_batch_length
bucket_boundaries = quantiles * max_batch_length / quantiles[-1]
# compute resulting bucket length multipliers
length_multipliers = [
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
index 6494b5304c4..f716fa3b57f 100644
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -560,7 +560,7 @@ def attention_rescoring(self,
[len(hyp[0]) for hyp in hyps], place=device,
dtype=paddle.long) # (beam_size,)
hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id)
- hyps_lens = hyps_lens + 1 # Add at begining
+ hyps_lens = hyps_lens + 1 # Add at beginning
logger.debug(
f"hyps pad: {hyps_pad} {self.sos} {self.eos} {self.ignore_id}")
@@ -709,7 +709,7 @@ def forward_attention_decoder(self,
hypothesis from ctc prefix beam search and one encoder output
Args:
hyps (paddle.Tensor): hyps from ctc prefix beam search, already
- pad sos at the begining, (B, T)
+ pad sos at the beginning, (B, T)
hyps_lens (paddle.Tensor): length of each hyp in hyps, (B)
encoder_out (paddle.Tensor): corresponding encoder output, (B=1, T, D)
Returns:
diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py
index 31defbbaf1b..b4c8c255f01 100644
--- a/paddlespeech/s2t/models/u2_st/u2_st.py
+++ b/paddlespeech/s2t/models/u2_st/u2_st.py
@@ -455,7 +455,7 @@ def forward_attention_decoder(
hypothesis from ctc prefix beam search and one encoder output
Args:
hyps (paddle.Tensor): hyps from ctc prefix beam search, already
- pad sos at the begining, (B, T)
+ pad sos at the beginning, (B, T)
hyps_lens (paddle.Tensor): length of each hyp in hyps, (B)
encoder_out (paddle.Tensor): corresponding encoder output, (B=1, T, D)
Returns:
diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py
index 536ffe0a906..a702f0aa12d 100644
--- a/paddlespeech/server/engine/asr/online/python/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py
@@ -609,7 +609,7 @@ def rescoring(self):
dtype=paddle.long) # (beam_size,)
hyps_pad, _ = add_sos_eos(hyps_pad, self.model.sos, self.model.eos,
self.model.ignore_id)
- hyps_lens = hyps_lens + 1 # Add at begining
+ hyps_lens = hyps_lens + 1 # Add at beginning
# ctc score in ln domain
# (beam_size, max_hyps_len, vocab_size)
diff --git a/paddlespeech/server/ws/asr_api.py b/paddlespeech/server/ws/asr_api.py
index ae1c8831077..b3ad0b7c502 100644
--- a/paddlespeech/server/ws/asr_api.py
+++ b/paddlespeech/server/ws/asr_api.py
@@ -67,7 +67,7 @@ async def websocket_endpoint(websocket: WebSocket):
# and we break the loop
if message['signal'] == 'start':
resp = {"status": "ok", "signal": "server_ready"}
- # do something at begining here
+ # do something at beginning here
# create the instance to process the audio
#connection_handler = PaddleASRConnectionHanddler(asr_model)
connection_handler = asr_model.new_handler()
diff --git a/paddlespeech/t2s/frontend/generate_lexicon.py b/paddlespeech/t2s/frontend/generate_lexicon.py
index 6b467d00e12..4fb748a69bb 100644
--- a/paddlespeech/t2s/frontend/generate_lexicon.py
+++ b/paddlespeech/t2s/frontend/generate_lexicon.py
@@ -45,7 +45,7 @@ def rule(C, V, R, T):
'u' in syllables when certain conditions are satisfied.
'i' is distinguished when appeared in phonemes, and separated into 3 categories, 'i', 'ii' and 'iii'.
- Erhua is is possibly applied to every finals, except for finals that already ends with 'r'.
+ Erhua is possibly applied to every finals, except for finals that already ends with 'r'.
When a syllable is impossible or does not have any characters with this pronunciation, return None
to filter it out.
"""
diff --git a/paddlespeech/t2s/models/waveflow.py b/paddlespeech/t2s/models/waveflow.py
index 8e2ce822fd2..b4818cab4cd 100644
--- a/paddlespeech/t2s/models/waveflow.py
+++ b/paddlespeech/t2s/models/waveflow.py
@@ -236,7 +236,7 @@ def add_input(self, x_row, condition_row):
Returns:
res (Tensor):
- A row of the the residual output. shape=(batch_size, channel, 1, width)
+ A row of the residual output. shape=(batch_size, channel, 1, width)
skip (Tensor):
A row of the skip output. shape=(batch_size, channel, 1, width)
@@ -343,7 +343,7 @@ def add_input(self, x_row, condition_row):
Returns:
res (Tensor):
- A row of the the residual output. shape=(batch_size, channel, 1, width)
+ A row of the residual output. shape=(batch_size, channel, 1, width)
skip (Tensor):
A row of the skip output. shape=(batch_size, channel, 1, width)
@@ -465,7 +465,7 @@ def _start_sequence(self):
self.resnet.start_sequence()
def inverse(self, z, condition):
- """Sampling from the the distrition p(X). It is done by sample form
+ """Sampling from the distrition p(X). It is done by sample form
p(Z) and transform the sample. It is a auto regressive transformation.
Args:
@@ -600,7 +600,7 @@ def forward(self, x, condition):
return z, log_det_jacobian
def inverse(self, z, condition):
- """Sampling from the the distrition p(X).
+ """Sampling from the distrition p(X).
It is done by sample a ``z`` form p(Z) and transform it into ``x``.
Each Flow transform .. math:: `z_{i-1}` to .. math:: `z_{i}` in an
diff --git a/paddlespeech/t2s/modules/transformer/lightconv.py b/paddlespeech/t2s/modules/transformer/lightconv.py
index 22217d50f51..85336f4f3ca 100644
--- a/paddlespeech/t2s/modules/transformer/lightconv.py
+++ b/paddlespeech/t2s/modules/transformer/lightconv.py
@@ -110,7 +110,7 @@ def forward(self, query, key, value, mask):
(batch, time1, time2) mask
Return:
- Tensor: ouput. (batch, time1, d_model)
+ Tensor: output. (batch, time1, d_model)
"""
# linear -> GLU -> lightconv -> linear
diff --git a/paddlespeech/vector/exps/ecapa_tdnn/train.py b/paddlespeech/vector/exps/ecapa_tdnn/train.py
index bf014045d0a..2dc7a7164c5 100644
--- a/paddlespeech/vector/exps/ecapa_tdnn/train.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/train.py
@@ -51,7 +51,7 @@ def main(args, config):
# stage0: set the training device, cpu or gpu
paddle.set_device(args.device)
- # stage1: we must call the paddle.distributed.init_parallel_env() api at the begining
+ # stage1: we must call the paddle.distributed.init_parallel_env() api at the beginning
paddle.distributed.init_parallel_env()
nranks = paddle.distributed.get_world_size()
rank = paddle.distributed.get_rank()
@@ -146,7 +146,7 @@ def main(args, config):
timer.start()
for epoch in range(start_epoch + 1, config.epochs + 1):
- # at the begining, model must set to train mode
+ # at the beginning, model must set to train mode
model.train()
avg_loss = 0
diff --git a/paddlespeech/vector/exps/ge2e/preprocess.py b/paddlespeech/vector/exps/ge2e/preprocess.py
index dabe0ce7694..ee59e62457a 100644
--- a/paddlespeech/vector/exps/ge2e/preprocess.py
+++ b/paddlespeech/vector/exps/ge2e/preprocess.py
@@ -42,7 +42,7 @@
parser.add_argument(
"--skip_existing",
action="store_true",
- help="Whether to skip ouput files with the same name. Useful if this script was interrupted."
+ help="Whether to skip output files with the same name. Useful if this script was interrupted."
)
parser.add_argument(
"--no_trim",
diff --git a/speechx/examples/ds2_ol/onnx/local/onnx_infer_shape.py b/speechx/examples/ds2_ol/onnx/local/onnx_infer_shape.py
index c53e9ec920d..65709fc2be1 100755
--- a/speechx/examples/ds2_ol/onnx/local/onnx_infer_shape.py
+++ b/speechx/examples/ds2_ol/onnx/local/onnx_infer_shape.py
@@ -2078,7 +2078,7 @@ def _infer_PythonOp(self, node):
output_tensor_ranks = get_attribute(node, 'output_tensor_ranks')
assert output_tensor_ranks
- # set the context output seperately.
+ # set the context output separately.
# The first output is autograd's context.
vi = self.known_vi_[node.output[0]]
vi.CopyFrom(
diff --git a/speechx/speechx/frontend/audio/db_norm.cc b/speechx/speechx/frontend/audio/db_norm.cc
index ad79fcc3a4a..7141fc8077a 100644
--- a/speechx/speechx/frontend/audio/db_norm.cc
+++ b/speechx/speechx/frontend/audio/db_norm.cc
@@ -76,7 +76,7 @@ bool DecibelNormalizer::Compute(VectorBase* waves) const {
if (gain > opts_.max_gain_db) {
LOG(ERROR)
<< "Unable to normalize segment to " << opts_.target_db << "dB,"
- << "because the the probable gain have exceeds opts_.max_gain_db"
+ << "because the probable gain has exceeded opts_.max_gain_db"
<< opts_.max_gain_db << "dB.";
return false;
}
diff --git a/speechx/speechx/kaldi/base/kaldi-types.h b/speechx/speechx/kaldi/base/kaldi-types.h
index c6a3e1aedb9..07381cf2af8 100644
--- a/speechx/speechx/kaldi/base/kaldi-types.h
+++ b/speechx/speechx/kaldi/base/kaldi-types.h
@@ -40,7 +40,7 @@ typedef float BaseFloat;
#include
// for discussion on what to do if you need compile kaldi
-// without OpenFST, see the bottom of this this file
+// without OpenFST, see the bottom of this file
#ifndef COMPILE_WITHOUT_OPENFST
diff --git a/speechx/speechx/kaldi/feat/pitch-functions.cc b/speechx/speechx/kaldi/feat/pitch-functions.cc
index 430e9bdb53a..d71169ec916 100644
--- a/speechx/speechx/kaldi/feat/pitch-functions.cc
+++ b/speechx/speechx/kaldi/feat/pitch-functions.cc
@@ -746,7 +746,7 @@ OnlinePitchFeatureImpl::OnlinePitchFeatureImpl(
Vector lags_offset(lags_);
// lags_offset equals lags_ (which are the log-spaced lag values we want to
// measure the NCCF at) with nccf_first_lag_ / opts.resample_freq subtracted
- // from each element, so we can treat the measured NCCF values as as starting
+ // from each element, so we can treat the measured NCCF values as starting
// from sample zero in a signal that starts at the point start /
// opts.resample_freq. This is necessary because the ArbitraryResample code
// assumes that the input signal starts from sample zero.
diff --git a/speechx/speechx/kaldi/lat/lattice-functions.h b/speechx/speechx/kaldi/lat/lattice-functions.h
index 6b1b6656c27..785d3f96ec9 100644
--- a/speechx/speechx/kaldi/lat/lattice-functions.h
+++ b/speechx/speechx/kaldi/lat/lattice-functions.h
@@ -355,12 +355,12 @@ bool PruneLattice(BaseFloat beam, LatticeType *lat);
//
//
// /// This function returns the number of words in the longest sentence in a
-// /// CompactLattice (i.e. the the maximum of any path, of the count of
+// /// CompactLattice (i.e. the maximum of any path, of the count of
// /// olabels on that path).
// int32 LongestSentenceLength(const Lattice &lat);
//
// /// This function returns the number of words in the longest sentence in a
-// /// CompactLattice, i.e. the the maximum of any path, of the count of
+// /// CompactLattice, i.e. the maximum of any path, of the count of
// /// labels on that path... note, in CompactLattice, the ilabels and olabels
// /// are identical because it is an acceptor.
// int32 LongestSentenceLength(const CompactLattice &lat);
@@ -408,7 +408,7 @@ bool PruneLattice(BaseFloat beam, LatticeType *lat);
//
// /// This function computes the mapping from the pair
// /// (frame-index, transition-id) to the pair
-// /// (sum-of-acoustic-scores, num-of-occurences) over all occurences of the
+// /// (sum-of-acoustic-scores, num-of-occurrences) over all occurrences of the
// /// transition-id in that frame.
// /// frame-index in the lattice.
// /// This function is useful for retaining the acoustic scores in a
@@ -422,13 +422,13 @@ bool PruneLattice(BaseFloat beam, LatticeType *lat);
// /// @param [out] acoustic_scores
// /// Pointer to a map from the pair (frame-index,
// /// transition-id) to a pair (sum-of-acoustic-scores,
-// /// num-of-occurences).
+// /// num-of-occurrences).
// /// Usually the acoustic scores for a pdf-id (and hence
// /// transition-id) on a frame will be the same for all the
-// /// occurences of the pdf-id in that frame.
+// /// occurrences of the pdf-id in that frame.
// /// But if not, we will take the average of the acoustic
// /// scores. Hence, we store both the sum-of-acoustic-scores
-// /// and the num-of-occurences of the transition-id in that
+// /// and the num-of-occurrences of the transition-id in that
// /// frame.
// void ComputeAcousticScoresMap(
// const Lattice &lat,
@@ -440,8 +440,8 @@ bool PruneLattice(BaseFloat beam, LatticeType *lat);
// ///
// /// @param [in] acoustic_scores
// /// A map from the pair (frame-index, transition-id) to a
-// /// pair (sum-of-acoustic-scores, num-of-occurences) of
-// /// the occurences of the transition-id in that frame.
+// /// pair (sum-of-acoustic-scores, num-of-occurrences) of
+// /// the occurrences of the transition-id in that frame.
// /// See the comments for ComputeAcousticScoresMap for
// /// details.
// /// @param [out] lat Pointer to the output lattice.
diff --git a/speechx/speechx/kaldi/matrix/kaldi-matrix.cc b/speechx/speechx/kaldi/matrix/kaldi-matrix.cc
index faf23cdf0c5..85e6fecc861 100644
--- a/speechx/speechx/kaldi/matrix/kaldi-matrix.cc
+++ b/speechx/speechx/kaldi/matrix/kaldi-matrix.cc
@@ -1646,7 +1646,7 @@ SubMatrix::SubMatrix(const MatrixBase &M,
static_cast(M.num_rows_ - ro) &&
static_cast(c) <=
static_cast(M.num_cols_ - co));
- // point to the begining of window
+ // point to the beginning of window
MatrixBase::num_rows_ = r;
MatrixBase::num_cols_ = c;
MatrixBase::stride_ = M.Stride();
diff --git a/speechx/speechx/kaldi/matrix/sparse-matrix.cc b/speechx/speechx/kaldi/matrix/sparse-matrix.cc
index 68a61e17dc3..192d258457c 100644
--- a/speechx/speechx/kaldi/matrix/sparse-matrix.cc
+++ b/speechx/speechx/kaldi/matrix/sparse-matrix.cc
@@ -998,7 +998,7 @@ void FilterCompressedMatrixRows(const CompressedMatrix &in,
// iterating row-wise versus column-wise in compressed-matrix uncompression.
if (num_kept_rows > heuristic * in.NumRows()) {
- // if quite a few of the the rows are kept, it may be more efficient
+ // if quite a few of the rows are kept, it may be more efficient
// to uncompress the entire compressed matrix, since per-column operation
// is more efficient.
Matrix full_mat(in);
diff --git a/speechx/speechx/kaldi/util/kaldi-table-inl.h b/speechx/speechx/kaldi/util/kaldi-table-inl.h
index 6aca2f137e3..175e27049a0 100644
--- a/speechx/speechx/kaldi/util/kaldi-table-inl.h
+++ b/speechx/speechx/kaldi/util/kaldi-table-inl.h
@@ -1587,7 +1587,7 @@ template class RandomAccessTableReaderImplBase {
// this from a pipe. In principle we could read it on-demand as for the
// archives, but this would probably be overkill.
-// Note: the code for this this class is similar to TableWriterScriptImpl:
+// Note: the code for this class is similar to TableWriterScriptImpl:
// try to keep them in sync.
template
class RandomAccessTableReaderScriptImpl:
diff --git a/speechx/speechx/nnet/ds2_nnet.cc b/speechx/speechx/nnet/ds2_nnet.cc
index 22c7f61b82d..f30d7979cd2 100644
--- a/speechx/speechx/nnet/ds2_nnet.cc
+++ b/speechx/speechx/nnet/ds2_nnet.cc
@@ -105,7 +105,7 @@ paddle_infer::Predictor* PaddleNnet::GetPredictor() {
while (pred_id < pool_usages.size()) {
if (pool_usages[pred_id] == false) {
- predictor = pool->Retrive(pred_id);
+ predictor = pool->Retrieve(pred_id);
break;
}
++pred_id;
diff --git a/speechx/speechx/protocol/websocket/websocket_server.cc b/speechx/speechx/protocol/websocket/websocket_server.cc
index 14f2f6e9fb4..d1bed1ca11e 100644
--- a/speechx/speechx/protocol/websocket/websocket_server.cc
+++ b/speechx/speechx/protocol/websocket/websocket_server.cc
@@ -32,14 +32,14 @@ void ConnectionHandler::OnSpeechStart() {
decode_thread_ = std::make_shared(
&ConnectionHandler::DecodeThreadFunc, this);
got_start_tag_ = true;
- LOG(INFO) << "Server: Recieved speech start signal, start reading speech";
+ LOG(INFO) << "Server: Received speech start signal, start reading speech";
json::value rv = {{"status", "ok"}, {"type", "server_ready"}};
ws_.text(true);
ws_.write(asio::buffer(json::serialize(rv)));
}
void ConnectionHandler::OnSpeechEnd() {
- LOG(INFO) << "Server: Recieved speech end signal";
+ LOG(INFO) << "Server: Received speech end signal";
if (recognizer_ != nullptr) {
recognizer_->SetFinished();
}
@@ -70,8 +70,8 @@ void ConnectionHandler::OnSpeechData(const beast::flat_buffer& buffer) {
pcm_data(i) = static_cast(*pdata);
pdata++;
}
- VLOG(2) << "Server: Recieved " << num_samples << " samples";
- LOG(INFO) << "Server: Recieved " << num_samples << " samples";
+ VLOG(2) << "Server: Received " << num_samples << " samples";
+ LOG(INFO) << "Server: Received " << num_samples << " samples";
CHECK(recognizer_ != nullptr);
recognizer_->Accept(pcm_data);
diff --git a/tools/extras/install_mkl.sh b/tools/extras/install_mkl.sh
index 8c1899bdf2f..01bce64fe27 100755
--- a/tools/extras/install_mkl.sh
+++ b/tools/extras/install_mkl.sh
@@ -166,7 +166,7 @@ variable, sudo might not allow it to propagate to the command that it invokes."
fi
# The install variants, each in a function to simplify error reporting.
-# Each one invokes a subshell with a 'set -x' to to show system-modifying
+# Each one invokes a subshell with a 'set -x' to show system-modifying
# commands it runs. The subshells simply limit the scope of this diagnostics
# and avoid creating noise (if we were using 'set +x', it would be printed).
Install_redhat () {
diff --git a/utils/fst/ctc_token_fst.py b/utils/fst/ctc_token_fst.py
index 2262912c8bf..f63e9cdacb5 100755
--- a/utils/fst/ctc_token_fst.py
+++ b/utils/fst/ctc_token_fst.py
@@ -6,7 +6,7 @@ def main(args):
"""Token Transducer"""
# entry
print('0 1 ')
- # skip begining and ending
+ # skip beginning and ending
print('1 1 ')
print('2 2 ')
# exit
diff --git a/utils/tokenizer.perl b/utils/tokenizer.perl
index ae97d6582bd..836fe19c612 100644
--- a/utils/tokenizer.perl
+++ b/utils/tokenizer.perl
@@ -296,7 +296,7 @@ sub tokenize
$text =~ s/DOTMULTI\./DOTDOTMULTI/g;
}
- # seperate out "," except if within numbers (5,300)
+ # separate out "," except if within numbers (5,300)
#$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
# separate out "," except if within numbers (5,300)