diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
index a18c454c77f..1ff473308f7 100644
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -27,4 +27,4 @@ git commit -m "xxxxxx, test=doc"
 1. 虽然跳过了 CI，但是还要先排队排到才能跳过，所以非自己方向看到 pending 不要着急 🤣
 2. 在 `git commit --amend` 的时候才加 `test=xxx` 可能不太有效
 3. 一个 pr 多次提交 commit 注意每次都要加 `test=xxx`，因为每个 commit 都会触发 CI
-4. 删除 python 环境中已经安装好的的 paddlespeech，否则可能会影响 import paddlespeech 的顺序</div>
+4. 删除 python 环境中已经安装好的 paddlespeech，否则可能会影响 import paddlespeech 的顺序</div>
diff --git a/audio/paddleaudio/backends/soundfile_backend.py b/audio/paddleaudio/backends/soundfile_backend.py
index ae7b5b52d49..9195ea0974a 100644
--- a/audio/paddleaudio/backends/soundfile_backend.py
+++ b/audio/paddleaudio/backends/soundfile_backend.py
@@ -191,7 +191,7 @@ def soundfile_save(y: np.ndarray, sr: int, file: os.PathLike) -> None:
 
     if sr <= 0:
         raise ParameterError(
-            f'Sample rate should be larger than 0, recieved sr = {sr}')
+            f'Sample rate should be larger than 0, received sr = {sr}')
 
     if y.dtype not in ['int16', 'int8']:
         warnings.warn(
diff --git a/demos/TTSAndroid/README.md b/demos/TTSAndroid/README.md
index 36ff969fb03..36848cbe370 100644
--- a/demos/TTSAndroid/README.md
+++ b/demos/TTSAndroid/README.md
@@ -1,6 +1,6 @@
 # 语音合成 Java API Demo 使用指南
 
-在 Android 上实现语音合成功能，此 Demo 有很好的的易用性和开放性，如在 Demo 中跑自己训练好的模型等。
+在 Android 上实现语音合成功能，此 Demo 有很好的易用性和开放性，如在 Demo 中跑自己训练好的模型等。
 
 本文主要介绍语音合成 Demo 运行方法。
 
diff --git a/demos/TTSArmLinux/front.conf b/demos/TTSArmLinux/front.conf
index 04bd2d97f05..5960b32a92d 100644
--- a/demos/TTSArmLinux/front.conf
+++ b/demos/TTSArmLinux/front.conf
@@ -6,13 +6,13 @@
 --jieba_stop_word_path=./dict/jieba/stop_words.utf8
 
 # dict conf fastspeech2_0.4
---seperate_tone=false
+--separate_tone=false
 --word2phone_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
 --phone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
 --tone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
 
 # dict conf speedyspeech_0.5
-#--seperate_tone=true
+#--separate_tone=true
 #--word2phone_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/word2phone.dict
 #--phone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt
 #--tone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
diff --git a/demos/TTSCppFrontend/front_demo/front.conf b/demos/TTSCppFrontend/front_demo/front.conf
index e9ce1c94d73..abff444703c 100644
--- a/demos/TTSCppFrontend/front_demo/front.conf
+++ b/demos/TTSCppFrontend/front_demo/front.conf
@@ -6,13 +6,13 @@
 --jieba_stop_word_path=./front_demo/dict/jieba/stop_words.utf8
 
 # dict conf fastspeech2_0.4
---seperate_tone=false
+--separate_tone=false
 --word2phone_path=./front_demo/dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
 --phone2id_path=./front_demo/dict/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
 --tone2id_path=./front_demo/dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
 
 # dict conf speedyspeech_0.5
-#--seperate_tone=true
+#--separate_tone=true
 #--word2phone_path=./front_demo/dict/speedyspeech_nosil_baker_ckpt_0.5/word2phone.dict
 #--phone2id_path=./front_demo/dict/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt
 #--tone2id_path=./front_demo/dict/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
diff --git a/demos/TTSCppFrontend/front_demo/front_demo.cpp b/demos/TTSCppFrontend/front_demo/front_demo.cpp
index 19f16758bc6..77f3fc725d0 100644
--- a/demos/TTSCppFrontend/front_demo/front_demo.cpp
+++ b/demos/TTSCppFrontend/front_demo/front_demo.cpp
@@ -20,7 +20,7 @@
 
 DEFINE_string(sentence, "你好，欢迎使用语音合成服务", "Text to be synthesized");
 DEFINE_string(front_conf, "./front_demo/front.conf", "Front conf file");
-// DEFINE_string(seperate_tone, "true", "If true, get phoneids and tonesid");
+// DEFINE_string(separate_tone, "true", "If true, get phoneids and tonesid");
 
 
 int main(int argc, char** argv) {
diff --git a/demos/TTSCppFrontend/front_demo/gentools/word2phones.py b/demos/TTSCppFrontend/front_demo/gentools/word2phones.py
index 8726ee89cf6..d9baeea9c09 100644
--- a/demos/TTSCppFrontend/front_demo/gentools/word2phones.py
+++ b/demos/TTSCppFrontend/front_demo/gentools/word2phones.py
@@ -20,7 +20,7 @@
 newdict = "./dict/word_phones.dict"
 
 
-def GenPhones(initials, finals, seperate=True):
+def GenPhones(initials, finals, separate=True):
 
     phones = []
     for c, v in zip(initials, finals):
@@ -30,9 +30,9 @@ def GenPhones(initials, finals, seperate=True):
             elif c in ['zh', 'ch', 'sh', 'r']:
                 v = re.sub('i', 'iii', v)
         if c:
-            if seperate is True:
+            if separate is True:
                 phones.append(c + '0')
-            elif seperate is False:
+            elif separate is False:
                 phones.append(c)
             else:
                 print("Not sure whether phone and tone need to be separated")
diff --git a/demos/TTSCppFrontend/src/front/front_interface.cpp b/demos/TTSCppFrontend/src/front/front_interface.cpp
index 8bd466d28e9..e7b08c798f8 100644
--- a/demos/TTSCppFrontend/src/front/front_interface.cpp
+++ b/demos/TTSCppFrontend/src/front/front_interface.cpp
@@ -126,7 +126,7 @@ int FrontEngineInterface::init() {
     }
 
     // 生成音调字典（音调到音调id的映射）
-    if (_seperate_tone == "true") {
+    if (_separate_tone == "true") {
         if (0 != GenDict(_tone2id_path, &tone_id_map)) {
             LOG(ERROR) << "Genarate tone2id dict failed";
             return -1;
@@ -168,7 +168,7 @@ int FrontEngineInterface::ReadConfFile() {
     _jieba_stop_word_path = conf_map["jieba_stop_word_path"];
 
     // dict path
-    _seperate_tone = conf_map["seperate_tone"];
+    _separate_tone = conf_map["separate_tone"];
     _word2phone_path = conf_map["word2phone_path"];
     _phone2id_path = conf_map["phone2id_path"];
     _tone2id_path = conf_map["tone2id_path"];
@@ -295,7 +295,7 @@ int FrontEngineInterface::GetWordsIds(
                     }
                 }
             } else {  // 标点符号
-                if (_seperate_tone == "true") {
+                if (_separate_tone == "true") {
                     phone = "sp0";  // speedyspeech
                 } else {
                     phone = "sp";  // fastspeech2
@@ -354,7 +354,7 @@ int FrontEngineInterface::Phone2Phoneid(const std::string &phone,
     std::string temp_phone;
     for (int i = 0; i < phone_vec.size(); i++) {
         temp_phone = phone_vec[i];
-        if (_seperate_tone == "true") {
+        if (_separate_tone == "true") {
             phoneid->push_back(atoi(
                 (phone_id_map[temp_phone.substr(0, temp_phone.length() - 1)])
                     .c_str()));
diff --git a/demos/TTSCppFrontend/src/front/front_interface.h b/demos/TTSCppFrontend/src/front/front_interface.h
index fc33a4de6bc..8c16859cf46 100644
--- a/demos/TTSCppFrontend/src/front/front_interface.h
+++ b/demos/TTSCppFrontend/src/front/front_interface.h
@@ -182,7 +182,7 @@ class FrontEngineInterface : public TextNormalizer {
     std::string _jieba_idf_path;
     std::string _jieba_stop_word_path;
 
-    std::string _seperate_tone;
+    std::string _separate_tone;
     std::string _word2phone_path;
     std::string _phone2id_path;
     std::string _tone2id_path;
diff --git a/demos/speech_web/README.md b/demos/speech_web/README.md
index 572781ab682..fc1fe7105f4 100644
--- a/demos/speech_web/README.md
+++ b/demos/speech_web/README.md
@@ -23,7 +23,7 @@ Paddle Speech Demo 是一个以 PaddleSpeech 的语音交互功能为主体开
 
 + ERNIE-SAT：语言-语音跨模态大模型 ERNIE-SAT 可视化展示示例，支持个性化合成，跨语言语音合成（音频为中文则输入英文文本进行合成），语音编辑（修改音频文字中间的结果）功能。 ERNIE-SAT 更多实现细节，可以参考：
   + [【ERNIE-SAT with AISHELL-3 dataset】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/ernie_sat)
-  + [【ERNIE-SAT with with AISHELL3 and VCTK datasets】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3_vctk/ernie_sat)
+  + [【ERNIE-SAT with AISHELL3 and VCTK datasets】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3_vctk/ernie_sat)
   + [【ERNIE-SAT with VCTK dataset】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/ernie_sat)
 
 运行效果：
diff --git a/demos/speech_web/speech_server/main.py b/demos/speech_web/speech_server/main.py
index 03e7e5996c2..f4678628f15 100644
--- a/demos/speech_web/speech_server/main.py
+++ b/demos/speech_web/speech_server/main.py
@@ -260,7 +260,7 @@ async def websocket_endpoint_online(websocket: WebSocket):
                 #              and we break the loop
                 if message['signal'] == 'start':
                     resp = {"status": "ok", "signal": "server_ready"}
-                    # do something at begining here
+                    # do something at beginning here
                     # create the instance to process the audio
                     # connection_handler = chatbot.asr.connection_handler
                     connection_handler = PaddleASRConnectionHanddler(engine)
diff --git a/docs/tutorial/st/st_tutorial.ipynb b/docs/tutorial/st/st_tutorial.ipynb
index 2fb850535ed..e755bebad17 100644
--- a/docs/tutorial/st/st_tutorial.ipynb
+++ b/docs/tutorial/st/st_tutorial.ipynb
@@ -62,7 +62,7 @@
     "collapsed": false
    },
    "source": [
-    "# 使用Transformer进行端到端语音翻译的的基本流程\n",
+    "# 使用Transformer进行端到端语音翻译的基本流程\n",
     "## 基础模型\n",
     "由于 ASR 章节已经介绍了 Transformer 以及语音特征抽取，在此便不做过多介绍，感兴趣的同学可以去相关章节进行了解。\n",
     "\n",
diff --git a/docs/tutorial/tts/tts_tutorial.ipynb b/docs/tutorial/tts/tts_tutorial.ipynb
index 583adb01470..0cecb680d61 100644
--- a/docs/tutorial/tts/tts_tutorial.ipynb
+++ b/docs/tutorial/tts/tts_tutorial.ipynb
@@ -464,7 +464,7 @@
     "<br><center> FastSpeech2 网络结构图</center></br>\n",
     "\n",
     "\n",
-    "PaddleSpeech TTS 实现的 FastSpeech2 与论文不同的地方在于，我们使用的的是 phone 级别的 `pitch` 和 `energy`(与 FastPitch 类似)，这样的合成结果可以更加**稳定**。\n",
+    "PaddleSpeech TTS 实现的 FastSpeech2 与论文不同的地方在于，我们使用的是 phone 级别的 `pitch` 和 `energy`(与 FastPitch 类似)，这样的合成结果可以更加**稳定**。\n",
     "<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/862c21456c784c41a83a308b7d9707f0810cc3b3c6f94ed48c60f5d32d0072f0\"></center>\n",
     "<br><center> FastPitch 网络结构图</center></br>\n",
     "\n",
diff --git a/examples/librispeech/asr2/README.md b/examples/librispeech/asr2/README.md
index 26978520da2..253c9b45950 100644
--- a/examples/librispeech/asr2/README.md
+++ b/examples/librispeech/asr2/README.md
@@ -153,7 +153,7 @@ After training the model, we need to get the final model for testing and inferen
 ```bash
  if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
      # avg n best model
-     avg.sh lastest exp/${ckpt}/checkpoints ${avg_num}
+     avg.sh latest exp/${ckpt}/checkpoints ${avg_num}
  fi
 ```
 The `avg.sh` is in the `../../../utils/` which is define in the `path.sh`.
diff --git a/examples/other/mfa/local/generate_lexicon.py b/examples/other/mfa/local/generate_lexicon.py
index 3deb2470189..e63b5eb27d0 100644
--- a/examples/other/mfa/local/generate_lexicon.py
+++ b/examples/other/mfa/local/generate_lexicon.py
@@ -48,7 +48,7 @@ def rule(C, V, R, T):
     
     'i' is distinguished when appeared in phonemes, and separated into 3 categories, 'i', 'ii' and 'iii'.
 
-    Erhua is is possibly applied to every finals, except for finals that already ends with 'r'.
+    Erhua is possibly applied to every finals, except for finals that already ends with 'r'.
 
     When a syllable is impossible or does not have any characters with this pronunciation, return None
     to filter it out.
diff --git a/examples/tiny/asr1/README.md b/examples/tiny/asr1/README.md
index cfa26670451..489f5bc3e76 100644
--- a/examples/tiny/asr1/README.md
+++ b/examples/tiny/asr1/README.md
@@ -37,7 +37,7 @@ It will support the way of using `--variable value` in the shell scripts.
 Some local variables are set in `run.sh`. 
 `gpus` denotes the GPU number you want to use. If you set `gpus=`, it means you only use CPU. 
 `stage` denotes the number of stage you want the start from in the experiments.
-`stop stage` denotes the number of stage you want the stop at in the expriments. 
+`stop stage` denotes the number of stage you want the stop at in the experiments. 
 `conf_path` denotes the config path of the model.
 `avg_num`denotes the number K of top-K models you want to average to get the final model.
 `ckpt` denotes the checkpoint prefix of the model, e.g. "transformerr"
diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py
index 6663bcf87be..37d99226204 100644
--- a/paddlespeech/s2t/__init__.py
+++ b/paddlespeech/s2t/__init__.py
@@ -267,7 +267,7 @@ def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
 
 
 if not hasattr(paddle.Tensor, 'to'):
-    logger.debug("register user to to paddle.Tensor, remove this when fixed!")
+    logger.debug("register user to paddle.Tensor, remove this when fixed!")
     setattr(paddle.Tensor, 'to', to)
     setattr(paddle.static.Variable, 'to', to)
 
diff --git a/paddlespeech/s2t/frontend/augmentor/augmentation.py b/paddlespeech/s2t/frontend/augmentor/augmentation.py
index 4c5ca4fe630..744ea56dd79 100644
--- a/paddlespeech/s2t/frontend/augmentor/augmentation.py
+++ b/paddlespeech/s2t/frontend/augmentor/augmentation.py
@@ -45,7 +45,7 @@ class AugmentationPipeline():
     samples to make the model invariant to certain types of perturbations in the
     real world, improving model's generalization ability.
 
-    The pipeline is built according the the augmentation configuration in json
+    The pipeline is built according to the augmentation configuration in json
     string, e.g.
     
     .. code-block::
diff --git a/paddlespeech/s2t/io/speechbrain/sampler.py b/paddlespeech/s2t/io/speechbrain/sampler.py
index ba13193eb6e..09a884c2b84 100755
--- a/paddlespeech/s2t/io/speechbrain/sampler.py
+++ b/paddlespeech/s2t/io/speechbrain/sampler.py
@@ -283,7 +283,7 @@ def _get_boundaries_through_warping(
             num_quantiles, )
         # get quantiles using lognormal distribution
         quantiles = lognorm.ppf(latent_boundaries, 1)
-        # scale up to to max_batch_length
+        # scale up to max_batch_length
         bucket_boundaries = quantiles * max_batch_length / quantiles[-1]
         # compute resulting bucket length multipliers
         length_multipliers = [
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
index 6494b5304c4..f716fa3b57f 100644
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -560,7 +560,7 @@ def attention_rescoring(self,
             [len(hyp[0]) for hyp in hyps], place=device,
             dtype=paddle.long)  # (beam_size,)
         hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id)
-        hyps_lens = hyps_lens + 1  # Add <sos> at begining
+        hyps_lens = hyps_lens + 1  # Add <sos> at beginning
         logger.debug(
             f"hyps pad: {hyps_pad} {self.sos} {self.eos} {self.ignore_id}")
 
@@ -709,7 +709,7 @@ def forward_attention_decoder(self,
             hypothesis from ctc prefix beam search and one encoder output
         Args:
             hyps (paddle.Tensor): hyps from ctc prefix beam search, already
-                pad sos at the begining, (B, T)
+                pad sos at the beginning, (B, T)
             hyps_lens (paddle.Tensor): length of each hyp in hyps, (B)
             encoder_out (paddle.Tensor): corresponding encoder output, (B=1, T, D)
         Returns:
diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py
index 31defbbaf1b..b4c8c255f01 100644
--- a/paddlespeech/s2t/models/u2_st/u2_st.py
+++ b/paddlespeech/s2t/models/u2_st/u2_st.py
@@ -455,7 +455,7 @@ def forward_attention_decoder(
             hypothesis from ctc prefix beam search and one encoder output
         Args:
             hyps (paddle.Tensor): hyps from ctc prefix beam search, already
-                pad sos at the begining, (B, T)
+                pad sos at the beginning, (B, T)
             hyps_lens (paddle.Tensor): length of each hyp in hyps, (B)
             encoder_out (paddle.Tensor): corresponding encoder output, (B=1, T, D)
         Returns:
diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py
index 536ffe0a906..a702f0aa12d 100644
--- a/paddlespeech/server/engine/asr/online/python/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py
@@ -609,7 +609,7 @@ def rescoring(self):
             dtype=paddle.long)  # (beam_size,)
         hyps_pad, _ = add_sos_eos(hyps_pad, self.model.sos, self.model.eos,
                                   self.model.ignore_id)
-        hyps_lens = hyps_lens + 1  # Add <sos> at begining
+        hyps_lens = hyps_lens + 1  # Add <sos> at beginning
 
         # ctc score in ln domain
         # (beam_size, max_hyps_len, vocab_size)
diff --git a/paddlespeech/server/ws/asr_api.py b/paddlespeech/server/ws/asr_api.py
index ae1c8831077..b3ad0b7c502 100644
--- a/paddlespeech/server/ws/asr_api.py
+++ b/paddlespeech/server/ws/asr_api.py
@@ -67,7 +67,7 @@ async def websocket_endpoint(websocket: WebSocket):
                 #              and we break the loop
                 if message['signal'] == 'start':
                     resp = {"status": "ok", "signal": "server_ready"}
-                    # do something at begining here
+                    # do something at beginning here
                     # create the instance to process the audio
                     #connection_handler = PaddleASRConnectionHanddler(asr_model)
                     connection_handler = asr_model.new_handler()
diff --git a/paddlespeech/t2s/frontend/generate_lexicon.py b/paddlespeech/t2s/frontend/generate_lexicon.py
index 6b467d00e12..4fb748a69bb 100644
--- a/paddlespeech/t2s/frontend/generate_lexicon.py
+++ b/paddlespeech/t2s/frontend/generate_lexicon.py
@@ -45,7 +45,7 @@ def rule(C, V, R, T):
     'u' in syllables when certain conditions are satisfied.
 
     'i' is distinguished when appeared in phonemes, and separated into 3 categories, 'i', 'ii' and 'iii'.
-    Erhua is is possibly applied to every finals, except for finals that already ends with 'r'.
+    Erhua is possibly applied to every finals, except for finals that already ends with 'r'.
     When a syllable is impossible or does not have any characters with this pronunciation, return None
     to filter it out.
     """
diff --git a/paddlespeech/t2s/models/waveflow.py b/paddlespeech/t2s/models/waveflow.py
index 8e2ce822fd2..b4818cab4cd 100644
--- a/paddlespeech/t2s/models/waveflow.py
+++ b/paddlespeech/t2s/models/waveflow.py
@@ -236,7 +236,7 @@ def add_input(self, x_row, condition_row):
 
         Returns:
             res (Tensor): 
-                A row of the the residual output. shape=(batch_size, channel, 1, width)
+                A row of the residual output. shape=(batch_size, channel, 1, width)
             skip (Tensor): 
                 A row of the skip output. shape=(batch_size, channel, 1, width)
 
@@ -343,7 +343,7 @@ def add_input(self, x_row, condition_row):
             
         Returns:
             res (Tensor): 
-                A row of the the residual output. shape=(batch_size, channel, 1, width) 
+                A row of the residual output. shape=(batch_size, channel, 1, width) 
             skip (Tensor): 
                 A row of the skip output. shape=(batch_size, channel, 1, width)
                 
@@ -465,7 +465,7 @@ def _start_sequence(self):
         self.resnet.start_sequence()
 
     def inverse(self, z, condition):
-        """Sampling from the the distrition p(X). It is done by sample form
+        """Sampling from the distrition p(X). It is done by sample form
         p(Z) and transform the sample. It is a auto regressive transformation.
 
         Args:
@@ -600,7 +600,7 @@ def forward(self, x, condition):
         return z, log_det_jacobian
 
     def inverse(self, z, condition):
-        """Sampling from the the distrition p(X).
+        """Sampling from the distrition p(X).
 
         It is done by sample a ``z`` form p(Z) and transform it into ``x``.
         Each Flow transform .. math:: `z_{i-1}` to .. math:: `z_{i}` in an
diff --git a/paddlespeech/t2s/modules/transformer/lightconv.py b/paddlespeech/t2s/modules/transformer/lightconv.py
index 22217d50f51..85336f4f3ca 100644
--- a/paddlespeech/t2s/modules/transformer/lightconv.py
+++ b/paddlespeech/t2s/modules/transformer/lightconv.py
@@ -110,7 +110,7 @@ def forward(self, query, key, value, mask):
                 (batch, time1, time2) mask
 
         Return:
-            Tensor: ouput. (batch, time1, d_model) 
+            Tensor: output. (batch, time1, d_model) 
 
         """
         # linear -> GLU -> lightconv -> linear
diff --git a/paddlespeech/vector/exps/ecapa_tdnn/train.py b/paddlespeech/vector/exps/ecapa_tdnn/train.py
index bf014045d0a..2dc7a7164c5 100644
--- a/paddlespeech/vector/exps/ecapa_tdnn/train.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/train.py
@@ -51,7 +51,7 @@ def main(args, config):
     # stage0: set the training device, cpu or gpu
     paddle.set_device(args.device)
 
-    # stage1: we must call the paddle.distributed.init_parallel_env() api at the begining
+    # stage1: we must call the paddle.distributed.init_parallel_env() api at the beginning
     paddle.distributed.init_parallel_env()
     nranks = paddle.distributed.get_world_size()
     rank = paddle.distributed.get_rank()
@@ -146,7 +146,7 @@ def main(args, config):
     timer.start()
 
     for epoch in range(start_epoch + 1, config.epochs + 1):
-        # at the begining, model must set to train mode
+        # at the beginning, model must set to train mode
         model.train()
 
         avg_loss = 0
diff --git a/paddlespeech/vector/exps/ge2e/preprocess.py b/paddlespeech/vector/exps/ge2e/preprocess.py
index dabe0ce7694..ee59e62457a 100644
--- a/paddlespeech/vector/exps/ge2e/preprocess.py
+++ b/paddlespeech/vector/exps/ge2e/preprocess.py
@@ -42,7 +42,7 @@
     parser.add_argument(
         "--skip_existing",
         action="store_true",
-        help="Whether to skip ouput files with the same name. Useful if this script was interrupted."
+        help="Whether to skip output files with the same name. Useful if this script was interrupted."
     )
     parser.add_argument(
         "--no_trim",
diff --git a/speechx/examples/ds2_ol/onnx/local/onnx_infer_shape.py b/speechx/examples/ds2_ol/onnx/local/onnx_infer_shape.py
index c53e9ec920d..65709fc2be1 100755
--- a/speechx/examples/ds2_ol/onnx/local/onnx_infer_shape.py
+++ b/speechx/examples/ds2_ol/onnx/local/onnx_infer_shape.py
@@ -2078,7 +2078,7 @@ def _infer_PythonOp(self, node):
         output_tensor_ranks = get_attribute(node, 'output_tensor_ranks')
         assert output_tensor_ranks
 
-        # set the context output seperately.
+        # set the context output separately.
         # The first output is autograd's context.
         vi = self.known_vi_[node.output[0]]
         vi.CopyFrom(
diff --git a/speechx/speechx/frontend/audio/db_norm.cc b/speechx/speechx/frontend/audio/db_norm.cc
index ad79fcc3a4a..7141fc8077a 100644
--- a/speechx/speechx/frontend/audio/db_norm.cc
+++ b/speechx/speechx/frontend/audio/db_norm.cc
@@ -76,7 +76,7 @@ bool DecibelNormalizer::Compute(VectorBase<BaseFloat>* waves) const {
     if (gain > opts_.max_gain_db) {
         LOG(ERROR)
             << "Unable to normalize segment to " << opts_.target_db << "dB,"
-            << "because the the probable gain have exceeds opts_.max_gain_db"
+            << "because the probable gain has exceeded opts_.max_gain_db"
             << opts_.max_gain_db << "dB.";
         return false;
     }
diff --git a/speechx/speechx/kaldi/base/kaldi-types.h b/speechx/speechx/kaldi/base/kaldi-types.h
index c6a3e1aedb9..07381cf2af8 100644
--- a/speechx/speechx/kaldi/base/kaldi-types.h
+++ b/speechx/speechx/kaldi/base/kaldi-types.h
@@ -40,7 +40,7 @@ typedef float   BaseFloat;
 #include <stdint.h>
 
 // for discussion on what to do if you need compile kaldi
-// without OpenFST, see the bottom of this this file
+// without OpenFST, see the bottom of this file
 
 #ifndef COMPILE_WITHOUT_OPENFST
 
diff --git a/speechx/speechx/kaldi/feat/pitch-functions.cc b/speechx/speechx/kaldi/feat/pitch-functions.cc
index 430e9bdb53a..d71169ec916 100644
--- a/speechx/speechx/kaldi/feat/pitch-functions.cc
+++ b/speechx/speechx/kaldi/feat/pitch-functions.cc
@@ -746,7 +746,7 @@ OnlinePitchFeatureImpl::OnlinePitchFeatureImpl(
   Vector<BaseFloat> lags_offset(lags_);
   // lags_offset equals lags_ (which are the log-spaced lag values we want to
   // measure the NCCF at) with nccf_first_lag_ / opts.resample_freq subtracted
-  // from each element, so we can treat the measured NCCF values as as starting
+  // from each element, so we can treat the measured NCCF values as starting
   // from sample zero in a signal that starts at the point start /
   // opts.resample_freq.  This is necessary because the ArbitraryResample code
   // assumes that the input signal starts from sample zero.
diff --git a/speechx/speechx/kaldi/lat/lattice-functions.h b/speechx/speechx/kaldi/lat/lattice-functions.h
index 6b1b6656c27..785d3f96ec9 100644
--- a/speechx/speechx/kaldi/lat/lattice-functions.h
+++ b/speechx/speechx/kaldi/lat/lattice-functions.h
@@ -355,12 +355,12 @@ bool PruneLattice(BaseFloat beam, LatticeType *lat);
 //
 //
 // /// This function returns the number of words in the longest sentence in a
-// /// CompactLattice (i.e. the the maximum of any path, of the count of
+// /// CompactLattice (i.e. the maximum of any path, of the count of
 // /// olabels on that path).
 // int32 LongestSentenceLength(const Lattice &lat);
 //
 // /// This function returns the number of words in the longest sentence in a
-// /// CompactLattice, i.e. the the maximum of any path, of the count of
+// /// CompactLattice, i.e. the maximum of any path, of the count of
 // /// labels on that path... note, in CompactLattice, the ilabels and olabels
 // /// are identical because it is an acceptor.
 // int32 LongestSentenceLength(const CompactLattice &lat);
@@ -408,7 +408,7 @@ bool PruneLattice(BaseFloat beam, LatticeType *lat);
 //
 // /// This function computes the mapping from the pair
 // /// (frame-index, transition-id) to the pair
-// /// (sum-of-acoustic-scores, num-of-occurences) over all occurences of the
+// /// (sum-of-acoustic-scores, num-of-occurrences) over all occurrences of the
 // /// transition-id in that frame.
 // /// frame-index in the lattice.
 // /// This function is useful for retaining the acoustic scores in a
@@ -422,13 +422,13 @@ bool PruneLattice(BaseFloat beam, LatticeType *lat);
 // ///   @param [out] acoustic_scores
 // ///                     Pointer to a map from the pair (frame-index,
 // ///                     transition-id) to a pair (sum-of-acoustic-scores,
-// ///                     num-of-occurences).
+// ///                     num-of-occurrences).
 // ///                     Usually the acoustic scores for a pdf-id (and hence
 // ///                     transition-id) on a frame will be the same for all the
-// ///                     occurences of the pdf-id in that frame.
+// ///                     occurrences of the pdf-id in that frame.
 // ///                     But if not, we will take the average of the acoustic
 // ///                     scores. Hence, we store both the sum-of-acoustic-scores
-// ///                     and the num-of-occurences of the transition-id in that
+// ///                     and the num-of-occurrences of the transition-id in that
 // ///                     frame.
 // void ComputeAcousticScoresMap(
 //     const Lattice &lat,
@@ -440,8 +440,8 @@ bool PruneLattice(BaseFloat beam, LatticeType *lat);
 // ///
 // ///   @param [in] acoustic_scores
 // ///                      A map from the pair (frame-index, transition-id) to a
-// ///                      pair (sum-of-acoustic-scores, num-of-occurences) of
-// ///                      the occurences of the transition-id in that frame.
+// ///                      pair (sum-of-acoustic-scores, num-of-occurrences) of
+// ///                      the occurrences of the transition-id in that frame.
 // ///                      See the comments for ComputeAcousticScoresMap for
 // ///                      details.
 // ///   @param [out] lat   Pointer to the output lattice.
diff --git a/speechx/speechx/kaldi/matrix/kaldi-matrix.cc b/speechx/speechx/kaldi/matrix/kaldi-matrix.cc
index faf23cdf0c5..85e6fecc861 100644
--- a/speechx/speechx/kaldi/matrix/kaldi-matrix.cc
+++ b/speechx/speechx/kaldi/matrix/kaldi-matrix.cc
@@ -1646,7 +1646,7 @@ SubMatrix<Real>::SubMatrix(const MatrixBase<Real> &M,
                static_cast<UnsignedMatrixIndexT>(M.num_rows_ - ro) &&
                static_cast<UnsignedMatrixIndexT>(c) <=
                static_cast<UnsignedMatrixIndexT>(M.num_cols_ - co));
-  // point to the begining of window
+  // point to the beginning of window
   MatrixBase<Real>::num_rows_ = r;
   MatrixBase<Real>::num_cols_ = c;
   MatrixBase<Real>::stride_ = M.Stride();
diff --git a/speechx/speechx/kaldi/matrix/sparse-matrix.cc b/speechx/speechx/kaldi/matrix/sparse-matrix.cc
index 68a61e17dc3..192d258457c 100644
--- a/speechx/speechx/kaldi/matrix/sparse-matrix.cc
+++ b/speechx/speechx/kaldi/matrix/sparse-matrix.cc
@@ -998,7 +998,7 @@ void FilterCompressedMatrixRows(const CompressedMatrix &in,
   // iterating row-wise versus column-wise in compressed-matrix uncompression.
 
   if (num_kept_rows > heuristic * in.NumRows()) {
-    // if quite a few of the the rows are kept, it may be more efficient
+    // if quite a few of the rows are kept, it may be more efficient
     // to uncompress the entire compressed matrix, since per-column operation
     // is more efficient.
     Matrix<BaseFloat> full_mat(in);
diff --git a/speechx/speechx/kaldi/util/kaldi-table-inl.h b/speechx/speechx/kaldi/util/kaldi-table-inl.h
index 6aca2f137e3..175e27049a0 100644
--- a/speechx/speechx/kaldi/util/kaldi-table-inl.h
+++ b/speechx/speechx/kaldi/util/kaldi-table-inl.h
@@ -1587,7 +1587,7 @@ template<class Holder> class RandomAccessTableReaderImplBase {
 // this from a pipe.  In principle we could read it on-demand as for the
 // archives, but this would probably be overkill.
 
-// Note: the code for this this class is similar to TableWriterScriptImpl:
+// Note: the code for this class is similar to TableWriterScriptImpl:
 // try to keep them in sync.
 template<class Holder>
 class RandomAccessTableReaderScriptImpl:
diff --git a/speechx/speechx/nnet/ds2_nnet.cc b/speechx/speechx/nnet/ds2_nnet.cc
index 22c7f61b82d..f30d7979cd2 100644
--- a/speechx/speechx/nnet/ds2_nnet.cc
+++ b/speechx/speechx/nnet/ds2_nnet.cc
@@ -105,7 +105,7 @@ paddle_infer::Predictor* PaddleNnet::GetPredictor() {
 
     while (pred_id < pool_usages.size()) {
         if (pool_usages[pred_id] == false) {
-            predictor = pool->Retrive(pred_id);
+            predictor = pool->Retrieve(pred_id);
             break;
         }
         ++pred_id;
diff --git a/speechx/speechx/protocol/websocket/websocket_server.cc b/speechx/speechx/protocol/websocket/websocket_server.cc
index 14f2f6e9fb4..d1bed1ca11e 100644
--- a/speechx/speechx/protocol/websocket/websocket_server.cc
+++ b/speechx/speechx/protocol/websocket/websocket_server.cc
@@ -32,14 +32,14 @@ void ConnectionHandler::OnSpeechStart() {
     decode_thread_ = std::make_shared<std::thread>(
         &ConnectionHandler::DecodeThreadFunc, this);
     got_start_tag_ = true;
-    LOG(INFO) << "Server: Recieved speech start signal, start reading speech";
+    LOG(INFO) << "Server: Received speech start signal, start reading speech";
     json::value rv = {{"status", "ok"}, {"type", "server_ready"}};
     ws_.text(true);
     ws_.write(asio::buffer(json::serialize(rv)));
 }
 
 void ConnectionHandler::OnSpeechEnd() {
-    LOG(INFO) << "Server: Recieved speech end signal";
+    LOG(INFO) << "Server: Received speech end signal";
     if (recognizer_ != nullptr) {
         recognizer_->SetFinished();
     }
@@ -70,8 +70,8 @@ void ConnectionHandler::OnSpeechData(const beast::flat_buffer& buffer) {
         pcm_data(i) = static_cast<float>(*pdata);
         pdata++;
     }
-    VLOG(2) << "Server: Recieved " << num_samples << " samples";
-    LOG(INFO) << "Server: Recieved " << num_samples << " samples";
+    VLOG(2) << "Server: Received " << num_samples << " samples";
+    LOG(INFO) << "Server: Received " << num_samples << " samples";
     CHECK(recognizer_ != nullptr);
     recognizer_->Accept(pcm_data);
 
diff --git a/tools/extras/install_mkl.sh b/tools/extras/install_mkl.sh
index 8c1899bdf2f..01bce64fe27 100755
--- a/tools/extras/install_mkl.sh
+++ b/tools/extras/install_mkl.sh
@@ -166,7 +166,7 @@ variable, sudo might not allow it to propagate to the command that it invokes."
 fi
 
 # The install variants, each in a function to simplify error reporting.
-# Each one invokes a subshell with a 'set -x' to to show system-modifying
+# Each one invokes a subshell with a 'set -x' to show system-modifying
 # commands it runs. The subshells simply limit the scope of this diagnostics
 # and avoid creating noise (if we were using 'set +x', it would be printed).
 Install_redhat () {
diff --git a/utils/fst/ctc_token_fst.py b/utils/fst/ctc_token_fst.py
index 2262912c8bf..f63e9cdacb5 100755
--- a/utils/fst/ctc_token_fst.py
+++ b/utils/fst/ctc_token_fst.py
@@ -6,7 +6,7 @@ def main(args):
     """Token Transducer"""
     # <eps> entry
     print('0 1 <eps> <eps>')
-    # skip begining and ending <blank>
+    # skip beginning and ending <blank>
     print('1 1 <blank> <eps>')
     print('2 2 <blank> <eps>')
     # <eps> exit
diff --git a/utils/tokenizer.perl b/utils/tokenizer.perl
index ae97d6582bd..836fe19c612 100644
--- a/utils/tokenizer.perl
+++ b/utils/tokenizer.perl
@@ -296,7 +296,7 @@ sub tokenize
         $text =~ s/DOTMULTI\./DOTDOTMULTI/g;
     }
 
-    # seperate out "," except if within numbers (5,300)
+    # separate out "," except if within numbers (5,300)
     #$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
 
     # separate out "," except if within numbers (5,300)