[examples] add paraformer fintune recipe (#2289)

* [examples] add paraformer fintune recipe * fix predictor type in yaml
wenet-e2e · Jan 10, 2024 · 50153d1 · 50153d1
1 parent eaa3713
commit 50153d1
Show file tree

Hide file tree

Showing 8 changed files with 439 additions and 0 deletions.
diff --git a/examples/aishell/paraformer/README.md b/examples/aishell/paraformer/README.md
@@ -0,0 +1,40 @@
+# Preliminary
+1. Run below command to convert funasr-style  ckpt to wenet-style ckpt:
+```sh
+output_dir=exp/paraformer/large
+mkdir -p ${output_dir}
+. ./path.sh && python wenet/paraformer/convert_paraformer_to_wenet_config_and_ckpt.py \
+  --output_dir ${output_dir}
+```
+
+# Performance Record
+
+## Paraformer (original) Result
+
+| decoding mode             |  CER  |
+|---------------------------|-------|
+| paraformer greedy search  | 1.95  |
+
+## Paraformer (full-parameter tuning) Result
+
+* Training info: batch size 28, ctc_weight: 0.3, acc_grad 4, 8 * v100 gpu, 40 epochs
+* Decoding info: ctc_weight 0.3, average_num 5
+* Git hash: TBD
+
+| decoding mode             | CER   |
+|---------------------------|-------|
+| ctc greedy search         | 4.00  |
+| ctc prefix beam search    | 4.00  |
+| paraformer greedy search  | 2.16  |
+
+## Paraformer-dynamic training (full-parameter tuning) Result
+
+* Training info: batch size 28, ctc_weight: 0.3, acc_grad 4, 8 * v100 gpu, 43 epochs
+* Decoding info: ctc_weight 0.3, average_num 5
+* Git hash: TBD
+
+| decoding mode             | full   | 16   |
+|---------------------------|--------|------|
+| ctc greedy search         | 3.93   | 4.94 |
+| ctc prefix beam search    | 3.93   | 4.94 |
+| paraformer greedy search  | 2.08   | 2.41 |
diff --git a/examples/aishell/paraformer/conf/train_paraformer.yaml b/examples/aishell/paraformer/conf/train_paraformer.yaml
@@ -0,0 +1,113 @@
+encoder: sanm_encoder
+encoder_conf:
+  attention_dropout_rate: 0.1
+  attention_heads: 4
+  dropout_rate: 0.1
+  input_layer: paraformer_dummy
+  kernel_size: 11
+  linear_units: 2048
+  normalize_before: true
+  num_blocks: 50
+  output_size: 512
+  pos_enc_layer_type: abs_pos_paraformer
+  positional_dropout_rate: 0.1
+  sanm_shfit: 0
+
+decoder: sanm_decoder
+decoder_conf:
+  att_layer_num: 16
+  attention_heads: 4
+  dropout_rate: 0.1
+  kernel_size: 11
+  linear_units: 2048
+  num_blocks: 16
+  positional_dropout_rate: 0.1
+  sanm_shfit: 0
+  self_attention_dropout_rate: 0.1
+  src_attention_dropout_rate: 0.1
+
+tokenizer: paraformer
+tokenizer_conf:
+  seg_dict_path: exp/paraformer/large/seg_dict
+  special_tokens:
+    <blank>: 0
+    <eos>: 2
+    <sos>: 1
+    <unk>: 8403
+  symbol_table_path: exp/paraformer/large/units.txt
+
+ctc: ctc
+ctc_conf:
+  ctc_blank_id: 0
+
+cmvn: global_cmvn
+cmvn_conf:
+  cmvn_file: exp/paraformer/large/global_cmvn
+  is_json_cmvn: true
+
+model: paraformer
+model_conf:
+  ctc_weight: 0.3
+  length_normalized_loss: false
+  lsm_weight: 0.1
+  predictor_bias: 1
+  predictor_weight: 1.0
+  sampling_ratio: 0.75
+
+predictor: paraformer_predictor
+predictor_conf:
+  cnn_groups: 1
+  idim: 512
+  l_order: 1
+  noise_threshold2: 0.01
+  r_order: 1
+  residual: false
+  smooth_factor2: 0.25
+  tail_threshold: 0.45
+  threshold: 1.0
+  upsample_times: 3
+  upsample_type: cnn_blstm
+  use_cif1_cnn: false
+
+dataset: asr
+dataset_conf:
+    filter_conf:
+        max_length: 40960
+        min_length: 0
+        token_max_length: 200
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.1
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 28
+
+grad_clip: 5
+accum_grad: 1
+max_epoch: 45
+log_interval: 100
+
+optim: adam
+optim_conf:
+  lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+  warmup_steps: 25000
diff --git a/examples/aishell/paraformer/conf/train_paraformer_dynamic.yaml b/examples/aishell/paraformer/conf/train_paraformer_dynamic.yaml
@@ -0,0 +1,114 @@
+encoder: sanm_encoder
+encoder_conf:
+  attention_dropout_rate: 0.1
+  attention_heads: 4
+  dropout_rate: 0.1
+  input_layer: paraformer_dummy
+  kernel_size: 11
+  linear_units: 2048
+  normalize_before: true
+  num_blocks: 50
+  output_size: 512
+  pos_enc_layer_type: abs_pos_paraformer
+  positional_dropout_rate: 0.1
+  sanm_shfit: 0
+  use_dynamic_chunk: true
+
+decoder: sanm_decoder
+decoder_conf:
+  att_layer_num: 16
+  attention_heads: 4
+  dropout_rate: 0.1
+  kernel_size: 11
+  linear_units: 2048
+  num_blocks: 16
+  positional_dropout_rate: 0.1
+  sanm_shfit: 0
+  self_attention_dropout_rate: 0.1
+  src_attention_dropout_rate: 0.1
+
+tokenizer: paraformer
+tokenizer_conf:
+  seg_dict_path: exp/paraformer/large/seg_dict
+  special_tokens:
+    <blank>: 0
+    <eos>: 2
+    <sos>: 1
+    <unk>: 8403
+  symbol_table_path: exp/paraformer/large/units.txt
+
+ctc: ctc
+ctc_conf:
+  ctc_blank_id: 0
+
+cmvn: global_cmvn
+cmvn_conf:
+  cmvn_file: exp/paraformer/large/global_cmvn
+  is_json_cmvn: true
+
+model: paraformer
+model_conf:
+  ctc_weight: 0.3
+  length_normalized_loss: false
+  lsm_weight: 0.1
+  predictor_bias: 1
+  predictor_weight: 1.0
+  sampling_ratio: 0.75
+
+predictor: paraformer_predictor
+predictor_conf:
+  cnn_groups: 1
+  idim: 512
+  l_order: 1
+  noise_threshold2: 0.01
+  r_order: 1
+  residual: false
+  smooth_factor2: 0.25
+  tail_threshold: 0.45
+  threshold: 1.0
+  upsample_times: 3
+  upsample_type: cnn_blstm
+  use_cif1_cnn: false
+
+dataset: asr
+dataset_conf:
+    filter_conf:
+        max_length: 40960
+        min_length: 0
+        token_max_length: 200
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.1
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 28
+
+grad_clip: 5
+accum_grad: 1
+max_epoch: 45
+log_interval: 100
+
+optim: adam
+optim_conf:
+  lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+  warmup_steps: 25000
diff --git a/examples/aishell/paraformer/local b/examples/aishell/paraformer/local
@@ -0,0 +1 @@
+../whisper/local
diff --git a/examples/aishell/paraformer/path.sh b/examples/aishell/paraformer/path.sh
@@ -0,0 +1,8 @@
+export WENET_DIR=$PWD/../../..
+export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build
+export OPENFST_BIN=${BUILD_DIR}/../fc_base/openfst-build/src
+export PATH=$PWD:${BUILD_DIR}/bin:${BUILD_DIR}/kaldi:${OPENFST_BIN}/bin:$PATH
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=../../../:$PYTHONPATH