From b2b9845ab98f0d0387caccef673d71eef79adb9f Mon Sep 17 00:00:00 2001 From: Theo Lepage Date: Thu, 21 Oct 2021 01:20:33 +0200 Subject: [PATCH] refactor(all): remove useless models and code --- .gitignore | 2 +- README.md | 61 +- cache_features.py | 24 - configs/cpc-base-kaldi.json | 50 - configs/cpc-base-kaldi_boosted.json | 51 - configs/cpc-base.json | 33 + configs/debug.json | 40 - configs/moco-base-kaldi.json | 43 - configs/simclr-base.json | 29 + evaluate.ipynb | 880 ---------------- evaluate.py | 52 +- extract_embeddings.py | 57 - kaldi/.gitignore | 2 - kaldi/cmd.sh | 13 - kaldi/conf/mfcc.conf | 7 - kaldi/conf/vad.conf | 4 - kaldi/local/add_disambig.pl | 58 - kaldi/local/add_lex_disambig.pl | 195 ---- kaldi/local/analyze_segments.pl | 43 - kaldi/local/apply_map.pl | 85 -- kaldi/local/best_wer.sh | 32 - kaldi/local/build_const_arpa_lm.sh | 49 - kaldi/local/check_spk_emb_range.py | 9 - kaldi/local/combine_data.sh | 128 --- kaldi/local/compute_min_dcf.py | 146 --- kaldi/local/compute_vad_decision.sh | 72 -- kaldi/local/convert_ctm.pl | 96 -- kaldi/local/convert_slf.pl | 302 ------ kaldi/local/convert_slf_parallel.sh | 71 -- kaldi/local/copy_data_dir.sh | 142 --- kaldi/local/create_data_link.pl | 132 --- kaldi/local/create_split_dir.pl | 92 -- kaldi/local/dict_dir_add_pronprobs.sh | 252 ----- kaldi/local/eps2disambig.pl | 29 - kaldi/local/filt.py | 14 - kaldi/local/filter_scp.pl | 87 -- kaldi/local/filter_scps.pl | 170 --- kaldi/local/find_arpa_oovs.pl | 71 -- kaldi/local/fix_ctm.sh | 32 - kaldi/local/fix_data_dir.sh | 189 ---- kaldi/local/format_lm.sh | 85 -- kaldi/local/format_lm_sri.sh | 94 -- kaldi/local/gen_topo.pl | 79 -- kaldi/local/generate_vctk_wav.py | 22 - kaldi/local/get_spk_emb.py | 29 - kaldi/local/get_spk_emb_2.py | 45 - kaldi/local/get_utt2num_frames.sh | 47 - kaldi/local/int2sym.pl | 71 -- kaldi/local/kwslist_post_process.pl | 291 ----- kaldi/local/ln.pl | 58 - kaldi/local/make_absolute.sh | 21 - kaldi/local/make_fbank.sh | 156 --- kaldi/local/make_lexicon_fst.pl | 155 --- kaldi/local/make_lexicon_fst_silprob.pl | 147 --- kaldi/local/make_mfcc.sh | 166 --- kaldi/local/make_musan.py | 123 --- kaldi/local/make_musan.sh | 39 - kaldi/local/make_unigram_grammar.pl | 54 - kaldi/local/make_vctk.pl | 61 -- kaldi/local/make_vctk_wav.py | 4 - kaldi/local/make_vctk_wav.sh | 0 kaldi/local/make_voxceleb1.pl | 113 -- kaldi/local/make_voxceleb2.pl | 70 -- kaldi/local/map_arpa_lm.pl | 137 --- kaldi/local/mkgraph.sh | 171 --- .../nnet3/xvector/prepare_feats_for_egs.sh | 83 -- kaldi/local/nnet3/xvector/run_xvector.sh | 155 --- .../nnet3/xvector/tuning/run_xvector_1a.sh | 155 --- kaldi/local/parse_options.sh | 97 -- kaldi/local/pbs.pl | 587 ----------- kaldi/local/perturb_data_dir_speed.sh | 125 --- kaldi/local/pinyin_map.pl | 79 -- kaldi/local/prepare_extended_lang.sh | 165 --- kaldi/local/prepare_for_eer.py | 19 - kaldi/local/prepare_lang.sh | 522 --------- kaldi/local/prepare_online_nnet_dist_build.sh | 75 -- kaldi/local/queue.pl | 624 ----------- kaldi/local/remove_data_links.sh | 53 - kaldi/local/remove_oovs.pl | 43 - kaldi/local/retry.pl | 106 -- kaldi/local/reverse_arpa.py | 188 ---- kaldi/local/rnnlm_compute_scores.sh | 90 -- kaldi/local/run.pl | 282 ----- kaldi/local/s2eps.pl | 27 - kaldi/local/segmentation.pl | 402 ------- kaldi/local/show_lattice.sh | 43 - kaldi/local/shuffle_list.pl | 44 - kaldi/local/slurm.pl | 627 ----------- kaldi/local/spk2utt_to_utt2spk.pl | 27 - kaldi/local/split_data.sh | 160 --- kaldi/local/split_scp.pl | 225 ---- kaldi/local/ssh.pl | 219 ---- kaldi/local/subset_data_dir.sh | 194 ---- kaldi/local/subset_data_dir_tr_cv.sh | 64 -- kaldi/local/subset_scp.pl | 105 -- kaldi/local/summarize_logs.pl | 121 --- kaldi/local/summarize_warnings.pl | 46 - kaldi/local/sym2int.pl | 104 -- kaldi/local/utt2spk_to_spk2utt.pl | 38 - kaldi/local/validate_data_dir.sh | 363 ------- kaldi/local/validate_dict_dir.pl | 508 --------- kaldi/local/validate_lang.pl | 997 ------------------ kaldi/local/validate_text.pl | 132 --- kaldi/local/visualize_spk_emb.py | 66 -- kaldi/local/visualize_trait_emb.py | 99 -- kaldi/local/visualize_utt_emb.py | 69 -- kaldi/local/write_kwslist.pl | 345 ------ kaldi/path.sh | 8 - kaldi/run.sh | 102 -- kaldi/steps | 1 - kaldi/utils | 1 - kaldi_evaluate.py | 30 - requirements.txt | 4 - run.sh | 6 + setup.py | 5 +- .../dataset/AudioAugmentationGenerator.py | 57 - sslforslr/dataset/AudioDatasetGenerator.py | 68 -- sslforslr/dataset/AudioDatasetLoader.py | 271 ----- sslforslr/dataset/KaldiDatasetLoader.py | 74 +- sslforslr/models/multitask/MultiTask.py | 413 -------- sslforslr/models/multitask/__init__.py | 1 - sslforslr/models/vqwav2vec/VQWav2Vec.py | 266 ----- sslforslr/models/vqwav2vec/VQWav2VecConfig.py | 98 -- sslforslr/models/vqwav2vec/__init__.py | 2 - sslforslr/models/wav2vec2/Wav2Vec2.py | 254 ----- sslforslr/models/wav2vec2/Wav2Vec2Config.py | 94 -- sslforslr/models/wav2vec2/__init__.py | 2 - sslforslr/modules/TransformerEncoder.py | 94 -- sslforslr/utils/callbacks.py | 13 + .../utils/callbacks/TimeHistoryCallback.py | 16 - sslforslr/utils/callbacks/__init__.py | 1 - sslforslr/utils/evaluate.py | 52 + sslforslr/utils/helpers.py | 76 +- train.py | 40 +- train_evaluate.py | 107 -- 135 files changed, 227 insertions(+), 16484 deletions(-) delete mode 100644 cache_features.py delete mode 100644 configs/cpc-base-kaldi.json delete mode 100644 configs/cpc-base-kaldi_boosted.json create mode 100644 configs/cpc-base.json delete mode 100644 configs/debug.json delete mode 100644 configs/moco-base-kaldi.json create mode 100644 configs/simclr-base.json delete mode 100644 evaluate.ipynb delete mode 100644 extract_embeddings.py delete mode 100644 kaldi/.gitignore delete mode 100755 kaldi/cmd.sh delete mode 100644 kaldi/conf/mfcc.conf delete mode 100644 kaldi/conf/vad.conf delete mode 100755 kaldi/local/add_disambig.pl delete mode 100755 kaldi/local/add_lex_disambig.pl delete mode 100755 kaldi/local/analyze_segments.pl delete mode 100755 kaldi/local/apply_map.pl delete mode 100755 kaldi/local/best_wer.sh delete mode 100755 kaldi/local/build_const_arpa_lm.sh delete mode 100755 kaldi/local/check_spk_emb_range.py delete mode 100755 kaldi/local/combine_data.sh delete mode 100755 kaldi/local/compute_min_dcf.py delete mode 100755 kaldi/local/compute_vad_decision.sh delete mode 100755 kaldi/local/convert_ctm.pl delete mode 100755 kaldi/local/convert_slf.pl delete mode 100755 kaldi/local/convert_slf_parallel.sh delete mode 100755 kaldi/local/copy_data_dir.sh delete mode 100755 kaldi/local/create_data_link.pl delete mode 100755 kaldi/local/create_split_dir.pl delete mode 100755 kaldi/local/dict_dir_add_pronprobs.sh delete mode 100755 kaldi/local/eps2disambig.pl delete mode 100755 kaldi/local/filt.py delete mode 100755 kaldi/local/filter_scp.pl delete mode 100755 kaldi/local/filter_scps.pl delete mode 100755 kaldi/local/find_arpa_oovs.pl delete mode 100755 kaldi/local/fix_ctm.sh delete mode 100755 kaldi/local/fix_data_dir.sh delete mode 100755 kaldi/local/format_lm.sh delete mode 100755 kaldi/local/format_lm_sri.sh delete mode 100755 kaldi/local/gen_topo.pl delete mode 100755 kaldi/local/generate_vctk_wav.py delete mode 100755 kaldi/local/get_spk_emb.py delete mode 100755 kaldi/local/get_spk_emb_2.py delete mode 100755 kaldi/local/get_utt2num_frames.sh delete mode 100755 kaldi/local/int2sym.pl delete mode 100755 kaldi/local/kwslist_post_process.pl delete mode 100755 kaldi/local/ln.pl delete mode 100755 kaldi/local/make_absolute.sh delete mode 100755 kaldi/local/make_fbank.sh delete mode 100755 kaldi/local/make_lexicon_fst.pl delete mode 100755 kaldi/local/make_lexicon_fst_silprob.pl delete mode 100755 kaldi/local/make_mfcc.sh delete mode 100755 kaldi/local/make_musan.py delete mode 100755 kaldi/local/make_musan.sh delete mode 100755 kaldi/local/make_unigram_grammar.pl delete mode 100755 kaldi/local/make_vctk.pl delete mode 100755 kaldi/local/make_vctk_wav.py delete mode 100755 kaldi/local/make_vctk_wav.sh delete mode 100755 kaldi/local/make_voxceleb1.pl delete mode 100755 kaldi/local/make_voxceleb2.pl delete mode 100755 kaldi/local/map_arpa_lm.pl delete mode 100755 kaldi/local/mkgraph.sh delete mode 100755 kaldi/local/nnet3/xvector/prepare_feats_for_egs.sh delete mode 100755 kaldi/local/nnet3/xvector/run_xvector.sh delete mode 100755 kaldi/local/nnet3/xvector/tuning/run_xvector_1a.sh delete mode 100755 kaldi/local/parse_options.sh delete mode 100755 kaldi/local/pbs.pl delete mode 100755 kaldi/local/perturb_data_dir_speed.sh delete mode 100755 kaldi/local/pinyin_map.pl delete mode 100755 kaldi/local/prepare_extended_lang.sh delete mode 100755 kaldi/local/prepare_for_eer.py delete mode 100755 kaldi/local/prepare_lang.sh delete mode 100755 kaldi/local/prepare_online_nnet_dist_build.sh delete mode 100755 kaldi/local/queue.pl delete mode 100755 kaldi/local/remove_data_links.sh delete mode 100755 kaldi/local/remove_oovs.pl delete mode 100755 kaldi/local/retry.pl delete mode 100755 kaldi/local/reverse_arpa.py delete mode 100755 kaldi/local/rnnlm_compute_scores.sh delete mode 100755 kaldi/local/run.pl delete mode 100755 kaldi/local/s2eps.pl delete mode 100755 kaldi/local/segmentation.pl delete mode 100755 kaldi/local/show_lattice.sh delete mode 100755 kaldi/local/shuffle_list.pl delete mode 100755 kaldi/local/slurm.pl delete mode 100755 kaldi/local/spk2utt_to_utt2spk.pl delete mode 100755 kaldi/local/split_data.sh delete mode 100755 kaldi/local/split_scp.pl delete mode 100755 kaldi/local/ssh.pl delete mode 100755 kaldi/local/subset_data_dir.sh delete mode 100755 kaldi/local/subset_data_dir_tr_cv.sh delete mode 100755 kaldi/local/subset_scp.pl delete mode 100755 kaldi/local/summarize_logs.pl delete mode 100755 kaldi/local/summarize_warnings.pl delete mode 100755 kaldi/local/sym2int.pl delete mode 100755 kaldi/local/utt2spk_to_spk2utt.pl delete mode 100755 kaldi/local/validate_data_dir.sh delete mode 100755 kaldi/local/validate_dict_dir.pl delete mode 100755 kaldi/local/validate_lang.pl delete mode 100755 kaldi/local/validate_text.pl delete mode 100755 kaldi/local/visualize_spk_emb.py delete mode 100755 kaldi/local/visualize_trait_emb.py delete mode 100755 kaldi/local/visualize_utt_emb.py delete mode 100755 kaldi/local/write_kwslist.pl delete mode 100755 kaldi/path.sh delete mode 100755 kaldi/run.sh delete mode 120000 kaldi/steps delete mode 120000 kaldi/utils delete mode 100644 kaldi_evaluate.py create mode 100644 run.sh delete mode 100644 sslforslr/dataset/AudioAugmentationGenerator.py delete mode 100644 sslforslr/dataset/AudioDatasetGenerator.py delete mode 100644 sslforslr/dataset/AudioDatasetLoader.py delete mode 100644 sslforslr/models/multitask/MultiTask.py delete mode 100644 sslforslr/models/multitask/__init__.py delete mode 100644 sslforslr/models/vqwav2vec/VQWav2Vec.py delete mode 100644 sslforslr/models/vqwav2vec/VQWav2VecConfig.py delete mode 100644 sslforslr/models/vqwav2vec/__init__.py delete mode 100644 sslforslr/models/wav2vec2/Wav2Vec2.py delete mode 100644 sslforslr/models/wav2vec2/Wav2Vec2Config.py delete mode 100644 sslforslr/models/wav2vec2/__init__.py delete mode 100644 sslforslr/modules/TransformerEncoder.py create mode 100644 sslforslr/utils/callbacks.py delete mode 100644 sslforslr/utils/callbacks/TimeHistoryCallback.py delete mode 100644 sslforslr/utils/callbacks/__init__.py create mode 100644 sslforslr/utils/evaluate.py delete mode 100644 train_evaluate.py diff --git a/.gitignore b/.gitignore index e1ed109..076e022 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ checkpoints/ -datasets/ +data/ __pycache__ .ipynb_checkpoints/ build diff --git a/README.md b/README.md index 1978459..5019ba0 100644 --- a/README.md +++ b/README.md @@ -1,58 +1,39 @@ # ssl-for-slr -Framework to train a speech encoder in a **self-supervised** way for **speaker and language recognition** tasks. +Collection of **self-supervised** models for **speaker and language recognition** tasks. -The aim is to train a speech encoder by using multiple self-supervised modules as shown on figure below. +## Models -## Features - -- Configurable speech encoders (1D conv layers, GRU, skip connections, [SincNet](https://arxiv.org/abs/1808.00158)) -- Self-supervised models: - - [Contrastive Predictive Coding](https://arxiv.org/pdf/1807.03748.pdf) *(unidirectional or bidirectional)* - - [vq-wav2vec](https://arxiv.org/pdf/1910.05453.pdf) - - [Wav2Vec 2.0](https://arxiv.org/pdf/2006.11477.pdf) - - [Local Info Max (LIM)](https://arxiv.org/pdf/1812.00271.pdf) and Global Info Max (GIM) - - [PASE](https://arxiv.org/pdf/1904.03416.pdf) and [PASE+](https://arxiv.org/pdf/2001.09239.pdf) with the following workers: *Waveform*, *LPS*, *MFCC*, *CPC*, *LIM* and *GIM* -- Evaluation on speaker recognition, speaker verification, language recognition and data-efficiency -- Handle *LibriSpeech* and *VoxLingua107* datasets -- Speech augmentation module *(reverberation, noise, frequency and temporal masks, clipping, ...)* -- Modular configuration files +- **CPC**: [Representation Learning with Contrastive Predictive Coding](https://arxiv.org/pdf/1807.03748.pdf) +- **LIM/GIM**: [Learning Speaker Representations with Mutual Information](https://arxiv.org/pdf/1812.00271.pdf) +- **SimCLR**: [Contrastive Self-Supervised Learning for Text-Independent Speaker Verification](https://sci-hub.mksa.top/10.1109/icassp39728.2021.9413351) +- **MoCo**: [Self-supervised Text-independent Speaker Verification using Prototypical Momentum Contrastive Learning](https://arxiv.org/pdf/2012.07178.pdf) ## Usage -### Install dependencies (inside a virtual env) - -1. `virtualenv ~/ssl-for-slr-env && source ~/ssl-for-slr-env/bin/activate` -2. `pip install -r requirements.txt` - -*Type `deactivate` to exit the virtual env after use.* - -### Train model on pretext task - -``` -python train.py configs/cpc-v1.json -``` +Start self-supervised training with `python train.py configs/cpc-base.json`. -*Multiple config files are located in the `config/` folder.* - -### Evaluate model on downstream task *(speaker or language recognition)* - -1. Train a classifier on top of the previsouly trained encoder: `python train_evaluate.py configs/cpc-v1.json`. -2. Use notebook `evaluate.ipnyb` to evaluate metrics obtained on the downstream task. +Then, you can evaluate model on speaker verification (EER, minDCF) with `python evaluate.py configs/cpc-base.json`. ## To-Do -- [ ] Create config for different models (5) -> train -> evaluate -> experiment -- [ ] Data augmentation / MFCC pipeline (cache features with create_features.py?) +- [ ] Refactor project + - [ ] Data: check similar (padding) [30min] + - [ ] Evaluate: check works [30min] + - [ ] Model: clamp W, init -5 10, check similar encoder, mfcc [1h] + - [ ] Start SimCLR training [30min] ---- +- [ ] Reproduce results of SimCLR + - [ ] If not working => use voxceleb_trainer implem + - [ ] Add data augmentation + - [ ] Evaluate: add minDCF +- [ ] Experiment with VICReg -- [ ] Dataset: cache useful? do not store audio cache in checkpoints/model/ -- [ ] Refactor evaluation (choose type of classifier: random, surpervised) -- [ ] Use dataclass and YAML for all configs +--- +- [ ] Explain data preparation / reproduction + cite articles in README +- [ ] Use dataclass and YAML for model configs - [ ] CPC/LIM: @tf.function warning when doing tensor[1, :] -- [ ] Fix error end training saving history.npy - [ ] Fix warning loading weights not used - [ ] Create custom training loop (https://stackoverflow.com/questions/57971007/tensorflow-2-0-display-progress-bar-in-custom-training-loop) - [ ] Allow restore optimizer \ No newline at end of file diff --git a/cache_features.py b/cache_features.py deleted file mode 100644 index 42eeb15..0000000 --- a/cache_features.py +++ /dev/null @@ -1,24 +0,0 @@ -import argparse -import numpy as np -import kaldiio -from tqdm import tqdm - -from shutil import copyfile - -def create_features(data_path): - feats = {} - wav_scp = kaldiio.load_scp(data_path + '/wav.scp') - for utterance_id in tqdm(wav_scp): - sr, data = wav_scp[utterance_id] - data = data.astype(np.float32) - feats[utterance_id] = data - - kaldiio.save_ark(data_path + '/feats.ark', feats, scp=data_path + '/feats.scp') - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument('data_path', help='Path to data folder containing wav.scp file (Kaldi format).') - args = parser.parse_args() - - #create_features(args.data_path) - copyfile(args.data_path + '/wav.scp', args.data_path + '/feats.scp') diff --git a/configs/cpc-base-kaldi.json b/configs/cpc-base-kaldi.json deleted file mode 100644 index a6452c5..0000000 --- a/configs/cpc-base-kaldi.json +++ /dev/null @@ -1,50 +0,0 @@ -{ - "name": "cpc-base-kaldi", - "seed": 1717, - "encoder": { - "type": "CPC", - "encoded_dim": 512, - "weight_regularizer": 1e-4 - }, - "model": { - "type": "CPC", - "nb_timesteps_to_predict": 12, - "context_network": { - "type": "GRU", - "dim": 256, - "nb_layers": 1 - }, - "bidirectional": false, - "weight_regularizer": 1e-4 - }, - "training": { - "epochs": 50, - "batch_size": 64, - "learning_rate": 0.0001, - "dataset": { - "type": "Kaldi", - "sample_frequency": 16000, - "scp": "./data/train/feats.scp", - "utt2spk": "./data/train/utt2spk", - "frames": { - "length": 20480 - } - } - }, - "evaluate": { - "type": "speaker-id", - "train_encoder": false, - "epochs": 50, - "batch_size": 64, - "learning_rate": 0.001, - "dataset": { - "type": "Kaldi", - "sample_frequency": 16000, - "scp": "./data/voxceleb1_test/feats.scp", - "utt2spk": "./data/voxceleb1_test/utt2spk", - "frames": { - "length": 20480 - } - } - } -} \ No newline at end of file diff --git a/configs/cpc-base-kaldi_boosted.json b/configs/cpc-base-kaldi_boosted.json deleted file mode 100644 index 7968524..0000000 --- a/configs/cpc-base-kaldi_boosted.json +++ /dev/null @@ -1,51 +0,0 @@ -{ - "name": "cpc-base-kaldi_boosted", - "seed": 1717, - "encoder": { - "type": "CPC", - "encoded_dim": 512, - "weight_regularizer": 1e-4 - }, - "model": { - "type": "CPC", - "nb_timesteps_to_predict": 12, - "context_network": { - "type": "LSTM", - "dim": 256, - "nb_layers": 2 - }, - "bidirectional": true, - "weight_regularizer": 1e-4 - }, - "training": { - "epochs": 50, - "batch_size": 128, - "learning_rate": 0.0001, - "tensorboard": true, - "dataset": { - "type": "Kaldi", - "sample_frequency": 16000, - "scp": "./data/train/feats.scp", - "utt2spk": "./data/train/utt2spk", - "frames": { - "length": 20480 - } - } - }, - "evaluate": { - "type": "speaker-id", - "train_encoder": false, - "epochs": 50, - "batch_size": 64, - "learning_rate": 0.001, - "dataset": { - "type": "Kaldi", - "sample_frequency": 16000, - "scp": "./data/voxceleb1_test/feats.scp", - "utt2spk": "./data/voxceleb1_test/utt2spk", - "frames": { - "length": 20480 - } - } - } -} \ No newline at end of file diff --git a/configs/cpc-base.json b/configs/cpc-base.json new file mode 100644 index 0000000..7fa73eb --- /dev/null +++ b/configs/cpc-base.json @@ -0,0 +1,33 @@ +{ + "name": "cpc-base", + "seed": 1717, + "encoder": { + "type": "CPC", + "encoded_dim": 512, + "weight_regularizer": 1e-4 + }, + "model": { + "type": "CPC", + "nb_timesteps_to_predict": 12, + "context_network": { + "type": "GRU", + "dim": 256, + "nb_layers": 1 + }, + "bidirectional": false, + "weight_regularizer": 1e-4 + }, + "training": { + "epochs": 50, + "batch_size": 64, + "learning_rate": 0.0001 + }, + "dataset": { + "sample_frequency": 16000, + "frame_length": 20480, + "max_samples": 1000, + "train": "./data/debug.scp", + "test": "./data/debug.scp", + "trials": "./data/voxceleb1_test/trials" + } +} \ No newline at end of file diff --git a/configs/debug.json b/configs/debug.json deleted file mode 100644 index 8dc974b..0000000 --- a/configs/debug.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "name": "debug", - "seed": 1717, - "encoder": { - "type": "CPC", - "encoded_dim": 512, - "weight_regularizer": 1e-4 - }, - "model": { - "type": "CPC", - "nb_timesteps_to_predict": 12, - "bidirectional": false, - "context_network": { - "type": "GRU", - "dim": 256, - "nb_layers": 1 - }, - "weight_regularizer": 1e-4 - }, - "training": { - "epochs": 50, - "batch_size": 64, - "learning_rate": 0.0001, - "dataset": { - "type": "LibriSpeech", - "sample_frequency": 16000, - "train_paths": [ - "./datasets/LibriSpeech/dev-clean/*" - ], - "val_ratio": 0.2, - "test_ratio": 0.1, - "frames": { - "pick": "sequence", - "length": 20480, - "stride": 20480, - "count": 1 - } - } - } -} \ No newline at end of file diff --git a/configs/moco-base-kaldi.json b/configs/moco-base-kaldi.json deleted file mode 100644 index 12cdc03..0000000 --- a/configs/moco-base-kaldi.json +++ /dev/null @@ -1,43 +0,0 @@ -{ - "name": "moco-base-kaldi", - "seed": 1717, - "encoder": { - "type": "XVector", - "encoded_dim": 3000, - "weight_regularizer": 1e-4 - }, - "model": { - "type": "MoCo", - "queue_size": 10000, - "info_nce_temp": 0.07, - "embedding_dim": 512, - "proto_nce_loss_factor": 0.25, - "nb_clusters": 5000, - "clustering_negs_count": 10000, - "epochs_before_proto_nce": 1111111111111, - "weight_regularizer": 1e-4 - }, - "training": { - "epochs": 150, - "optimizer": { - "type": "SGD", - "momentum": 0.9 - }, - "batch_size": 1024, - "learning_rate": { - "scheduler": "cosine", - "start": 0.1, - "end": 0.0001 - }, - "dataset": { - "type": "Kaldi", - "sample_frequency": 16000, - "scp": "./data/train/feats.scp", - "utt2spk": "./data/train/utt2spk", - "frames": { - "length": 300, - "extract_mfcc": true - } - } - } -} \ No newline at end of file diff --git a/configs/simclr-base.json b/configs/simclr-base.json new file mode 100644 index 0000000..cc9bf57 --- /dev/null +++ b/configs/simclr-base.json @@ -0,0 +1,29 @@ +{ + "name": "simclr-base", + "seed": 1717, + "encoder": { + "type": "ThinResNet34", + "encoded_dim": 512, + "weight_regularizer": 1e-4 + }, + "model": { + "type": "SimCLR", + "channel_loss_factor": 0.1, + "weight_regularizer": 1e-4 + }, + "training": { + "epochs": 100, + "optimizer": { + "type": "Adam" + }, + "batch_size": 256, + "learning_rate": 0.001 + }, + "dataset": { + "sample_frequency": 16000, + "frame_length": 20480, + "train": "./data/debug.scp", + "test": "./data/debug.scp", + "trials": "./data/voxceleb1_test/trials" + } +} \ No newline at end of file diff --git a/evaluate.ipynb b/evaluate.ipynb deleted file mode 100644 index 604a9af..0000000 --- a/evaluate.ipynb +++ /dev/null @@ -1,880 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Evaluation" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "CONFIG_PATH = './configs/cpc-v1.json'" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO:tensorflow:Enabling eager execution\n", - "INFO:tensorflow:Enabling v2 tensorshape\n", - "INFO:tensorflow:Enabling resource variables\n", - "INFO:tensorflow:Enabling tensor equality\n", - "INFO:tensorflow:Enabling control flow v2\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "import soundfile as sf\n", - "import matplotlib.pyplot as plt\n", - "from sklearn.metrics import classification_report\n", - "from sklearn.metrics import confusion_matrix\n", - "from sklearn.manifold import TSNE\n", - "from sklearn.decomposition import PCA\n", - "from mlxtend.plotting import plot_confusion_matrix\n", - "\n", - "from evaluate import load" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of training batches: 2\n", - "Number of val batches: 1\n", - "Number of test batches: 1\n", - "Model: \"model\"\n", - "_________________________________________________________________\n", - "Layer (type) Output Shape Param # \n", - "=================================================================\n", - "input_1 (InputLayer) [(None, 20480, 1)] 0 \n", - "_________________________________________________________________\n", - "cpc_encoder (CPCEncoder) (None, 128, 512) 5260800 \n", - "_________________________________________________________________\n", - "autoregressive (Autoregressi (None, 256) 591360 \n", - "=================================================================\n", - "Total params: 5,852,160\n", - "Trainable params: 5,847,040\n", - "Non-trainable params: 5,120\n", - "_________________________________________________________________\n", - "Model: \"model_1\"\n", - "_________________________________________________________________\n", - "Layer (type) Output Shape Param # \n", - "=================================================================\n", - "input_2 (InputLayer) [(None, 20480, 1)] 0 \n", - "_________________________________________________________________\n", - "cpc_model (CPCModel) (None, 256) 5852160 \n", - "_________________________________________________________________\n", - "classifier (Classifier) (None, 10) 68362 \n", - "=================================================================\n", - "Total params: 5,920,522\n", - "Trainable params: 5,915,402\n", - "Non-trainable params: 5,120\n", - "_________________________________________________________________\n" - ] - } - ], - "source": [ - "model, history, model_evaluate, history_evaluate, test_gen = load(CONFIG_PATH)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(11, 20480, 1)\n", - "(11,)\n", - "(11,)\n" - ] - } - ], - "source": [ - "def load_gen_in_memory(gen):\n", - " X_test = []\n", - " y_test = []\n", - " for i in range(len(gen)):\n", - " tmp = gen[i]\n", - " X_test.extend(tmp[0])\n", - " y_test.extend(tmp[1]) \n", - " return np.array(X_test), np.array(y_test)\n", - "\n", - "# Load test_gen in memory\n", - "X_test, y_test = load_gen_in_memory(test_gen)\n", - "y_test_pred = np.argmax(model_evaluate.predict(X_test), axis=-1)\n", - "\n", - "print(X_test.shape)\n", - "print(y_test.shape)\n", - "print(y_test_pred.shape)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Encoder (self-supervised training)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Learning curves" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY4AAAEWCAYAAABxMXBSAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAA0j0lEQVR4nO3deXxV9bnv8c+TgSRMARLGDBBIQBlkioAKDgiWWhVncPaUI7VKVTrces/t7aEee6ttZRIUUXGqVlGL0jqGSVABCYgDUyYCSZiSQEICBDI894+1orsxwA5kZ2V43q/XfrmG3177+SW4v1nrtwZRVYwxxhh/BXldgDHGmKbFgsMYY0ydWHAYY4ypEwsOY4wxdWLBYYwxpk4sOIwxxtSJBYcxpyEiY0Rkh9d1NGUiki0i47yuw9QPCw7ToERklYgcEpEwr2vxl6quUdV+XtdhTGNhwWEajIj0AsYAClzTwJ8d0pCf11i01H6bwLLgMA3pTmAd8CJwl+8KEYkTkX+ISL6IFIrIPJ9194jINhEpEZGtIjLMXa4ikujT7kURedSdvlREckXktyKyD3hBRDqKyL/czzjkTsf6vL+TiLwgInvc9e/4bsunXQ8Redvdzk4RecBn3QgRSRWRwyKyX0RmnuyH4fYrQ0QOishSEenhLn9aRP5ao+27IvJLPz5/hoi8JSJ/E5HDwN21fG6YiPxVRHa7NS4QkYgaP7f/EpEC9xDTbT7vjRSRl93P3iUivxORIJ/1tf6uXENE5GsRKRaRN0Qk3H1PtPu7KHJ/Fmt8t2kaIVW1l70a5AVkAPcBw4FyoKu7PBj4CpgFtAHCgdHuupuAPOB8QIBEoKe7ToFEn+2/CDzqTl8KVACPA2FABBAF3AC0BtoBbwLv+Lz/PeANoCMQClzis61cdzoI2Aj8HmgF9AaygB+569cCd7jTbYFRJ/lZjAUKgGFufU8Cq911FwM5gLjzHYFjQA8/Pn+G+7O91m0bUctnzwKWAp3cn8M/gT/V+LnNdOu6BDgC9HPXvwy8676vF5AGTPHjd5UNfOH2oROwDbjXXfcnYIH7Mw/F2SsVr/+92usU/y97XYC9WsYLGO1+oUW789uB6e70BUA+EFLL+z4CHjzJNk8XHCeA8FPUNAQ45E53B6qAjrW08w2OkcDuGuv/N/CCO70a+EN1P0/x2c8Df/aZb+v+fHq5X7q7gYvddfcAK/z8/Bm4AXSSzxU3CPr4LLsA2OnT1wqgjc/6xcD/xQn4E0B/n3U/A1b58bvKBm73mf8zsMCdfgQnjBJPVre9GtfLdgdNQ7kL+FhVC9z51/j+cFUcsEtVK2p5XxyQeYafma+qZdUzItJaRJ5xD7EcxvmS7yAiwe7nHFTVQ6fZZk+gh3tYpUhEioD/Arq666cAfYHtIrJBRK46yXZ6ALuqZ1S1FCgEYtT5Nn0duMVdfSvwqp+fD87eysl0xtnj2ujz/g/d5dUOqeoRn/ldbr3ROHsEu2qsi3GnT/e72uczfRQnLAH+grM3+rGIZInIw6fYhmkEbODMBJx7/PxmINgdbwDnMEgHERmM80UXLyIhtYRHDtDnJJs+ivMlWK0bkOszX/PWz78C+gEjVXWfiAwBvsT5KzwH6CQiHVS16BTdycH56zyptpWqmg7c4h6jvx54S0SianwRA+zBCQEARKQNzqG0PHfR33G+SB/D2cu4zp/Pry7jFOsKcA57DVDVvJO06SgibXxqjge+dd9b7ta91Wdd9XZO9bs6ebGqJTi/m1+JyEBghYhsUNXldd2WaRi2x2EawrVAJdAf5/DQEOBcYA3OgPkXwF7gMRFpIyLhInKR+97ngF+LyHBxJIpI9RfuZuBWEQkWkQk4x+NPpR3Ol2aRiHQC/rt6haruBT4AnnIH0UNF5OJatvEFUCLOoHuE+9kDReR8ABG5XUQ6q2oVUOS+p6qW7fwd+A8RGSLOqcn/D1ivqtluPV/ifFE/B3zkE2an/PzTcet6FpglIl3cmmNE5Ec1mv5BRFqJyBjgKuBNVa3EOWz1RxFp5/4efgn8zX3PqX5XJyUiV7ltBSjG+bdS28/MNBIWHKYh3IVzDH63qu6rfgHzgNtw/uK/GmcwdTfOXsMkAFV9E/gjzqGtEuAdnMFVgAfd9xW523nnNHXMxhkkL8A5u+vDGuvvwPmLejtwAHio5gbcL8+rcMJvJ99/uUe6TSYAW0SkFJgDTFbVY7VsZxnOuMHbOKHZB5hco9lrwDj3v/5+vj9+i3NoaJ17yG4Zzp5YtX3AIZy9oldxBrG3u+t+gTNGkgV86ta2yK3tVL+rU0lyayjFObngKVVdWYf+mAZWfdaGMcYgIpcCf1PV2NM0NS2Y7XEYY4ypEwsOY4wxdRLQ4BCRCSKyQ5yrY39wip17Besb7vr14tySAhHpJSLHRGSz+1rg855V7jar13UJZB+MaUlUdZUdpjKnE7DTcd1z4+cD43EGOzeIyFJV3erTbArOOeOJIjIZ5yrfSe66TFUdcpLN36aqqQEq3RhjzCkE8jqOEUCGqmYBiMjrwES+P/8bd36GO/0WMM89Ja9eRUdHa69evep7s8YY06xt3LixQFU711weyOCI4d+vYM3FuZCp1jaqWiEixTgXQQEkiMiXwGHgd6q6xud9L4hIJc6pjI9qLaeGichUYCpAfHw8qam2g2KMMXUhIrtqW95YB8f3AvGqOhTnAqPXRKS9u+42VR2EcyO0MTjn3v+Aqi5U1WRVTe7c+QeBaYwx5gwFMjjycO5dUy2W729N8IM24jw3IBIoVNXjqloIoKobce5/09edz3P/W4JzodGIAPbBGGNMDYEMjg1AkogkiEgrnKtil9Zos5Tvb3R3I84dQFVEOruD64hIb5wrS7NEJEREot3loThX0H4bwD4YY4ypIWBjHO6YxTScWy0HA4tUdYuIPAKkqupSnFtLvyIiGcBBvr/lwsXAIyJSjnPPmntV9aB7I7iP3NAIxrlNwbOB6oMxxpgfahG3HElOTlYbHDfGmLoRkY2qmlxzeWMdHDfGGNNIWXAYY4ypEwsOY4xphr7KKeLPH24/fcMzYE8ANMaYZmTLnmJmpaSxbNsBOrYO5c4LetEtMrxeP8OCwxhjmoEd+0qYvSyND77dR/vwEH59RV/uviiBtmH1/zVvwWGMMU1YxoFS5ixP519f76FNqxAeuDyJKaMTiIwIDdhnWnAYY0wTlF1whLnL03lncx7hocH8/JI+TL24Nx1atwr4Z1twGGNME5Jz8CjzVmTw1qZcQoOF/xzTm59d3JuotmENVoMFhzHGNAF7io4xf2UGi1NzEBHuGNWT+y7rQ5d29Tvw7Q8LDmOMacQOHC7jqVWZvLZ+N4oy6fw47r8ske6REZ7VZMFhjDGNUEHpcRasyuSVdbuoqFJuGh7LtLGJxHZs7XVpFhzGGNOYHDpygoVrsnjp82zKyiu5bmgsD1yeSM+oNl6X9h0LDmOMaQSKj5Xz/JosFn2WzZETFVx9Xg8eHJdEn85tvS7tByw4jDHGQyVl5bzwWTbPrsmipKyCKwd146FxfenbtZ3XpZ2UBYcxxnjgyPEKXlqbzcLVWRQdLWd8/65MH9eX/j3an/7NHrPgMMaYBlRWXsnf1u3i6VWZFB45wWX9OjN9fF/Oi+3gdWl+s+AwxpgGUFZeyetf7Gb+qkzyS44zJimah8b1ZXjPjl6XVmcWHMYYE0AnKqpYnJrD/JUZ7C0uY2RCJ+bdMpSRvaO8Lu2MBTQ4RGQCMAfn+eDPqepjNdaHAS8Dw4FCYJKqZotIL2AbsMNtuk5V73XfMxx4EYgA3gce1Jbw/FtjTJNSXlnFPzblMnd5BnlFxxgW34G/3jSYC/tEISJel3dWAhYcIhIMzAfGA7nABhFZqqpbfZpNAQ6paqKITAYeBya56zJVdUgtm34auAdYjxMcE4APAtMLY4ypm8oq5d3NecxZns6uwqMMjo3kj9cN5JK+nZt8YFQL5B7HCCBDVbMAROR1YCLgGxwTgRnu9FvAPDnFT1ZEugPtVXWdO/8ycC0WHMYYj1VVKf/6Zi+zl6WRlX+E/t3b89ydyVx+bpdmExjVAhkcMUCOz3wuMPJkbVS1QkSKgeoDfwki8iVwGPidqq5x2+fW2GZMAGo3xhi/VFUpH23Zx6xlaaTtL6Vf13YsuH0YV/TvRlBQ8wqMao11cHwvEK+qhe6YxjsiMqAuGxCRqcBUgPj4+ACUaIxpyVSV5dsOMDMlja17D9OncxuevGUoPxnUvdkGRrVABkceEOczH+suq61NroiEAJFAoTvYfRxAVTeKSCbQ120fe5pt4r5vIbAQIDk52QbPjTH1QlX5JC2fWSlpfJVbTM+o1sy8eTATh8QQ3MwDo1ogg2MDkCQiCThf7pOBW2u0WQrcBawFbgRWqKqKSGfgoKpWikhvIAnIUtWDInJYREbhDI7fCTwZwD4YYwzgBMbnmYXMTElj465DxHSI4M83nMf1w2IICQ7yurwGFbDgcMcspgEf4ZyOu0hVt4jII0Cqqi4FngdeEZEM4CBOuABcDDwiIuVAFXCvqh50193H96fjfoANjBtjAuyLnQd54uMdrN95kO6R4fzxuoHcNDyOViEtKzCqSUu4BCI5OVlTU1O9LsMY08Rs3HWIWSlpfJpRQJd2Ydx/WSKTzo8jPDTY69IahIhsVNXkmssb6+C4McZ45uvcImampLFqRz5RbVrxu5+cy+2jeraYwDgdCw5jjHFt3XOYWcvSSNm6nw6tQ/nthHO484KetAmzr0pf9tMwxrR4aftLmL0sjfe/2Uf78BB+Nb4vd1/Ui3bhoV6X1ihZcBhjWqzM/FLmLEvnn1/voU2rEB4Ym8iUMb2JjLDAOBULDmNMi7Or8Ahzl2ew5MtcwkKCufeSPkwd05uObVp5XVqTYMFhjGkxcg8dZd6KDN7cmEtIkDBldAI/u6QP0W3DvC6tSbHgMMY0e/uKy5i3Mp03NuQgCHeM6sl9l/ahS/twr0trkiw4jDHN1oGSMp5amclrX+xGVbk5OY5pYxPpHhnhdWlNmgWHMabZKSw9zjOrs3h5bTbllcqNw2KZNjaRuE6tvS6tWbDgMMY0G0VHT7BwdRYvfp5NWXkl1w6N4YGxSfSKbuN1ac2KBYcxpskrPlbO85/uZNGnOzlyooKrzuvBg5cnkdilrdelNUsWHMaYJqv0eAUvfLqTZ9dkcbisgh8P7MZD4/rSr1s7r0tr1iw4jDFNztETFby8dhfPfJLJoaPljDu3K9PHJzGgR6TXpbUIFhzGmCajrLySv63bxYJPMikoPcGl/TozfVxfBsd18Lq0FsWCwxjT6B2vqOT1L3KYvzKDAyXHGZ0YzfTxSQzv2cnr0lokCw5jTKN1oqKKNzfmMG9FBnuLyxiR0Im5twxlVO8or0tr0Sw4jDGNTkVlFf/4Mo+5y9PJPXSMofEd+MuNg7koMQqRlvFc78bMgsMY02hUVilLv8pjzrJ0sguPcl5sJP9z7UAu7dvZAqMRseAwxniuqkp575u9zF6WRmb+Ec7t3p5n70xm3LldLDAaoYAGh4hMAOYAwcBzqvpYjfVhwMvAcKAQmKSq2T7r44GtwAxV/au7LBsoASqBitqeh2uMaRpUlY+27GNWSjo79pfQt2tbnr5tGD8a0I2gIAuMxipgwSEiwcB8YDyQC2wQkaWqutWn2RTgkKomishk4HFgks/6mcAHtWz+MlUtCFDpxpgAU1VWbD/AzJQ0tuw5TO/oNsyZPISrzutBsAVGoxfIPY4RQIaqZgGIyOvARJw9iGoTgRnu9FvAPBERVVURuRbYCRwJYI3GmAakqqxOL2BmShpf5RQR36k1T9w0mIlDehASHOR1ecZPgQyOGCDHZz4XGHmyNqpaISLFQJSIlAG/xdlb+XWN9yjwsYgo8IyqLqztw0VkKjAVID4+/iy7Yow5W59nOIGRuusQMR0iePyGQVw/LJZQC4wmp7EOjs8AZqlqaS0DY6NVNU9EugApIrJdVVfXbOQGykKA5ORkDXTBxpjabcg+yBMf72Bd1kG6tQ/n0WsHcnNyHK1CLDCaqkAGRx4Q5zMf6y6rrU2uiIQAkTiD5COBG0Xkz0AHoEpEylR1nqrmAajqARFZgnNI7AfBYYzx1qbdh5iVksaa9AKi24bx31f355YR8YSHBntdmjlLgQyODUCSiCTgBMRk4NYabZYCdwFrgRuBFaqqwJjqBiIyAyhV1Xki0gYIUtUSd/oK4JEA9sEYU0ff5BYzM2UHK3fk06lNK/7Pledy+6ieRLSywGguAhYc7pjFNOAjnNNxF6nqFhF5BEhV1aXA88ArIpIBHMQJl1PpCixxD1+FAK+p6oeB6oMxxn/b9h5mVkoaH2/dT4fWofyvCf2464JetAlrrEfEzZkS5w/85i05OVlTU1O9LsOYZil9fwmzl6Xz3jd7aRcewn+O7s1PR/eiXXio16WZsyQiG2u7Vs7+FDDGnJGs/FLmLE9n6Vd7aB0azC/GJvKfo3sT2doCo7mz4DDG1MnuwqPMXZHOPzblEhYSzM8u7sPUi3vTqU0rr0szDcSCwxjjl9xDR5m/MoM3U3MJDhJ+elECP7ukD53bhXldmmlgFhzGmFPaV1zG/JUZvL5hN4Jw28h47rsska7tw70uzXjEgsMYU6sDJWU8vSqTV9fvpqpKufn8OKZdlkiPDhFel2Y8ZsFhjPk3haXHWbg6i5fWZlNeqdwwLIZfjE0irlNrr0szjYQFhzEGgKKjJ3h2TRYvfJZNWXkl1w6J4ReXJ5EQ3cbr0kwjY8FhTAt3uKyc59fsZNGnOyk9UcFPBnXnoXFJJHZp53VpppGy4DCmhSo9XsGLn+1k4eosDpdVMGFANx4an8Q53dp7XZpp5Cw4jGlhjp6o4JW1u1jwSSaHjpYz7twuPDSuLwNjIr0uzTQRFhzGtBBl5ZW8un43T6/KpKD0OJf07cz08X0ZEtfB69JME2PBYUwzd7yikjc25DB/ZQb7Dx/nwj5RLLh9GMm9OnldmmmiLDiMaabKK6t4MzWXeSvS2VNcxvm9OjJ70lAu6BPldWmmibPgMKaZqaisYsmXecxdkU7OwWMMievA4zeex+jEaGp5oqYxdWbBYUwzUVml/POrPcxZns7OgiMMionkkbsHcmm/zhYYpl5ZcBjTxFVVKe9/u5fZy9LJOFDKOd3asfCO4Yzv39UCwwSEBYcxTZSq8vHW/cxKSWP7vhKSurTlqduGMWFAN4KCLDBM4FhwGNPEqCordxxgZkoa3+YdJiG6DXMmD+Gq83oQbIFhGkBQIDcuIhNEZIeIZIjIw7WsDxORN9z160WkV4318SJSKiK/9nebxjRXqsrqtHyue+pzfvpiKoePVfDXmwaTMv1iJg6JsdAwDSZgexwiEgzMB8YDucAGEVmqqlt9mk0BDqlqoohMBh4HJvmsnwl8UMdtGtPsfJ5ZwKyUNDZkHyKmQwSPXT+IG4bHEhoc0L/9jKlVIA9VjQAyVDULQEReByYCvl/yE4EZ7vRbwDwREVVVEbkW2AkcqeM2jWk2UrMP8sTHaazNKqRb+3D+59qB3JwcS1hIsNelmRYskMERA+T4zOcCI0/WRlUrRKQYiBKRMuC3OHsWv66t/Sm2CYCITAWmAsTHx595L4zxwOacIp74eAdr0guIbhvG76/qz60j4wkPtcAw3musg+MzgFmqWnqmpxOq6kJgIUBycrLWX2nGBM63ecXMSklj+fYDdGrTiv+68hzuGNWLiFYWGKbxCGRw5AFxPvOx7rLa2uSKSAgQCRTi7EXcKCJ/BjoAVe5eyEY/tmlMk7Nt72FmL0vjoy37iYwI5Tc/6sddF/aibVhj/dvOtGSB/Fe5AUgSkQScL/fJwK012iwF7gLWAjcCK1RVgTHVDURkBlCqqvPccDndNo1pMjIOlDBrWTrvfb2XdmEhPDQuiZ+OTqB9eKjXpRlzUgELDnfMYhrwERAMLFLVLSLyCJCqqkuB54FXRCQDOIgTBHXeZqD6YEyg7Cw4wtzl6by7OY+I0GCmXZbIPWN6E9naAsM0fuL8gd+8JScna2pqqtdlGEPOwaPMXZ7OP77Mo1VwEHde2JOfXdyHTm1aeV2aMT8gIhtVNbnm8tPucYjI1cB7qloVkMqMaQH2FB3jyRUZvJmaQ1CQcPeFvbj3kj50bhfmdWnG1Jk/h6omAbNF5G2cQ0PbA1yTMc3G/sNlzF+ZwetfOGeR3zoynvsvS6Rr+3CPKzPmzJ02OFT1dhFpD9wCvCgiCrwA/F1VSwJdoDFNUX7JcZ5elcmr63dRWaXclBzHtLGJxHSI8Lo0Y86aX4PjqnpYRN4CIoCHgOuA34jIXFV9MoD1GdOkHDxygmdWZ/Ly57s4UVnF9UNjeODyJOI6tfa6NGPqjT9jHNcA/wEkAi8DI1T1gIi0xrnVhwWHafGKj5bz7JosXvhsJ0fLK7l2iBMYCdFtvC7NmHrnzx7HDThXca/2XaiqR0VkSmDKMqZpOFxWzqJPd/L8mp2UHK/gJ+d1Z/q4JBK7tPO6NGMCxp/gmAHsrZ4RkQigq6pmq+ryQBVmTGN25HgFL36ezcLVWRQfK+dHA7oyfXxfzunW3uvSjAk4f4LjTeBCn/lKd9n5AanImEbs2IlKXlmXzYJPsjh45ASXn9OF6eP7MjAm0uvSjGkw/gRHiKqeqJ5R1RMiYlcrmRalrLyS19bv5qlVmRSUHufivp2ZPi6JofEdvS7NmAbnT3Dki8g17i1CEJGJQEFgyzKmcTheUcniDTnMW5nB/sPHuaB3FE/fPozze3XyujRjPONPcNwLvCoi8wDBeR7GnQGtyhiPlVdW8fbGXJ5ckUFe0TGSe3Zk1qQhXNgn2uvSjPGcPxcAZgKjRKStO18a8KqM8UhFZRXvbN7D3OXp7D54lCFxHfjT9YMYkxTNmT4bxpjmxq8LAEXkJ8AAILz6fx5VfSSAdRnToCqrlH99vYc5y9LJKjjCwJj2LLo7mcv6dbHAMKYGfy4AXAC0Bi4DnsN5bsYXAa7LmAZRVaV8uGUfs1LSSD9Qyjnd2vHMHcO5on9XCwxjTsKfPY4LVfU8EflaVf8gIk8AHwS6MGMCSVVJ2bqfWcvS2bb3MIld2jL/1mH8eGA3goIsMIw5FX+Co8z971ER6YHzaNfugSvJmMBRVVbtyGdmShrf5BWTEN2G2ZOGcPXgHgRbYBjjF3+C458i0gH4C7AJUODZQBZlTH1TVT7NKGBmShpf7i4irlMEf7nxPK4bGkNIcJDX5RnTpJwyOEQkCFiuqkXA2yLyLyBcVYsbojhj6sO6rEJmfpzGF9kH6REZzp+uH8SNw2MJtcAw5oycMjhUtUpE5gND3fnjwHF/Ny4iE4A5OM8Hf05VH6uxPgznjrvDcQ6BTVLVbBEZASysbgbMUNUl7nuygRKcW59U1PZYQ2MANu46yBMfp/F5ZiFd2oXxyMQBTDo/jrCQYK9LM6ZJ8+dQ1XIRuQH4h9bhAeUiEgzMB8YDucAGEVmqqlt9mk0BDqlqoohMBh7HeeLgt0CyqlaISHfgKxH5p6pWuO+7TFXt6nVTq805RcxMSWN1Wj7RbVvxf6/qz20j4wkPtcAwpj74Exw/A34JVIhIGc4egKrq6W4DOgLIUNUsABF5HZiI8wyPahNx7r4L8BYwT0REVY/6tAnHGVcx5pS+zStm9rI0lm07QMfWofzvH5/DHRf0pHUrvy5XMsb4yZ8rx8/0wQIxOLcnqZYLjDxZG3fvohiIAgpEZCSwCOgJ3OGzt6HAx+4jbJ9R1YXUQkSmAlMB4uPjz7ALpinYvu8ws1PS+XDLPtqHh/DrK/py90UJtA2zwDAmEPy5APDi2pbXfLBTfVPV9cAAETkXeElEPlDVMmC0quaJSBcgRUS211aLGygLAZKTk22PpRnKOFDK7GVpvPfNXtq2CuHBy5OYMiaB9uGhXpdmTLPmz59kv/GZDsc5BLURGHua9+UBcT7zse6y2trkikgIEIkzSP4dVd0mIqXAQCBVVfPc5QdEZIlbT0BDzDQu2QVHmLs8nXc25xEeGsx9l/bhnjG96dDa7vZvTEPw51DV1b7zIhIHzPZj2xuAJBFJwAmIycCtNdosBe4C1uLcymSFqqr7nhz38FVP4BwgW0TaAEGqWuJOXwHYPbNaiJyDR3lyRTpvb8ojNFi4Z0xvpl7cm6i2YV6XZkyLciYHgXOBc0/XyP3SnwZ8hHM67iJV3SIij+DsOSwFngdeEZEM4CBOuACMBh4WkXKgCrhPVQtEpDewxL2HUAjwmqp+eAZ9ME3InqJjzFuZweINOQQFCXde0JOfX9qHLu3CvS7NmBZJTneGrYg8yfdnNQUBQ4BsVb09sKXVn+TkZE1NTfW6DFNHBw6XMX9lBn//IgdFmXx+PPdflki3SAsMYxqCiGys7Vo5f/Y4fL9xK4C/q+pn9VaZMTUUlB7n6VWZ/G3dLiqrlJuSY5k2NomYDhFel2aMwb/geAsoU9VKcC7sE5HWNa61MOasHTpygmdWZ/HS59kcr6jk+mGxPDA2ifio1l6XZozx4deV48A4oPrJfxHAx8CFgSrKtCzFR8t57tMsFn26k6PllUwc3IMHLk+id+e2XpdmjKmFP8ER7vu4WFUtFRH7E9CctZKychZ9ms1zn2ZRUlbBTwZ156FxSSR1PdNrTo0xDcGf4DgiIsNUdROAiAwHjgW2LNOcHTlewUtrs1m4Oouio+Vc0b8rD43rS/8ep7uLjTGmMfAnOB4C3hSRPTj3qeqGcyNCY+rk2IlK/rZuFws+yaTwyAnGntOF6eP6Mig20uvSjDF14M8FgBtE5Bygn7toh6qWB7Ys05yUlVfy9y9289SqTPJLjjMmKZrp4/syLL6j16UZY86AP/equh94VVW/dec7isgtqvpUwKszTdqJiireSM1h/ooM9h0uY1TvTsy/dRgjEjp5XZox5iz4c6jqHlWdXz2jqodE5B7AgsPUqryyin9symXu8gzyio4xvGdHZt48mAsTo70uzRhTD/wJjmD3GRkK3z2gye4mZ36gorKKdzfvYe6KdHYVHmVwbCT/7/pBXJwUjXubGGNMM+BPcHwIvCEiz7jzPwM+CFxJpqmpqlL++fUe5ixPJyv/CAN6tOf5u5IZe04XCwxjmiF/guO3OA9Euted/xrnzCrTwlVVKR9t2cesZWmk7S+lX9d2LLh9OD8a0NUCw5hmzJ+zqqpEZD3QB7gZiAbeDnRhpvFSVZZtO8DMlDS27T1Mn85tePKWofxkUHeCgiwwjGnuThocItIXuMV9FQBvAKjqZQ1TmmlsVJVVafnMSknj69xiekW1ZtakwVwzOIZgCwxjWoxT7XFsB9YAV6lqBoCITG+Qqkyjoqp8llHIzJQdbNpdRGzHCP5843lcPzSGkOAgr8szxjSwUwXH9TgPVlopIh8Cr+NcOW5akPVZhTyRksYXOw/SPTKcP143kJuGx9EqxALDmJbqpMGhqu8A77iPaJ2Ic+uRLiLyNLBEVT9ukAqNJzbuOsTMlB18llFIl3Zh/OGaAUweEUdYSLDXpRljPObP4PgR4DXgNRHpCNyEc6aVBUcz9FVOEbOWpbFqRz7RbVvxu5+cy+2jehIeaoFhjHHU6ZnjqnoIWOi+TktEJgBzcJ45/pyqPlZjfRjwMjAcKAQmqWq2iIzw+QwBZqjqEn+2ac7Mlj3FzEpJZ9m2/XRsHcrDPz6HOy/oSetWZ/JYemNMcxawbwX3CvP5wHggF9ggIktVdatPsynAIVVNFJHJwOM4d979FkhW1QoR6Q58JSL/xHn2+em2aeogbX8Js1LS+ODbfbQPD+FX4/ty90W9aBce6nVpxphGKpB/To4AMlQ1C0BEXscZK/H9kp8IzHCn3wLmubc38X0sbThOYPi7TeOHzPxSZi9L519f76FNqxAeuDyJKaMTiIywwDDGnFoggyMGyPGZzwVGnqyNu3dRDEQBBSIyElgE9ATucNf7s00ARGQqzhXvxMfHn31vmonsgiPMXZHOO1/mER4azM8v6cPUi3vTobXdfswY459GewBbVdcDA0TkXOAlEanT/bFU9buxmOTkZD1N82Yv99BRnlyewVubcgkNFv5zTG9+dnFvotqGeV2aMaaJCWRw5AFxPvOx7rLa2uSKSAgQiTNI/h1V3SYipcBAP7dpfOwtPsa8FRksTs1BRLhjVE/uu7QPXdqHe12aMaaJCmRwbACSRCQB58t9MnBrjTZLgbuAtcCNwApVVfc9Oe7hqZ7AOUA2UOTHNg1w4HAZT63K5LX1u1GUSefHcf9liXSPjPC6NGNMExew4HC/9KcBH+GcOrtIVbeIyCNAqqouBZ4HXhGRDOAgThAAjAYeFpFyoAq4T1ULAGrbZqD60BQVlB7nmU8yeXntLiqqlJuGxzJtbCKxHVt7XZoxppkQ9/lMzVpycrKmpqZ6XUZAHTpygoVrsnjp82zKyiu5bmgsD1yeSM+oNl6XZoxpokRko6om11zeaAfHjX+Kj5Xz/JosFn2WzZETFVx9Xg8eHJdEn85tvS7NGNNMWXA0USVl5bzwWTbPrsmipKyCKwd146FxfenbtZ3XpRljmjkLjibm6IkKXvp8F8+szqToaDnj+3floXFJDOgR6XVpxpgWwoKjiSgrr+Rv63bx9KpMCo+c4LJ+nZk+vi/nxXbwujRjTAtjwdHIHa+o5O/rdzN/VSb5JccZnRjN9PF9Gd6zo9elGWNaKAuORupERRVvbsxh3ooM9haXMTKhE/NuGcrI3lFel2aMaeEsOBqZ8soqlmzKY+6KdHIPHWNYfAf+etNgLuwThYg9gNEY4z0Ljkaiskp5d3Mec5ans6vwKOfFRvLotQO5pG9nCwxjTKNiweGxqirlvW/2MntZGpn5R+jfvT3P3ZnM5ed2scAwxjRKFhweqapSPt66j1kp6ezYX0K/ru1YcPswrujfjaAgCwxjTONlwdHAVJXl2w4wMyWNrXsP07tzG+beMpSrBnW3wDDGNAkWHA1EVfkkLZ9ZKWl8lVtMz6jWzLx5MNcM7kFIcJDX5RljjN8sOAJMVfk8s5CZKWls3HWImA4R/PmG87huWAyhFhjGmCbIgiOAvth5kCc+3sH6nQfp1j6cR68dyM3JcbQKscAwxjRdFhwBsGn3IWalpLEmvYDO7cKYcXV/Jo+IJzw02OvSjDHmrFlw1KOvc4uYlZLGyh35RLVpxe9+ci63j+ppgWGMaVYsOOrB1j2HmbUsjZSt++nQOpTfTjiHOy/oSZsw+/EaY5of+2Y7C2n7S5i9LI33v9lHu/AQfjm+L/9xUS/ahYd6XZoxxgRMQINDRCYAc3CeD/6cqj5WY30Y8DIwHCgEJqlqtoiMBx4DWgEngN+o6gr3PauA7sAxdzNXqOqBQPajpsz8UuYuT2fpV3to0yqEB8YmMmVMbyIjLDCMMc1fwIJDRIKB+cB4IBfYICJLVXWrT7MpwCFVTRSRycDjwCSgALhaVfeIyEDgIyDG5323qWqDP0R8V+ER5i7PYMmXuYSFBHPvJX2YOqY3Hdu0auhSjDHGM4Hc4xgBZKhqFoCIvA5MBHyDYyIww51+C5gnIqKqX/q02QJEiEiYqh4PYL0nlXvoKPNWZPDmxlxCgoSfXpTAvZf2IbptmBflGGOMpwIZHDFAjs98LjDyZG1UtUJEioEonD2OajcAm2qExgsiUgm8DTyqqlrzw0VkKjAVID4+/ow6sK+4jHkr03ljQw6CcMeontx3aR+6tA8/o+0ZY0xz0KgHx0VkAM7hqyt8Ft+mqnki0g4nOO7AGSf5N6q6EFgIkJyc/INgOZ3KKuWGpz/nQEkZNyfHMW1sIt0jI86oH8YY05wEMjjygDif+Vh3WW1tckUkBIjEGSRHRGKBJcCdqppZ/QZVzXP/WyIir+EcEvtBcJyt4CDhT9cPIiG6DXGdWtf35o0xpskK5L0vNgBJIpIgIq2AycDSGm2WAne50zcCK1RVRaQD8B7wsKp+Vt1YREJEJNqdDgWuAr4NVAcu7tvZQsMYY2oIWHCoagUwDeeMqG3AYlXdIiKPiMg1brPngSgRyQB+CTzsLp8GJAK/F5HN7qsLEAZ8JCJfA5tx9lieDVQfjDHG/JDUMq7c7CQnJ2tqaoOfvWuMMU2aiGxU1eSay+02rcYYY+rEgsMYY0ydWHAYY4ypEwsOY4wxdWLBYYwxpk4sOIwxxtSJBYcxxpg6seAwxhhTJxYcxhhj6sSCwxhjTJ006tuqG2OMV8rLy8nNzaWsrMzrUgIuPDyc2NhYQkP9e/y1BYcxxtQiNzeXdu3a0atXL0TE63ICRlUpLCwkNzeXhIQEv95jh6qMMaYWZWVlREVFNevQABARoqKi6rRnZcFhjDEn0dxDo1pd+2nBYYwxpk4sOIwxphEqKiriqaeeqvP7rrzySoqKiuq/IB8WHMYY0widLDgqKipO+b7333+fDh06BKgqh51VZYwxp/GHf25h657D9brN/j3a899XDzjp+ocffpjMzEyGDBlCaGgo4eHhdOzYke3bt5OWlsa1115LTk4OZWVlPPjgg0ydOhWAXr16kZqaSmlpKT/+8Y8ZPXo0n3/+OTExMbz77rtEREScde0B3eMQkQkiskNEMkTk4VrWh4nIG+769SLSy10+XkQ2isg37n/H+rxnuLs8Q0TmSksZvTLGtCiPPfYYffr0YfPmzfzlL39h06ZNzJkzh7S0NAAWLVrExo0bSU1NZe7cuRQWFv5gG+np6dx///1s2bKFDh068Pbbb9dLbQHb4xCRYGA+MB7IBTaIyFJV3erTbApwSFUTRWQy8DgwCSgArlbVPSIyEPgIiHHf8zRwD7AeeB+YAHwQqH4YY8yp9gwayogRI/7tOou5c+eyZMkSAHJyckhPTycqKurf3pOQkMCQIUMAGD58ONnZ2fVSSyD3OEYAGaqapaongNeBiTXaTARecqffAi4XEVHVL1V1j7t8CxDh7p10B9qr6jpVVeBl4NoA9sEYYxqFNm3afDe9atUqli1bxtq1a/nqq68YOnRorddhhIWFfTcdHBx82vERfwUyOGKAHJ/5XL7fa/hBG1WtAIqBqBptbgA2qepxt33uabYJgIhMFZFUEUnNz88/404YY4wX2rVrR0lJSa3riouL6dixI61bt2b79u2sW7euQWtr1IPjIjIA5/DVFXV9r6ouBBYCJCcnaz2XZowxARUVFcVFF13EwIEDiYiIoGvXrt+tmzBhAgsWLODcc8+lX79+jBo1qkFrC2Rw5AFxPvOx7rLa2uSKSAgQCRQCiEgssAS4U1UzfdrHnmabxhjTLLz22mu1Lg8LC+ODD2of2q0ex4iOjubbb7/9bvmvf/3reqsrkIeqNgBJIpIgIq2AycDSGm2WAne50zcCK1RVRaQD8B7wsKp+Vt1YVfcCh0VklHs21Z3AuwHsgzHGmBoCFhzumMU0nDOitgGLVXWLiDwiIte4zZ4HokQkA/glUH3K7jQgEfi9iGx2X13cdfcBzwEZQCZ2RpUxxjSogI5xqOr7OKfM+i77vc90GXBTLe97FHj0JNtMBQbWb6XGGGP8ZbccMcYYUycWHMYYY+rEgsMYY0ydWHAYY0wz0LZt2wb7LAsOY4wxddKorxw3xphG4YOHYd839bvNboPgx4+ddPXDDz9MXFwc999/PwAzZswgJCSElStXcujQIcrLy3n00UeZOLHmLQADz/Y4jDGmEZo0aRKLFy/+bn7x4sXcddddLFmyhE2bNrFy5Up+9atf4dzvtWHZHocxxpzOKfYMAmXo0KEcOHCAPXv2kJ+fT8eOHenWrRvTp09n9erVBAUFkZeXx/79++nWrVuD1mbBYYwxjdRNN93EW2+9xb59+5g0aRKvvvoq+fn5bNy4kdDQUHr16lXr7dQDzYLDGGMaqUmTJnHPPfdQUFDAJ598wuLFi+nSpQuhoaGsXLmSXbt2eVKXBcepBGJAzBjTNAz8X1Dg7VfkgK6tKCkqJKZLJ7qHlnLbhFFcffuLDOrfj+TBAzknqTcc3Alty0EVCtL/fQOhERAZW/vGz4IFhzHGNGLfrP7Xd9PRUZ1Y+8HiWtuV7trcQBVZcJyaBwNixphGYts2iE7yuopGyU7HNcYYUycWHMYYcxJeXCPhhbr204LDGGNqER4eTmFhYbMPD1WlsLCQ8PBwv99jYxzGGFOL2NhYcnNzyc/P97qUgAsPDyc21v+zryw4jDGmFqGhoSQkJHhdRqNkh6qMMcbUiQWHMcaYOrHgMMYYUyfS3M8YABCRfOBMb+oSDRTUYzlNgfW5ZWhpfW5p/YWz73NPVe1cc2GLCI6zISKpqprsdR0NyfrcMrS0Pre0/kLg+myHqowxxtSJBYcxxpg6seA4vYVeF+AB63PL0NL63NL6CwHqs41xGGOMqRPb4zDGGFMnFhzGGGPqxILDJSITRGSHiGSIyMO1rA8TkTfc9etFpJcHZdYbP/r7SxHZKiJfi8hyEenpRZ316XR99ml3g4ioiDT5Uzf96bOI3Oz+rreIyGsNXWN98+PfdryIrBSRL91/31d6UWd9EZFFInJARL49yXoRkbnuz+NrERl21h+qqi3+BQQDmUBvoBXwFdC/Rpv7gAXu9GTgDa/rDnB/LwNau9M/b8r99bfPbrt2wGpgHZDsdd0N8HtOAr4EOrrzXbyuuwH6vBD4uTvdH8j2uu6z7PPFwDDg25OsvxL4ABBgFLD+bD/T9jgcI4AMVc1S1RPA68DEGm0mAi+5028Bl4uINGCN9em0/VXVlap61J1dB9T/E+8blj+/Y4D/AR4HyhqyuADxp8/3APNV9RCAqh5o4Brrmz99VqC9Ox0J7GnA+uqdqq4GDp6iyUTgZXWsAzqISPez+UwLDkcMkOMzn+suq7WNqlYAxUBUg1RX//zpr68pOH+xNGWn7bO7Cx+nqu81ZGEB5M/vuS/QV0Q+E5F1IjKhwaoLDH/6PAO4XURygfeBXzRMaZ6p6//vp2XP4zCnJCK3A8nAJV7XEkgiEgTMBO72uJSGFoJzuOpSnL3K1SIySFWLvCwqwG4BXlTVJ0TkAuAVERmoqlVeF9ZU2B6HIw+I85mPdZfV2kZEQnB2cQsbpLr6509/EZFxwP8BrlHV4w1UW6Ccrs/tgIHAKhHJxjkWvLSJD5D783vOBZaqarmq7gTScIKkqfKnz1OAxQCquhYIx7kZYHPl1//vdWHB4dgAJIlIgoi0whn8XlqjzVLgLnf6RmCFuiNPTdBp+ysiQ4FncEKjqR/3htP0WVWLVTVaVXupai+ccZ1rVDXVm3LrhT//rt/B2dtARKJxDl1lNWCN9c2fPu8GLgcQkXNxgqM5Px92KXCne3bVKKBYVfeezQbtUBXOmIWITAM+wjkrY5GqbhGRR4BUVV0KPI+zS5uBMxA12buKz46f/f0L0BZ40z0HYLeqXuNZ0WfJzz43K372+SPgChHZClQCv1HVpron7W+ffwU8KyLTcQbK727CfwQiIn/HCf9od9zmv4FQAFVdgDOOcyWQARwF/uOsP7MJ/7yMMcZ4wA5VGWOMqRMLDmOMMXViwWGMMaZOLDiMMcbUiQWHMcaYOrHgMKYeiEiliGz2eZ307rtnsO1eJ7vzqTFesOs4jKkfx1R1iNdFGNMQbI/DmAASkWwR+bOIfCMiX4hIoru8l4is8HneSby7vKuILBGRr9zXhe6mgkXkWfeZGR+LSIRnnTItngWHMfUjosahqkk+64pVdRAwD5jtLnsSeElVzwNeBea6y+cCn6jqYJxnLGxxlyfh3P58AFAE3BDQ3hhzCnbluDH1QERKVbVtLcuzgbGqmiUiocA+VY0SkQKgu6qWu8v3qmq0iOQDsb43lRTnaZMpqprkzv8WCFXVRxuga8b8gO1xGBN4epLpuvC9O3ElNj5pPGTBYUzgTfL571p3+nO+v1HmbcAad3o5zqN6EZFgEYlsqCKN8Zf91WJM/YgQkc0+8x+qavUpuR1F5GucvYZb3GW/AF4Qkd/g3NK7+o6lDwILRWQKzp7Fz4GzugW2MfXNxjiMCSB3jCNZVQu8rsWY+mKHqowxxtSJ7XEYY4ypE9vjMMYYUycWHMYYY+rEgsMYY0ydWHAYY4ypEwsOY4wxdfL/AeIGP5P0rR50AAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.plot(history['accuracy'])\n", - "plt.plot(history['val_accuracy'])\n", - "plt.title('Accuracies over epochs')\n", - "plt.ylabel('Accuracy')\n", - "plt.xlabel('Epoch')\n", - "plt.legend(['train', 'val'], loc='lower right')\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEWCAYAAABrDZDcAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAArAElEQVR4nO3deXwUdb7u8c83C4Swk7AT1iSMogKCiGyyq7igzijOqKOOIy4wbM549Jx7z/WcM3PPjB7ZBMVlcNzXI0dm1BGQsMiisomCkgUTEhAIgcSwBLL87h9deGMmhADpVDr9vF+vftFdVV31VCfk6arqrjLnHCIiEr4i/A4gIiL+UhGIiIQ5FYGISJhTEYiIhDkVgYhImFMRiIiEORWBSJgzs0fN7BW/c4h/VARSK8ws08xG+51DRP6RikDEJ2YW5XcGEVARiM/MrKGZzTazPd5ttpk19MbFm9nfzCzfzA6a2Wozi/DG/ZOZ7TazQjPbYWajvOERZvawmWWYWZ6ZvWVmrbxxMWb2ijc838w+N7O2p8h1npmt8KbbZmbXecMvNbO9ZhZZbtobzGxrNZbf1cycmd1tZruA5adY9jVmtsVb9lozu6jcuEwze8TMtpvZITN7wcxiyo2/x8zSvddrsZl1KDeul5kt9cbtM7N/LrfYBmb2kvd6bjOz/uWeV+lrLfWHikD89i/AQKAP0BsYAPwvb9yDQA7QGmgL/DPgzKwnMBm4xDnXFLgCyPSe8xvgeuByoANwCJjvjbsDaA4kAHHAfcCxioHMLBr4K7AEaOPN81Uz6+mc+xQ4Aows95RfAK9VY/knXQ6c5+WuuOy+wELgXi/jM8Dik+XoudV7bg8g+eTrZWYjgf8EbgbaA1nAG964psAy4O9erkTg43LzvM6btgWwGJjnPa+q11rqC+ecbroF/Ubgj8foSoZnAOPKPb4CyPTu/zvwHpBY4TmJwH5gNBBdYdzXwKhyj9sDxUAU8CtgLXDRabIOBfYCEeWGvQ486t3/PbDQu9+UQDF0qcbyuwIO6F7Fsp8G/qPCsB3A5eVex/vKjRsHZHj3/ww8Vm5cE2/ZXYGfA5tPscxHgWXlHp8PHDvda61b/blpi0D81oHAO9eTsrxhAI8D6cASM9tpZg8DOOfSgWkE/oDtN7M3yu0C6QIs8nar5BP4w1xKYIviZeAj4A1vN9Rj3rv/yjJlO+fKKuTq6N1/DbjRe5d+I7DJOXdyHapa/knZVbweXYAHTz7fm0dCudek4vPLv14/ei2dc4eBPC93AoHSPZW95e4fBWLMLOo0r7XUEyoC8dseAn/8TursDcM5V+ice9A5153ArosZJ/dPO+dec84N8Z7rgD95z88GrnLOtSh3i3HO7XbOFTvn/s05dz4wCLgG+OUpMiWcPB5RLtdub9nbCfzBvYof7xaqcvnlpqnqlL/ZwB8qPD/WOfd6uWkSKnu9qPBamlljAruXdnvz7V7Fck+pitda6gkVgdSmaO+A7clbFIFdLv/LzFqbWTzwr8Ar8MNB00QzM6CAwDvrMjPraWYjvXfkRQT28598974A+IOZdfHm0drMxnv3R5jZhd6B3u8J7DYp/67/pE8JvCt+yMyizWw4cC3e/nbPa8BUYBjwdrnhp1x+NT0H3OcdlDYza2xmV3v7+E+aZGadvIPQ/wK86Q1/HbjLzPp4r83/BT51zmUCfwPam9k0Cxygb2pml54uzGlea6kv/N43pVt43Ajs23YVbr8HYoC5wHfebS4Q4z1nuve8IwQOGv9vb/hFwGdAIXCQwB+5Dt64CGAGgf3qhQR2h/xfb9zPveFHgH3esqJOkbcXsJJAAW0HbqgwvjOBP4jvVxhe1fK7eutd6TLLzeNK4HMg33tN3gaalnsdH/Ey5QMvArHlnnuft8yTr0uncuMuIHCA+BCBXUEPe8MfBV4pN90POat6rXWrPzfzfvAiEgLMLBP4tXNumd9ZpP7QriERkTCnIhARCXPaNSQiEua0RSAiEuZC7qRX8fHxrmvXrn7HEBEJKRs3bjzgnGtd2biQK4KuXbuyYcMGv2OIiIQUM8s61TjtGhIRCXMqAhGRMKciEBEJcyoCEZEwpyIQEQlzKgIRkTCnIhARCXNhUwS5hcf5w/vbyS087ncUEZE6JahFYGYtzOwdM/vGzL42s8sqjB9uZgVmtsW7/WuwsqzbmcfCNZkMeyyF//zwaw4eORGsRYmIhJRgf7N4DvB359zPzKwBEFvJNKudc9cEOQfX9e7AhR2bM/fjNJ5dtZNX1mVx1+Bu3DO0O81jK7tsrYhIeAjaFoGZNSdwGb8/AzjnTjjn8oO1vOroFt+YWRP6sHT6MIb/pA3zUtIZ8thy5ixLo7Co2M9oIiK+CeauoW5ALvCCmW02s+e9i2lXdJmZfWFmH5pZryDm+UFim6bM/8XFfDh1KJd1j2PWslSGPpbCUyvSOXK8pDYiiIjUGUG7HoGZ9QfWA4Odc5+a2Rzge+fc/y43TTOgzDl32MzGAXOcc0mVzGsiMBGgc+fO/bKyTnnupLPyZU4BM5fuIGVHLnGNG3D/8B7cNrALMdGRNbocERG/mNlG51z/SscFsQjaAeudc129x0MJXCz76iqekwn0d84dONU0/fv3d8E6++jGrEPMXpbK6rQDtG7akEnDe/DzSzvTMEqFICKhraoiCNquIefcXiDbzHp6g0YB2ysEa2dm5t0f4OXJC1am0+nXpSUv330pb04cSPf4xjz61+0Mf3wFr36axYmSMr9iiYgEVVAvVWlmfYDngQbATuAuYAKAc26BmU0G7gdKgGPADOfc2qrmGcwtgvKcc6zNyOOJJTvYtCufTi0bMWVUEjf27UhUZNh8/UJE6glfdg0FS20VwUnOOVak5jJraSpbcwroGhfL1NFJXNe7I5ERVms5RETOhS+7huoLM2NEzza8N2kwz97ej5joSKa/+QVXzF7F37buoawstIpURKQiFUE1mRlje7XjgylDeerWizFg8mubGTd3NR9t20uobVmJiJykIjhDERHGuAvb8/dpw5hzSx+Ol5Rx78sbuW7eGlK+2a9CEJGQo2ME56iktIz/2bKHOR+nkn3wGH07t2DGmGSGJMbjfSBKRMR3OlhcC4pLy3hnYw5PfpzGnoIiBnRtxYyxyQzsHud3NBERFUFtOl5SypufZzNveTr7C48zODGOGWN60q9LS7+jiUgYUxH4oKi4lFfWZ7FgZQYHDp9geM/WzBiTzEWdWvgdTUTCkIrAR0dPlPDSuiyeWZnBoaPFjD6vLTPGJHN+h2Z+RxORMKIiqAMKi4r5y5pMnlu9k++LShh3YTumjU4muW1Tv6OJSBhQEdQhBceK+fPqnSxck8mREyVc17sDU0cl0b11E7+jiUg9piKogw4dOcGzq3fylzWZHC8p5Ya+nZg6KonOcZVdxE1E5NyoCOqwA4ePs2BFBi+vz6K0zHFT/05MHplExxaN/I4mIvWIiiAE7Pu+iKdS0nn9s2wAbhmQwKQRibRtFuNzMhGpD1QEIWR3/jHmLU/n7Q3ZREYYtw3swn2X96B104Z+RxOREKYiCEHZB48y9+M03t28mwaREfxyUBfuHdaDVo0b+B1NREKQiiCEfXvgCHOWpfLeF3uIjY7kV0O68esh3WkeG+13NBEJISqCeiBtXyGzl6Xx/pff0TQmil8P6c6vhnSlaYwKQUROT0VQj2zf8z2zlqWydPs+WsRGM3FYd+4c1JXYBlF+RxOROkxFUA99mVPAzKU7SNmRS1zjBtw/vAe3DexCTHSk39FEpA5SEdRjG7MOMWtpKp+kH6BN04ZMGpHILQMSaBilQhCR/09FEAY+3ZnHE0tT+ezbg7RvHsPkkYnc1C+BBlG6CJ2IqAjChnOOtRl5PLFkB5t25dOpZSOmjErixr4diYpUIYiEMxVBmHHOsSI1l1lLU9maU0C3+MZMHZXEtb07EBmhy2eKhKOqikBvE+shM2NEzza8N2kwz97ej4ZREUx7cwtXzF7F+1u/o6wstMpfRIJLRVCPmRlje7XjgylDmf+LiwGY9Nomxs1dzUfb9hJqW4MiEhwqgjAQEWFcfVF7Ppo2jDm39OF4SRn3vryR6+atIeWb/SoEkTCnIggjkRHG+D4dWTp9GI//7CLyj53grr98zo1Pr+WTtAMqBJEwpYPFYay4tIy3N+Qwb3kaewqKGNCtFTPGJDOwe5zf0USkhulTQ1Kl4yWlvPl5NvOWp7O/8DiDE+OYMaYn/bq09DuaiNQQFYFUS1FxKa+sz2LBygwOHD7B8J6tmTEmmYs6tfA7moicIxWBnJGjJ0p4cW0Wz6zKIP9oMWPOb8v00cmc36GZ39FE5CypCOSsFBYV88KaTJ5bvZPCohLGXdiO6aOTSWrb1O9oInKGVARyTgqOFfPn1TtZuCaTIydKuK53B6aOSqJ76yZ+RxORavLtm8Vm1sLM3jGzb8zsazO7rMJ4M7O5ZpZuZlvN7OJg5pGz07xRNDPG9mT1QyO4d1gPlmzbx+iZK/nt21+wK++o3/FE5BwFdYvAzF4EVjvnnjezBkCscy6/3PhxwG+AccClwBzn3KVVzVNbBP7LLTzOMyszeHl9FqVljpv6d2LyyCQ6tmjkdzQROQVfdg2ZWXNgC9DdnWIhZvYMsMI597r3eAcw3Dn33anmqyKoO/Z9X8RTKem8/lk2ALcMSGDSiETaNovxOZmIVOTXrqFuQC7wgpltNrPnzaxxhWk6AtnlHud4w37EzCaa2QYz25Cbmxu8xHJG2jaL4d/GX0DK74bz036deO3TXQx7LIX/+Nt2Dhw+7nc8EammYBZBFHAx8LRzri9wBHj4bGbknHvWOdffOde/devWNZlRakDHFo34zxsvZPmDw7m2dwdeWPMtQ/+Uwh8//IZDR074HU9ETiOYRZAD5DjnPvUev0OgGMrbDSSUe9zJGyYhqHNcLP91U2+WzbicK3q15ZlVGQz503KeWLKDgmPFfscTkVMIWhE45/YC2WbW0xs0CtheYbLFwC+9Tw8NBAqqOj4goaF76ybMvqUvS6YNY3jPNjy5PJ0hf1rO3I/TKCxSIYjUNcH+1FAf4HmgAbATuAuYAOCcW2BmBswDrgSOAnc556o8EqyDxaFn+57vmbUslaXb99EiNpp7h/XgjkFdiG0Q5Xc0kbChL5RJnbA1J5+ZS1NZsSOXuMYNuH94D24b2IWY6Ei/o4nUeyoCqVM2Zh1i1tJUPkk/QJumDZk0IpFbBiTQMEqFIBIsKgKpk9bvzGPm0lQ++/YgHZrHMHlkEjf170R0pK6XJFLTVARSZznnWJOexxNLd7B5Vz4JrRoxZWQSN/TtSJQKQaTGqAikznPOsSI1l5lLUvlydwHd4hszdVQS1/buQGSE+R1PJOT5dtI5keoyM0b0bMPiyYN59vZ+NIyKYNqbW7hy9ire3/odZWWh9YZFJJSoCKROMTPG9mrHB1OGMv8XF+OASa9tYtzc1SzZtpdQ24IVCQUqAqmTIiKMqy9qz0fThjF7Qh+Ol5Qx8eWNjJ+/hpQd+1UIIjVIxwgkJJSUlrFo827mLk8j++AxLu7cghljejI4MY7A9xJFpCo6WCz1xomSMt7ZmMO85WnsKShiQLdWPDgmmUu7x/kdTaROUxFIvXO8pJQ3Pstmfko6+wuPMyQxnuljkunXpaXf0UTqJBWB1FtFxaW8sj6Lp1dkkHfkBMN7tmbGmGQu6tTC72gidYqKQOq9I8dLeGldFs+syiD/aDFjzm/LjDHJnNe+md/RROoEFYGEjcKiYl5Yk8lzq3dSWFTC1Re2Z9roJJLaNvU7moivVAQSdgqOFvP8JztZ+Mm3HC0uZXzvDkwZlUT31k38jibiCxWBhK2DR07w7KqdvLg2kxOlZdzQtyNTRyWR0CrW72gitUpFIGEvt/A4C1Zm8Mr6LErLHDf1T+A3IxPp0KKR39FEaoWKQMSz7/si5qek8/pnuzCMWwYkMGlEIm2bxfgdTSSoVAQiFezOP8a85Wm8vSGHyAjjtoFduH94D+KbNPQ7mkhQqAhETmFX3lHmLk/j3U05NIyK5I5BXbl3WHdaNm7gdzSRGqUiEDmNnbmHmfNxGou/2EPjBlH8anBX7h7aneaNov2OJlIjVAQi1ZS6r5DZy1L54Mu9NI2J4p6h3blrcFeaxqgQJLSpCETO0PY93zNrWSpLt++jRWw09w7rwR2DuhDbIMrvaCJnRUUgcpa25uQzc2kqK3bkEt+kAfdd3oPbBnYhJjrS72giZ0RFIHKONmYdZObSVNak59GmaUMmj0xkwiUJNIxSIUhoUBGI1JD1O/OYuSSVzzIP0qF5DJNHJnFT/05ER+pif1K3qQhEapBzjjXpeTyxdAebd+WT0KoRU0YmcUPfjkSpEKSOqqoI9FsrcobMjCFJ8bx7/yBeuPMSWjRqwO/e2crYWat4b8tuSstC682ViIpA5CyZGSN+0obFkwfzzO39aBAVwdQ3tnDl7FW8v/U7ylQIEiJUBCLnyMy4olc7PpgylPm/uBgHTHptE1c/+QlLtu0l1Ha/SvhREYjUkIgI4+qL2vPRtGHMntCHYydKmPjyRsbPX0PKjv0qBKmzdLBYJEhKSst4d/Nu5n6cRs6hY1zcuQUPju3JoB5xmJnf8STM6FNDIj46UVLG2xuzmbc8ne8KihjQrRUPjknm0u5xfkeTMOJbEZhZJlAIlAIlFUOY2XDgPeBbb9C7zrl/r2qeKgIJVcdLSnnjs2zmp6Szv/A4QxLjmTE2mYs7t/Q7moSBqoqgNk6cMsI5d6CK8audc9fUQg4RX508zfWESxJ4ZX0WT6/I4Man1jKiZ2tmjOnJhZ2a+x1RwpQOFovUspjoSH49tDurHhrBQ1f2ZHN2PtfO+4R7XtrA199973c8CUPBLgIHLDGzjWY28RTTXGZmX5jZh2bWK8h5ROqMxg2jeGB4IqsfGsH00cms35nHVXNWM+nVTaTtK/Q7noSRYB8j6Oic221mbYClwG+cc6vKjW8GlDnnDpvZOGCOcy6pkvlMBCYCdO7cuV9WVlbQMov4peBoMc9/spOFn3zL0eJSxvfuwNTRyXSLb+x3NKkH6sSnhszsUeCwc+6/qpgmE+hf1TEFHSyW+u7gkRM8syqDl9ZmcaK0jBv7dmTKqCQSWsX6HU1CmC/nGjKzxmbW9OR9YCzwVYVp2pn3gWozG+DlyQtWJpFQ0KpxAx656jxWPTSCOwd15b0v9jDiv1bwyLtfsif/mN/xpB4K2haBmXUHFnkPo4DXnHN/MLP7AJxzC8xsMnA/UAIcA2Y459ZWNV9tEUi42fd9EfNT0nn9s10Yxs8HJDBpRCJtmsX4HU1CyDnvGvLe0R9zzpWZWTLwE+BD51xxzUY9PRWBhKvd+ceYtzyNtzfkEBlh3D6wC/cN70F8k4Z+R5MQUBNFsBEYCrQE1gCfAyecc7fWZNDqUBFIuNuVd5Q5H6exaHPOD99NuHdYd1o2buB3NKnDauIYgTnnjgI3Ak85524C9FFPER90jovliZt7s3TG5Yzt1ZZnVmUw9LEUZi7ZQcGxWt9Il3qg2kVgZpcBtwLve8N0sVYRH/Vo3YQ5t/Tlo2nDGJYcz9zl6Qz903Ke/DiNw8dL/I4nIaS6RTANeARY5Jzb5h0ITglaKhGptuS2TXnq1n68P2UIA7rF8cTSVIb+aTkLVmZw9IQKQU7vjD81ZGYRQBPnnC/fhdcxApGqfZGdz6xlqazYkUt8kwbcd3kPbhvYhZhobcSHs3M+RmBmr5lZM+/TQ18B283sdzUZUkRqRu+EFvzlrgH89/2X0bNdU37//tdc/ngKL63L5HhJqd/xpA6q7q6h870tgOuBD4FuwO3BCiUi565fl1a8+uuBvDFxIF1aNeZf39vGiMdX8PpnuyguLfM7ntQh1S2CaDOLJlAEi73vD4TWFW1EwtTA7nG8ee9AXr57AG2axfDIu18y6omVvLMxhxIVglD9IngGyAQaA6vMrAug8+WKhAgzY2hSaxY9MIiFd/anWaMofvv2F4ydtYr3tuymtEzv68LZWZ9iwsyinHO1/pEEHSwWOXfOOZZs38espal8s7eQpDZNmD4mmSt7tSMiQtdTro9q4mBxczObaWYbvNsTBLYORCQEmRlX9GrHB1OGMu8XfXHAA69u4uonP2HJtr2E2rXM5dxUd9fQQgLXHr7Zu30PvBCsUCJSOyIijGsu6sBH04Yxa0Jvjp0oYeLLGxk/fw0pO/arEMJEdc81tMU51+d0w2qDdg2JBE9JaRnvbtrN3OVp5Bw6xsWdW/Dg2J4M6hGHd8Z4CVE1ca6hY2Y2pNwMBxM4bbSI1CNRkRHcfEkCyx8czh9uuIDvCoq49flPueXZ9Xz27UG/40mQVHeLoDfwEtDcG3QIuMM5tzWI2SqlLQKR2lNUXMobn+1i/ooMcguPMzQpnuljkrm4c0u/o8kZqrFLVXrXGMY5972ZTXPOza6ZiNWnIhCpfcdOlPLqp1k8vSKDvCMnGNGzNTPG9OTCTs1P/2SpE4JyzWIz2+Wc63xOyc6CikDEP0eOl/DiukyeXbWT/KPFjD2/LdPHJHNe+2Z+R5PTCFYRZDvnEs4p2VlQEYj4r7ComIWfZPL86p0UHi/h6gvbM210Ekltm/odTU5BWwQiEhQFR4t5bvVOXljzLUeLSxnfuwNTRyfTLV5fM6przroIzKyQys8pZEAj51xUzUSsPhWBSN1z8MgJnlmVwYtrMykuddzYtyNTRiWR0CrW72jiCcoWgV9UBCJ1V27hcZ5ekcErn2ZRVua4+ZIEJo9IpEOLRn5HC3sqAhGpVXsLipifks4bn+/CMH4+IIFJIxJp0yzG72hhS0UgIr7YnX+MecvTeHtDDpERxu0Du3Df8B7EN2nod7SwoyIQEV/tyjvKnI/TWLQ5h5joSO4Y1JWJQ7vTsnEDv6OFDRWBiNQJGbmHmbMsjb9u3UPjBlH8akg37h7SjeaNov2OVu+pCESkTtmxt5DZy1L58Ku9NIuJ4p6h3blrSDeaNKz1DyKGDRWBiNRJ2/YUMGtpGsu+3kfL2GjuvbwHv7ysC7ENVAg1TUUgInXaF9n5zFyaysrUXOKbNOD+4YncemlnYqIj/Y5Wb6gIRCQkbMw6yMylqaxJz6Nts4ZMGpHIhEsSaBilQjhXKgIRCSnrMvKYuXQHn2ceomOLRkwemcjP+nUiOrK6l1CRilQEIhJynHN8kn6AJ5aksiU7n86tYpkyKonr+3QgSoVwxmriCmUiIrXKzBia1JpFDwxi4Z39adYoit++/QVjZ63ivS27KS0LrTexdZmKQETqNDNj5E/a8tfJQ1hwWz8aREUw9Y0tXDVnFR98+R1lKoRzFtQiMLNMM/vSzLaY2T/sz7GAuWaWbmZbzeziYOYRkdBlZlx5QTs+mDKUeb/oS2mZ44FXN3H1k5+wdPs+Qm03d11SGx/WHeGcO3CKcVcBSd7tUuBp718RkUpFRBjXXNSBqy5oz+IvdjNnWRr3vLSB3p2aM31MMpcnt8bM/I4ZUvzeNTQeeMkFrAdamFl7nzOJSAiIjDBu6NuJZTMu57GfXsSBwye484XP+dmCdaxJP6AthDMQ7CJwwBIz22hmEysZ3xHILvc4xxv2I2Y20cw2mNmG3NzcIEUVkVAUFRnBzZckkPLb4fz++gvYfegYtz7/Kbc8u57Pvj3od7yQEOwiGOKcu5jALqBJZjbsbGbinHvWOdffOde/devWNZtQROqFBlER3DawCyt+N5xHrz2fnQeOcPMz67j9z5+yedchv+PVaUEtAufcbu/f/cAiYECFSXYDCeUed/KGiYiclZjoSO4c3I1VvxvBv4w7j217vueGp9byq798zle7C/yOVycFrQjMrLGZNT15HxgLfFVhssXAL71PDw0ECpxz3wUrk4iEj0YNIrlnWHdWPzSCh67sycasQ1zz5CdMfGkDX3/3vd/x6pRgfmqoLbDIO3ofBbzmnPu7md0H4JxbAHwAjAPSgaPAXUHMIyJhqHHDKB4YnsjtA7uw8JNMnl+9kyXbV3P1Re2ZPjqJxDZN/Y7oO51iQkTCSsHRYp5bvZMX1nzLseJSxvfpyJRRSXSLb+x3tKDSuYZERCo4eOQEz6zM4MV1mRSXOm7sGyiEhFaxfkcLChWBiMgp7C8sYsGKnbzyaRZlZY6bL0lg8ohEOrRo5He0GqUiEBE5jb0FRcxPSeeNz3dhGL+4tDMPDO9Bm2YxfkerESoCEZFqyjl0lHnL03l7Yw5REcYvL+vCvZf3IL5JQ7+jnRMVgYjIGcrKO8Lcj9NZtDmHmOhI7hjUlYlDu9OycQO/o50VFYGIyFnKyD3MnGVp/HXrHho3iOJXQ7px95BuNG8U7Xe0M6IiEBE5Rzv2FjJ7WSoffrWXZjFRTBzWnTsHd6NJw9o4ifO5UxGIiNSQr3YXMHtZKsu+3k/L2Gjuu7wHt1/WhdgGdbsQVAQiIjVsS3Y+s5amsjI1l/gmDbl/eA9uvbQzMdGRfkerlIpARCRINmQeZObSVNZm5NG2WUMmj0jk5ksSaBhVtwpBRSAiEmTrMvKYuXQHn2ceomOLRkwemcjP+nUiOtLv638FqAhERGqBc47VaQd4YmkqX2Tn07lVLFNGJXF9nw5E+VwIVRVB3agqEZF6wMwYltya/3lgEH++oz9NY6L47dtfMHbWKt7bspuysrr5xltFICJSw8yMUee15W+/GcKC2/oRHRnB1De2cOWcVXz45Xd1rhBUBCIiQWJmXHlBOz6cOpQnf96X0jLH/a9u4ponP2Hp9n3UlV3zKgIRkSCLiDCu7d2BJdMvZ9aE3hw9UcI9L23g+vlrWLFjv++FoIPFIiK1rKS0jHc37WbOx2nszj9Gvy4teXBMMoMS44O2TH1qSESkDjpRUsZbG7KZtzydvd8XMbB7Kx4c25NLuraq8WWpCERE6rCi4lJe/2wX81MyOHD4OEOT4pkxJpm+nVvW2DJUBCIiIeDYiVJeWZ/F0yszOHjkBCN/0oYZY5K5oGPzc563ikBEJIQcOV7CX9Zm8uyqnRQcK+aKXm2ZPiaZn7Rrdtbz1BfKRERCSOOGUUwakcjqfxrBtNFJrE3P48rZq5m9LDUoy6vb500VEQljzWKimTY6mbsGdeO51TuDchAZVAQiInVe89hofntFz6DNX7uGRETCnIpARCTMqQhERMKcikBEJMypCEREwpyKQEQkzKkIRETCnIpARCTMBb0IzCzSzDab2d8qGXenmeWa2Rbv9utg5xERkR+rjW8WTwW+Bk51tqQ3nXOTayGHiIhUIqhbBGbWCbgaeD6YyxERkbMX7F1Ds4GHgLIqpvmpmW01s3fMLCHIeUREpIKg7Roys2uA/c65jWY2/BST/RV43Tl33MzuBV4ERlYyr4nARIDOnTsHJ7CI1GvFxcXk5ORQVFTkd5SgiomJoVOnTkRHR1f7OUG7MI2Z/SdwO1ACxBA4RvCuc+62U0wfCRx0zlV5KR5dmEZEzsa3335L06ZNiYuLw8z8jhMUzjny8vIoLCykW7duPxrny4VpnHOPOOc6Oee6ArcAyyuWgJm1L/fwOgIHlUVEalxRUVG9LgEAMyMuLu6Mt3pq/XoEZvbvwAbn3GJgipldR2Cr4SBwZ23nEZHwUZ9L4KSzWcdaKQLn3ApghXf/X8sNfwR4pDYyiIhI5fTNYhGRWpCfn89TTz11xs8bN24c+fn5NR+oHBWBiEgtOFURlJSUVPm8Dz74gBYtWgQpVYCuWSwiYeff/rqN7Xu+r9F5nt+hGf/n2l6nHP/www+TkZFBnz59iI6OJiYmhpYtW/LNN9+QmprK9ddfT3Z2NkVFRUydOpWJEycC0LVrVzZs2MDhw4e56qqrGDJkCGvXrqVjx4689957NGrU6Jyza4tARKQW/PGPf6RHjx5s2bKFxx9/nE2bNjFnzhxSU1MBWLhwIRs3bmTDhg3MnTuXvLy8f5hHWloakyZNYtu2bbRo0YL//u//rpFs2iIQkbBT1Tv32jJgwIAffdZ/7ty5LFq0CIDs7GzS0tKIi4v70XO6detGnz59AOjXrx+ZmZk1kkVFICLig8aNG/9wf8WKFSxbtox169YRGxvL8OHDK/0uQMOGDX+4HxkZybFjx2oki3YNiYjUgqZNm1JYWFjpuIKCAlq2bElsbCzffPMN69evr9Vs2iIQEakFcXFxDB48mAsuuIBGjRrRtm3bH8ZdeeWVLFiwgPPOO4+ePXsycODAWs0WtHMNBYvONSQiZ+Prr7/mvPPO8ztGrahsXX0515CIiIQGFYGISJhTEYiIhDkVgYhImFMRiIiEORWBiEiYUxGIiNRBTZo0qbVlqQhERMKcvlksIuHnw4dh75c1O892F8JVfzzl6IcffpiEhAQmTZoEwKOPPkpUVBQpKSkcOnSI4uJifv/73zN+/PiazVUN2iIQEakFEyZM4K233vrh8VtvvcUdd9zBokWL2LRpEykpKTz44IP4cbYHbRGISPip4p17sPTt25f9+/ezZ88ecnNzadmyJe3atWP69OmsWrWKiIgIdu/ezb59+2jXrl2tZlMRiIjUkptuuol33nmHvXv3MmHCBF599VVyc3PZuHEj0dHRdO3atdLTTwebikBEpJZMmDCBe+65hwMHDrBy5Ureeust2rRpQ3R0NCkpKWRlZfmSK3yKIG0ZfPTPfqcQEb8M+CPs9zdCr9YRFB46QMfWLWkfmc+tYy/h2pdf4MLzkunfpxc/SeoOB9Ih9hi4Mtj/9Y9nEBsHTdrUeK7wKYKYZtAmPE5BKyKViIiGqBi/U/Dlmo9+uB/ftj3rliyqdLrD2dv+cWBEcP5kh08RJAwI3EQkPH39NbTqdvrpwpA+PioiEuZUBCISNkLtioxn42zWUUUgImEhJiaGvLy8el0Gzjny8vKIiTmzYyHhc4xARMJap06dyMnJITc31+8oQRUTE0OnTp3O6DkqAhEJC9HR0XTrpoPFldGuIRGRMKciEBEJcyoCEZEwZ6F2BN3McoGzPSFHPHCgBuOEAq1zeNA6h4dzWecuzrnWlY0IuSI4F2a2wTnX3+8ctUnrHB60zuEhWOusXUMiImFORSAiEubCrQie9TuAD7TO4UHrHB6Css5hdYxARET+UbhtEYiISAUqAhGRMFcvi8DMrjSzHWaWbmYPVzK+oZm96Y3/1My6+hCzRlVjnWeY2XYz22pmH5tZFz9y1qTTrXO56X5qZs7MQv6jhtVZZzO72ftZbzOz12o7Y02rxu92ZzNLMbPN3u/3OD9y1hQzW2hm+83sq1OMNzOb670eW83s4nNeqHOuXt2ASCAD6A40AL4Azq8wzQPAAu/+LcCbfueuhXUeAcR69+8Ph3X2pmsKrALWA/39zl0LP+ckYDPQ0nvcxu/ctbDOzwL3e/fPBzL9zn2O6zwMuBj46hTjxwEfAgYMBD4912XWxy2CAUC6c26nc+4E8AYwvsI044EXvfvvAKPMzGoxY0077To751Kcc0e9h+uBMztPbd1TnZ8zwH8AfwKKajNckFRnne8B5jvnDgE453y+XPs5q846O6CZd785sKcW89U459wq4GAVk4wHXnIB64EWZtb+XJZZH4ugI5Bd7nGON6zSaZxzJUABEFcr6YKjOutc3t0E3lGEstOus7fJnOCce782gwVRdX7OyUCyma0xs/VmdmWtpQuO6qzzo8BtZpYDfAD8pnai+eZM/7+flq5HEGbM7DagP3C531mCycwigJnAnT5HqW1RBHYPDSew1bfKzC50zuX7GSrIfg78xTn3hJldBrxsZhc458r8DhYq6uMWwW4godzjTt6wSqcxsygCm5N5tZIuOKqzzpjZaOBfgOucc8drKVuwnG6dmwIXACvMLJPAvtTFIX7AuDo/5xxgsXOu2Dn3LZBKoBhCVXXW+W7gLQDn3DoghsDJ2eqrav1/PxP1sQg+B5LMrJuZNSBwMHhxhWkWA3d4938GLHfeUZgQddp1NrO+wDMESiDU9xvDadbZOVfgnIt3znV1znUlcFzkOufcBn/i1ojq/G7/D4GtAcwsnsCuop21mLGmVWeddwGjAMzsPAJFUJ+vR7kY+KX36aGBQIFz7rtzmWG92zXknCsxs8nARwQ+cbDQObfNzP4d2OCcWwz8mcDmYzqBgzK3+Jf43FVznR8HmgBve8fFdznnrvMt9Dmq5jrXK9Vc54+AsWa2HSgFfuecC9mt3Wqu84PAc2Y2ncCB4ztD+Y2dmb1OoMzjveMe/weIBnDOLSBwHGQckA4cBe4652WG8OslIiI1oD7uGhIRkTOgIhARCXMqAhGRMKciEBEJcyoCEZEwpyIQqcDMSs1sS7nbKc9sehbz7nqqs0qK+KXefY9ApAYcc8718TuESG3RFoFINZlZppk9ZmZfmtlnZpboDe9qZsvLXeuhsze8rZktMrMvvNsgb1aRZvacd72AJWbWyLeVEkFFIFKZRhV2DU0oN67AOXchMA+Y7Q17EnjROXcR8Cow1xs+F1jpnOtN4Pzy27zhSQROFd0LyAd+GtS1ETkNfbNYpAIzO+yca1LJ8ExgpHNup5lFA3udc3FmdgBo75wr9oZ/55yLN7NcoFP5E/xZ4Gp4S51zSd7jfwKinXO/r4VVE6mUtghEzow7xf0zUf7Mr6XoWJ34TEUgcmYmlPt3nXd/Lf//xIW3Aqu9+x8TuCwoZhZpZs1rK6TImdA7EZF/1MjMtpR7/Hfn3MmPkLY0s60E3tX/3Bv2G+AFM/sdgdMfnzwb5FTgWTO7m8A7//uBczpdsEgw6BiBSDV5xwj6O+cO+J1FpCZp15CISJjTFoGISJjTFoGISJhTEYiIhDkVgYhImFMRiIiEORWBiEiY+3+oGmMnIGC6TgAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.plot(history['loss'])\n", - "plt.plot(history['val_loss'])\n", - "plt.title('Losses over epochs')\n", - "plt.ylabel('Loss')\n", - "plt.xlabel('Epoch')\n", - "plt.legend(['train', 'val'], loc='lower right')\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### PCA on embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "def get_inputs(ids, nb_per_class):\n", - " embeddings = []\n", - " idx_class = 0\n", - " for i in range(len(ids)):\n", - " for j in range(len(X_test)):\n", - " if y_test[j] == ids[i] and idx_class < nb_per_class:\n", - " embeddings.append(X_test[j])\n", - " idx_class += 1\n", - " idx_class = 0\n", - " return np.array(embeddings)\n", - "\n", - "def plot_embeddings(classes, nb_per_class=6):\n", - " inputs = get_inputs(classes, nb_per_class)\n", - " \n", - " assert len(inputs) == (len(classes) * nb_per_class)\n", - " \n", - " features = model(inputs)\n", - " features = PCA(n_components=2).fit_transform(features)\n", - " features = features.reshape((len(classes), nb_per_class, 2))\n", - "\n", - " # Plot\n", - " plt.figure(figsize=(6, 6))\n", - " for i in range(len(classes)):\n", - " plt.plot(features[i, :, 0],\n", - " features[i, :, 1],\n", - " 'o',\n", - " label=\"Class {}\".format(classes[i]))\n", - " plt.axis('off')\n", - " plt.legend(numpoints=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAV0AAAFUCAYAAACHh+9/AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAP50lEQVR4nO3dW4yV5bnA8WcWi5nCxk45DAcHGRCBcRBJSneqXlSpN20QE4W2SpraikTDRWto0m5JmlIbEUnbRNsSUq20SavS9KrBFEO8oCbF1tgEh4ODUhwoxwEVYcOe49oXDHVGAZlx+QxTfr+bcd6s9a3v4+LPx7vWeqwolUoBQI7CQJ8AwOVEdAESiS5AItEFSCS6AIlEFyCR6AIkEl2ARKILkEh0ARKJLkCi4kCfAHBpe/XVV8cWi8WnIuK6cKPWU1dEbOvo6Lhvzpw5Ry72SaILXFCxWHxq/Pjx19bU1LxTKBRMyOrW1dVV0dLS0nDo0KGnIuL2i32ev7WAj3JdTU3Ne4LbW6FQKNXU1ByPM/8CuPjnfULnA/znKAjuuXX/ufSpo6ILXPL27t1bvO22266+6qqrrps5c+a1N9988zWvvfZaVVNTU+W0adNmfhKvefr06Yp58+ZdPWnSpOuuv/76+qampspyHNeeLlBWv3u5edQTL75R23KitbLmiqq2b986bf/Xb6h7u7/H6+rqittvv/2aRYsWHduwYcM/IyK2bNky7MCBA0OnTJnSVr4z7+3xxx8fU11d3bF3795tv/rVr0YuW7Zs4vPPP//Pj3tcd7qDxYrqRbGi+q1YUd3V/XPRQJ8SfNDvXm4e9eMNO+qOnGitLEXEkROtlT/esKPudy83j+rvMTds2HBFsVgsfe9732s5u3bjjTee/tKXvnSy5+Oampoq58yZM6OhoeHahoaGazdt2vRfERHNzc1DP/e5z82or69vmDZt2syNGzeO6OjoiAULFkyeNm3azOnTpzf86Ec/GnuO1/3Mvffeeywi4lvf+tY7f/3rX6/o6urq72X8mzvdweBMYJ+MiOHdK3UR8WSsqI5YcfyZgTsx6O2JF9+obe3o6nUz19rRVXjixTdq+3u3+9prrw2bPXv2qY963JVXXtnx0ksv7Ro+fHipsbGx6u67775627ZtO59++ulRt9566/HHHnvsUEdHR5w4caKwZcuW4QcPHhz6xhtvbI+IOHr06JAPHu/w4cOVZ++khw4dGiNGjOg8fPhwccKECR39uY6zRHdwWBnvB/es4d3rosslo+VE6zn3Pc+3Xk5tbW0VixcvrtuxY8ewQqEQzc3NVRERN9xww//ef//9k9vb2wsLFy5856abbjpdX1/fum/fvqp77rnnqvnz5x+/44473vukz+8s2wuDw6Q+rsOAqLmi6px7rOdbvxizZs06vXXr1g/edHzII488Mm7s2LHtO3fu3NHY2Lijvb29EBHx5S9/+eRf/vKXptra2rZ77713yi9+8YvRNTU1ndu2bdsxd+7cE2vXrq256667Jn/weOPGjWvbs2dPZUREe3t7nDx5csi4ceM+1l1uhOgOFnv7uA4D4tu3TttfVSz02visKha6vn3rtP39Peb8+fNPtLW1VfzkJz8Zc3btb3/727CNGzeO6Pm448ePD5kwYUL7kCFDYs2aNaM7OzsjImLXrl2VEydObP/ud7979Bvf+EbLP/7xj+EHDx4sdnZ2xje/+c13H3300f2NjY0fivq8efPeffrpp0dHRKxbt27kjTfeeKJQ+PjJtL0wOCyP3nu6ERGnutfhknF237acn14oFArxpz/9affSpUuvevzxx8dXVVWVJk6c2Przn/98X8/HPfjgg0cWLFgw9bnnnhv9xS9+8fiwYcO6IiJeeOGFK5544onxxWKxNHz48M7f//73e956662hixcvntzV1VUREfHwww//64Ov+53vfOfoggULpkyaNOm66urqzvXr1+/u7zX0VFEq+czzoHDmzbSVcWZLYW9ELPcmGhm2bt361uzZs48O9HlcqrZu3Tpm9uzZky/28e50B4szgRVZGOTs6QIkEl2ARKILkEh0ARKJLkAi0QUueQMx2vHPf/7ziIaGhmuLxeKcdevWjSzXcX1kDCivV349KjY/Vhsnj1TGiLFtcfP398d/Lx50ox2vvvrqtnXr1r21atWqceU8rjtdoHxe+fWoeOGhujh5uDKiFHHycGW88FBdvPLrQTfaccaMGW2f//znT5fjq789udMFymfzY7XR0dq7Uh2thdj8WG1/73YHarTjJ0V0gfI5eeTcIxzPt15GRjsCl58RY8+9x3q+9YswUKMdPymiC5TPzd/fH8Wq3v9Pm2JVV9z8/UE32vGTYnsBKJ+z+7Zl/PTCQI123Lx58/CvfvWr17z33ntDXnzxxc888sgjV7755pvb+3sdZxntCFyQ0Y4X1tfRjrYXABKJLkAi0QVIJLoAiUQXIJHoAiQSXeCSNxCjHVevXl0zffr0hvr6+oY5c+bMePXVVz9VjuOKLlBW65vWj5r7h7mzrv/t9XPm/mHurPVN6/s9YSzi/dGOX/jCF07s27dv2/bt23euWrVq/4EDB4aW65zP5b777ju2a9euHa+//vqOZcuWHXrwwQevKsdxRRcom/VN60etfmV13dHTRytLUYqjp49Wrn5ldd3HCe9AjXYcNWrUv7/OfPLkySEVFRX9vYRefA0YKJu1W9fWtnW29bqZa+tsK6zdurb2azO+NuhGOz766KM1a9asGdfe3l7YtGlTU3/O/4Pc6QJlc+z0sXOOcDzfejm1tbVVLFq0aPL06dMbvvKVr0zdvXv3pyLOjHZ89tlnxyxbtuzKv//978NGjhzZ1XO04x//+MdPjxw5svNcx3zooYda9u3bt23FihX/+uEPfzihHOcpukDZjB42+pwjHM+3fjEuhdGOS5YseXvTpk2f6e819CS6QNk8MPuB/ZVDKnuNdqwcUtn1wOwHBt1ox8bGxqqz/71+/frqurq61v5eQ0/2dIGyObtvu3br2tpjp49Vjh42uu2B2Q/s7+9+bsTAjXb82c9+Nvall176dLFYLFVXV3f85je/2dPfa+jJaEfggox2vDCjHQEuYaILkEh0ARKJLkAi0QVIJLoAiUQXuOQNxGjHiIinnnpq5NSpU2dec801M+fPnz+lHMf05QigrN5+9rlRx9asqe04erSyOGZM2+ilS/ePuvuufn854uxox0WLFh3bsGHDPyMitmzZMuzAgQNDp0yZ0u+vF3+UxsbGqp/+9KcTXn755ddramo69+/fX5ZeutMFyubtZ58bdWTVqrqOlpbKKJWio6Wl8siqVXVvP/vcoBvt+Mtf/rJmyZIlR2pqajojImprazv6ew09udMFyubYmjW1pdbWXjdzpdbWwrE1a2r7e7c7UKMd33zzzaqIiM9+9rP1nZ2d8YMf/ODAwoUL3+vPNfQkukDZdBw9es4RjudbL6e2traKxYsX1+3YsWNYoVCI5ubmqogzox3vv//+ye3t7YWFCxe+c9NNN53uOdpx/vz5x++4444PxbSzs7Ni9+7dVVu2bGnas2fP0FtuuaX+lltu2T5mzJhzjoG8WLYXgLIpjhlzzj3W861fjIEa7ThhwoS222677d2qqqpSfX1925QpU/5v+/btVR964T4SXaBsRi9dur+iqqrXaMeKqqqu0UuXDrrRjnfeeee7mzdvviIi4uDBg8U9e/Z8asaMGR97vKPtBaBszu7blvPTCwM12vHOO+98b+PGjZ+eOnXqzCFDhpQefvjhfePHj/9YWwsRRjsCH8Foxwsz2hHgEia6AIlEFyCR6AIfpevsG0701v3n0vWRD+xBdIGPsq2lpaVaeHvr6uqqaGlpqY6IbX15no+MARfU0dFx36FDh546dOjQdeFGraeuiNjW0dFxX1+e5CNjAIn8rQWQSHQBEokuQCLRBUgkugCJRBcgkegCJBJdgESiC5BIdAESiS5AItEFSCS6AIlEFyCR6AIkEl2ARKILkEh0ARKJLkAi0QVIJLoAiUQXIJHoAiQSXYBEoguQSHQBEokuQCLRBUgkugCJRBcgkegCJBJdgESiC5BIdAESiS5AItEFSCS6AIlEFyCR6AIkEl2ARKILkEh0ARKJLkAi0QVIJLoAiUQXIJHoAiQSXYBEoguQSHQBEokuQCLRBUgkugCJRBcgkegCJBJdgESiC5BIdAESiS5AItEFSCS6AIlEFyCR6AIkEl2ARKILkEh0ARKJLkAi0QVIJLoAiUQXIJHoAiQSXYBEoguQSHQBEokuQCLRBUgkugCJRBcgkegCJBJdgESiC5BIdAESiS5AItEFSCS6AIlEFyCR6AIkEl2ARKILkEh0ARKJLkAi0QVIJLoAiUQXIJHoAiQSXYBEoguQSHQBEokuQCLRBUgkugCJRBcgkegCJBJdgESiC5BIdAESiS5AItEFSCS6AIlEFyCR6AIkEl2ARKILkEh0ARKJLkAi0QVIJLoAiUQXIJHoAiQSXYBEoguQSHQBEokuQCLRBUgkugCJRBcgkegCJBJdgESiC5BIdAESiS5AItEFSCS6AIlEFyCR6AIkEl2ARKILkEh0ARKJLkAi0QVIJLoAiYoDfQIAl5Kd9dcuioiVETEpIvZGxPJrX9/5TLmOX1Eqlcp1LIBBrTu4T0bE8B7LpyJiSbnCa3sB4H0ro3dwo/v3leV6AdEFeN+kPq73megCvG9vH9f7THQB3rc8zuzh9nSqe70sRBegW/ebZUsiojkiSt0/y/YmWoRPLwCkcqcLkEh0ARKJLkAi0QVIJLoAiUQXIJHoAiQSXYBEoguQSHQBEokuQCLRBUgkugCJRBcgkegCJBJdgESiC5BIdAESiS5AItEFSCS6AIlEFyCR6AIkEl2ARKILkEh0ARKJLkAi0QVIJLoAiUQXIFFxoE8ALieT/+f5RRGxMiImRcTeiFj+1qp5zwzsWZGpolQqDfQ5wGWhO7hPRsTwHsunImKJ8F4+bC9AnpXRO7jR/fvKATgXBojoQp5JfVznP5DoQp69fVznP5DoQp7lcWYPt6dT3etcJkQXknS/WbYkIpojotT905tolxmfXgBI5E4XIJHoAiQSXYBEoguQSHQBEokuQCLRBUgkugCJRBcgkegCJBJdgESiC5BIdAESiS5AItEFSCS6AIlEFyCR6AIkEl2ARKILkEh0ARKJLkAi0QVIJLoAiUQXIJHoAiQSXYBEoguQSHQBEokuQCLRBUgkugCJRBcgkegCJBJdgESiC5BIdAESiS5AItEFSCS6AIlEFyCR6AIkEl2ARKILkEh0ARKJLkAi0QVIJLoAiUQXIJHoAiQSXYBExXIfcNZvZy2KiJURMSki9kbE8sZ7Gp8p9+sADEYVpVKpbAfrDu6TETG8x/KpiFgivADl315YGb2DG92/ryzz6wAMSuWO7qQ+rgNcVsod3b19XAe4rJQ7usvjzB5uT6e61wEue2WNbvebZUsiojkiSt0/vYkG0K2sn14A4MJ8OQIgkegCJBJdgESiC5BIdAESiS5AItEFSCS6AIlEFyCR6AIkEl2ARKILkEh0ARKJLkAi0QVIJLoAiUQXINH/A9EdjDAVP9BcAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "speakers = [0, 1, 3, 6]\n", - "plot_embeddings(speakers, nb_per_class=1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Speaker / language recognition" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Scores" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1/1 [==============================] - 1s 1s/step - loss: 2.6310 - accuracy: 0.0909\n" - ] - }, - { - "data": { - "text/plain": [ - "[2.6309814453125, 0.09090909361839294]" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model_evaluate.evaluate(X_test, y_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " precision recall f1-score support\n", - "\n", - " 0.0 0.09 1.00 0.17 1\n", - " 1.0 0.00 0.00 0.00 2\n", - " 2.0 0.00 0.00 0.00 1\n", - " 3.0 0.00 0.00 0.00 2\n", - " 4.0 0.00 0.00 0.00 1\n", - " 6.0 0.00 0.00 0.00 1\n", - " 7.0 0.00 0.00 0.00 1\n", - " 8.0 0.00 0.00 0.00 2\n", - "\n", - " accuracy 0.09 11\n", - " macro avg 0.01 0.12 0.02 11\n", - "weighted avg 0.01 0.09 0.02 11\n", - "\n" - ] - } - ], - "source": [ - "report = classification_report(y_test, y_test_pred, zero_division=0)\n", - "print(report)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Learning curves" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEWCAYAAAB8LwAVAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAbdElEQVR4nO3df5xd873v8ddbMkyCNpMfgiQ1KepHOA1Gyqk6Du0Rrl+n5YRq0dNLe2ivo5yHnPbcWxy9R+k5VHFIe9HeUiKuH/1F0fhxb6lMUkUIiQiZECZpQlJCIp/7x/pObJPvTPYks2bPZN7Px2M/stb6rh+f72z2e6/13XttRQRmZmbtbVHrAszMrHdyQJiZWZYDwszMshwQZmaW5YAwM7MsB4SZmWU5IMwSSZ+S9Fyt6+jLJC2Q9Ola12HdwwFhpZD0oKRlkraqdS3ViohHImK3Wtdh1ls4IKzbSWoEPgUEcEwPH3tgTx6vt+iv/bZyOSCsDKcAjwE3AqdWNkgaI+n/SGqVtFTSVRVtp0t6VtIKSc9I2jctD0m7VKx3o6SL0/QhkloknS9pMXCDpAZJv0jHWJamR1dsP1TSDZJeSe13Vu6rYr0dJd2e9vOipP9W0TZBUrOkNyW9Juk/OvpjpH7Nk/QnSXdL2jEt/09J32u37l2SvlHF8S+QNE3STyW9CZyWOe5Wkr4n6eVU47WSBrX7u31T0pJ0aejkim0/LOkn6dgvSfoXSVtUtGefq2S8pCclvSHpVkn1aZvh6blYnv4Wj1Tu03qhiPDDj259APOAM4H9gNXAyLR8APBH4HJga6AeOCi1nQAsAvYHBOwC7JTaAtilYv83Ahen6UOANcB3ga2AQcAw4HPAYGBb4DbgzortfwncCjQAdcBfVeyrJU1vAcwE/gewJfBRYD5weGp/FPhimt4GOKCDv8WhwBJg31TfD4CHU9vBwEJAab4BeBvYsYrjX5D+tseldQdljn05cDcwNP0dfg78W7u/23+kuv4K+DOwW2r/CXBX2q4ReB74chXP1QLg8dSHocCzwFdT278B16a/eR3FWaZq/d+rH538v1zrAvzYvB7AQemFa3ianwOck6YPBFqBgZnt7gXO7mCfGwqId4H6TmoaDyxL0zsAa4GGzHqVAfEJ4OV27f8M3JCmHwYubOtnJ8f+X8ClFfPbpL9PY3pxfRk4OLWdDvy2yuNfQAqaDo6r9IK/c8WyA4EXK/q6Bti6on0q8N8pgvxdYM+Ktq8AD1bxXC0AvlAxfylwbZq+iCJ0dumobj9618Ond9bdTgV+ExFL0vzNvH+ZaQzwUkSsyWw3BnhhI4/ZGhGr2mYkDZZ0Xbo08ibFi/kQSQPScf4UEcs2sM+dgB3T5ZDlkpYD3wRGpvYvAx8D5kiaIemoDvazI/BS20xErASWAqOieNW8BTgpNX8euKnK40Nx9tGRERRnUDMrtr8nLW+zLCL+XDH/Uqp3OMU7/JfatY1K0xt6rhZXTL9FEYoAl1GcXf5G0nxJkzvZh/UCHtiybpOub/8dMCCNB0Bx+WKIpI9TvKB9RNLATEgsBHbuYNdvUbzYtdkeaKmYb39L4nOB3YBPRMRiSeOBP1C8q14IDJU0JCKWd9KdhRTvtnfNNUbEXOCkdA39s8A0ScPaveACvELxYg+ApK0pLoEtSot+RvGCeQnFWcPfVnP8tjI6aVtCcblqXEQs6mCdBklbV9T8EeDptO3qVPczFW1t++nsueq42IgVFM/NuZL2An4raUZEPNDVfVnP8BmEdafjgPeAPSku64wH9gAeoRi4fhx4FbhE0taS6iV9Mm37I+A8SfupsIukthfWJ4DPSxogaSLF9fLObEvx4rhc0lDg220NEfEq8GvgmjSYXSfp4Mw+HgdWqBj8HpSOvZek/QEkfUHSiIhYCyxP26zN7OdnwJckjVfxkd//Cfw+Ihakev5A8YL8I+DeitDq9Pgbkur6IXC5pO1SzaMkHd5u1QslbSnpU8BRwG0R8R7F5abvSNo2PQ/fAH6atunsueqQpKPSugLeoPhvJfc3s17CAWHd6VSKa+QvR8TitgdwFXAyxTv4oykGNV+mOAuYBBARtwHfobgktQK4k2KQE+DstN3ytJ87N1DHFRSD1UsoPk11T7v2L1K8Q54DvA78Y/sdpBfJoyhC7kXefxH/cFplIjBb0krg+8CJEfF2Zj/3U1zXv50iHHcGTmy32s3Ap9O/1R6/GudTXNJ5LF1qu5/izKrNYmAZxVnOTRSDyXNS29cpxjDmA/831XZ9qq2z56ozu6YaVlIM8l8TEdO70B/rYW2fnjCzfkTSIcBPI2L0Bla1fsxnEGZmluWAMDOzLF9iMjOzLJ9BmJlZ1mbzPYjhw4dHY2NjrcswM+tTZs6cuSQiRuTaNpuAaGxspLm5udZlmJn1KZJe6qjNl5jMzCzLAWFmZlkOCDMzy3JAmJlZlgPCzMyyHBBmZpblgDAzsywHhJmZZTkgzMwsywFhZmZZDggzM8tyQJiZWZYDwszMshwQZmaW5YAwM7MsB4SZmWU5IMzMLMsBYWZmWQ4IMzPLckCYmVmWA8LMzLIcEGZmluWAMDOzLAeEmZllOSDMzCzLAWFmZlkOCDMzy3JAmJlZlgPCzMyyHBBmZpblgDAzsywHhJmZZTkgzMwsywFhZmZZDggzM8tyQJiZWVapASFpoqTnJM2TNDnTfrCkWZLWSDo+0/4hSS2SriqzTjMzW19pASFpAHA1cASwJ3CSpD3brfYycBpwcwe7+Vfg4bJqNDOzjpV5BjEBmBcR8yPiXeAW4NjKFSJiQUQ8Caxtv7Gk/YCRwG9KrNHMzDpQZkCMAhZWzLekZRskaQvg34HzNrDeGZKaJTW3trZudKFmZra+3jpIfSbwq4ho6WyliJgSEU0R0TRixIgeKs3MrH8YWOK+FwFjKuZHp2XVOBD4lKQzgW2ALSWtjIj1BrrNzKwcZQbEDGBXSWMpguFE4PPVbBgRJ7dNSzoNaHI4mJn1rNIuMUXEGuBrwL3As8DUiJgt6SJJxwBI2l9SC3ACcJ2k2WXVY2ZmXaOIqHUN3aKpqSmam5trXYaZWZ8iaWZENOXaeusgtZmZ1ZgDwszMshwQZmaW5YAwM7MsB4SZmWU5IMzMLMsBYWZmWQ4IMzPLckCYmVmWA8LMzLIcEGZmluWAMDOzLAeEmZllOSDMzCzLAWFmZlkOCDMzy3JAmJlZlgPCzMyyHBBmZpblgDAzsywHhJmZZTkgzMwsywFhZmZZDggzM8tyQJiZWZYDwszMshwQZmaW5YAwM7MsB4SZmWU5IMzMLMsBYWZmWQ4IMzPLckCYmVmWA8LMzLJKDQhJEyU9J2mepMmZ9oMlzZK0RtLxFct3SsufkDRb0lfLrNPMzNY3sKwdSxoAXA18BmgBZki6OyKeqVjtZeA04Lx2m78KHBgR70jaBng6bftKWfWamdkHlRYQwARgXkTMB5B0C3AssC4gImJBaltbuWFEvFsxuxW+FGZm1uPKfOEdBSysmG9Jy6oiaYykJ9M+vuuzBzOzntVr35lHxMKI+AtgF+BUSSPbryPpDEnNkppbW1t7vkgzs81YmQGxCBhTMT86LeuSdObwNPCpTNuUiGiKiKYRI0ZsdKFmZra+MgNiBrCrpLGStgROBO6uZkNJoyUNStMNwEHAc6VVamZm6yktICJiDfA14F7gWWBqRMyWdJGkYwAk7S+pBTgBuE7S7LT5HsDvJf0ReAj4XkQ8VVatZma2PkVE5ytIRwO/jIi1na5YY01NTdHc3FzrMszM+hRJMyOiKddWzRnEJGCupEsl7d69pZmZWW+1wYCIiC8A+wAvADdKejR9emjb0qszM7OaqWoMIiLeBKYBtwA7AH8LzJL09RJrMzOzGtpgQEg6RtIdwINAHTAhIo4APg6cW255ZmZWK9XcauNzwOUR8XDlwoh4S9KXyynLzMxqrZqAuIDi5nkApO8njIyIBRHxQFmFmZlZbVUzBnEbUPkR1/fSMjMz24xVExADK++umqa3LK8kMzPrDaoJiNa2bz4DSDoWWFJeSWZm1htUMwbxVeAmSVcBorj99imlVmVmZjW3wYCIiBeAA9IvuxERK0uvyszMaq6qX5ST9F+AcUC9JAAi4qIS6zIzsxqr5oty11Lcj+nrFJeYTgB2KrkuMzOrsWoGqf8yIk4BlkXEhcCBwMfKLcvMzGqtmoBYlf59S9KOwGqK+zGZmdlmrJoxiJ9LGgJcBswCAvhhmUWZmVntdRoQkrYAHoiI5cDtkn4B1EfEGz1RnJmZ1U6nl5jSr8hdXTH/jsPBzKx/qGYM4gFJn1Pb51vNzKxfqCYgvkJxc753JL0paYWkN0uuy8zMaqyab1L7p0XNzPqhDQaEpINzy9v/gJCZmW1eqvmY6z9VTNcDE4CZwKGlVGRmZr1CNZeYjq6clzQGuKKsgszMrHeoZpC6vRZgj+4uxMzMepdqxiB+QPHtaSgCZTzFN6rNzGwzVs0YRHPF9BrgZxHx/0qqx8zMeolqAmIasCoi3gOQNEDS4Ih4q9zSzMyslqr6JjUwqGJ+EHB/OeWYmVlvUU1A1Ff+zGiaHlxeSWZm1htUExB/lrRv24yk/YC3yyvJzMx6g2rGIP4RuE3SKxQ/Obo9xU+QmpnZZqyaL8rNkLQ7sFta9FxErC63LDMzq7UNXmKSdBawdUQ8HRFPA9tIOrP80szMrJaqGYM4Pf2iHAARsQw4vbSKzMysV6gmIAZU/liQpAHAltXsXNJESc9Jmidpcqb9YEmzJK2RdHzF8vGSHpU0W9KTkjzmYWbWw6oZpL4HuFXSdWn+K8CvN7RRCpKrgc9Q3L9phqS7I+KZitVeBk4Dzmu3+VvAKRExV9KOwExJ91aeyZiZWbmqCYjzgTOAr6b5Jyk+ybQhE4B5ETEfQNItwLHAuoCIiAWpbW3lhhHxfMX0K5JeB0YAy6s4rpmZdYMNXmKKiLXA74EFFC/6hwLPVrHvUcDCivmWtKxLJE2guKT1QqbtDEnNkppbW1u7umszM+tEh2cQkj4GnJQeS4BbASLir3umNJC0A/C/gVNTUH1AREwBpgA0NTVF+3YzM9t4nV1imgM8AhwVEfMAJJ3ThX0vAsZUzI9Oy6oi6UPAL4FvRcRjXTiumZl1g84uMX0WeBWYLumHkg6j+CZ1tWYAu0oaK2lL4ETg7mo2TOvfAfwkIqZ14ZhmZtZNOgyIiLgzIk4EdgemU9xyYztJ/ynpbza044hYA3wNuJdizGJqRMyWdJGkYwAk7S+pBTgBuE7S7LT53wEHA6dJeiI9xm90L83MrMsUUf2le0kNFC/mkyLisNKq2ghNTU3R3Ny84RXNzGwdSTMjoinX1qXfpI6IZRExpbeFg5mZdb8uBYSZmfUfDggzM8tyQJiZWZYDwszMshwQZmaW5YAwM7MsB4SZmWU5IMzMLMsBYWZmWQ4IMzPLckCYmVmWA8LMzLIcEGZmluWAMDOzLAeEmZllOSDMzCzLAWFmZlkOCDMzy3JAmJlZlgPCzMyyHBBmZpblgDAzsywHhJmZZTkgzMwsywFhZmZZDggzM8tyQJiZWZYDwszMshwQZmaW5YAwM7MsB4SZmWU5IMzMLMsBYWZmWaUGhKSJkp6TNE/S5Ez7wZJmSVoj6fh2bfdIWi7pF2XWaGZmeaUFhKQBwNXAEcCewEmS9my32svAacDNmV1cBnyxrPrMzKxzZZ5BTADmRcT8iHgXuAU4tnKFiFgQEU8Ca9tvHBEPACtKrM/MzDpRZkCMAhZWzLekZd1G0hmSmiU1t7a2dueuzcz6vT49SB0RUyKiKSKaRowYUetyzMw2K2UGxCJgTMX86LTMzMz6gDIDYgawq6SxkrYETgTuLvF4ZmbWjUoLiIhYA3wNuBd4FpgaEbMlXSTpGABJ+0tqAU4ArpM0u217SY8AtwGHSWqRdHhZtZqZ2foUEbWuoVs0NTVFc3NzrcswM+tTJM2MiKZcW58epDYzs/I4IMzMLMsBYWZmWQNrXYCZWS2tXr2alpYWVq1aVetSSlVfX8/o0aOpq6urehsHhJn1ay0tLWy77bY0NjYiqdbllCIiWLp0KS0tLYwdO7bq7XyJycz6tVWrVjFs2LDNNhwAJDFs2LAunyU5IMys39ucw6HNxvTRAWFmZlkOCDOzGlq+fDnXXHNNl7c78sgjWb58efcXVMEBYWZWQx0FxJo1azrd7le/+hVDhgwpqaqCP8VkZpZc+PPZPPPKm926zz13/BDfPnpch+2TJ0/mhRdeYPz48dTV1VFfX09DQwNz5szh+eef57jjjmPhwoWsWrWKs88+mzPOOAOAxsZGmpubWblyJUcccQQHHXQQv/vd7xg1ahR33XUXgwYN2uTafQZhZlZDl1xyCTvvvDNPPPEEl112GbNmzeL73/8+zz//PADXX389M2fOpLm5mSuvvJKlS5eut4+5c+dy1llnMXv2bIYMGcLtt9/eLbX5DMLMLOnsnX5PmTBhwge+q3DllVdyxx13ALBw4ULmzp3LsGHDPrDN2LFjGT9+PAD77bcfCxYs6JZaHBBmZr3I1ltvvW76wQcf5P777+fRRx9l8ODBHHLIIdnvMmy11VbrpgcMGMDbb7/dLbX4EpOZWQ1tu+22rFixItv2xhtv0NDQwODBg5kzZw6PPfZYj9bmMwgzsxoaNmwYn/zkJ9lrr70YNGgQI0eOXNc2ceJErr32WvbYYw922203DjjggB6tzT8YZGb92rPPPssee+xR6zJ6RK6v/sEgMzPrMgeEmZllOSDMzCzLAWFmZlkOCDMzy3JAmJlZlgPCzKwP2WabbXrsWA4IMzPL8jepzcza/HoyLH6qe/e5/d5wxCUdNk+ePJkxY8Zw1llnAXDBBRcwcOBApk+fzrJly1i9ejUXX3wxxx57bPfWVQWfQZiZ1dCkSZOYOnXquvmpU6dy6qmncscddzBr1iymT5/OueeeSy3ueuEzCDOzNp280y/LPvvsw+uvv84rr7xCa2srDQ0NbL/99pxzzjk8/PDDbLHFFixatIjXXnuN7bffvkdrc0CYmdXYCSecwLRp01i8eDGTJk3ipptuorW1lZkzZ1JXV0djY2P2Nt9lc0CYmdXYpEmTOP3001myZAkPPfQQU6dOZbvttqOuro7p06fz0ksv1aQuB4SZWY2NGzeOFStWMGrUKHbYYQdOPvlkjj76aPbee2+amprYfffda1KXA8LMrBd46qn3Pz01fPhwHn300ex6K1eu7KmS/CkmMzPLc0CYmVmWA8LM+r3N5Zc1O7MxfSw1ICRNlPScpHmSJmfaD5Y0S9IaSce3aztV0tz0OLXMOs2s/6qvr2fp0qWbdUhEBEuXLqW+vr5L25U2SC1pAHA18BmgBZgh6e6IeKZitZeB04Dz2m07FPg20AQEMDNtu6yses2sfxo9ejQtLS20trbWupRS1dfXM3r06C5tU+anmCYA8yJiPoCkW4BjgXUBERELUtvadtseDtwXEX9K7fcBE4GflVivmfVDdXV1jB07ttZl9EplXmIaBSysmG9Jy7ptW0lnSGqW1Ly5p7+ZWU/r04PUETElIpoiomnEiBG1LsfMbLNSZkAsAsZUzI9Oy8re1szMuoHKGrmXNBB4HjiM4sV9BvD5iJidWfdG4BcRMS3NDwVmAvumVWYB+7WNSXRwvFagNjcs2TTDgSW1LqKHuc/9g/vcN+wUEdlLMKUFBICkI4ErgAHA9RHxHUkXAc0Rcbek/YE7gAZgFbA4Isalbf8e+Gba1Xci4obSCq0hSc0R0VTrOnqS+9w/uM99X6kBYRu2uf0HVQ33uX9wn/u+Pj1IbWZm5XFA1N6UWhdQA+5z/+A+93G+xGRmZlk+gzAzsywHhJmZZTkgeoCkoZLuS3emvU9SQwfrdXoHW0l3S3q6/Io33ab0WdJgSb+UNEfSbEmX9Gz11avijsVbSbo1tf9eUmNF2z+n5c9JOrxHC98EG9tnSZ+RNFPSU+nfQ3u8+I20Kc9zav+IpJWSzmu/ba8WEX6U/AAuBSan6cnAdzPrDAXmp38b0nRDRftngZuBp2vdn7L7DAwG/jqtsyXwCHBErfuUqX8A8ALw0VTnH4E9261zJnBtmj4RuDVN75nW3woYm/YzoNZ9KrnP+wA7pum9gEW17k/Zfa5onwbcBpxX6/505eEziJ5xLPDjNP1j4LjMOuvuYBvFbc3b7mCLpG2AbwAXl19qt9noPkfEWxExHSAi3qX4Jn3X7lPcM9bdsTjV2XbH4kqVf4dpwGGSlJbfEhHvRMSLwLy0v95uo/scEX+IiFfS8tnAIElb9UjVm2ZTnmckHQe8SNHnPsUB0TNGRsSraXoxMDKzTmd3sP1X4N+Bt0qrsPttap8BkDQEOBp4oIQaN1U1dx1et05ErAHeAIZVuW1vtCl9rvQ5YFZEvFNSnd1po/uc3tydD1zYA3V2uzJ/D6JfkXQ/sH2m6VuVMxERkqr+bLGk8cDOEXFO++uatVZWnyv2P5DiN0CujPS7Itb3SRoHfBf4m1rX0gMuAC6PiJXphKJPcUB0k4j4dEdtkl6TtENEvCppB+D1zGqLgEMq5kcDDwIHAk2SFlA8X9tJejAiDqHGSuxzmynA3Ii4YtOrLUU1dx1uW6clBd6HgaVVbtsbbUqfkTSa4v5rp0TEC+WX2y02pc+fAI6XdCkwBFgraVVEXFV61d2h1oMg/eEBXMYHB2wvzawzlOI6ZUN6vAgMbbdOI31nkHqT+kwx3nI7sEWt+9JJHwdSDKyP5f3By3Ht1jmLDw5eTk3T4/jgIPV8+sYg9ab0eUha/7O17kdP9bndOhfQxwapa15Af3hQXH99AJgL3F/xItgE/Khivb+nGKycB3wps5++FBAb3WeKd2gBPAs8kR7/tdZ96qCfR1Lc1v4F4Ftp2UXAMWm6nuLTK/OAx4GPVmz7rbTdc/TCT2l1d5+BfwH+XPGcPgFsV+v+lP08V+yjzwWEb7VhZmZZ/hSTmZllOSDMzCzLAWFmZlkOCDMzy3JAmJlZlgPCrAskvSfpiYrHenf23IR9N/aVu/Va/+BvUpt1zdsRMb7WRZj1BJ9BmHUDSQskXZp+6+BxSbuk5Y2SfivpSUkPSPpIWj5S0h2S/pgef5l2NUDSD9PvYPxG0qCadcr6PQeEWdcManeJaVJF2xsRsTdwFXBFWvYD4McR8RfATcCVafmVwEMR8XFgX96/FfSuwNURMQ5YTnHXU7Oa8DepzbpA0sqI2CazfAFwaETMl1QHLI6IYZKWADtExOq0/NWIGC6pFRgdFbe7TnfrvS8idk3z5wN1EdGXfgfENiM+gzDrPtHBdFdU/j7Ce3ic0GrIAWHWfSZV/Ptomv4dxd09AU6m+PlUKG5k+A8AkgZI+nBPFWlWLb87MeuaQZKeqJi/JyLaPuraIOlJirOAk9KyrwM3SPonoBX4Ulp+NjBF0pcpzhT+AXgVs17EYxBm3SCNQTRFxJJa12LWXXyJyczMsnwGYWZmWT6DMDOzLAeEmZllOSDMzCzLAWFmZlkOCDMzy/r/rRxmpfrnGOAAAAAASUVORK5CYII=\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.plot(history_evaluate['accuracy'])\n", - "plt.plot(history_evaluate['val_accuracy'])\n", - "plt.title('Accuracies over epochs')\n", - "plt.ylabel('Accuracy')\n", - "plt.xlabel('Epoch')\n", - "plt.legend(['train', 'val'], loc='lower right')\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX4AAAEWCAYAAABhffzLAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAZT0lEQVR4nO3df5RV5X3v8fcHmDiARBBGQH44qAlR/IE3U2MS0+CPNsiNgZhrKdfecBMrNdekamwSEnMjaUxrtLlJuGlqyZVor4ohWq42SVt/FKVrSUwHigqKogbDoMiAoqCSon7vH2ePOY5nZg4Me29mns9rrb3mnGfvfZ7vc2atD5tn79lbEYGZmaVjQNkFmJlZsRz8ZmaJcfCbmSXGwW9mlhgHv5lZYhz8ZmaJcfCbJUjSAkk3ll2HlcPBb7mRtFHSmWXXYWZv5eA3K5CkQWXXYObgt8JJOkjSdyU9ky3flXRQtm6UpJ9K2iHpeUn/KmlAtu5LkjZL2inpMUlnZO0DJM2X9KSk7ZKWSjo0W9co6casfYekf5M0uou6jpF0b7bdOkkfy9rfJ2mLpIFV235c0kN19N8sKSSdL+nXwL900fdHJa3J+r5f0glV6zZK+rKkRyS9IOlHkhqr1l8g6Yns+7pD0uFV66ZIuitb95ykr1R1+w5Jf5d9n+sktVTtV/O7tv7BwW9luBw4BZgKnAicDHw1W3cZ0AY0AaOBrwAhaTLwWeB3ImIY8BFgY7bP54BZwIeBw4EXgL/O1s0FDgEmACOBC4FXOxckqQH4B+BO4LDsM2+SNDkiHgBeBk6v2uW/AjfX0X+HDwPHZHV37vskYDHwJ1mNfwvc0fGPYea8bN+jgHd3fF+STgf+EvgDYCzwNHBLtm4YcDfwT1ldRwP3VH3mx7JthwN3AN/P9uvuu7b+ICK8eMlloRIWZ9ZofxKYUfX+I8DG7PWfA7cDR3fa52hgK3Am0NBp3aPAGVXvxwJ7gEHAp4H7gRN6qPVDwBZgQFXbEmBB9vpKYHH2ehiVfwiOqKP/ZiCAI7vp+2+Ab3Rqewz4cNX3eGHVuhnAk9nr64Crq9YdnPXdDMwB/r2LPhcAd1e9PxZ4tafv2kv/WHzEb2U4nMqRaYenszaAa4AngDslPSVpPkBEPAFcQiWwtkq6pWpK4whgWTZNsoNKEL9O5X8M/xf4Z+CWbFrp6uzovlZNmyLijU51jcte3wyckx2FnwOsjoiOMXTXf4dN3XwfRwCXdeyffcaEqu+k8/7V39dbvsuI2AVsz+qeQOUf2a5sqXr9CtAoaVAP37X1Aw5+K8MzVMKuw8SsjYjYGRGXRcSRVKYiPt8xvxwRN0fEqdm+AXwr238TcFZEDK9aGiNic0TsiYivR8SxwAeAjwKf7KKmCR3nE6rq2pz1/QiVgD2Lt07zdNt/1Tbd3QZ3E/DNTvsPiYglVdtMqPV90em7lDSUynTR5uxzj+ym3y51811bP+Dgt7w1ZCdYO5ZBVKZQviqpSdIo4GvAjfDmSc6jJQl4kcqR8xuSJks6PTvi3k1lnr7j6Pxa4JuSjsg+o0nSzOz1aZKOz07MvkRlGqT6qL7DA1SOer8oqUHSNOBssvnyzM3AxcDvAj+pau+y/zr9ELgwO4ksSUMl/edsjr7DRZLGZyeNLwd+nLUvAT4laWr23fwF8EBEbAR+CoyVdIkqJ9SHSXpfT8X08F1bf1D2XJOX/rtQmZuOTsuVQCOwEHg2WxYCjdk+l2b7vUzlJO//zNpPAH4J7ASepxJqh2frBgCfpzIvvpPK9MZfZOvmZO0vA89lfQ3qot4pwH1U/sF5BPh4p/UTqQTgzzq1d9d/czbumn1WfcZ04N+AHdl38hNgWNX3+OWsph3ADcCQqn0vzPrs+F7GV607jsoJ3ReoTO3Mz9oXADdWbfdmnd191176x6Lsl25mByhJG4E/joi7y67F+gdP9ZiZJcbBb2aWGE/1mJklxkf8ZmaJ6RM3jBo1alQ0NzeXXYaZWZ+yatWqbRHR1Lm9TwR/c3Mzra2tZZdhZtanSHq6VruneszMEuPgNzNLjIPfzCwxDn4zs8Q4+M3MEpNb8EtaLGmrpLWd2j8naX32qLer8+rfzMxqy/OI/3oqdxx8k6TTgJnAiRExBfirHPs3M7Macgv+iFhB5Zau1T4DXBURv8m22ZpX/2ZmVlvRc/zvBj4k6QFJ90n6na42lDRPUquk1vb29gJLNDPr34oO/kHAocApwBeApdmTlt4mIhZFREtEtDQ1ve0vjs3MbB8VHfxtwN9HxS+pPM1oVME1mJklrejg/3/AaQCS3g28A9hWcA1mZknL7SZtkpYA04BRktqAK4DFwOLsEs//AOaGHwhgZlao3II/IuZ0seqP8urTzMx65r/cNTNLjIPfzCwxDn4zs8Q4+M3MEuPgNzNLjIPfzCwxDn4zs8Q4+M3MEuPgNzNLjIPfzCwxDn4zs8Q4+M3MEuPgNzNLjIPfzCwxDn4zs8Q4+M3MEpNb8EtaLGlr9rStjrYFkjZLWpMtM/Lq38zMasvziP96YHqN9u9ExNRs+XmO/ZuZWQ25BX9ErACez+vzzcxs35Qxx/9ZSQ9lU0EjutpI0jxJrZJa29vbi6zPzKxfKzr4/wY4CpgKPAt8u6sNI2JRRLREREtTU1NB5ZmZ9X+FBn9EPBcRr0fEG8APgZOL7N/MzAoOfkljq95+HFjb1bZmZpaPQXl9sKQlwDRglKQ24ApgmqSpQAAbgT/Jq38zM6stt+CPiDk1mq/Lqz8zM6uP/3LXzCwxDn4zs8Q4+M3MEuPgNzNLjIPfzCwxDn4zs8Q4+M3MEuPgNzNLjIPfzCwxDn4zs8Q4+M3MEuPgNzNLjIPfzCwxDn4zs8Q4+M3MEuPgNzNLTG7BL2mxpK2S3vZ4RUmXSQpJo/Lq38zMasvziP96YHrnRkkTgN8Hfp1j32Zm1oXcgj8iVgDP11j1HeCLVJ67a2ZmBSt0jl/STGBzRDxYx7bzJLVKam1vby+gOjOzNBQW/JKGAF8BvlbP9hGxKCJaIqKlqakp3+LMzBJS5BH/UcAk4EFJG4HxwGpJYwqswcwseYOK6igiHgYO63ifhX9LRGwrqgYzM8v3cs4lwEpgsqQ2Sefn1ZeZmdUvtyP+iJjTw/rmvPo2M7Ou+S93zcwS4+A3M0uMg9/MLDEOfjOzxDj4zcwS4+A3M0uMg9/MLDEOfjOzxDj4zcwS4+A3M0uMg9/MLDEOfjOzxDj4zcwS4+A3M0uMg9/MLDEOfjOzxDj4zcwSk+ejFxdL2ippbVXbNyQ9JGmNpDslHZ5X/2ZmVlueR/zXA9M7tV0TESdExFTgp8DXcuzfzMxqyC34I2IF8Hyntpeq3g4FIq/+zcysttwett4VSd8EPgm8CJzWzXbzgHkAEydOLKY4M7MEFH5yNyIuj4gJwE3AZ7vZblFEtERES1NTU3EFmpn1c2Ve1XMT8IkS+zczS1KhwS/pXVVvZwLri+zfzMxynOOXtASYBoyS1AZcAcyQNBl4A3gauDCv/s3MrLbcgj8i5tRovi6v/szMrD7+y10zs8Q4+M3MEuPgNzNLjIPfzCwxDn4zs8Q4+M3MEuPgNzNLjIPfzCwxDn4zs8TUFfyShkoakL1+t6SPSWrItzQzM8tDvUf8K4BGSeOAO4H/RuUJW2Zm1sfUG/yKiFeAc4AfRMS5wJT8yjIzs7zUHfyS3g+cB/wsaxuYT0lmZpaneoP/EuDLwLKIWCfpSGB5blWZmVlu6rotc0TcB9wHkJ3k3RYRf5pnYWZmlo96r+q5WdI7JQ0F1gKPSPpCvqWZmVke6p3qOTYiXgJmAf8ITKJyZU+XJC2WtFXS2qq2ayStl/SQpGWShu9j3WZmto/qDf6G7Lr9WcAdEbEHiB72uR6Y3qntLuC4iDgBeJzKeQMzMytQvcH/t8BGYCiwQtIRwEvd7RARK4DnO7XdGRGvZW9/AYzfq2rNzKzX6gr+iFgYEeMiYkZUPA2c1su+P01l2qgmSfMktUpqbW9v72VXZmbWod6Tu4dI+l8dQSzp21SO/veJpMuB14CbutomIhZFREtEtDQ1Ne1rV2Zm1km9Uz2LgZ3AH2TLS8CP9qVDSf8d+ChwXkT0dJ7AzMz2s7qu4weOiohPVL3/uqQ1e9uZpOnAF4EPZ7eAMDOzgtV7xP+qpFM73kj6IPBqdztIWgKsBCZLapN0PvB9YBhwl6Q1kq7dx7rNzGwf1XvEfyHwd5IOyd6/AMztboeImFOj+bq9qM3MzHJQ7y0bHgROlPTO7P1Lki4BHsqxNjMzy8FePYErIl7K/oIX4PM51GNmZjnrzaMXtd+qMDOzwvQm+H0ppplZH9TtHL+kndQOeAGDc6nIzMxy1W3wR8SwogoxM7Ni9Gaqx8zM+iAHv5lZYhz8ZmaJcfCbmSXGwW9mlhgHv5lZYhz8ZmaJcfCbmSXGwW9mlhgHv5lZYnILfkmLJW2VtLaq7VxJ6yS9Iaklr77NzKxreR7xXw9M79S2FjgHWJFjv2Zm1o16H7241yJihaTmTm2PAki+lb+ZWVkO2Dl+SfMktUpqbW9vL7scM7N+44AN/ohYFBEtEdHS1NRUdjlmZv3GARv8ZmaWDwe/mVli8ryccwmwEpgsqU3S+ZI+LqkNeD/wM0n/nFf/ZmZWW55X9czpYtWyvPo0M7OeearHzCwxDn4zs8Q4+M3MEuPgNzNLjIPfzCwxDn4zs8Q4+M3MEuPgNzNLjIPfzCwxDn4zs8Q4+M3MEuPgNzNLjIPfzCwxDn4zs8Q4+M3MEuPgNzNLTJ5P4FosaauktVVth0q6S9KG7OeIvPo3M7Pa8jzivx6Y3qltPnBPRLwLuCd7b2ZmBcot+CNiBfB8p+aZwA3Z6xuAWXn1b2ZmtRU9xz86Ip7NXm8BRne1oaR5kloltba3txdTnZlZAko7uRsRAUQ36xdFREtEtDQ1NRVYmZlZ/1Z08D8naSxA9nNrwf2bmSWv6OC/A5ibvZ4L3F5w/2Zmycvzcs4lwEpgsqQ2SecDVwG/J2kDcGb23szMCjQorw+OiDldrDojrz7NzKxn/stdM7PEOPjNzBLj4DczS4yD38wsMQ5+M7PEOPjNzBLj4DczS4yD38wsMQ5+M7PEOPjNzBLj4DczS4yD38wsMQ5+M7PE5HZ3TjOzMu3Zs4e2tjZ2795ddim5a2xsZPz48TQ0NNS1vYPfzPqltrY2hg0bRnNzM5LKLic3EcH27dtpa2tj0qRJde3jqR4z65d2797NyJEj+3XoA0hi5MiRe/U/m1KCX9LFktZKWifpkjJqMLP+r7+Hfoe9HWfhwS/pOOAC4GTgROCjko4uug4zs1SVccR/DPBARLwSEa8B9wHnlFCHmVmuduzYwQ9+8IO93m/GjBns2LFj/xeUKSP41wIfkjRS0hBgBjChhDrMzHLVVfC/9tpr3e7385//nOHDh+dUVQlX9UTEo5K+BdwJvAysAV7vvJ2kecA8gIkTJxZZopn1M1//h3U88sxL+/Uzjz38nVxx9pRut5k/fz5PPvkkU6dOpaGhgcbGRkaMGMH69et5/PHHmTVrFps2bWL37t1cfPHFzJs3D4Dm5mZaW1vZtWsXZ511Fqeeeir3338/48aN4/bbb2fw4MG9qr2Uk7sRcV1EvDcifhd4AXi8xjaLIqIlIlqampqKL9LMrJeuuuoqjjrqKNasWcM111zD6tWr+d73vsfjj1cib/HixaxatYrW1lYWLlzI9u3b3/YZGzZs4KKLLmLdunUMHz6c2267rdd1lXIdv6TDImKrpIlU5vdPKaMOM0tDT0fmRTn55JPfcq39woULWbZsGQCbNm1iw4YNjBw58i37TJo0ialTpwLw3ve+l40bN/a6jrL+gOs2SSOBPcBFEbGjpDrMzAozdOjQN1/fe++93H333axcuZIhQ4Ywbdq0mtfiH3TQQW++HjhwIK+++mqv6ygl+CPiQ2X0a2ZWpGHDhrFz586a61588UVGjBjBkCFDWL9+Pb/4xS8Kq8u3bDAzy8nIkSP54Ac/yHHHHcfgwYMZPXr0m+umT5/OtddeyzHHHMPkyZM55ZTiZrwVEYV1tq9aWlqitbW17DLMrA959NFHOeaYY8ouozC1xitpVUS0dN7W9+oxM0uMg9/MLDEOfjOzxDj4zcwS4+A3M0uMg9/MLDEOfjOzA8TBBx9cSD8OfjOzxPgvd82s//vH+bDl4f37mWOOh7Ou6naT+fPnM2HCBC666CIAFixYwKBBg1i+fDkvvPACe/bs4corr2TmzJn7t7Ye+IjfzCwns2fPZunSpW++X7p0KXPnzmXZsmWsXr2a5cuXc9lll1H0HRR8xG9m/V8PR+Z5Oemkk9i6dSvPPPMM7e3tjBgxgjFjxnDppZeyYsUKBgwYwObNm3nuuecYM2ZMYXU5+M3McnTuuedy6623smXLFmbPns1NN91Ee3s7q1atoqGhgebm5pq3Y86Tg9/MLEezZ8/mggsuYNu2bdx3330sXbqUww47jIaGBpYvX87TTz9deE0OfjOzHE2ZMoWdO3cybtw4xo4dy3nnncfZZ5/N8ccfT0tLC+95z3sKr8nBb2aWs4cf/u0VRaNGjWLlypU1t9u1a1ch9ZRyVY+kSyWtk7RW0hJJjWXUYWaWosKDX9I44E+Blog4DhgI/GHRdZiZpaqs6/gHAYMlDQKGAM+UVIeZ9WN94QmD+8PejrPw4I+IzcBfAb8GngVejIg7O28naZ6kVkmt7e3tRZdpZn1cY2Mj27dv7/fhHxFs376dxsb6Z8wLP7kraQQwE5gE7AB+IumPIuLG6u0iYhGwCCrP3C26TjPr28aPH09bWxspHDg2NjYyfvz4urcv46qeM4FfRUQ7gKS/Bz4A3NjtXmZme6GhoYFJkyaVXcYBqYw5/l8Dp0gaIknAGcCjJdRhZpakMub4HwBuBVYDD2c1LCq6DjOzVJXyB1wRcQVwRRl9m5mlTn3hjLekdqD4G1r03ihgW9lFFCi18YLHnIq+OuYjIqKpc2OfCP6+SlJrRLSUXUdRUhsveMyp6G9j9oNYzMwS4+A3M0uMgz9fqV2tlNp4wWNORb8as+f4zcwS4yN+M7PEOPjNzBLj4O8lSYdKukvShuzniC62m5tts0HS3Brr75C0Nv+Ke6c3481u0/EzSeuzB/FcVWz1e0fSdEmPSXpC0vwa6w+S9ONs/QOSmqvWfTlrf0zSRwotvBf2dcySfk/SKkkPZz9PL7z4fdSb33O2fqKkXZL+rLCieysivPRiAa4G5mev5wPfqrHNocBT2c8R2esRVevPAW4G1pY9njzHS+XZC6dl27wD+FfgrLLH1MU4BwJPAkdmtT4IHNtpm/8BXJu9/kPgx9nrY7PtD6JyF9ongYFljynnMZ8EHJ69Pg7YXPZ48h5z1fpbgZ8Af1b2eOpdfMTfezOBG7LXNwCzamzzEeCuiHg+Il4A7gKmA0g6GPg8cGX+pe4X+zzeiHglIpYDRMR/ULlfU/33ki3WycATEfFUVustVMZerfq7uBU4I7vx4Ezgloj4TUT8Cngi+7wD3T6POSL+PSI6Hqi0jsqDlg4qpOre6c3vGUmzgF9RGXOf4eDvvdER8Wz2egswusY244BNVe/bsjaAbwDfBl7JrcL9q7fjBUDScOBs4J4catwfehxD9TYR8RrwIjCyzn0PRL0Zc7VPAKsj4jc51bk/7fOYs4O2LwFfL6DO/aqUm7T1NZLuBsbUWHV59ZuICEl1Xx8raSpwVERc2nnesEx5jbfq8wcBS4CFEfHUvlVpByJJU4BvAb9fdi0FWAB8JyJ2Zf8B6DMc/HWIiDO7WifpOUljI+JZSWOBrTU22wxMq3o/HrgXeD/QImkjld/FYZLujYhplCjH8XZYBGyIiO/2vtrcbAYmVL0fn7XV2qYt+8fsEGB7nfseiHozZiSNB5YBn4yIJ/Mvd7/ozZjfB/wXSVcDw4E3JO2OiO/nXnVvlX2Soa8vwDW89WTn1TW2OZTKPOCIbPkVcGinbZrpGyd3ezVeKucybgMGlD2WHsY5iMpJ6Un89qTflE7bXMRbT/otzV5P4a0nd5+ib5zc7c2Yh2fbn1P2OIoac6dtFtCHTu6WXkBfX6jMb94DbADurgq4FuD/VG33aSon+Z4APlXjc/pK8O/zeKkcTQWVJ66tyZY/LntM3Yx1BvA4las+Ls/a/hz4WPa6kcrVHE8AvwSOrNr38my/xzhAr1zan2MGvgq8XPV7XQMcVvZ48v49V31Gnwp+37LBzCwxvqrHzCwxDn4zs8Q4+M3MEuPgNzNLjIPfzCwxDn4zQNLrktZULW+7S2MvPru5L9x51dLhv9w1q3g1IqaWXYRZEXzEb9YNSRslXZ3dZ/6Xko7O2psl/YukhyTdI2li1j5a0jJJD2bLB7KPGijph9lzCO6UNLi0QVnyHPxmFYM7TfXMrlr3YkQcD3wf+G7W9r+BGyLiBOAmYGHWvhC4LyJOBP4Tv71d77uAv46IKcAOKnewNCuF/3LXDJC0KyIOrtG+ETg9Ip6S1ABsiYiRkrYBYyNiT9b+bESMktQOjI+qWxJnd169KyLelb3/EtAQEX3lGQzWz/iI36xn0cXrvVF9b/rX8fk1K5GD36xns6t+rsxe30/lTo0A51F5jCRUbmD3GQBJAyUdUlSRZvXyUYdZxWBJa6re/1NEdFzSOULSQ1SO2udkbZ8DfiTpC0A78Kms/WJgkaTzqRzZfwZ4FrMDiOf4zbqRzfG3RMS2smsx21881WNmlhgf8ZuZJcZH/GZmiXHwm5klxsFvZpYYB7+ZWWIc/GZmifn/YFtc8n1zVREAAAAASUVORK5CYII=\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.plot(history_evaluate['loss'])\n", - "plt.plot(history_evaluate['val_loss'])\n", - "plt.title('Losses over epochs')\n", - "plt.ylabel('Loss')\n", - "plt.xlabel('Epoch')\n", - "plt.legend(['train', 'val'], loc='lower right')\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Confusion matrix" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(
,\n", - " )" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY0AAAGJCAYAAAB2Nm/HAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAA/F0lEQVR4nO3dfXwV9Z33/9fHHO4sN6HIXRJXCFFSQlKEABULrq13Vwl0t9wuqFBstbv02q274i7dXeDSrbTlakutXD8W/F2VS1RYpF4RpEEWFxWtDcKKAVG5iwtJBaQJESEh4Of640wwgQQGPDGD5/18POZBZuY7831PDjmfM9+Zc465OyIiImFc1tIBRETk0qGiISIioaloiIhIaCoaIiISmoqGiIiEFmvpAC3NYu3cWndo6RjndO2X/qSlI4hIEnnvvVI++OADa2ydikbrDrTpO76lY5zTK79/pKUjiEgSuX5ofpPrNDwlIiKhqWiIiEhoKhoiIhKaioaIiISmoiEiIqGpaIiISGgqGiIiEpqKhoiIhKaiISIioaloiIhIaCoaIiISmoqGiIiEpqIhIiKhqWiIiEhoKhoiIhKaisYFWDh7Mu+tn8vrK37YZJuf3T+WbYWzKV4+kwHZGaeXTx41lJLCWZQUzmLyqKHNlvH5tUXk5fQlJzuLeT/98Vnra2pquH3SBHKysxg+bCjvlZaeXjfvJ3PJyc4iL6cv655fm7QZo55PGZMnYyTzuXuzTEAPYBmwG9gMrAGuAXoB25qpzzbAcmAX8Hug1/m2sXZdve2A6aGmr0/7uX9l4lzftrOs0fXf/P4CL9q4zdsOmO4j7pjnxW/u9bYDpnvPETN8z75D3nPEDO8x/D7fs++Q9xh+X+h+j9d6qOlo9UnvnZnpb72z2498VOO5uXm+Zev2Bm3mP7zAv/Pde/x4rfuSpU/5mHHj/Xit+5at2z03N88rj1b7jnf3eO/MTD9afTJ035+XjFHPp4zJk7El8w0cOMibes5sljMNMzPgGWCDu/dx90HATKB7c/RXz11AhbtnAb8AfpLInb+yZTd/PHKsyfUFN+Tx5OpiAIpLSunUoR09rujIzcO+xPrX3qai6hiVHx5n/Wtvc8v1/RIZDYBNxcX06ZNF78xMWrduzbgJE1m9qrBBm9WrCpl8xxQAvjVmLBteWI+7s3pVIeMmTKRNmzb06t2bPn2y2FRcnHQZo55PGZMnY1TzNdfw1I1ArbsvrFvg7lvd/eX6jcysl5m9bGZbgmlYsLynmb1kZm+Y2TYzG25mKWb2WDBfYmb3NtLvN4Elwc9PA18PCthnIq1bKvvfrzg9X3agkrRuqaR1TWX/gXrLD1aS1jU14f2Xl5eRkXHl6fn09AzKysrObnNlvE0sFqNjp04cPnyYsrKzty0vb7htMmSMej5lTJ6MUc3XXN8R3p/4kNT5HARudvdqM7saeArIByYBa939R2aWAlwODADS3b0/gJmlNrK/dGAfgLufNLMjQBfgg/qNzOxu4G4AWrW/0GMTEUlaLX0hvBWw2MxKgBVA3ZjNJuDbZjYHyHX3D4E9QKaZ/crMbgOqLrZTd1/k7vnunm+xdp/uCOopP1hJRo/Op+fTu6dSfrCS8kOVZHSvt7xbKuWHKhPWb520tHT27993er6sbD/p6elnt9kXb3Py5EmqjhyhS5cupKefvW1aWsNtkyFj1PMpY/JkjGq+5ioa24FBIdrdCxwAvkz8DKM1gLu/BIwAyoDHzOxOd68I2m0Avgc82sj+yoArAcwsBnQCDn+aA7kQz71YwqSCIQAMye1F1dHjvP9BFete3cFN12WT2qEdqR3acdN12ax7dUfC+88fPJhdu3ZSuncvJ06cYMXyZYwsGN2gzciC0TzxeHwE7zcrn+aGG7+GmTGyYDQrli+jpqaG0r172bVrJ4OHDEm6jFHPp4zJkzGq+ZpreOoF4CEzu9vdFwGYWR7xJ/F99dp1Ava7+8dmNgVICdpeFSxfbGZtgIFmtgY44e4rzewdYGkj/T4LTAF+B4wFXvDgtqpEWDJ3KsMHXc0Vqe3ZVfQgDy5cQ6tYCgCPPr2Roo3bufWrOWx/djbHqmu5Z048YkXVMeYuLmLj0vsBeGhRERVVTV9Qv1ixWIxf/PIRRo28lVOnTjFl6jT65eTwwJxZDByUT8Go0UyddhfTpt5BTnYWnTt/kcefWAZAv5wcxowbz7V5/YjFYsx/eAEpKSlJlzHq+ZQxeTJGNZ8l8Dm14Y7N0oD5xM84qoFS4AdALbDa3fsH1zFWAg4UAdPdvX1QQGYEbY8CdwIdgV/zydnRTHf/7Rl9tgUeB64F/ghMdPc958p52eXdvE3f8Z/2cJtVxaZHWjqCiCSR64fms3nz643eRNRsReNSoaIhItLQuYpGS18IFxGRS4iKhoiIhKaiISIioaloiIhIaCoaIiISmoqGiIiEpqIhIiKhqWiIiEhoKhoiIhKaioaIiISmoiEiIqGpaIiISGgqGiIiEpqKhoiIhKaiISIioaloiIhIaCoaIiISmoqGiIiEpqIhIiKhqWiIiEhoKhoiIhKaioaIiISmoiEiIqGpaIiISGgqGiIiEpqKhoiIhKaiISIioaloiIhIaCoaIiISmoqGiIiEpqJxARbOnsx76+fy+oofNtnmZ/ePZVvhbIqXz2RAdsbp5ZNHDaWkcBYlhbOYPGpos2V8fm0ReTl9ycnOYt5Pf3zW+pqaGm6fNIGc7CyGDxvKe6Wlp9fN+8lccrKzyMvpy7rn1yZtxqjnU8bkyRjJfO7eLBPQA1gG7AY2A2uAa4BewLZm6nMEsAU4CYwNs4216+ptB0wPNX192s/9KxPn+radZY2u/+b3F3jRxm3edsB0H3HHPC9+c6+3HTDde46Y4Xv2HfKeI2Z4j+H3+Z59h7zH8PtC93u81kNNR6tPeu/MTH/rnd1+5KMaz83N8y1btzdoM//hBf6d797jx2vdlyx9yseMG+/Ha923bN3uubl5Xnm02ne8u8d7Z2b60eqTofv+vGSMej5lTJ6MLZlv4MBB3tRzZrOcaZiZAc8AG9y9j7sPAmYC3Zujv3r+C5gKPNkcO39ly27+eORYk+sLbsjjydXFABSXlNKpQzt6XNGRm4d9ifWvvU1F1TEqPzzO+tfe5pbr+yU836biYvr0yaJ3ZiatW7dm3ISJrF5V2KDN6lWFTL5jCgDfGjOWDS+sx91ZvaqQcRMm0qZNG3r17k2fPllsKi5OuoxRz6eMyZMxqvmaa3jqRqDW3RfWLXD3re7+cv1GZtbLzF42sy3BNCxY3tPMXjKzN8xsm5kNN7MUM3ssmC8xs3vP7NTdS939TeDjZjquc0rrlsr+9ytOz5cdqCStWyppXVPZf6De8oOVpHVNTXj/5eVlZGRceXo+PT2DsrKys9tcGW8Ti8Xo2KkThw8fpqzs7G3LyxtumwwZo55PGZMnY1TzxRKyl7P1Jz4kdT4HgZvdvdrMrgaeAvKBScBad/+RmaUAlwMDgHR37w9gZqkXG87M7gbuBqBV+4vdjYhI0mnpC+GtgMVmVgKsAOrGbDYB3zazOUCuu38I7AEyzexXZnYbUHWxnbr7InfPd/d8i7X7dEdQT/nBSjJ6dD49n949lfKDlZQfqiSje73l3VIpP1SZsH7rpKWls3//vtPzZWX7SU9PP7vNvnibkydPUnXkCF26dCE9/ext09IabpsMGaOeTxmTJ2NU8zVX0dgODArR7l7gAPBl4mcYrQHc/SXiF7XLgMfM7E53rwjabQC+Bzya+NifznMvljCpYAgAQ3J7UXX0OO9/UMW6V3dw03XZpHZoR2qHdtx0XTbrXt2R8P7zBw9m166dlO7dy4kTJ1ixfBkjC0Y3aDOyYDRPPL4EgN+sfJobbvwaZsbIgtGsWL6MmpoaSvfuZdeunQweMiTpMkY9nzImT8ao5muu4akXgIfM7G53XwRgZnlAJ2BfvXadgP3u/rGZTQFSgrZXBcsXm1kbYKCZrQFOuPtKM3sHWNpM2Zu0ZO5Uhg+6mitS27Or6EEeXLiGVrEUAB59eiNFG7dz61dz2P7sbI5V13LPnHjEiqpjzF1cxMal9wPw0KIiKqqavqB+sWKxGL/45SOMGnkrp06dYsrUafTLyeGBObMYOCifglGjmTrtLqZNvYOc7Cw6d/4ijz+xDIB+OTmMGTeea/P6EYvFmP/wAlJSUpIuY9TzKWPyZIxqPgtuVU04M0sD5hM/46gGSoEfALXAanfvH1zHWAk4UARMd/f2QQGZEbQ9CtwJdAR+zSdnRzPd/bdn9DmY+F1bnYM+33f3nHPlvOzybt6m7/hPe7jNqmLTIy0dQUSSyPVD89m8+XVrbF2zFY1LhYqGiEhD5yoaLX0hXERELiEqGiIiEpqKhoiIhKaiISIioaloiIhIaCoaIiISmoqGiIiEpqIhIiKhqWiIiEhoKhoiIhKaioaIiISmoiEiIqGpaIiISGgqGiIiEpqKhoiIhKaiISIioaloiIhIaCoaIiISmoqGiIiEpqIhIiKhqWiIiEhoKhoiIhKaioaIiISmoiEiIqGpaIiISGgqGiIiEpqKhoiIhKaiISIioaloiIhIaCoaIiISmoqGiIiEpqJxARbOnsx76+fy+oofNtnmZ/ePZVvhbIqXz2RAdsbp5ZNHDaWkcBYlhbOYPGpos2V8fm0ReTl9ycnOYt5Pf3zW+pqaGm6fNIGc7CyGDxvKe6Wlp9fN+8lccrKzyMvpy7rn1yZtxqjnU8bkyRjJfO7eLBPQA1gG7AY2A2uAa4BewLZm6vNvgbeAN4H1wFXn28badfW2A6aHmr4+7ef+lYlzfdvOskbXf/P7C7xo4zZvO2C6j7hjnhe/udfbDpjuPUfM8D37DnnPETO8x/D7fM++Q95j+H2h+z1e66Gmo9UnvXdmpr/1zm4/8lGN5+bm+Zat2xu0mf/wAv/Od+/x47XuS5Y+5WPGjffjte5btm733Nw8rzxa7Tve3eO9MzP9aPXJ0H1/XjJGPZ8yJk/Glsw3cOAgb+o5s1nONMzMgGeADe7ex90HATOB7s3RXz3/CeS7ex7wNPDTRO78lS27+eORY02uL7ghjydXFwNQXFJKpw7t6HFFR24e9iXWv/Y2FVXHqPzwOOtfe5tbru+XyGgAbCoupk+fLHpnZtK6dWvGTZjI6lWFDdqsXlXI5DumAPCtMWPZ8MJ63J3VqwoZN2Eibdq0oVfv3vTpk8Wm4uKkyxj1fMqYPBmjmq+5hqduBGrdfWHdAnff6u4v129kZr3M7GUz2xJMw4LlPc3sJTN7w8y2mdlwM0sxs8eC+RIzu/fMTt39P9y97ln9NSDjzDbNKa1bKvvfrzg9X3agkrRuqaR1TWX/gXrLD1aS1jU14f2Xl5eRkXHl6fn09AzKysrObnNlvE0sFqNjp04cPnyYsrKzty0vb7htMmSMej5lTJ6MUc0XS8heztaf+JDU+RwEbnb3ajO7GngKyAcmAWvd/UdmlgJcDgwA0t29P4CZpZ5n33cBv21shZndDdwNQKv2IWKKiAi0/IXwVsBiMysBVgB1YzabgG+b2Rwg190/BPYAmWb2KzO7Dahqaqdmdjvx4jOvsfXuvsjd890932LtEnYw5QcryejR+fR8evdUyg9WUn6okozu9ZZ3S6X8UGXC+q2TlpbO/v37Ts+Xle0nPT397Db74m1OnjxJ1ZEjdOnShfT0s7dNS2u4bTJkjHo+ZUyejFHN11xFYzswKES7e4EDwJeJP8m3BnD3l4ARQBnwmJnd6e4VQbsNwPeARxvboZndBPwjMNrdaz7dYVyY514sYVLBEACG5Pai6uhx3v+ginWv7uCm67JJ7dCO1A7tuOm6bNa9uiPh/ecPHsyuXTsp3buXEydOsGL5MkYWjG7QZmTBaJ54fAkAv1n5NDfc+DXMjJEFo1mxfBk1NTWU7t3Lrl07GTxkSNJljHo+ZUyejFHN11zDUy8AD5nZ3e6+CMDM8oBOwL567ToB+939YzObAqQEba8Kli82szbAQDNbA5xw95Vm9g6w9MxOzexa4F+B29z9YKIPasncqQwfdDVXpLZnV9GDPLhwDa1iKQA8+vRGijZu59av5rD92dkcq67lnjnxiBVVx5i7uIiNS+8H4KFFRVRUNX1B/WLFYjF+8ctHGDXyVk6dOsWUqdPol5PDA3NmMXBQPgWjRjN12l1Mm3oHOdlZdO78RR5/YhkA/XJyGDNuPNfm9SMWizH/4QWkpKQkXcao51PG5MkY1XwW3KqacGaWBswnfsZRDZQCPwBqgdXu3j+4jrEScKAImO7u7YMCMiNoexS4E+gI/JpPzo5munuDaxZm9u9ALvCHYNF/uXvD0nyGyy7v5m36jv9Ux9rcKjY90tIRRCSJXD80n82bX7fG1jVb0bhUqGiIiDR0rqLR0hfCRUTkEqKiISIioaloiIhIaCoaIiISmoqGiIiEpqIhIiKhqWiIiEhoKhoiIhKaioaIiISmoiEiIqGpaIiISGgqGiIiEpqKhoiIhKaiISIioaloiIhIaCoaIiISmoqGiIiEpqIhIiKhqWiIiEhoKhoiIhKaioaIiISmoiEiIqGpaIiISGgqGiIiEpqKhoiIhKaiISIioaloiIhIaCoaIiISmoqGiIiEpqIhIiKhqWiIiEhoKhoXYOHsyby3fi6vr/hhk21+dv9YthXOpnj5TAZkZ5xePnnUUEoKZ1FSOIvJo4Y2W8bn1xaRl9OXnOws5v30x2etr6mp4fZJE8jJzmL4sKG8V1p6et28n8wlJzuLvJy+rHt+bdJmjHo+ZUyejJHM5+7NMgE9gGXAbmAzsAa4BugFbGumPr8HlABvABuBfufbxtp19bYDpoeavj7t5/6ViXN9286yRtd/8/sLvGjjNm87YLqPuGOeF7+519sOmO49R8zwPfsOec8RM7zH8Pt8z75D3mP4faH7PV7roaaj1Se9d2amv/XObj/yUY3n5ub5lq3bG7SZ//AC/8537/Hjte5Llj7lY8aN9+O17lu2bvfc3DyvPFrtO97d470zM/1o9cnQfX9eMkY9nzImT8aWzDdw4CBv6jmzWc40zMyAZ4AN7t7H3QcBM4HuzdFfPU+6e667DwB+Cvw8kTt/Zctu/njkWJPrC27I48nVxQAUl5TSqUM7elzRkZuHfYn1r71NRdUxKj88zvrX3uaW6/slMhoAm4qL6dMni96ZmbRu3ZpxEyayelVhgzarVxUy+Y4pAHxrzFg2vLAed2f1qkLGTZhImzZt6NW7N336ZLGpuDjpMkY9nzImT8ao5muyaJjZh2ZWFUwf1pv/0MyqzrPfG4Fad19Yt8Ddt7r7y2f00cvMXjazLcE0LFje08xeMrM3zGybmQ03sxQzeyyYLzGze8/s1N3r5/oC4GF+CYmS1i2V/e9XnJ4vO1BJWrdU0rqmsv9AveUHK0nrmprw/svLy8jIuPL0fHp6BmVlZWe3uTLeJhaL0bFTJw4fPkxZ2dnblpc33DYZMkY9nzImT8ao5os1tcLdO3yK/fYnPiR1PgeBm9292syuBp4C8oFJwFp3/5GZpQCXAwOAdHfvD2BmqY3t0MymA38LtAa+1kSbu4G7AWjVPvRBiYgku1DDU2b2VTP7dvDzFWbWO0H9twIWm1kJsAKoG7PZBHzbzOYAue7+IbAHyDSzX5nZbUCjZzvuvsDd+wB/D/xTE20WuXu+u+dbrF2CDgXKD1aS0aPz6fn07qmUH6yk/FAlGd3rLe+WSvmhyoT1WyctLZ39+/edni8r2096evrZbfbF25w8eZKqI0fo0qUL6elnb5uW1nDbZMgY9XzKmDwZo5rvvEXDzGYTfwKeGSxqDSw9z2bbgUEh+r8XOAB8mfgZRmsAd38JGAGUAY+Z2Z3uXhG020D8gvej59n3MuDPQmRImOdeLGFSwRAAhuT2ourocd7/oIp1r+7gpuuySe3QjtQO7bjpumzWvboj4f3nDx7Mrl07Kd27lxMnTrBi+TJGFoxu0GZkwWieeHwJAL9Z+TQ33Pg1zIyRBaNZsXwZNTU1lO7dy65dOxk8ZEjSZYx6PmVMnoxRzdfk8FQ9fw5cC2wBcPdyMzvf0NULwENmdre7LwIwszygE7CvXrtOwH53/9jMpgApQdurguWLzawNMNDM1gAn3H2lmb1DI4XLzK52953B7Ehg55ltPo0lc6cyfNDVXJHanl1FD/LgwjW0iqUA8OjTGynauJ1bv5rD9mdnc6y6lnvmxCNWVB1j7uIiNi69H4CHFhVRUdX0BfWLFYvF+MUvH2HUyFs5deoUU6ZOo19ODg/MmcXAQfkUjBrN1Gl3MW3qHeRkZ9G58xd5/IllAPTLyWHMuPFcm9ePWCzG/IcXkJKSknQZo55PGZMnY1TzWXCratMNzIrdfYiZbXH3gWb2BeB37p53nu3SgPnEzziqgVLgB0AtsNrd+wfXMVYSv2BdBEx39/ZBAZkRtD0K3Al0BH7NJ2dHM939t2f0+UvgpmC7CuD77r79XDkvu7ybt+k7/py/g5ZWsemRlo4gIknk+qH5bN78ujW2LsyZxr+Z2b8CqWb2XWAasPh8G7l7OdDUs3H/oM1OoH7x+ftg+RJgSSPbDTxPn39zvlwiInLxzls03P1/mtnNxC88XwPMcvd1zZ5MREQiJ8yZBsTfZd2O+DBSSfPFERGRKAtz99R3gGLgW8BY4DUzm9bcwUREJHrCnGnMAK5198MAZtYFeBX4380ZTEREoifMm/sOAx/Wm/8wWCYiIkmmyTMNM/vb4MddwO/NrJD4NY1vAm9+BtlERCRizjU8VfcGvt3BVKewkbYiIpIEzvWBhf/jswwiIiLRd94L4WbWFbgfyAHa1i1390Y/QVZERD6/wlwIfwJ4G+gN/A/iHweyqRkziYhIRIUpGl3c/f8n/qVKL7r7NJr4ngoREfl8C/M+jdrg3z+Y2UigHPhi80USEZGoClM0/sXMOgF/B/yK+KfNnvVVqyIi8vkX5gMLVwc/HiH+3d8iIpKkzvXmvl8RfzNfo9z9r5slkYiIRNa5zjRe/8xSiIjIJeFcb+5r7EuQREQkiYW55VZERARQ0RARkQugoiEiIqGF+ea+a8xsvZltC+bzzOyfmj+aiIhETZgzjcXATIJ3hrv7m8DE5gwlIiLRFKZoXO7uxWcsO9kcYUREJNrCFI0PzKwPwRv9zGws8IdmTSUiIpEU5rOnpgOLgGwzKwP2Arc3ayoREYmkMJ89tQe4ycy+AFzm7h82fywREYmiMN/cN+uMeQDc/YFmyiQiIhEVZnjqo3o/twUKgB3NE0dERKIszPDUz+rPm9n/BNY2WyIREYmsi3lH+OVARqKDiIhI9IW5plHCJ9+rkQJ0BXQ9Q0QkCYU50ygARgXTLUCauz/SrKkiauHsyby3fi6vr/hhk21+dv9YthXOpnj5TAZkf3JCNnnUUEoKZ1FSOIvJo4Y2W8bn1xaRl9OXnOws5v30x2etr6mp4fZJE8jJzmL4sKG8V1p6et28n8wlJzuLvJy+rHu++UYgo54x6vmUMXkyRjKfuzc5ET+zePtcbc6xbQ9gGbAb2AysAa4BegHbLmafF9D3GOJnR/nna2vtunrbAdNDTV+f9nP/ysS5vm1nWaPrv/n9BV60cZu3HTDdR9wxz4vf3OttB0z3niNm+J59h7zniBneY/h9vmffIe8x/L7Q/R6v9VDT0eqT3jsz0996Z7cf+ajGc3PzfMvW7Q3azH94gX/nu/f48Vr3JUuf8jHjxvvxWvctW7d7bm6eVx6t9h3v7vHemZl+tPpk6L4/Lxmjnk8ZkydjS+YbOHCQN/Wcec4zDXc/BbxjZn9yIYXI4vflPgNscPc+7j6I+OdXdb+Q/VwMM+sA/A3w+0Tv+5Utu/njkWNNri+4IY8nV8c/caW4pJROHdrR44qO3DzsS6x/7W0qqo5R+eFx1r/2Nrdc3y/R8dhUXEyfPln0zsykdevWjJswkdWrChu0Wb2qkMl3TAHgW2PGsuGF9bg7q1cVMm7CRNq0aUOv3r3p0yeLTcVnfnrM5z9j1PMpY/JkjGq+MMNTnYHtwSfdPls3nWebG4Fad19Yt8Ddt7r7y/UbmVkvM3vZzLYE07BgeU8ze8nM3jCzbWY23MxSzOyxYL7EzO5tou8HgZ8A1SGOLaHSuqWy//2K0/NlBypJ65ZKWtdU9h+ot/xgJWldUxPef3l5GRkZV56eT0/PoKys7Ow2V8bbxGIxOnbqxOHDhykrO3vb8vKG2yZDxqjnU8bkyRjVfGHep/HPF7Hf/sSHpM7nIHCzu1eb2dXAU0A+MAlY6+4/MrMU4ndsDQDS3b0/gJmlnrkzMxsIXOnuz5nZjKY6NbO7gbsBaNX+Ag5LRCS5hTnT+Ia7v1h/Ar6RoP5bAYuDO7RWAHVjNpuAb5vZHCA3+OiSPUCmmf3KzG4DqurvyMwuA34O/N35OnX3Re6e7+75FmuXoEOB8oOVZPTofHo+vXsq5QcrKT9USUb3esu7pVJ+qDJh/dZJS0tn//59p+fLyvaTnp5+dpt98TYnT56k6sgRunTpQnr62dumpTXcNhkyRj2fMiZPxqjmC1M0bm5k2X87zzbbgUEh9n0vcAD4MvEzjNYA7v4SMAIoAx4zszvdvSJotwH4HvDoGfvqQPwMZ4OZlQJfAZ41s/wQORLiuRdLmFQwBIAhub2oOnqc9z+oYt2rO7jpumxSO7QjtUM7broum3WvJv5N9fmDB7Nr105K9+7lxIkTrFi+jJEFoxu0GVkwmiceXwLAb1Y+zQ03fg0zY2TBaFYsX0ZNTQ2le/eya9dOBg8ZknQZo55PGZMnY1TzNTk8ZWZ/CfwV8Vf3b9Zb1QF45Tz7fQF4yMzudvdFwf7ygE7AvnrtOgH73f1jM5tC/G4tzOyqYPliM2sDDDSzNcAJd19pZu8AS+t36O5HgCvq5d8A3Ofur58na2hL5k5l+KCruSK1PbuKHuTBhWtoFUsB4NGnN1K0cTu3fjWH7c/O5lh1LffMiUesqDrG3MVFbFx6PwAPLSqioqrpC+oXKxaL8YtfPsKokbdy6tQppkydRr+cHB6YM4uBg/IpGDWaqdPuYtrUO8jJzqJz5y/y+BPLAOiXk8OYceO5Nq8fsViM+Q8vICUlJekyRj2fMiZPxqjms+AW1bNXmHUifhF8LvAP9VZ96O5/PO+OzdKA+cTPOKqBUuAHxL8BcLW79w+uY6wkfntsETDd3dsHBWRG0PYocCfQEfg1n5wdzXT3356j/w2EKBqXXd7N2/Qdf77DaVEVm5LybTEi0kKuH5rP5s2vW2PrmiwayUJFQ0SkoXMVjYv57CkREUlSKhoiIhKaioaIiISmoiEiIqGpaIiISGgqGiIiEpqKhoiIhKaiISIioaloiIhIaCoaIiISmoqGiIiEpqIhIiKhqWiIiEhoKhoiIhKaioaIiISmoiEiIqGpaIiISGgqGiIiEpqKhoiIhKaiISIioaloiIhIaCoaIiISmoqGiIiEpqIhIiKhqWiIiEhoKhoiIhKaioaIiISmoiEiIqGpaIiISGgqGiIiEpqKxgVYOHsy762fy+srfthkm5/dP5ZthbMpXj6TAdkZp5dPHjWUksJZlBTOYvKooc2W8fm1ReTl9CUnO4t5P/3xWetramq4fdIEcrKzGD5sKO+Vlp5eN+8nc8nJziIvpy/rnl+btBmjnk8ZkydjJPO5e7NMQA9gGbAb2AysAa4BegHbmqnPqcAh4I1g+s75trF2Xb3tgOmhpq9P+7l/ZeJc37azrNH13/z+Ai/auM3bDpjuI+6Y58Vv7vW2A6Z7zxEzfM++Q95zxAzvMfw+37PvkPcYfl/ofo/XeqjpaPVJ752Z6W+9s9uPfFTjubl5vmXr9gZt5j+8wL/z3Xv8eK37kqVP+Zhx4/14rfuWrds9NzfPK49W+45393jvzEw/Wn0ydN+fl4xRz6eMyZOxJfMNHDjIm3rObJYzDTMz4Blgg7v3cfdBwEyge3P0d4bl7j4gmB5N5I5f2bKbPx451uT6ghvyeHJ1MQDFJaV06tCOHld05OZhX2L9a29TUXWMyg+Ps/61t7nl+n6JjAbApuJi+vTJondmJq1bt2bchImsXlXYoM3qVYVMvmMKAN8aM5YNL6zH3Vm9qpBxEybSpk0bevXuTZ8+WWwqLk66jFHPp4zJkzGq+ZpreOpGoNbdF9YtcPet7v5y/UZm1svMXjazLcE0LFje08xeMrM3zGybmQ03sxQzeyyYLzGze5sp+0VL65bK/vcrTs+XHagkrVsqaV1T2X+g3vKDlaR1TU14/+XlZWRkXHl6Pj09g7KysrPbXBlvE4vF6NipE4cPH6as7Oxty8sbbpsMGaOeTxmTJ2NU88USspez9Sc+JHU+B4Gb3b3azK4GngLygUnAWnf/kZmlAJcDA4B0d+8PYGapTexzjJmNAN4F7nX3fWc2MLO7gbsBaNX+Ag5LRCS5tfSF8FbAYjMrAVYAdWM2m4Bvm9kcINfdPwT2AJlm9iszuw2oamR/q4Be7p4HrAOWNNapuy9y93x3z7dYu4QdTPnBSjJ6dD49n949lfKDlZQfqiSje73l3VIpP1SZsH7rpKWls3//JzWyrGw/6enpZ7fZF29z8uRJqo4coUuXLqSnn71tWlrDbZMhY9TzKWPyZIxqvuYqGtuBQSHa3QscAL5M/AyjNYC7vwSMAMqAx8zsTnevCNptAL4HnHW9wt0Pu3tNMPtoyAwJ89yLJUwqGALAkNxeVB09zvsfVLHu1R3cdF02qR3akdqhHTddl826V3ckvP/8wYPZtWsnpXv3cuLECVYsX8bIgtEN2owsGM0Tj8dr6W9WPs0NN34NM2NkwWhWLF9GTU0NpXv3smvXTgYPGZJ0GaOeTxmTJ2NU8zXX8NQLwENmdre7LwIwszygE1B/uKgTsN/dPzazKUBK0PaqYPliM2sDDDSzNcAJd19pZu8AS8/s1Mx6uvsfgtnRQEKfmZfMncrwQVdzRWp7dhU9yIML19AqlgLAo09vpGjjdm79ag7bn53Nsepa7pkTj1hRdYy5i4vYuPR+AB5aVERFVdMX1C9WLBbjF798hFEjb+XUqVNMmTqNfjk5PDBnFgMH5VMwajRTp93FtKl3kJOdRefOX+TxJ5YB0C8nhzHjxnNtXj9isRjzH15ASkpK0mWMej5lTJ6MUc1nwa2qCWdmacB84q/2q4FS4AdALbDa3fsH1zFWAg4UAdPdvX1QQGYEbY8CdwIdgV/zydnRTHf/7Rl9ziVeLE4CfwT+0t3fPlfOyy7v5m36jv+0h9usKjY90tIRRCSJXD80n82bX7fG1jVb0bhUqGiIiDR0rqLR0hfCRUTkEqKiISIioaloiIhIaCoaIiISmoqGiIiEpqIhIiKhqWiIiEhoKhoiIhKaioaIiISmoiEiIqGpaIiISGgqGiIiEpqKhoiIhKaiISIioaloiIhIaCoaIiISmoqGiIiEpqIhIiKhqWiIiEhoKhoiIhKaioaIiISmoiEiIqGpaIiISGgqGiIiEpqKhoiIhKaiISIioaloiIhIaCoaIiISmoqGiIiEpqIhIiKhqWiIiEhoKhoXYOHsyby3fi6vr/hhk21+dv9YthXOpnj5TAZkZ5xePnnUUEoKZ1FSOIvJo4Y2W8bn1xaRl9OXnOws5v30x2etr6mp4fZJE8jJzmL4sKG8V1p6et28n8wlJzuLvJy+rHt+bdJmjHo+ZUyejJHM5+7NMgE9gGXAbmAzsAa4BugFbGvGfscDbwHbgSfP197adfW2A6aHmr4+7ef+lYlzfdvOskbXf/P7C7xo4zZvO2C6j7hjnhe/udfbDpjuPUfM8D37DnnPETO8x/D7fM++Q95j+H2h+z1e66Gmo9UnvXdmpr/1zm4/8lGN5+bm+Zat2xu0mf/wAv/Od+/x47XuS5Y+5WPGjffjte5btm733Nw8rzxa7Tve3eO9MzP9aPXJ0H1/XjJGPZ8yJk/Glsw3cOAgb+o5s1nONMzMgGeADe7ex90HATOB7s3RX71+rw76ud7dc4AfJHL/r2zZzR+PHGtyfcENeTy5uhiA4pJSOnVoR48rOnLzsC+x/rW3qag6RuWHx1n/2tvccn2/REYDYFNxMX36ZNE7M5PWrVszbsJEVq8qbNBm9apCJt8xBYBvjRnLhhfW4+6sXlXIuAkTadOmDb1696ZPnyw2FRcnXcao51PG5MkY1XzNNTx1I1Dr7gvrFrj7Vnd/uX4jM+tlZi+b2ZZgGhYs72lmL5nZG2a2zcyGm1mKmT0WzJeY2b2N9PtdYIG7VwR9Hmym42tUWrdU9r9fcXq+7EAlad1SSeuayv4D9ZYfrCSta2rC+y8vLyMj48rT8+npGZSVlZ3d5sp4m1gsRsdOnTh8+DBlZWdvW17ecNtkyBj1fMqYPBmjmi+WkL2crT/xIanzOQjc7O7VwVnCU0A+MAlY6+4/MrMU4HJgAJDu7v0BzCy1kf1dE6x7BUgB5rh70ZmNzOxu4G4AWrW/oAMTEUlmLX0hvBWw2MxKgBVA3ZjNJuDbZjYHyHX3D4E9QKaZ/crMbgOqGtlfDLga+FPgL4J9p57ZyN0XuXu+u+dbrF3CDqb8YCUZPTqfnk/vnkr5wUrKD1WS0b3e8m6plB+qTFi/ddLS0tm/f9/p+bKy/aSnp5/dZl+8zcmTJ6k6coQuXbqQnn72tmlpDbdNhoxRz6eMyZMxqvmaq2hsBwaFaHcvcAD4MvEzjNYA7v4SMAIoAx4zszuDIacvAxuA7wGPNrK//cCz7l7r7nuBd4kXkc/Ecy+WMKlgCABDcntRdfQ4739QxbpXd3DTddmkdmhHaod23HRdNute3ZHw/vMHD2bXrp2U7t3LiRMnWLF8GSMLRjdoM7JgNE88vgSA36x8mhtu/BpmxsiC0axYvoyamhpK9+5l166dDB4yJOkyRj2fMiZPxqjma67hqReAh8zsbndfBGBmeUAnYF+9dp2A/e7+sZlNIT6khJldFSxfbGZtgIFmtgY44e4rzewdYGkj/f5f4mcYvzazK4gPV+1J1EEtmTuV4YOu5orU9uwqepAHF66hVSwFgEef3kjRxu3c+tUctj87m2PVtdwzJx6xouoYcxcXsXHp/QA8tKiIiqqmL6hfrFgsxi9++QijRt7KqVOnmDJ1Gv1ycnhgziwGDsqnYNRopk67i2lT7yAnO4vOnb/I408sA6BfTg5jxo3n2rx+xGIx5j+8gJSUlKTLGPV8ypg8GaOaz4LbVBPOzNKA+cTPOKqBUuJ3M9UCq929f3AdYyXgQBEw3d3bBwVkRtD2KHAn0BH4NZ+cHc1099+e0acBPwNuA04BP3L3ZefKednl3bxN3/Gf9nCbVcWmR1o6gogkkeuH5rN58+vW2LpmKxqXChUNEZGGzlU0WvpCuIiIXEJUNEREJDQVDRERCU1FQ0REQlPREBGR0FQ0REQkNBUNEREJTUVDRERCU9EQEZHQVDRERCQ0FQ0REQlNRUNEREJT0RARkdBUNEREJDQVDRERCU1FQ0REQlPREBGR0FQ0REQkNBUNEREJTUVDRERCU9EQEZHQVDRERCQ0FQ0REQlNRUNEREJT0RARkdBUNEREJDQVDRERCU1FQ0REQlPREBGR0FQ0REQkNBUNEREJTUXjAiycPZn31s/l9RU/bLLNz+4fy7bC2RQvn8mA7IzTyyePGkpJ4SxKCmcxedTQZsv4/Noi8nL6kpOdxbyf/vis9TU1Ndw+aQI52VkMHzaU90pLT6+b95O55GRnkZfTl3XPr03ajFHPp4zJkzGS+dy9WSagB7AM2A1sBtYA1wC9gG3N1OcvgDeC6V2g8nzbWLuu3nbA9FDT16f93L8yca5v21nW6Ppvfn+BF23c5m0HTPcRd8zz4jf3etsB073niBm+Z98h7zlihvcYfp/v2XfIewy/L3S/x2s91HS0+qT3zsz0t97Z7Uc+qvHc3DzfsnV7gzbzH17g3/nuPX681n3J0qd8zLjxfrzWfcvW7Z6bm+eVR6t9x7t7vHdmph+tPhm6789LxqjnU8bkydiS+QYOHORNPWc2y5mGmRnwDLDB3fu4+yBgJtC9Ofqr4+73uvsAdx8A/Ar4TSL3/8qW3fzxyLEm1xfckMeTq4sBKC4ppVOHdvS4oiM3D/sS6197m4qqY1R+eJz1r73NLdf3S2Q0ADYVF9OnTxa9MzNp3bo14yZMZPWqwgZtVq8qZPIdUwD41pixbHhhPe7O6lWFjJswkTZt2tCrd2/69MliU3Fx0mWMej5lTJ6MUc3XXMNTNwK17r6wboG7b3X3l+s3MrNeZvaymW0JpmHB8p5m9pKZvWFm28xsuJmlmNljwXyJmd17ngx/ATyV8CM7h7Ruqex/v+L0fNmBStK6pZLWNZX9B+otP1hJWtfUhPdfXl5GRsaVp+fT0zMoKys7u82V8TaxWIyOnTpx+PBhysrO3ra8vOG2yZAx6vmUMXkyRjVfLCF7OVt/4kNS53MQuNndq83sauJP8vnAJGCtu//IzFKAy4EBQLq79wcws9SmdmpmVwG9gReaWH83cDcArdqHOyIREWnxC+GtgMVmVgKsAOrGbDYB3zazOUCuu38I7AEyzexXZnYbUHWO/U4Ennb3U42tdPdF7p7v7vkWa5eoY6H8YCUZPTqfnk/vnkr5wUrKD1WS0b3e8m6plB+qTFi/ddLS0tm/f9/p+bKy/aSnp5/dZl+8zcmTJ6k6coQuXbqQnn72tmlpDbdNhoxRz6eMyZMxqvmaq2hsBwaFaHcvcAD4MvEzjNYA7v4SMAIoAx4zszvdvSJotwH4HvDoOfY7kc94aArguRdLmFQwBIAhub2oOnqc9z+oYt2rO7jpumxSO7QjtUM7broum3Wv7kh4//mDB7Nr105K9+7lxIkTrFi+jJEFoxu0GVkwmiceXwLAb1Y+zQ03fg0zY2TBaFYsX0ZNTQ2le/eya9dOBg8ZknQZo55PGZMnY1TzNdfw1AvAQ2Z2t7svAjCzPKATsK9eu07Afnf/2MymAClB26uC5YvNrA0w0MzWACfcfaWZvQMsbaxjM8sGOgO/S/RBLZk7leGDruaK1PbsKnqQBxeuoVUsBYBHn95I0cbt3PrVHLY/O5tj1bXcMycesaLqGHMXF7Fx6f0APLSoiIqqpi+oX6xYLMYvfvkIo0beyqlTp5gydRr9cnJ4YM4sBg7Kp2DUaKZOu4tpU+8gJzuLzp2/yONPLAOgX04OY8aN59q8fsRiMeY/vICUlJSkyxj1fMqYPBmjms+CW1UTzszSgPnEzziqgVLgB0AtsNrd+wfXMVYCDhQB0929fVBAZgRtjwJ3Ah2BX/PJ2dFMd/9tI/3OAdq6+z+EyXnZ5d28Td/xF3eQn5GKTY+0dAQRSSLXD81n8+bXrbF1zVY0LhUqGiIiDZ2raLT0hXAREbmEqGiIiEhoKhoiIhKaioaIiISmoiEiIqGpaIiISGgqGiIiEpqKhoiIhKaiISIioaloiIhIaCoaIiISmoqGiIiEpqIhIiKhqWiIiEhoSf/R6GZ2CHgvgbu8AvgggftrDsqYGMr46UU9HyRnxqvcvWtjK5K+aCSamb3u7vktneNclDExlPHTi3o+UMYzaXhKRERCU9EQEZHQVDQSb1FLBwhBGRNDGT+9qOcDZWxA1zRERCQ0nWmIiEhoKhoiIp9jZmaJ3J+KRgtJ9AMp0aTH+dO5FH5/ZhZr6Qzn0QrAzBLyfK+i8Rkys95m1tnMOri7J+pBTCQza93SGc7FzP4k+B22b+ksTTGzPmaWbmadIvw4DzWzjJbOcS5mNgwYHeXCYWY3Az+N6t+NmRUA/2lmXd3940TsM3L/mT+vggfv34EfA0VmlpmoBzFRzGwk8M9m1qelszTGzG4DCoGfAf9mZt1bONJZzOxW4P8C/wg8Z2adI/g4dwLWAg+YWe+WztOY4Pe4CPiD17tbJ0oFxMz+G/C/gH939xP1lkciY/A7fBA4AHwjWPapn/NVNJqZxfUA/gmY5u73AGuIP6EMCtq0+ONgZgOAZcDXgIKoFQ4zGw48DPwAmAm8CdwVrIvKH+lw4JfA3wD3AtuBdmaW0OGBBDgBFANXA/ebWWYL52nAzK4DngL+yt2LzayDmaWY2eVROXML/s99Ffg7d19jZl80s0wz6+4RuCXVzL4OzCP+f3ExQdFw948/7d9Li//yP+887n1gG1AbLPsR8K/Ak2bWKxEPZAJcBowFpgMDgW/VLxwtmc/MUoBs4J/d/UV3P0D8CTkL4r/jlspWJ8j4J8B33P0FIA0YB/w98BszuyoijzPufhx4gng2B/7azP48eOXcooLfT0/gP4DLzSwd+D/E/17WR+X3GPyfawdcY2ZpBGduwMbgjL3F/maCobIRwF+6+0vAM0BfM5tRL/tFU9FoZsGZRmviHyY2vG65u88n/of7aN0rqBaKWKcEeNXd3yD+ajkHGFOvcLTYmK27nwKWA6/WW1wM9KibMbM2n3Wu+oKM/+buG83scuD7wFzgn4FNwDoz6xiBx7nOFcA33P2vgL7ASqBby0Y6/YS2lvjfxl8Q/3/5H8CsYPmaumuCLZfytOeATsBfAf/b3W8nPhz0czO7sqUyBkNl/+Lur5hZK3evJv77ywjO2nSmEWXBmcYJ4FFgrJndW+9B+xdgT8ul+4S717r7h8HPW4BHiBeOG83sAeKvllNaMF+Vu++rtygF6AVgZlOBX7T0sIW7151JHgN+6e7zgtwPEC94Lfb7a8TjwGEzu4Z40VgHXG9mV7VsLHD3j4gXiOeAv3X3h9293N3nAK8DLX62FngL6A3cSHzID3f/P8DviZ9ptph6/xdrg0XvEh9OG/5pi1nUbxX7XDCzFHffZWZTgMeIj3P/Hsgg/kC2A461YMSzuPvrZvb3wPPEX019M3g1HRUHgK1m9hfAXxK/XtTiF5zNzIIXCv9Vb9ntxJ+Yo/QizYm/Qv4R8Gfu/ryZ/S+CIdSWFPwOPzKzZ6hXIOr9Hlv8ecvMLnP3P5jZTOAhINvMJhHPO4z4dbfIcPe3zexh4je6vOHu5Re7L32MSILVPWkEP7d396PBz5cFY7F9gNuJvxK5Gvgbdy+JQsZG2g0n/mrvOnffHqV8wcXlMuLFY4K7v/VZ5buAjF8AJgH/HZgYlYx1y81sMNDe3f/jzPYtnfGMNinE/2buI0KPdb2/6TTgT4Fc4te15rr7tpbO10i7q4m/UHjA3Ssuuj8VjcQ548GbBnQmfktedfBHmuLup8ws5u4ngzHuqihlPKNtX6DW3T+zIbSw+YIn5EeBOe7+zmeV7wIzdiR+t9e/ufvbUcpY94RX1xY++xsKLvCx/ivg2ag91nV/0/XaXx4MT0YiXyPtO7n7kU/VqbtrSvBE/D/4ZiAzmI/VPa51U1QzRmU6Vz7gsuDfVpdAxpQoZozC/8GQv8e6F7aXRTFjVP6mz/f3nMh8URpjvWTVvxvBzL4IjAbuBD4IrmMsNLNveD1RzfhZ57qYfP7JtYuTUc/on/F1oAv5v/hZ5rqYjPDJ2Y9/xterov43faF/z4nMp6LxKZ1xepju7n8kfovgKmAh8GXidy7caS10W2jUM15svs/yDzXqv0NlTJ6MLZ6vJU+pPk8T8HfE30vQlfjdHaOBbsG6Pwd+A7RWxks3nzIqY5QytlS+FntAPk8TMA14GbgimO8EdAh+ng5sAXKV8dLNp4zKGKWMLZmvxe93/pzoDKwAcoPbGG8FtpnZIuLvpL7DP8NbVpsQ9YxRzwfKmCjKeAnn0y23F6j+eGK9ZbcCU4AriY8pfgTcAPzY45+TpIyXUD5lVMYoZYxaPhWNC3DGBajpxKt9K3efbfE3IH3B3avMbDTxDy/7b+7+B2W8dPIpozJGKWMk87XUmNylOPFJkf0b4EXiH2nwPrCyXps7iX8Ca44yXnr5lFEZo5Qxivl0y20IZna9md3s7m7x78YYBPwZMIr4xaieZrY2aF4CjPTPeLwz6hmjnk8ZlTFKGSOdryWq+6U2AZOBvcDXg/kvEv+8+t8F838CfAz8WhkvzXzKqIxRyhjlfLp76hws/s16KcS/ae8EMN/MfuDu682smvjdCu2B64A5xL9tTBkvoXzKqIxRyhj1fIDONM5R6W8j/tn9dwJZwbLbiZ8Kfp34BakngCeBfcDVynhp5VNGZYxSxqjnO52zJTqN+kT81rWdwOBG1tU9iAOBVsQ/DrmXMl5a+ZRRGaOUMer5GuRpqY6jPAF/S/x7LqCRTwUFxgP7gT9VxksznzIqY5QyRj1f/UnXNOqpd090b6DuM+dPwScfjmdm/YEi4t989l+N7SeZM0Y9nzIqY5QyRj1fY3TLbT11DxLwDPAVMxvkHv/CGvvk+6e/BvR29xX+GX450aWSMer5lFEZo5Qx6vkao6LRuN8DG4EJwYP4sce/1nEi8YtUFS0bD4h+xqjnA2VMFGX89KKe7zR9jEgTzCwduIv4XQuvA8eBscBY/wy///dcop4x6vlAGRNFGT+9qOero6JxDmbWjvg7MW8C/gD8h7u/27KpGop6xqjnA2VMFGX89KKeD1Q0RETkAuiahoiIhKaiISIioaloiIhIaCoaIiISmoqGiIiEpqIhIiKhqWiIJICZ/amZrQ5+Hm1m/3COtqlm9lcX0cccM7sv7PIz2jxmZmMvoK9eZhaZN5RJdKhoiJyDmaVc6Dbu/qy7//gcTVKBCy4aIlGgoiFJKXgl/baZPWFmO8zsaTO7PFhXamY/MbMtwDgzu8XMfmdmW8xsRfDNaZjZbcE+tgDfqrfvqWb2SPBzdzN7xsy2BtMw4MdAHzN7w8zmBe1mmNkmM3vTzP5HvX39o5m9a2Ybgb4hjuu7wX62mtnKumMK3GRmrwf7Kwjap5jZvHp93/Npf7fy+aaiIcmsL/C/3P1LQBUNX/0fdveBwL8D/wTcFMy/DvytmbUFFgOjiH/sQ48m+ngYeNHdv0z8S3S2A/8A7Hb3Ae4+w8xuAa4GhgADgEFmNsLiX/05MVj2DWBwiGP6jbsPDvrbQfyzjOr0CvoYCSwMjuEu4Ii7Dw72/10z6x2iH0lS+j4NSWb73P2V4OelwF8D/zOYXx78+xWgH/CKmQG0Bn4HZAN73X0ngJktBe5upI+vEf+UUtz9FHDEzDqf0eaWYPrPYL498SLSAXjG3Y8FfTwb4pj6m9m/EB8Caw+srbfu39z9Y2Cnme0JjuEWIK/e9Y5OQd+R+rwjiQ4VDUlmZ37wWv35j4J/DVjn7n9Rv6GZDUhgDgPmuvu/ntHHDy5iX48Bf+buW81sKvCn9dY1drwG/Hd3r19cMLNeF9G3JAENT0ky+xMzuy74eRLx7zM402vA9WaWBWBmXzCza4C3gV5m1ido9xeNbAuwHvjLYNsUM+sEfEj8LKLOWmBavWsl6WbWDXgJ+DMza2dmHYgPhZ1PB+APZtYKmHzGunEW/3KfPkAm8E7Q918G7TGza8zsCyH6kSSloiHJ7B1gupntADoD/9+ZDdz9EDAVeMrM3iQYmnL3auLDUc8FF8IPNtHH3wA3mlkJsBno5+6HiQ93bTOzee7+PPAk8Lug3dNAB3ffQnyYbCvwW2BTiGP6Z+Jf6PMK8cJW338BxcG+vhccw6PAW8CW4Bbbf0UjEHIO+mh0SUrB8Mtqd+/f0llELiU60xARkdB0piEiIqHpTENEREJT0RARkdBUNEREJDQVDRERCU1FQ0REQvt/V9XvUb1qHvAAAAAASUVORK5CYII=\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "cm = confusion_matrix(y_test, y_test_pred)\n", - "\n", - "nb_classes = len(np.unique(y_test))\n", - "class_names = ['Class {}'.format(i) for i in range(0, nb_classes)]\n", - "\n", - "plot_confusion_matrix(cm,\n", - " colorbar=False,\n", - " show_normed=True,\n", - " show_absolute=False,\n", - " class_names=class_names,\n", - " figsize=(6, 6))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Speaker recognition: error analysis" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "import glob\n", - "from IPython.display import display, Audio\n", - "\n", - "def get_speaker_info(id, dataset_path):\n", - " speakers = glob.glob(dataset_path + '/*')\n", - " speaker = speakers[id]\n", - " utterances = glob.glob(speaker + '/*/*.flac')\n", - " utterance = utterances[0]\n", - " return speaker.split('\\\\')[-1], utterance\n", - "\n", - "def show_errors(count=3, dataset_path=''):\n", - " idx_error = 0\n", - " for i in range(len(X_test)):\n", - " if y_test[i] != y_test_pred[i] and idx_error < count:\n", - " idx_error += 1\n", - "\n", - " print('Error {}'.format(i))\n", - "\n", - " id, path = get_speaker_info(int(y_test_pred[i]), dataset_path)\n", - " print('Predicted speaker: {}'.format(id))\n", - " display(Audio(path))\n", - "\n", - " id, path = get_speaker_info(int(y_test[i]), dataset_path)\n", - " print('Actual speaker: {}'.format(id))\n", - " display(Audio(path))\n", - "\n", - " print('=' * 40)" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Error 1\n", - "Predicted speaker: 103\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Actual speaker: 1034\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "========================================\n", - "Error 2\n", - "Predicted speaker: 103\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Actual speaker: 1034\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "========================================\n", - "Error 3\n", - "Predicted speaker: 103\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Actual speaker: 1040\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "========================================\n" - ] - } - ], - "source": [ - "show_errors(count=3,\n", - " dataset_path='D:/Datasets/LibriSpeech/train-clean-100')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Speaker verification" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "import soundfile as sf\n", - "from pathlib import Path\n", - "from scipy.spatial.distance import cosine\n", - "from tensorflow.keras.layers import GlobalAveragePooling1D\n", - "from sklearn.preprocessing import normalize\n", - "from sklearn.metrics import roc_curve, det_curve\n", - "\n", - "def get_frames(signal):\n", - " signal_length = len(signal)\n", - " frame_length = 20480\n", - " frame_step = 20480\n", - "\n", - " num_frames = int(1 + np.ceil((signal_length - frame_length) / frame_step))\n", - "\n", - " zeros = np.zeros((num_frames * frame_length - signal_length))\n", - " signal_padded = np.append(signal, zeros)\n", - "\n", - " indices_a = np.tile(np.arange(0, frame_length),\n", - " (num_frames, 1))\n", - " indices_b = np.tile(np.arange(0, num_frames * frame_step, frame_step),\n", - " (frame_length, 1))\n", - " indices = indices_a + indices_b.T\n", - "\n", - " frames = signal_padded[indices.astype(np.int32)]\n", - " return frames\n", - "\n", - "def evaluate_spk_verification(y, y_scores):\n", - " # ROC curve\n", - " fpr, tpr, thresholds = roc_curve(y, y_scores)\n", - " plt.title('ROC curve')\n", - " plt.xlabel('False Positive Rate')\n", - " plt.ylabel('True Positive Rate')\n", - " plt.plot(fpr, tpr)\n", - " plt.plot([0, 1], [0, 1], color='navy', linestyle='--')\n", - " plt.show()\n", - " \n", - " # DET curve\n", - " fpr, fnr, thresholds = det_curve(y, y_scores)\n", - " plt.title('DET curve')\n", - " plt.xlabel('False Positive Rate')\n", - " plt.ylabel('False Negative Rate')\n", - " plt.plot(fpr, fnr)\n", - " plt.show()\n", - " \n", - " eer_threshold = thresholds[np.nanargmin(np.absolute((fnr - fpr)))]\n", - " eer = fpr[np.nanargmin(np.absolute((fnr - fpr)))]\n", - " print('Equal Error Rate (EER):', eer)\n", - " \n", - " y_pred = y_scores >= eer_threshold\n", - " \n", - " # Classification report\n", - " report = classification_report(y, y_pred, zero_division=0)\n", - " print(report)\n", - "\n", - " # Confusion matrix\n", - " cm = confusion_matrix(y, y_pred)\n", - " plot_confusion_matrix(cm,\n", - " colorbar=False,\n", - " show_normed=True,\n", - " show_absolute=False,\n", - " figsize=(3, 3))\n", - "\n", - "def spk_verification(dataset_path, trial_list_path, limit=5):\n", - " y = []\n", - " y_scores = []\n", - "\n", - " with open(trial_list_path) as fp:\n", - " line = fp.readline()\n", - " count = 0\n", - "\n", - " while line and count != limit:\n", - " target, path1, path2 = line.rstrip('\\n').split(' ')\n", - "\n", - " data1, _ = sf.read(Path(dataset_path).joinpath(path1))\n", - " data2, _ = sf.read(Path(dataset_path).joinpath(path2))\n", - " \n", - " # Split data in chunks of frame_length and\n", - " # apply average pooling on resulting embeddings\n", - "\n", - " data1 = get_frames(data1)\n", - " data2 = get_frames(data2)\n", - " \n", - " data1 = np.expand_dims(data1, axis=-1)\n", - " data2 = np.expand_dims(data2, axis=-1)\n", - " \n", - " features1 = np.mean(model(data1), axis=0, keepdims=True)\n", - " features2 = np.mean(model(data2), axis=0, keepdims=True)\n", - " \n", - " dist = 1.0 / cosine(normalize(features1), normalize(features2))\n", - " y.append(int(target))\n", - " y_scores.append(dist)\n", - "\n", - " count += 1\n", - " line = fp.readline()\n", - "\n", - " return np.array(y), np.array(y_scores)" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "y, y_scores = spk_verification(dataset_path='D:/Datasets/vox1_test_wav/wav',\n", - " trial_list_path='./docs/voxcelebs_trial_list.txt',\n", - " limit=3)" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEWCAYAAABrDZDcAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAArzElEQVR4nO3dd3wVZfbH8c8hgIjSwUrvTUSMBRAQUZpg+bkodnejiIhlbWtFZdVdCzYEBATBXthVoyDoWpZdKwiIiCIRqYIU6UhLzu+PmbgxhnCBTG7uvd/365UXM3Ofe+cMgZw8zzNzHnN3REQkdZWKdwAiIhJfSgQiIilOiUBEJMUpEYiIpDglAhGRFKdEICKS4pQIRERSnBKBJB0zW2hmv5jZJjNbYWbjzOzAfG3amdn7ZrbRzNab2Ztm1jxfm4pm9qiZLQ4/6/twv3rxXpFItJQIJFn1dvcDgdbAUcAtuS+YWVvgHeAN4DCgHvAl8JGZ1Q/blAXeA1oA3YGKQFtgDXBsVEGbWemoPltkV5QIJKm5+wpgCkFCyPUA8Iy7P+buG939Z3e/HfgUuCtscxFQGzjT3ee6e467r3T3v7r7pILOZWYtzOxdM/vZzH4ys1vD4+PM7J487U40s6V59hea2V/MbDawOdyekO+zHzOzx8PtSmY2xsyWm9kyM7vHzNL27W9KUpkSgSQ1M6sJ9ACywv3yQDvg1QKavwKcEm6fDEx2900xnqcC8C9gMkEvoyFBjyJW5wKnApWBl4Ce4WcS/pA/G3ghbDsO2Bme4yigK3DpHpxL5DeUCCRZvW5mG4ElwErgzvB4VYJ/98sLeM9yIHf8v9ou2uxKL2CFuw9x961hT+OzPXj/4+6+xN1/cfdFwAzgzPC1k4At7v6pmR0M9ASudffN7r4SeATouwfnEvkNJQJJVme4ewXgRKAp//sBvxbIAQ4t4D2HAqvD7TW7aLMrtYDv9yrSwJJ8+y8Q9BIAzuN/vYE6QBlguZmtM7N1wEjgoH04t6Q4JQJJau7+b4KhlIfC/c3AJ0CfApqfzf+Gc/4FdDOzA2I81RKg/i5e2wyUz7N/SEGh5tt/FTgxHNo6k/8lgiXANqC6u1cOvyq6e4sY4xT5HSUCSQWPAqeY2ZHh/s3AxWZ2tZlVMLMq4WRuW+DusM2zBD90/2FmTc2slJlVM7NbzaxnAed4CzjUzK41s/3Czz0ufG0WwZh/VTM7BLh2dwG7+yrgQ+Bp4Ad3/yY8vpzgjqch4e2tpcysgZl12tO/FJFcSgSS9MIfqs8Ag8L9/wLdgP8jmAdYRDDpeoK7zw/bbCOYMP4WeBfYAHxOMMT0u7F/d99IMNHcG1gBzAc6hy8/S3B76kKCH+Ivxxj6C2EML+Q7fhFQFphLMNQ1gT0bxhL5DdPCNCIiqU09AhGRFKdEICKS4pQIRERSnBKBiEiKS7gCV9WrV/e6devGOwwRkYTyxRdfrHb3GgW9lnCJoG7dukyfPj3eYYiIJBQzW7Sr1zQ0JCKS4pQIRERSnBKBiEiKUyIQEUlxSgQiIikuskRgZmPNbKWZzdnF62Zmj5tZlpnNNrM2UcUiIiK7FmWPYBzBot+70gNoFH71A0ZEGIuIiOxCZM8RuPtUM6tbSJPTCRYQd+BTM6tsZoeG9daL3AufLeaNWcui+GgRkUjt3LaTbRt3cEyrg7mzd9GvQRTPOYLD+e3yfEvDY79jZv3MbLqZTV+1atVeneyNWcuYu3zDXr1XRCRefvp2DVP++gkfjZxFTk40ywYkxJPF7j4KGAWQnp6+138TzQ+tyMuXty2yuEREorJu3VZuvPEdXnlqJg0bVuWpp3rTqVPdSM4Vz0SwjGDB71w1w2MiIiktOzuHdu3GMG/eGm66qR133XUi++9fJrLzxTMRZAIDzewl4DhgfVTzAyIiiWDNmi1Urbo/aWmluPfek6hVqxLp6YdFft4obx99EfgEaGJmS80sw8z6m1n/sMkkYAGQBYwGBkQVi4hISebuPPfcbBo3foKnnpoBwJlnNiuWJADR3jV07m5ed+DKqM4vIpIIlixZT//+E5k0aT7HH1+T9u1rF3sMCTFZLCKSjF588Ssuv/wtsrOdRx/txsCBx5KWVvw3cyoRiIjESZUq+3PccTUZNaoX9epViVscSgQiIsVk584cHnnkE7Zvz+a22zrSvXtDunVrgJnFNS4lAhGRYvDllyvIyMjkiy+Wc/bZLXB3zCzuSQBUfVREJFLbtu3kjjveJz19NEuWbODVV/vw0ktnlYgEkEs9AhGRCM2f/zP33/8R5513BA8/3JVq1crHO6TfUSIQESlimzZt5403vuX881vRsuVBfPvtQOrXj99k8O5oaEhEpAi9++73HHHECC688DW++SYoklmSkwAoEYiIFIm1a38hI+MNunZ9jrJl0/j3vy+hWbMa8Q4rJhoaEhHZR9nZObRvP5bvvlvDLbecwKBBnShXLnF+vCZOpCIiJczq1f8rEnfffV2oXbsSbdocGu+w9piGhkRE9pC788wzX9K48dBfi8SdcUbThEwCoB6BiMgeWbRoHZdf/hZTpnxPu3a16NixTrxD2mdKBCIiMXruudlcccVE3J2hQ3swYMAxlCpVch4M21tKBCIiMapRozzt29di5Mhe1KlTOd7hFBklAhGRXdixI5shQz5hx45s7rijE926NaRr1/gXiStqSgQiIgWYOXM5GRmZzJy5gr59W5aoInFFTXcNiYjksXXrTm699T2OOWY0P/64kX/842xefLFkFYkrauoRiIjkkZX1Mw899DEXXXQkQ4Z0pUqV/eMdUuSUCEQk5W3atJ3XXvuGCy88kpYtD2LevIFxXTGsuGloSERS2pQpWbRoMZyLL3791yJxqZQEQIlARFLUmjVbuPji1+ne/XnKly/Df/7zx4QpElfUNDQkIiknt0hcVtbP3HZbB26/vWNCFYkraql75SKSclat2ky1auVJSyvF/fefTJ06lWnd+pB4hxV3GhoSkaTn7jz99EwaN36C0aO/AOD005sqCYTUIxCRpLZw4Tr69XuTd99dQIcOtencuV68QypxlAhEJGk9++yXXHHFRMyM4cN7cvnl6UlRJK6oKRGISNI6+OAD6dixDk8+2YvatSvFO5wSS4lARJLGjh3ZPPDAR2RnO4MGdaJr1wZ07dog3mGVeJosFpGkMGPGco45ZjS33/4B8+atwd3jHVLCUCIQkYT2yy87uPnmf3HssaP56afNvPbaOTz//P8ldZG4ohZpIjCz7mY2z8yyzOzmAl6vbWYfmNlMM5ttZj2jjEdEks+CBWt5+OFPuOSS1sydO4Azzmga75ASTmSJwMzSgGFAD6A5cK6ZNc/X7HbgFXc/CugLDI8qHhFJHhs2bGPcuFkAtGhxEPPnX8VTT52WEpVCoxBlj+BYIMvdF7j7duAl4PR8bRyoGG5XAn6MMB4RSQKTJs2nZcvhZGRk/lokLpmWjYyHKBPB4cCSPPtLw2N53QVcYGZLgUnAVQV9kJn1M7PpZjZ91apVUcQqIiXc6tVbuPDC1zj11BeoUGE/PvroTylbJK6oxXuy+FxgnLvXBHoCz5rZ72Jy91Hunu7u6TVq6Bsvkmpyi8S99NIcBg3qyIwZ/Tj++JrxDitpRPkcwTKgVp79muGxvDKA7gDu/omZlQOqAysjjEtEEsRPP22iRo0DSEsrxUMPnUKdOpVp1ergeIeVdKLsEUwDGplZPTMrSzAZnJmvzWKgC4CZNQPKARr7EUlx7s6YMTNo0uQJRo0KisT17t1ESSAikfUI3H2nmQ0EpgBpwFh3/9rMBgPT3T0TuB4YbWZ/Jpg4vsT1FIhISluwYC2XXfYm77//A5061eHkk+vHO6SkF2mJCXefRDAJnPfYoDzbc4H2UcYgIolj/PhZDBgwibQ048knT+Wyy45WkbhioFpDIlJiHHZYBU46qR4jRpxKzZoVd/8GKRJKBCISN9u3Z/P3v/+XnBznrrtO5JRTGnDKKSoSV9ziffuoiKSoadOWcfTRo7jzzg9ZsGCtisTFkRKBiBSrLVt2cMMN73D88WNYu/YXMjP78swzZ6pIXBxpaEhEitUPP6xl6NDPueyyNtx//8lUqlQu3iGlPCUCEYnc+vVb+ec/v+GPfzyKFi0OIivrKmrV0ophJYWGhkQkUhMnfkeLFsO59NI3+fbb1QBKAiWMEoGIRGLVqs2cf/4/6dXrRapU2Z9PPsmgadPq8Q5LCqChIREpctnZOZxwwtP88MNa7r77RG6++QTKlk2Ld1iyC0oEIlJkVqzYxEEHBUXihgzpSt26lWnZ8qB4hyW7EfPQkJmVjzIQEUlcOTnOyJHTadx4KCNHTgegV6/GSgIJYreJwMzamdlc4Ntw/0gz05KSIgJAVtbPdOnyDP37T+SYYw6nW7eG8Q5J9lAsQ0OPAN0IS0i7+5dm1jHSqEQkITz99EwGDJhE2bJpjB7dm4yMo/RgWAKKaY7A3Zfk++ZmRxOOiCSS2rUr0a1bA4YN68nhh6tIXKKKJREsMbN2gJtZGeAa4JtowxKRkmjbtp387W9BkbjBgzvTpUt9unTRegGJLpbJ4v7AlQQLzy8DWgMDIoxJREqgzz5bytFHj+Luu//N4sXrVSQuicTSI2ji7ufnPWBm7YGPoglJREqSzZu3c8cdH/Doo59y+OEVeeutczn11MbxDkuKUCw9gqExHhORJLRo0XqGD59G//7pfP31ACWBJLTLHoGZtQXaATXM7Lo8L1UkWINYRJLUunVbmTBhLpde2obmzWuQlXW1VgxLYoUNDZUFDgzbVMhzfAPwhyiDEpH4eeONb7niiomsXLmZE06oTdOm1ZUEktwuE4G7/xv4t5mNc/dFxRiTiMTBypWbufrqt3n55a9p1epgMjPPVZG4FBHLZPEWM3sQaAH8uoKEu58UWVQiUqyys3No334sixev5557OnPTTe0pU0YjwKkilkTwPPAy0IvgVtKLgVVRBiUixePHHzdyyCEHkpZWisce607dupVp3rxGvMOSYhbLXUPV3H0MsMPd/+3ufwLUGxBJYDk5zogR02ja9AmefDIoEtezZyMlgRQVS49gR/jncjM7FfgRqBpdSCISpe++W8Nll73J1KmLOPnk+vTooSJxqS6WRHCPmVUCrid4fqAicG2UQYlINMaMmcHAgW9Trlxpxo49jUsuaa0icbL7RODub4Wb64HO8OuTxSKSYOrWrUyPHg0ZNqwnhx5aYfdvkJRQ2ANlacDZBDWGJrv7HDPrBdwK7A8cVTwhisje2rZtJ3/961QA7rnnJBWJkwIV1iMYA9QCPgceN7MfgXTgZnd/vRhiE5F98PHHS8jIyOTbb1fzpz+1xt01DCQFKiwRpAOt3D3HzMoBK4AG7r6meEITkb2xadN2brvtPYYO/ZxatSoxefL5WjVMClXY7aPb3T0HwN23Agv2NAmYWXczm2dmWWZ28y7anG1mc83sazN7YU8+X0R+b/Hi9Ywc+QVXXnkMc+ZcoSQgu1VYj6Cpmc0Otw1oEO4b4O7eqrAPDucYhgGnAEuBaWaW6e5z87RpBNwCtHf3tWamla5F9sLatb/w6qtz6dfvaJo3r8GCBddw2GGaDJbYFJYImu3jZx8LZLn7AgAzewk4HZibp81lwDB3Xwvg7iv38ZwiKee1175hwIBJrFq1mU6d6tCkSXUlAdkjhRWd29dCc4cDS/LsLwWOy9emMYCZfURQ2voud5+c/4PMrB/QD6B27dr7GJZIclixYhNXXfU2EybMpXXrQ5g48TyaNFGRONlzMS1eH/H5GwEnAjWBqWZ2hLuvy9vI3UcBowDS09O1Pp6kvOzsHDp0eJolS9Zz330nccMN7VQkTvZalIlgGcHtp7lqhsfyWgp85u47gB/M7DuCxDAtwrhEEtbSpRs47LAKpKWV4vHHu1OvXhWVipZ9FkvROcxsfzNrsoefPQ1oZGb1zKws0BfIzNfmdYLeAGZWnWCoaMEenkck6eXkOEOHfkbTpk8wYkTwe1KPHo2UBKRI7DYRmFlvYBYwOdxvbWb5f6D/jrvvBAYCU4BvgFfc/WszG2xmp4XNpgBrzGwu8AFwo55TEPmtb79dTceOT3P11ZM54YTa9OqlNYOlaMUyNHQXwR1AHwK4+ywzqxfLh7v7JGBSvmOD8mw7cF34JSL5PPXUDAYOnET58mUYP/4MLrywlZ4OliIXUxlqd1+f7x+fJmxFikGDBlXo3bsJTzzRg4MPPjDe4UiSiiURfG1m5wFp4QNgVwMfRxuWSGraunUngwf/G4D77utC58716Nw5pg64yF6LZbL4KoL1ircBLxCUo742wphEUtJHHy2mdesn+dvf/suqVZsJRk5FohdLj6Cpu98G3BZ1MCKpaOPGbdx663sMGzaNOnUqM2XKBXTt2iDeYUkKiSURDDGzQ4AJwMvuPifimERSytKlG3jqqZlcddWx3HtvFw48sGy8Q5IUs9uhIXfvTLAy2SpgpJl9ZWa3Rx6ZSBJbs2bLr88DNGtWgwULruaxx3ooCUhcxPRAmbuvcPfHgf4EzxQMKvwdIlIQd2fChLk0bz6cq6+ezLx5qwG0bKTEVSwPlDUzs7vM7CuCxes/JigXISJ7YPnyjZx11iv06fMqtWpVZPr0y1QkTkqEWOYIxgIvA93c/ceI4xFJSrlF4pYt28gDD5zMn//cltKlY+qQi0Rut4nA3dsWRyAiyWjJkvUcfnhF0tJKMWxYT+rVq0LjxtXiHZbIb+zyVxIzeyX88yszm53n66s8K5eJSAGys3N4/PHPaNp02K+Twt26NVQSkBKpsB7BNeGfvYojEJFk8c03q8jIyOSTT5bSo0dDevfe08K9IsVrlz0Cd18ebg5w90V5v4ABxROeSGIZNeoLWrceyXffreHZZ89k4sTzqF27UrzDEilULLNVpxRwrEdRByKSDBo1qsqZZzZl7twrueACVQqVxLDLoSEzu4LgN//6+eYEKgAfRR2YSCL45Zcd3HXXh5gZf//7ySoSJwmpsDmCF4C3gb8BN+c5vtHdf440KpEEMHXqIi69NJP583+mf/+jcXf1ACQhFTY05O6+ELgS2JjnCzOrGn1oIiXThg3bGDBgIp06jSM723nvvYsYMaKXkoAkrN31CHoBXxAsRJP3X7kD9SOMS6TE+vHHjYwbN4vrrjuewYM7c8ABqg8kiW2XicDde4V/asBTUt7q1Vt45ZWvGTDgGJo2rc4PP1yjFcMkacRSa6i9mR0Qbl9gZg+bWe3oQxOJP3fn5Zfn0Lz5MK69djLffbcGQElAkkost4+OALaY2ZHA9cD3wLORRiVSAvz440bOOONl+vb9B3XqVOaLL/rpyWBJSrEUndvp7m5mpwNPuPsYM8uIOjCReMrOzqFjx6BI3EMPncI11xyvInGStGJJBBvN7BbgQqCDmZUCykQblkh8LFq0jpo1gyJxw4efSv36VWjYUDfJSXKL5VeccwgWrv+Tu68gWIvgwUijEilm2dk5PPzwJzRrNowRI6YD0LVrAyUBSQmxLFW5AngeqGRmvYCt7v5M5JGJFJM5c1bSrt1Yrr/+Hbp0qc8ZZzSNd0gixSqWu4bOBj4H+gBnA5+Z2R+iDkykODz55HTatBnJggVreeGF/yMzsy81a1aMd1gixSqWOYLbgGPcfSWAmdUA/gVMiDIwkSjlloNo1qw6ffq04NFHu1GjxgHxDkskLmJJBKVyk0BoDTEuei9S0mzZsoNBgz4gLc24//5T6NSpLp061Y13WCJxFcsP9MlmNsXMLjGzS4CJwKRowxIpeh9+uJBWrUYwZMgnbNq0HXePd0giJUIsaxbfaGb/B5wQHhrl7q9FG5ZI0Vm/fis33fQuo0bNoEGDKrz//kUqFS2SR2HrETQCHgIaAF8BN7j7suIKTKSoLF++ieee+4obbmjL3Xd3pnx5PQYjkldhQ0NjgbeAswgqkA7d0w83s+5mNs/Msszs5kLanWVmbmbpe3oOkYKsWrWZoUM/A6Bp0+osXHgNDz7YVUlApACFDQ1VcPfR4fY8M5uxJx9sZmnAMIKlLpcC08ws093n5mtXAbgG+GxPPl+kIO7Oiy/O4eqr32bDhm1069aQxo2r6Y4gkUIU1iMoZ2ZHmVkbM2sD7J9vf3eOBbLcfYG7bwdeAk4voN1fgfuBrXscvUgeS5asp3fvFzn//H/SsGFVZs68XEXiRGJQWI9gOfBwnv0VefYdOGk3n304sCTP/lLguLwNwoRSy90nmtmNu/ogM+sH9AOoXVsVsOX3du7M4cQTx7NixSYeeaQbV111LGlpustZJBaFLUzTOcoTh8XrHgYu2V1bdx8FjAJIT0/XPX/yq4UL11GrVkVKly7FyJG9qF+/CvXrV4l3WCIJJcpfmZYBtfLs1wyP5aoAtAQ+NLOFwPFApiaMJRY7d+bw0EMf06zZMIYPnwbAySfXVxIQ2QuxPFm8t6YBjcysHkEC6Aucl/uiu68Hqufum9mHBLeoTo8wJkkCs2f/REZGJtOn/8jppzfhrLOaxzskkYQWWSJw951mNhCYAqQBY939azMbDEx398yozi3Ja/jwaVxzzWSqVCnHyy//gT59mmNm8Q5LJKHtNhFY8L/sfKC+uw8O1ys+xN0/39173X0S+cpRuPugXbQ9MaaIJSXlFolr2fIg+vZtySOPdKN69fLxDkskKcTSIxgO5BDcJTQY2Aj8AzgmwrhEANi8eTu33/4+pUuX4sEHu9KxYx06dqwT77BEkkosk8XHufuVhPf5u/taoGykUYkA7723gCOOGMGjj37Gtm3ZKhInEpFYegQ7wqeEHX5djyAn0qgkpa1bt5UbbniHMWNm0qhRVaZOvYQOHdQLEIlKLD2Cx4HXgIPM7F7gv8B9kUYlKe2nnzbx0ktz+Mtf2vPll/2VBEQiFksZ6ufN7AugC2DAGe7+TeSRSUrJ/eF/zTXH06RJdRYuvFaTwSLFJJa7hmoDW4A38x5z98VRBiapwd15/vmvuOaayWzatJ2ePRvRqFE1JQGRYhTLHMFEgvkBA8oB9YB5QIsI45IUsHjxevr3f4u3386ibduajBlzGo0aqUicSHGLZWjoiLz7YaG4AZFFJCkhKBI3jpUrN/P4490ZMOAYFYkTiZM9frLY3WeY2XG7bynyewsWrKVOnUqULl2K0aN706BBVerWrRzvsERSWixzBNfl2S0FtAF+jCwiSUo7d+YwZMjH3HnnhzzwwClcffVxdOlSP95hiQix9Qgq5NneSTBn8I9owpFkNGvWCjIyMpkxYzlnntmUPn1UJE6kJCk0EYQPklVw9xuKKR5JMk888Tl//vMUqlXbnwkT+qhSqEgJtMtEYGalwwqi7YszIEkOuUXiWrU6mPPPP4KHH+5G1ar7xzssESlAYT2CzwnmA2aZWSbwKrA590V3/2fEsUkC2rRpO7fd9h5lyqTx0EMqEieSCGK5X68csIag+mgvoHf4p8hvvPPO97RsOZyhQz9nxw4ViRNJFIX1CA4K7xiaw/8eKMul/+Hyq7Vrf+G6695h3LhZNGlSjalT/8gJJ9SOd1giEqPCEkEacCC/TQC5lAjkVytXbmbChLnccssJDBrUiXLlolwBVUSKWmH/Y5e7++Bii0QSyooVm3jxxa/485/bhkXirqFaNdUHEklEhc0RaCFY+R13Z/z4WTRvPoxbbnmP+fPXACgJiCSwwhJBl2KLQhLCwoXr6N79eS655A2aN6/BrFn9VSROJAnscmjI3X8uzkCkZNu5M4fOncezevUWhg3rSf/+6ZQqpU6jSDLQrJ4UKivrZ+rVq0zp0qUYO/Y06tevQp06leMdlogUIdX9lQLt2JHNfff9hxYthjNs2DQAOneupyQgkoTUI5DfmTFjORkZmcyatYI+fZpzzjlag0gkmSkRyG88/vhnXHfdFGrUOIB//vNszjyzWbxDEpGIKREI8L8icUcddQgXXXQkQ4Z0pUoVFYkTSQVKBClu48Zt3HLLe+y3XxpDhnSjQ4c6dOigInEiqUSTxSls8uQsWrYcwfDh03BHReJEUpR6BClozZotXHfdOzzzzJc0a1adjz76E23b1op3WCISJ0oEKWjNml947bVvuOOOjtx2Wwf220//DERSWaRDQ2bW3czmmVmWmd1cwOvXmdlcM5ttZu+ZmQanI7J8+UYeeuhj3J3GjauxaNG1DB7cWUlARKJLBOF6x8OAHkBz4Fwzy79g7Uwg3d1bAROAB6KKJ1W5O2PHzqRZs2HccccHZGUFlUN0R5CI5IqyR3AskOXuC9x9O/AScHreBu7+gbtvCXc/BWpGGE/K+eGHtXTt+hwZGZkceeQhfPmlisSJyO9FOS5wOLAkz/5S4LhC2mcAbxf0gpn1A/oB1K6tla9isXNnDied9Axr1mxhxIhT6dfvaBWJE5EClYgBYjO7AEgHOhX0uruPAkYBpKen6x7HQsyfv4b69atQunQpnn76dBo0qEKtWpXiHZaIlGBRDg0tA/Lek1gzPPYbZnYycBtwmrtvizCepLZjRzb33DOVli1H8MQTnwNw4ol1lQREZLei7BFMAxqZWT2CBNAXOC9vAzM7ChgJdHf3lRHGktSmT/+RjIxMZs/+ib59W3LuuUfEOyQRSSCRJQJ332lmA4EpQBow1t2/NrPBwHR3zwQeBA4EXjUzgMXuflpUMSWjxx77lOuue4dDDjmQN97oy2mnNYl3SCKSYCKdI3D3ScCkfMcG5dk+OcrzJ7PcInHp6YeRkXEUDzxwCpUrl4t3WCKSgErEZLHEbsOGbfzlL+9SrlxpHnmkO+3b16Z9e91JJSJ7T0XnEsikSfNp0WI4o0bNoHTpUioSJyJFQj2CBLB69RauvXYyzz//FS1a1GDChD4cd5yevRORoqFEkADWrv2FN9/8jjvv7MStt3agbNm0eIckIklEiaCEWrZsA88//xU33tiORo2CInGaDBaRKGiOoIRxd0aP/oLmzYdz110f8v33awGUBEQkMkoEJcj33/9Mly7P0K/fW7RpcyizZ19Bw4ZV4x2WiCQ5DQ2VEDt35tClyzP8/PMvjBzZi0svbaMicSJSLJQI4mzevNU0aFCV0qVLMX78GTRoUJWaNSvGOywRSSEaGoqT7duzufvuDzniiBEMGxYUievUqa6SgIgUO/UI4uDzz5eRkZHJnDkrOe+8Izj//FbxDklEUpgSQTF79NFPuf76dzj00AN5881z6dWrcbxDEpEUp0RQTHKLxB177OFcdlkb7r//ZCpV0i2hIhJ/SgQRW79+Kzfd9C7771+GRx/tTrt2tWjXrtbu3ygiUkw0WRyhN9+cR/Pmw3nqqZnst1+aisSJSImkHkEEVq3azDXXTObFF+dwxBEH8frr53DMMYfHOywRkQIpEURg/fptTJo0n7vvPpGbbz5BReJEpERTIigiS5as57nnZnPzzSfQsGFVFi26VpPBIpIQNEewj3JynCefnE6LFsO5557//FokTklARBKFEsE+mD9/DSedNJ4rrpjIsccezldfqUiciCQeDQ3tpZ07czjllGdZt24rY8acxh//2BozFYkTkcSjRLCHvvlmFY0aVaN06VI8++yZNGhQlcMOqxDvsERE9pqGhmK0bdtO7rzzA1q1epInngiKxHXoUEdJQEQSnnoEMfj006VkZGQyd+4qLrywFRdeqCJxIpI8lAh2Y8iQj7nxxnepWbMikyadR48ejeIdkohIkVIi2IWcHKdUKaNt21r075/O3/9+MhUr7hfvsEREipwSQT7r1m3l+uunUL58GYYO7akicSKS9DRZnMfrr39L8+bDGD/+SypU2E9F4kQkJahHAKxcuZmBAyfx6qtzad36EN566zzatDk03mGJiBQLJQJgw4ZtvPvuAu699yRuvLEdZcqoSJyIpI6UTQSLF6/n2We/5NZbO9CwYVUWL76WChU0GSwiqSfSOQIz625m88wsy8xuLuD1/czs5fD1z8ysbpTxQHA30PDh02jRYjj33fffX4vEKQmISKqKLBGYWRowDOgBNAfONbPm+ZplAGvdvSHwCHB/VPEAbFixmRNPHMeVV06ibduafP31ABWJE5GUF+XQ0LFAlrsvADCzl4DTgbl52pwO3BVuTwCeMDPzCG7XycnOYerjX1A2B55++nQuvvhIFYkTESHaRHA4sCTP/lLguF21cfedZrYeqAasztvIzPoB/QBq1669V8G0rFWZCjcdz70XteHQQ1UfSEQkV0JMFrv7KGAUQHp6+l71Fu7s3QJ6F2lYIiJJIcrJ4mVA3kdya4bHCmxjZqWBSsCaCGMSEZF8okwE04BGZlbPzMoCfYHMfG0ygYvD7T8A70cxPyAiIrsW2dBQOOY/EJgCpAFj3f1rMxsMTHf3TGAM8KyZZQE/EyQLEREpRpHOEbj7JGBSvmOD8mxvBfpEGYOIiBRORedERFKcEoGISIpTIhARSXFKBCIiKc4S7W5NM1sFLNrLt1cn31PLKUDXnBp0zalhX665jrvXKOiFhEsE+8LMprt7erzjKE665tSga04NUV2zhoZERFKcEoGISIpLtUQwKt4BxIGuOTXomlNDJNecUnMEIiLye6nWIxARkXyUCEREUlxSJgIz625m88wsy8xuLuD1/czs5fD1z8ysbhzCLFIxXPN1ZjbXzGab2XtmVicecRal3V1znnZnmZmbWcLfahjLNZvZ2eH3+msze6G4YyxqMfzbrm1mH5jZzPDfd894xFlUzGysma00szm7eN3M7PHw72O2mbXZ55O6e1J9EZS8/h6oD5QFvgSa52szAHgy3O4LvBzvuIvhmjsD5cPtK1LhmsN2FYCpwKdAerzjLobvcyNgJlAl3D8o3nEXwzWPAq4It5sDC+Md9z5ec0egDTBnF6/3BN4GDDge+Gxfz5mMPYJjgSx3X+Du24GXgNPztTkdGB9uTwC6WGKvZL/ba3b3D9x9S7j7KcGKcYkslu8zwF+B+4GtxRlcRGK55suAYe6+FsDdVxZzjEUtlmt2oGK4XQn4sRjjK3LuPpVgfZZdOR14xgOfApXN7NB9OWcyJoLDgSV59peGxwps4+47gfVAtWKJLhqxXHNeGQS/USSy3V5z2GWu5e4TizOwCMXyfW4MNDazj8zsUzPrXmzRRSOWa74LuMDMlhKsf3JV8YQWN3v6/323EmLxeik6ZnYBkA50incsUTKzUsDDwCVxDqW4lSYYHjqRoNc31cyOcPd18QwqYucC49x9iJm1JVj1sKW758Q7sESRjD2CZUCtPPs1w2MFtjGz0gTdyTXFEl00YrlmzOxk4DbgNHffVkyxRWV311wBaAl8aGYLCcZSMxN8wjiW7/NSINPdd7j7D8B3BIkhUcVyzRnAKwDu/glQjqA4W7KK6f/7nkjGRDANaGRm9cysLMFkcGa+NpnAxeH2H4D3PZyFSVC7vWYzOwoYSZAEEn3cGHZzze6+3t2ru3tdd69LMC9ymrtPj0+4RSKWf9uvE/QGMLPqBENFC4oxxqIWyzUvBroAmFkzgkSwqlijLF6ZwEXh3UPHA+vdffm+fGDSDQ25+04zGwhMIbjjYKy7f21mg4Hp7p4JjCHoPmYRTMr0jV/E+y7Ga34QOBB4NZwXX+zup8Ut6H0U4zUnlRiveQrQ1czmAtnAje6esL3dGK/5emC0mf2ZYOL4kkT+xc7MXiRI5tXDeY87gTIA7v4kwTxITyAL2AL8cZ/PmcB/XyIiUgSScWhIRET2gBKBiEiKUyIQEUlxSgQiIilOiUBEJMUpEUiJZGbZZjYrz1fdQtpuKoLzjTOzH8JzzQifUN3Tz3jKzJqH27fme+3jfY0x/Jzcv5c5ZvammVXeTfvWiV6NU6Kn20elRDKzTe5+YFG3LeQzxgFvufsEM+sKPOTurfbh8/Y5pt19rpmNB75z93sLaX8JQdXVgUUdiyQP9QgkIZjZgeE6CjPM7Csz+12lUTM71Mym5vmNuUN4vKuZfRK+91Uz290P6KlAw/C914WfNcfMrg2PHWBmE83sy/D4OeHxD80s3cz+DuwfxvF8+Nqm8M+XzOzUPDGPM7M/mFmamT1oZtPCGvOXx/DX8glhsTEzOza8xplm9rGZNQmfxB0MnBPGck4Y+1gz+zxsW1DFVkk18a69rS99FfRF8FTsrPDrNYKn4CuGr1UneKoyt0e7KfzzeuC2cDuNoN5QdYIf7AeEx/8CDCrgfOOAP4TbfYDPgKOBr4ADCJ7K/ho4CjgLGJ3nvZXCPz8kXPMgN6Y8bXJjPBMYH26XJagiuT/QD7g9PL4fMB2oV0Ccm/Jc36tA93C/IlA63D4Z+Ee4fQnwRJ733wdcEG5XJqhFdEC8v9/6iu9X0pWYkKTxi7u3zt0xszLAfWbWEcgh+E34YGBFnvdMA8aGbV9391lm1olgsZKPwtIaZQl+ky7Ig2Z2O0GdmgyC+jWvufvmMIZ/Ah2AycAQM7ufYDjpP3twXW8Dj5nZfkB3YKq7/xIOR7Uysz+E7SoRFIv7Id/79zezWeH1fwO8m6f9eDNrRFBmocwuzt8VOM3Mbgj3ywG1w8+SFKVEIInifKAGcLS777Cgomi5vA3cfWqYKE4FxpnZw8Ba4F13PzeGc9zo7hNyd8ysS0GN3P07C9Y66AncY2bvufvgWC7C3bea2YdAN+AcgoVWIFht6ip3n7Kbj/jF3VubWXmC+jtXAo8TLMDzgbufGU6sf7iL9xtwlrvPiyVeSQ2aI5BEUQlYGSaBzsDv1ly2YB3mn9x9NPAUwXJ/nwLtzSx3zP8AM2sc4zn/A5xhZuXN7ACCYZ3/mNlhwBZ3f46gmF9Ba8buCHsmBXmZoFBYbu8Cgh/qV+S+x8wah+cskAerzV0NXG//K6WeW4r4kjxNNxIMkeWaAlxlYffIgqq0kuKUCCRRPA+km9lXwEXAtwW0ORH40sxmEvy2/Zi7ryL4wfiimc0mGBZqGssJ3X0GwdzB5wRzBk+5+0zgCODzcIjmTuCeAt4+CpidO1mczzsECwP9y4PlFyFIXHOBGRYsWj6S3fTYw1hmEyzM8gDwt/Da877vA6B57mQxQc+hTBjb1+G+pDjdPioikuLUIxARSXFKBCIiKU6JQEQkxSkRiIikOCUCEZEUp0QgIpLilAhERFLc/wPCamObnj3m/AAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZAAAAEWCAYAAABIVsEJAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAa+ElEQVR4nO3deZQldX338fcHRlmFAQVEBhwEogeMonYgHB7jAgJGcVDJg5joaDTE59E8BqMR4gJBkihu0aNZkBAmigHFhTFGyYACmqjQA7igIiOIzIg6LCIDssn3+aOq4dLe6blT3bcX+v06p07X8rtV39/0OfPpql/dqlQVkiRtrE1mugBJ0txkgEiSOjFAJEmdGCCSpE4MEElSJwaIJKkTA0SS1IkBIvVI8qMkv0pyW5JfJPmfJK9JsklPmzOS3J1kXc/0zSRP71m+PUmNa7PbTPZNmmoLZroAaRY6vKrOT7It8AzgA8D+wCt72pxSVW/t89mtAZIsBq4FFlbVvcMsNkmAVNV9wzyONJ5nINJ6VNWtVbUcOApYmuSJU32MJLsm+XSStUluSvKhdv2JST7W025xe0azoF2+MMnfJPlv4A7gTUlGx+372CTL2/nNkrwnyY+T/CzJPyXZYqr7o/nFAJE2oKouAVYDT5/K/SbZFPgP4DpgMbALcNZG7OJlwDHAI4B/Ah6fZK+e7S8FPt7OvxP4LWBfYM/2WG/vXr1kgEiD+gmwfc/yG9sxkrFpWYd97gc8BnhTVd1eVXdW1Vc34vNnVNWVVXVvVd0KnAscDdAGyROA5e0lrmOAY6vq5qq6Dfhb4CUdapbuZ4BIg9kFuLln+T1VtbBnWtphn7sC101ijOT6ccsfpw0QmrOPz1bVHcAOwJbAyrHAA77Yrpc6cxBd2oAkv0MTIBtzdjCI64HdkizoEyK30/ynP+bRfT4//lHaK4AdkuxLEyTHtutvBH4F7FNVayZdtdTyDERajyTbJHk+zbjEx6rq21N8iEuAG4B3JtkqyeZJDmy3XQH8XpLd2rvBjt/QzqrqHuCTwLtpLretaNffB3wEeH+SHQGS7JLk0Cnuj+YZA0T6TZ9LchvNGcJbgPfx4Ft4Af5y3Hc8btzYg1TVr4HDaQa1f0wzUH9Uu20FcDbwLWAlzWD7ID4OHAx8ctxZzZuBVcDXk/wSOB94/MbWLPWKL5SSJHXhGYgkqRMDRJLUiQEiSerEAJEkdTKvvgfyqEc9qhYvXjzTZUjSnLJy5cobq+o3vng6rwJk8eLFjI6ObrihJOl+Sa7rt95LWJKkTgwQSVInBogkqRMDRJLUiQEiSerEAJEkdWKASJI6MUAkSZ0YIJKkTgwQSVInBogkqRMDRJLUiQEiSerEAJEkdWKASJI6MUAkSZ0YIJKkTgwQSVInBogkqRMDRJLUiQEiSerEAJEkdWKASJI6MUAkSZ0YIJKkTmY0QJIcluSqJKuSHNdn+2ZJzm63fyPJ4nHbd0uyLskbp61oSRIwgwGSZFPgw8Bzgb2Bo5PsPa7Zq4BbqmpP4P3Au8Ztfx/whWHXKkn6TTN5BrIfsKqqrqmqu4GzgCXj2iwBlrXz5wAHJQlAkiOAa4Erp6dcSVKvmQyQXYDre5ZXt+v6tqmqe4FbgUcm2Rp4M/DXGzpIkmOSjCYZXbt27ZQULkmau4PoJwLvr6p1G2pYVadW1UhVjeywww7Dr0yS5okFM3jsNcCuPcuL2nX92qxOsgDYFrgJ2B84MskpwELgviR3VtWHhl61JAmY2QC5FNgrye40QfES4KXj2iwHlgJfA44EvlRVBTx9rEGSE4F1hockTa8ZC5CqujfJ64DzgE2B06vqyiQnAaNVtRz4F+CjSVYBN9OEjCRpFkjzB/38MDIyUqOjozNdhiTNKUlWVtXI+PVzdRBdkjTDDBBJUicGiCSpEwNEktSJASJJ6sQAkSR1YoBIkjoxQCRJnRggkqRODBBJUicGiCSpEwNEktSJASJJ6sQAkSR1YoBIkjoxQCRJnRggkqRODBBJUicGiCSpEwNEktSJASJJ6sQAkSR1YoBIkjoxQCRJnRggkqRONhggafxRkre3y7sl2W/4pUmSZrNBzkD+ATgAOLpdvg348NAqkiTNCQsGaLN/VT01yeUAVXVLkocPuS5J0iw3yBnIPUk2BQogyQ7AfUOtSpI06w0SIB8EPgPsmORvgK8CfzfUqiRJs94GA6SqzgT+kiY0bgCOqKpPTMXBkxyW5Kokq5Ic12f7ZknObrd/I8nidv1zkqxM8u3257Onoh5J0uA2OAaS5KNV9TLg+33WddZeFvsw8BxgNXBpkuVV9d2eZq8CbqmqPZO8BHgXcBRwI3B4Vf0kyROB84BdJlOPJGnjDHIJa5/ehfY//qdNwbH3A1ZV1TVVdTdwFrBkXJslwLJ2/hzgoCSpqsur6ift+iuBLZJsNgU1SZIGtN4ASXJ8ktuAJyX5ZZLb2uWfA+dOwbF3Aa7vWV7Nb55F3N+mqu4FbgUeOa7Ni4HLququKahJkjSg9QZIVf1dVT0CeHdVbVNVj2inR1bV8dNY43ol2YfmstafTtDmmCSjSUbXrl07fcVJ0kPcBsdAqur4JNsBewGb96y/eJLHXgPs2rO8qF3Xr83qJAuAbYGbAJIsork77OVV9cMJ6j8VOBVgZGSkJlmzJKk1yCD6q4HX0/wHfwXwu8DXgMne+XQpsFeS3WmC4iXAS8e1WQ4sbY93JPClqqokC4HPA8dV1X9Psg5JUgeDDKK/Hvgd4LqqehbwFOAXkz1wO6bxOpo7qL4HfKKqrkxyUpIXtM3+BXhkklXAG4CxW31fB+wJvD3JFe2042RrkiQNbpBHmdxZVXcmIclmVfX9JI+fioNX1X8C/zlu3dt75u8E/qDP504GTp6KGiRJ3QwSIKvbS0afBVYkuQW4bphFSZJmv0EG0V/Yzp6Y5Ms0A9lfGGpVkqRZb6NeKFVVFwF3Mu6ykyRp/pnoi4TPTvKDJOuSfCzJbycZpXkm1j9OX4mSpNloojOQ9wLH0Hzz+xyaW2nPqKqnVdWnp6M4SdLsNdEYSFXVhe38Z5OsqaoPTUNNkqQ5YKIAWZjkRb1te5c9C5Gk+W2iALkIOLxn+eKe5QIMEEmax9YbIFX1yuksRJI0t2zUbbySJI0xQCRJnRggkqRONhggSbZM8rYkH2mX90ry/OGXJkmazQY5A/lX4C7ggHZ5DT4JV5LmvUECZI+qOgW4B6Cq7gAy1KokSbPeIAFyd5ItaL77QZI9aM5IJEnz2CDvAzkR+CKwa5IzgQOBVwyxJknSHDDI+0D+K8lKmnehB3h9Vd049MokSbPaBgMkyeeAjwPLq+r24ZckSZoLBhkDeQ/wdOC7Sc5JcmSSzYdclyRplhvkEtZFwEVJNgWeDfwJcDqwzZBrkyTNYoMMotPehXU4cBTwVGDZMIuSJM1+g4yBfALYj+ZOrA8BF1XVfcMuTJI0uw1yBvIvwNFV9ethFyNJmjvWGyBJnl1VXwK2ApYkD/7yuW8klKT5baIzkGcAX+LBbyUc4xsJJWmem+iNhCe0sydV1bW925LsPtSqJEmz3iDfA/lUn3XnTHUhkqS5ZaIxkCcA+wDbJnlRz6ZtAL9IKEnz3ERjII8Hng8s5MHjILfRfJlQkjSPTTQGci5wbpIDqupr01iTJGkOGGQM5PIkr03yD0lOH5um4uBJDktyVZJVSY7rs32zJGe327+RZHHPtuPb9VclOXQq6pEkDW6QAPko8GjgUOAiYBHNZaxJaZ+t9WHgucDewNFJ9h7X7FXALVW1J/B+4F3tZ/cGXkIzRnMY8A/t/iRJ02SQANmzqt4G3F5Vy4DnAftPwbH3A1ZV1TVVdTdwFrBkXJslPPDcrXOAg9J8o3EJcFZV3dXeYryq3Z8kaZoMEiD3tD9/keSJwLbAjlNw7F2A63uWV7fr+rapqnuBW4FHDvhZAJIck2Q0yejatWunoGxJEgwWIKcm2Q54G7Ac+C5wylCrmkJVdWpVjVTVyA477DDT5UjSQ8Yg7wM5rZ29CHjcFB57DbBrz/Kidl2/NquTLKA5+7lpwM9KkoZokMe5v6HP6luBlVV1xSSOfSmwV/tYlDU0g+IvHddmObAU+BpwJPClqqoky4GPJ3kf8BhgL+CSSdQiSdpIgzzOfaSdPtcuPx/4FvCaJJ+sqk6Xs6rq3iSvA84DNgVOr6ork5wEjFbVcppHyX80ySrgZpqQoW33CZrLafcCr/Vx85I0vVJVEzdILgZ+v6rWtctbA5+nuX12ZVWNv/V21hoZGanR0dGZLkOS5pQkK6tqZPz6QQbRdwTu6lm+B9ipqn41br0kaR4Z5BLWmcA3kpzbLh9OM/6wFc0lJEnSPDTIXVjvSPIF4MB21Wuqauw60B8OrTJJ0qw2yCUsaB7f/suq+gBwnS+UkiRtMECSnAC8GTi+XfUw4GPDLEqSNPsNcgbyQuAFwO0AVfUT4BHDLEqSNPsNEiB3V3OvbwG0g+eSpHlukAD5RJJ/BhYm+RPgfOAjwy1LkjTbDXIX1nuSPAf4Jc1rbt9eVSuGXpkkaVYb5HsgtIFhaEiS7rfeAElyLe24Rx9VVXsMpyRJ0lww0RnI+OeebAL8b+CNwOVDq0iSNCesN0Cq6iaAJJsALwPeBFwBPK+qfISJJM1zE13Cehjwx8CxwFeBI6pq1XQVJkma3Sa6hHUtzbs2/h74MfCkJE8a21hVnx5uaZKk2WyiADmfZhD9ye3UqwADRJLmsYnGQF4xjXVIkuaYQZ/GK0nSgxggkqRODBBJUieDvA9kyyRvS/KRdnmvJM8ffmmSpNlskDOQfwXuAg5ol9cAJw+tIknSnDBIgOxRVacA9wBU1R1AhlqVJGnWG+iFUkm24IEXSu1Bc0YiSZrHBnmc+wnAF4Fdk5wJHAi8YphFSZJmv0FeKLUiyWXA79Jcunp9Vd049MokSbPaIHdhHQjcWVWfBxYCf5XkscMuTJI0uw0yBvKPwB1Jngy8Afgh8G9DrUqSNOsNEiD3VlUBS4APV9WHgUcMtyxJ0mw3yCD6bUmOB/4I+L32BVMPG25ZkqTZbpAzkKNobtt9VVX9FFgEvHsyB02yfZIVSa5uf263nnZL2zZXJ1nartsyyeeTfD/JlUneOZlaJEndbDBAquqnVfW+qvpKu/zjqprsGMhxwAVVtRdwQbv8IEm2p7mFeH9gP+CEnqB5T1U9AXgKcGCS506yHknSRlpvgCS5Lckv+0y3JfnlJI+7BFjWzi8DjujT5lBgRVXdXFW3ACuAw6rqjqr6MkBV3Q1cRnNWJEmaRhO9UGqYA+U7VdUN7fxPgZ36tNkFuL5neXW77n5JFgKHAx8YQo2SpAkMMogOQJIdgc3Hlqvqxxtofz7w6D6b3tK7UFWVpAato2f/C4B/Bz5YVddM0O4Y4BiA3XbbbWMPI0lajw0GSJIXAO8FHgP8HHgs8D1gn4k+V1UHT7DPnyXZuapuSLJzu9/x1gDP7FleBFzYs3wqcHVV/f0G6ji1bcvIyMhGB5Ukqb9B7sJ6B81jTH5QVbsDBwFfn+RxlwNL2/mlwLl92pwHHJJku3bw/JB2HUlOBrYF/nySdUiSOhokQO6pqpuATZJs0g5gj0zyuO8EnpPkauDgdpkkI0lOA6iqm2nC69J2Oqmqbk6yiOYy2N7AZUmuSPLqSdYjSdpIg4yB/CLJ1sDFwJlJfg7cPpmDtoF0UJ/1o8Cre5ZPB04f12Y1vo9EkmbcRLfxjo04LwHuAI6leaz7D2nufJIkzWMTnYF8FnhqVd2e5FNV9WIe+O6GJGmem2gMpPcy0eOGXYgkaW6ZKEBqPfOSJE14CevJ7SNLAmzR8/iS0Hz/b5uhVydJmrUmepTJptNZiCRpbhnkeyCSJP0GA0SS1IkBIknqxACRJHVigEiSOjFAJEmdGCCSpE4MEElSJwaIJKkTA0SS1IkBIknqxACRJHVigEiSOjFAJEmdGCCSpE4MEElSJwaIJKkTA0SS1IkBIknqxACRJHVigEiSOjFAJEmdGCCSpE4MEElSJwaIJKmTGQmQJNsnWZHk6vbndutpt7Rtc3WSpX22L0/yneFXLEkab6bOQI4DLqiqvYAL2uUHSbI9cAKwP7AfcEJv0CR5EbBuesqVJI03UwGyBFjWzi8DjujT5lBgRVXdXFW3ACuAwwCSbA28ATh5+KVKkvqZqQDZqapuaOd/CuzUp80uwPU9y6vbdQDvAN4L3LGhAyU5JsloktG1a9dOomRJUq8Fw9pxkvOBR/fZ9JbehaqqJLUR+90X2KOqjk2yeEPtq+pU4FSAkZGRgY8jSZrY0AKkqg5e37YkP0uyc1XdkGRn4Od9mq0BntmzvAi4EDgAGEnyI5r6d0xyYVU9E0nStJmpS1jLgbG7qpYC5/Zpcx5wSJLt2sHzQ4Dzquofq+oxVbUY+F/ADwwPSZp+MxUg7wSek+Rq4OB2mSQjSU4DqKqbacY6Lm2nk9p1kqRZIFXzZ1hgZGSkRkdHZ7oMSZpTkqysqpHx6/0muiSpEwNEktSJASJJ6sQAkSR1YoBIkjoxQCRJnRggkqRODBBJUicGiCSpEwNEktSJASJJ6sQAkSR1YoBIkjoxQCRJnRggkqRODBBJUicGiCSpEwNEktSJASJJ6sQAkSR1YoBIkjoxQCRJnRggkqRODBBJUiepqpmuYdokWQtcN9N1bKRHATfOdBHTzD7PD/Z57nhsVe0wfuW8CpC5KMloVY3MdB3TyT7PD/Z57vMSliSpEwNEktSJATL7nTrTBcwA+zw/2Oc5zjEQSVInnoFIkjoxQCRJnRggs0CS7ZOsSHJ1+3O79bRb2ra5OsnSPtuXJ/nO8CuevMn0OcmWST6f5PtJrkzyzumtfuMkOSzJVUlWJTmuz/bNkpzdbv9GksU9245v11+V5NBpLXwSuvY5yXOSrEzy7fbns6e9+A4m8ztut++WZF2SN05b0VOhqpxmeAJOAY5r548D3tWnzfbANe3P7dr57Xq2vwj4OPCdme7PsPsMbAk8q23zcOArwHNnuk/r6eemwA+Bx7W1fhPYe1yb/wv8Uzv/EuDsdn7vtv1mwO7tfjad6T4Nuc9PAR7Tzj8RWDPT/Rlmf3u2nwN8EnjjTPdnYybPQGaHJcCydn4ZcESfNocCK6rq5qq6BVgBHAaQZGvgDcDJwy91ynTuc1XdUVVfBqiqu4HLgEXDL7mT/YBVVXVNW+tZNH3v1ftvcQ5wUJK068+qqruq6lpgVbu/2a5zn6vq8qr6Sbv+SmCLJJtNS9XdTeZ3TJIjgGtp+junGCCzw05VdUM7/1Ngpz5tdgGu71le3a4DeAfwXuCOoVU49SbbZwCSLAQOBy4YQo1TYYN96G1TVfcCtwKPHPCzs9Fk+tzrxcBlVXXXkOqcKp372/7x92bgr6ehzim3YKYLmC+SnA88us+mt/QuVFUlGfje6iT7AntU1bHjr6vOtGH1uWf/C4B/Bz5YVdd0q1KzUZJ9gHcBh8x0LUN2IvD+qlrXnpDMKQbINKmqg9e3LcnPkuxcVTck2Rn4eZ9ma4Bn9iwvAi4EDgBGkvyI5ve5Y5ILq+qZzLAh9nnMqcDVVfX3k692aNYAu/YsL2rX9Wuzug3FbYGbBvzsbDSZPpNkEfAZ4OVV9cPhlztpk+nv/sCRSU4BFgL3Jbmzqj409KqnwkwPwjgVwLt58IDyKX3abE9znXS7droW2H5cm8XMnUH0SfWZZrznU8AmM92XDfRzAc3g/+48MMC6z7g2r+XBA6yfaOf34cGD6NcwNwbRJ9PnhW37F810P6ajv+PanMgcG0Sf8QKcCpprvxcAVwPn9/wnOQKc1tPuj2kGUlcBr+yzn7kUIJ37TPMXXgHfA65op1fPdJ8m6OvvAz+guVPnLe26k4AXtPOb09yBswq4BHhcz2ff0n7uKmbpnWZT2WfgrcDtPb/XK4AdZ7o/w/wd9+xjzgWIjzKRJHXiXViSpE4MEElSJwaIJKkTA0SS1IkBIknqxADRQ06SXye5omdaPEHbdVNwvDOSXNse67IkB3TYx2lJ9m7n/2rctv+ZbI3tfsb+Xb6T5HPtY2Amar9vkt+fimProcnbePWQk2RdVW091W0n2McZwH9U1TlJDgHeU1VPmsT+Jl3ThvabZBnwg6r6mwnavwIYqarXTXUtemjwDEQPeUm2TnJBe3bw7STjn5RKkp2TXNzzF/rT2/WHJPla+9lPtg+/m8jFwJ7tZ9/Q7us7Sf68XbdV+y6Tb7brj2rXX5hkJM27TbZo6ziz3bau/XlWkuf11HxGkiOTbJrk3UkuTfKtJH86wD/L12gf+Jdkv7aPlyf5nySPT/Jwmi/CHdXWclRb++lJLmnb/sa/o+aZmf4mo5PTVE/Ar3ngW8yfoXnUxDbttkfRfBt47Ox7XfvzL3jgG8SbAo9o214MbNWufzPw9j7HOwM4sp3/A+AbwNOAbwNbAVvTPKr7KTRPmP1Iz2e3bX9eSPPX/v019bQZq/GFwLJ2/uE0T3fdAjgGeGu7fjNgFNi9T53revr3SZpH4wNsAyxo5w8GPtXOvwL4UM/n/xb4o3Z+Ic03r7ea6d+308xNPkxRD0W/qqp9xxaSPAz42yS/B9xH85f3TjSPkR9zKXB62/azVXVFkmfQvNTpv9snpT6c5i/3ft6d5K3AWuBVwEHAZ6rq9raGTwNPB74IvDfJu2gue31lI/r1BeAD7fsxDgMurqpftZfNnpTkyLbdtsBeNM8O67VFkiva/n+P5v0qY+2XJdmL5hExD1vP8Q8BXpAH3pq3ObBbuy/NQwaI5oM/BHYAnlZV97RPLt68t0FVXdwGzPOAM5K8D7iF5oVWRw9wjDdV1TljC0kO6teoqn6Q5Kk0z046OckFVXXSIJ2oqjuTXEjzoq2jaF5cBBDgz6rqvA3s4ldVtW+SLYHzaB7w90Ga98l8uape2N5wcOF6Ph/gxVV11SD16qHPMRDNB9sCP2/D41nAY8c3SPJY4GdV9RHgNOCpwNeBA5OMjWlsleS3BjzmV4Aj0ry/fSuay09fSfIY4I6q+hjNE4mf2uez97RnQv2cDbySB85moAmD/zP2mSS/1R6zr6q6A/h/wF/0PFp87PHjr+hpehvNpbwx5wF/ltz/Jr2nrO8Ymh8MEM0HZ9K8M+XbwMuB7/dp80zgm0kup/nr/gNVtZbmP9R/T/ItmstXTxjkgFV1Gc3YyCU0YyKnVdXlwG8Dl7SXkk6g/2uITwW+NTaIPs5/Ac8Azq/m9anQBN53gcuSfAf4ZzZwdaGt5VvA0TTvp/+7tu+9n/sysPfYIDrNmcrD2tqubJc1j3kbrySpE89AJEmdGCCSpE4MEElSJwaIJKkTA0SS1IkBIknqxACRJHXy/wGm/no+C+7INgAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Equal Error Rate (EER): 0.0\n", - " precision recall f1-score support\n", - "\n", - " 0 1.00 1.00 1.00 1\n", - " 1 1.00 1.00 1.00 2\n", - "\n", - " accuracy 1.00 3\n", - " macro avg 1.00 1.00 1.00 3\n", - "weighted avg 1.00 1.00 1.00 3\n", - "\n" - ] - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAMwAAADQCAYAAABLNo4SAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAQ6UlEQVR4nO3dfXBV9Z3H8ffXpFCsgZvyoJNEF0yUmFhW5Wl9qtXBByRAV0FRdGVxS7c+tt2ttdKCVlu06NRa3fWhOjqigFg7kaggpVM7YDFQuoo81SihJqnyICG02CDpd/+4B7wkEO6PeHPvJZ/XTCb3/M7vnN/33JlPzjm555xr7o6IJOeIdBcgkk0UGJEACoxIAAVGJIACIxJAgREJkJvuAhJZbg+3bnnpLiPrnXrScekuIett3FjLli1brHV7ZgWmWx7dB16W7jKy3tI3Hkx3CVnvzOFD9tuuQzKRAAqMSAAFRiSAAiMSQIERCaDAiARQYEQCKDAiARQYkQAKjEgABUYkgAIjEkCBEQmgwIgEUGBEAigwIgEUGJEACoxIAAVGJIACIxJAgREJoMCIBFBgRAIoMCIBFBiRAAqMSAAFRiSAAiMSQIERCaDAiARQYEQCKDAiARQYkQAKjEgABQZ4ePpENi6ewYp5tx2wz323jOPtyulUz/0ep5QW7W2fOHo4qyqnsapyGhNHD++McjPaqwsXMKh8IOWlJcz8yd1t5jc3N3PVlZdTXlrC2WcMZ2Nt7d55M++ZQXlpCYPKB7Lo1YWdWHXyUhoYM7vIzNabWY2Z3ZrKsTri6fnLGHv9Qwecf+FZZRQf15eTx97BDXfN5oHbJgCQ3/NIpk4ZyZevvpezr5rJ1CkjieX16KyyM05LSwvfvOl6Kue/wh/fWsO8ObNZu2bNPn2efOJx8mP5rF5Xw403f4upt30XgLVr1jBv7hxWvrmaF6sWcPON19HS0pKOzWhXygJjZjnAQ8BIoAy4wszKUjVeRyxd+S4fbd95wPkV5wzi2apqAKpX1dIrrwfH9OnJ+WecxOJl69jWtJPGHR+zeNk6LjgzIzexUyyvrqa4uIQBxx9Pt27dGH/5BKrmV+7Tp2p+JROvvgaASy4dx29/sxh3p2p+JeMvn0D37t3pP2AAxcUlLK+uTsdmtCuVe5hhQI27v+fuu4A5wNgUjpcyBf1i1H2wbe90/YeNFPSLUdA3Rt2HCe2bGinoG0tDhZmhoaGeoqJj904XFhZRX1/fts+x8T65ubn07NWLrVu3Ul/fdtmGhn2XzQSpDEwh8H7CdF3Utg8zm2JmK8xshe/+OIXliHRc2k/63f1Rdx/i7kMsNzOP/xs2NVJ0TP7e6cKjYzRsaqRhcyNFRye094vRsLkxDRVmhoKCQurqPv0bWV9fR2FhYds+78f77N69m6bt2+nduzeFhW2XLSho8/c17VIZmHrg2ITpoqgt67z02iqurBgGwLAv9afprx/zwZYmFr2+lhGnlxLL60EsrwcjTi9l0etr01xt+gwZOpSamneo3bCBXbt2MW/uHEZVjNmnz6iKMTzz9FMAvPDL5znn3PMwM0ZVjGHe3Dk0NzdTu2EDNTXvMHTYsHRsRrtyU7ju5cAJZjaAeFAmAFemcLxD9tSMSZw9+AT6xI6iZsGd3Pnwy3wuNweAXzy/hAVLVnPhWeWsfnE6O//+CV+/fRYA25p2MuOxBSyZdQsAP350AduaDvzPg8Ndbm4uP/3Zg4wedSEtLS1cM2kyZeXl/PD2aZw2eAgVo8cwafK1TJ50NeWlJeTnf5Gnn5kDQFl5OZeOv4xTB5WRm5vL/Q88RE5OTpq3qC1z99St3Oxi4H4gB3jC3X/UXv8jjuzn3QdelrJ6uoptyx9MdwlZ78zhQ/jDH1ZY6/ZU7mFw95eBl1M5hkhnSvtJv0g2UWBEAigwIgEUGJEACoxIAAVGJIACIxJAgREJoMCIBFBgRAIoMCIBFBiRAAqMSAAFRiTAAS/vN7MdwJ6bZfbcF+DRa3f3nimuTSTjHDAw7p7XmYWIZIOkDsnM7Cwz+/fodZ/otmORLueggTGz6cB3ge9FTd2AWaksSiRTJbOH+VdgDPA3AHdvAHS4Jl1SMoHZ5fEnZTiAmX0htSWJZK5kAvOcmT0CxMzsa8CvgcdSW5ZIZjroU2Pc/V4zOx9oAk4Eprn7opRXJpKBkn3M0iqgB/HDslWpK0cksyXzX7L/AKqBS4BxwDIzm5zqwkQyUTJ7mO8Ap7r7VgAz6w28DjyRysJEMlEyJ/1bgR0J0zuiNpEup71ryb4dvawB3jCzSuLnMGOBtzqhNpGM094h2Z4PJ9+Nfvao3E9fkS6hvYsv7+jMQkSywUFP+s2sL3ALUA58fk+7u5+XwrpEMlIyJ/3PAOuAAcAdQC3xL0sS6XKSCUxvd38c+MTdX3P3yYD2LtIlJfM5zCfR77+Y2SigAfhi6koSyVzJBOYuM+sF/Bfwc6An8K2UViWSoZK5+LIqerkdODe15YhktvY+uPw5nz4Eow13v+mzLubUk45j6Rv6QtOOyh96Q7pLyHrN6/+83/b29jArUlOKSPZq74PLpzqzEJFsoAf5iQRQYEQCKDAiAZK54/JEM1tsZm9H04PM7PupL00k8ySzh3mM+EP8PgFw97eACaksSiRTJROYI929ulXb7lQUI5LpkgnMFjMr5tMH+Y0D/pLSqkQyVDLXkl0PPAqUmlk9sAG4KqVViWSoZK4lew8YET0i9gh333GwZUQOV8nccTmt1TQA7v7DFNUkkrGSOST7W8LrzwMVwNrUlCOS2ZI5JLsvcdrM7gUWpqwikQx2KJ/0HwkUfdaFiGSDZM5hVvHpfTE5QF9A5y/SJSVzDlOR8Ho38KG764NL6ZLaDYyZ5QAL3b20k+oRyWjtnsO4ewuw3syO66R6RDJaModk+cBqM6sm4V/M7j4mZVWJZKhkAvODlFchkiWSCczF7v7dxAYzuwd4LTUliWSuZD6HOX8/bSM/60JEskF7zyX7BnAdcLyZJX6BUh6wNNWFiWSi9g7JngVeAWYAtya073D3j1JalUiGau+5ZNuJPx72is4rRySz6akxIgEUGJEACoxIAAVGJIACIxJAgREJoMCIBFBgRAIoMCIBFBiRAApM5NWFCxhUPpDy0hJm/uTuNvObm5u56srLKS8t4ewzhrOxtnbvvJn3zKC8tIRB5QNZ9GrXfQLVw9MnsnHxDFbMu+2Afe67ZRxvV06neu73OKX004cPTRw9nFWV01hVOY2Jo4d3RrmHJGWBMbMnzGzTnu+VyWQtLS1886brqZz/Cn98aw3z5sxm7Zo1+/R58onHyY/ls3pdDTfe/C2m3ha/RWjtmjXMmzuHlW+u5sWqBdx843W0tLSkYzPS7un5yxh7/UMHnH/hWWUUH9eXk8fewQ13zeaB2+LfmpLf80imThnJl6++l7OvmsnUKSOJ5fXorLKDpHIP8yRwUQrX/5lZXl1NcXEJA44/nm7dujH+8glUza/cp0/V/EomXn0NAJdcOo7f/mYx7k7V/ErGXz6B7t2703/AAIqLS1he3frbQbqGpSvf5aPtOw84v+KcQTxbFX9vqlfV0iuvB8f06cn5Z5zE4mXr2Na0k8YdH7N42TouOLOss8oOkrLAuPvvgKy4DaChoZ6iomP3ThcWFlFfX9+2z7HxPrm5ufTs1YutW7dSX9922YaGfZeVuIJ+Meo+2LZ3uv7DRgr6xSjoG6Puw4T2TY0U9I2locKDS/s5jJlNMbMVZrZi85bN6S5HpF1pD4y7P+ruQ9x9SN8+fdNSQ0FBIXV17++drq+vo7CwsG2f9+N9du/eTdP27fTu3ZvCwrbLFhTsu6zENWxqpOiY/L3ThUfHaNjUSMPmRoqOTmjvF6Nhc2MaKjy4tAcmEwwZOpSamneo3bCBXbt2MW/uHEZV7PsUqVEVY3jm6acAeOGXz3POuedhZoyqGMO8uXNobm6mdsMGamreYeiwYenYjIz30muruLIi/t4M+1J/mv76MR9saWLR62sZcXopsbwexPJ6MOL0Uha9nplfEJHMU2MOe7m5ufz0Zw8yetSFtLS0cM2kyZSVl/PD26dx2uAhVIwew6TJ1zJ50tWUl5aQn/9Fnn5mDgBl5eVcOv4yTh1URm5uLvc/8BA5OTlp3qL0eGrGJM4efAJ9YkdRs+BO7nz4ZT6XG38vfvH8EhYsWc2FZ5Wz+sXp7Pz7J3z99lkAbGvayYzHFrBk1i0A/PjRBWxrOvA/D9LJ3P3gvQ5lxWazga8AfYAPgenu/nh7ywwePMSXvrEiJfV0JflDb0h3CVmvef1z/GPnJmvdnrI9jLvrWQBy2NE5jEgABUYkgAIjEkCBEQmgwIgEUGBEAigwIgEUGJEACoxIAAVGJIACIxJAgREJoMCIBFBgRAIoMCIBFBiRAAqMSAAFRiSAAiMSQIERCaDAiARQYEQCKDAiARQYkQAKjEgABUYkgAIjEkCBEQmgwIgEUGBEAigwIgEUGJEACoxIAAVGJIACIxIgZV8KeyjMbDOwMd11tKMPsCXdRRwGsuF9/Cd379u6MaMCk+nMbIW7D0l3Hdkum99HHZKJBFBgRAIoMGEeTXcBh4msfR91DiMSQHsYkQAKTJLM7CIzW29mNWZ2a7rryUZm9oSZbTKzt9Ndy6FSYJJgZjnAQ8BIoAy4wszK0ltVVnoSuCjdRXSEApOcYUCNu7/n7ruAOcDYNNeUddz9d8BH6a6jIxSY5BQC7ydM10Vt0sUoMCIBFJjk1APHJkwXRW3SxSgwyVkOnGBmA8ysGzABeDHNNUkaKDBJcPfdwA3AQmAt8Jy7r05vVdnHzGYDvwcGmlmdmV2b7ppC6ZN+kQDaw4gEUGBEAigwIgEUGJEACoxIAAUmg5jZV8ysKno9pr2ros0sZmbXHcIYt5vZfyfb3qrPk2Y2LmCs/tl8ZfL+KDCdILraOYi7v+jud7fTJQYEB0Y6RoHpgOgv6Doze8bM1prZ82Z2ZDSv1szuMbOVwHgzu8DMfm9mK81snpkdFfW7KFrHSuCShHVPMrMHo9dHm9mvzOzN6OcM4G6g2Mz+z8xmRv2+Y2bLzewtM7sjYV1TzexPZrYEGJjEdn0tWs+bZvbLPdsUGWFmK6L1VUT9c8xsZsLYX+/oe5upFJiOGwj8j7ufBDSx71/9re5+GvBr4PvAiGh6BfBtM/s88BgwGhgMHHOAMR4AXnP3fwZOA1YDtwLvuvsp7v4dM7sAOIH4rQinAIPN7MtmNpj4pTynABcDQ5PYphfcfWg03log8RP5/tEYo4CHo224Ftju7kOj9X/NzAYkMU7WyU13AYeB9919afR6FnATcG80PTf6/S/EbzxbamYA3YhfIlIKbHD3dwDMbBYwZT9jnAf8G4C7twDbzSy/VZ8Lop8/RtNHEQ9QHvArd98ZjZHMNXAnm9ldxA/7jiJ+SdAez7n7P4B3zOy9aBsuAAYlnN/0isb+UxJjZRUFpuNaX1uUOP236LcBi9z9isSOZnbKZ1iHATPc/ZFWY3zzENb1JPBVd3/TzCYBX0mYt7/tNeBGd08MFmbW/xDGzmg6JOu448zs9Oj1lcCS/fRZBpxpZiUAZvYFMzsRWAf0N7PiqN8V+1kWYDHwjWjZHDPrBewgvvfYYyEwOeHcqNDM+gG/A75qZj3MLI/44d/B5AF/MbPPARNbzRtvZkdENR8PrI/G/kbUHzM70cy+kMQ4WUeB6bj1wPVmthbIB/63dQd33wxMAmab2VtEh2Pu/nfih2AvRSf9mw4wxs3AuWa2CvgDUObuW4kf4r1tZjPd/VXgWeD3Ub/ngTx3X0n80PBN4BXityoczA+AN4ClxEOd6M9AdbSu/4y24RfAGmBl9G/kRzhMj150tXIHRIccVe5+crprkc6hPYxIAO1hRAJoDyMSQIERCaDAiARQYEQCKDAiARQYkQD/D6jSjysQzBEwAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "evaluate_spk_verification(y, y_scores)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/evaluate.py b/evaluate.py index 2d98fe2..04a47d3 100644 --- a/evaluate.py +++ b/evaluate.py @@ -1,40 +1,32 @@ -import argparse -from pathlib import Path -import numpy as np +import os +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' +import argparse import tensorflow as tf -from tensorflow.keras import Model -from tensorflow.keras.layers import Input -from tensorflow.keras.optimizers import Adam - -from train_evaluate import create_classifier -from sslforslr.utils.helpers import load_config, load_dataset, load_model - -def load(config_path): - config, checkpoint_dir, eval_checkpoint_dir = load_config(config_path) - gens, input_shape, nb_categories = load_dataset(config, - eval_checkpoint_dir, - key='evaluate') +from sslforslr.utils.helpers import load_config, load_model +from sslforslr.utils.evaluate import speaker_verification_evaluate - model = load_model(config, input_shape) +def evaluate(config_path): + # Load model + config, checkpoint_dir = load_config(config_path) + model = load_model(config) - # Create classifier - classifier = create_classifier(config, input_shape, nb_categories, model) - - # Load pre-trained model - last_checkpoint_path = tf.train.latest_checkpoint(eval_checkpoint_dir) + # Load pre-trained weights + last_checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir) if last_checkpoint_path: - classifier.load_weights(last_checkpoint_path) + mirrored_strategy = tf.distribute.MirroredStrategy() + with mirrored_strategy.scope(): + model.load_weights(last_checkpoint_path) else: - raise Exception('Evaluate: no checkpoints found.') - - # Load trainings history - history = np.load(checkpoint_dir + '/history.npy', allow_pickle=True).item() - history_evaluate = np.load(eval_checkpoint_dir + '/history.npy', allow_pickle=True).item() + raise Exception('%s has not been trained.' % config['name']) - _, _, test_gen = gens - return model, history, classifier, history_evaluate, test_gen + eer = speaker_verification_evaluate(model, config) + print('EER', eer) if __name__ == "__main__": - pass + parser = argparse.ArgumentParser() + parser.add_argument('config', help='Path to model config file.') + args = parser.parse_args() + + evaluate(args.config) \ No newline at end of file diff --git a/extract_embeddings.py b/extract_embeddings.py deleted file mode 100644 index de44a98..0000000 --- a/extract_embeddings.py +++ /dev/null @@ -1,57 +0,0 @@ -import os -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' - -import argparse -import numpy as np -import kaldiio -import soundfile as sf -import tensorflow as tf -from tqdm import tqdm - -from train_evaluate import create_classifier - -from sslforslr.utils.helpers import load_config, load_dataset, load_model - -def get_frames(signal): - signal_length = len(signal) - frame_length = 20480 - frame_step = 20480 - - num_frames = int(1 + np.ceil((signal_length - frame_length) / frame_step)) - - zeros = np.zeros((num_frames * frame_length - signal_length)) - signal_padded = np.append(signal, zeros) - - indices_a = np.tile(np.arange(0, frame_length), (num_frames, 1)) - indices_b = np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)) - indices = indices_a + indices_b.T - - frames = signal_padded[indices.astype(np.int32)] - return frames - -def extract_embeddings(input_path, output_path, config_path): - config, checkpoint_dir, eval_checkpoint_dir = load_config(config_path) - gens, input_shape, nb_categories = load_dataset(config, eval_checkpoint_dir) - model = load_model(config, input_shape) - - #scp = kaldiio.load_scp(input_path) - for line in tqdm(open(input_path)): - utterance_id, audio_path = line.rstrip().split() - - data, sr = sf.read(audio_path) - frames = np.expand_dims(get_frames(data), axis=-1) - embeddings = np.mean(model.predict(frames), axis=0) - - kaldiio.save_ark(output_path, - {utterance_id: embeddings}, - scp=output_path[:-3] + 'scp', - append=True) - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument('input', help='Path to scp file of features generated by Kaldi.') - parser.add_argument('output', help='Path to output file containing speaker embeddings.') - parser.add_argument('config', help='Path to model config file.') - args = parser.parse_args() - - extract_embeddings(args.input, args.output, args.config) diff --git a/kaldi/.gitignore b/kaldi/.gitignore deleted file mode 100644 index 808de68..0000000 --- a/kaldi/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -data/ -exp/ diff --git a/kaldi/cmd.sh b/kaldi/cmd.sh deleted file mode 100755 index 6e3e6b5..0000000 --- a/kaldi/cmd.sh +++ /dev/null @@ -1,13 +0,0 @@ -# you can change cmd.sh depending on what type of queue you are using. -# If you have no queueing system and want to run on a local machine, you -# can change all instances 'queue.pl' to run.pl (but be careful and run -# commands one by one: most recipes will exhaust the memory on your -# machine). queue.pl works with GridEngine (qsub). slurm.pl works -# with slurm. Different queues are configured differently, with different -# queue names and different ways of specifying things like memory; -# to account for these differences you can create and edit the file -# conf/queue.conf to match your queue's configuration. Search for -# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, -# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. - -export train_cmd="run.pl" diff --git a/kaldi/conf/mfcc.conf b/kaldi/conf/mfcc.conf deleted file mode 100644 index 9e12570..0000000 --- a/kaldi/conf/mfcc.conf +++ /dev/null @@ -1,7 +0,0 @@ ---sample-frequency=16000 ---frame-length=25 # the default is 25 ---low-freq=20 # the default. ---high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case). ---num-mel-bins=30 ---num-ceps=30 ---snip-edges=false diff --git a/kaldi/conf/vad.conf b/kaldi/conf/vad.conf deleted file mode 100644 index c9f5e8b..0000000 --- a/kaldi/conf/vad.conf +++ /dev/null @@ -1,4 +0,0 @@ ---vad-energy-threshold=5.5 ---vad-energy-mean-scale=0.5 ---vad-proportion-threshold=0.12 ---vad-frames-context=2 diff --git a/kaldi/local/add_disambig.pl b/kaldi/local/add_disambig.pl deleted file mode 100755 index 962ef38..0000000 --- a/kaldi/local/add_disambig.pl +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Adds some specified number of disambig symbols to a symbol table. -# Adds these as #1, #2, etc. -# If the --include-zero option is specified, includes an extra one -# #0. - -$include_zero = 0; -if($ARGV[0] eq "--include-zero") { - $include_zero = 1; - shift @ARGV; -} - -if(@ARGV != 2) { - die "Usage: add_disambig.pl [--include-zero] symtab.txt num_extra > symtab_out.txt "; -} - - -$input = $ARGV[0]; -$nsyms = $ARGV[1]; - -open(F, "<$input") || die "Opening file $input"; - -while() { - @A = split(" ", $_); - @A == 2 || die "Bad line $_"; - $lastsym = $A[1]; - print; -} - -if(!defined($lastsym)){ - die "Empty symbol file?"; -} - -if($include_zero) { - $lastsym++; - print "#0 $lastsym\n"; -} - -for($n = 1; $n <= $nsyms; $n++) { - $y = $n + $lastsym; - print "#$n $y\n"; -} diff --git a/kaldi/local/add_lex_disambig.pl b/kaldi/local/add_lex_disambig.pl deleted file mode 100755 index dd8a25d..0000000 --- a/kaldi/local/add_lex_disambig.pl +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2013-2016 Johns Hopkins University (author: Daniel Povey) -# 2015 Hainan Xu -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Adds disambiguation symbols to a lexicon. -# Outputs still in the normal lexicon format. -# Disambig syms are numbered #1, #2, #3, etc. (#0 -# reserved for symbol in grammar). -# Outputs the number of disambig syms to the standard output. -# With the --pron-probs option, expects the second field -# of each lexicon line to be a pron-prob. -# With the --sil-probs option, expects three additional -# fields after the pron-prob, representing various components -# of the silence probability model. - -$pron_probs = 0; -$sil_probs = 0; -$first_allowed_disambig = 1; - -for ($n = 1; $n <= 3 && @ARGV > 0; $n++) { - if ($ARGV[0] eq "--pron-probs") { - $pron_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--sil-probs") { - $sil_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--first-allowed-disambig") { - $first_allowed_disambig = 0 + $ARGV[1]; - if ($first_allowed_disambig < 1) { - die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n"; - } - shift @ARGV; - shift @ARGV; - } -} - -if (@ARGV != 2) { - die "Usage: add_lex_disambig.pl [opts] \n" . - "This script adds disambiguation symbols to a lexicon in order to\n" . - "make decoding graphs determinizable; it adds pseudo-phone\n" . - "disambiguation symbols #1, #2 and so on at the ends of phones\n" . - "to ensure that all pronunciations are different, and that none\n" . - "is a prefix of another.\n" . - "It prints to the standard output the number of the largest-numbered" . - "disambiguation symbol that was used.\n" . - "\n" . - "Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" . - " --sil-probs [should be with --pron-probs option]\n" . - " Expect 3 extra fields after the pron-probs, for aspects of\n" . - " the silence probability model\n" . - " --first-allowed-disambig The number of the first disambiguation symbol\n" . - " that this script is allowed to add. By default this is\n" . - " #1, but you can set this to a larger value using this option.\n" . - "e.g.:\n" . - " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n"; -} - - -$lexfn = shift @ARGV; -$lexoutfn = shift @ARGV; - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - -# (1) Read in the lexicon. -@L = ( ); -while() { - @A = split(" ", $_); - push @L, join(" ", @A); -} - -# (2) Work out the count of each phone-sequence in the -# lexicon. - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { - $p = shift @A; - if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; } - } - if ($sil_probs) { - $silp = shift @A; - if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - } - if (!(@A)) { - die "Bad lexicon line $1, no phone in phone list"; - } - $count{join(" ",@A)}++; -} - -# (3) For each left sub-sequence of each phone-sequence, note down -# that it exists (for identifying prefixes of longer strings). - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { shift @A; } # remove pron-prob. - if ($sil_probs) { - shift @A; # Remove silprob - shift @A; # Remove silprob - } - while(@A > 0) { - pop @A; # Remove last phone - $issubseq{join(" ",@A)} = 1; - } -} - -# (4) For each entry in the lexicon: -# if the phone sequence is unique and is not a -# prefix of another word, no diambig symbol. -# Else output #1, or #2, #3, ... if the same phone-seq -# has already been assigned a disambig symbol. - - -open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; - -# max_disambig will always be the highest-numbered disambiguation symbol that -# has been used so far. -$max_disambig = $first_allowed_disambig - 1; - -foreach $l (@L) { - @A = split(" ", $l); - $word = shift @A; - if ($pron_probs) { - $pron_prob = shift @A; - } - if ($sil_probs) { - $sil_word_prob = shift @A; - $word_sil_correction = shift @A; - $prev_nonsil_correction = shift @A - } - $phnseq = join(" ", @A); - if (!defined $issubseq{$phnseq} - && $count{$phnseq} == 1) { - ; # Do nothing. - } else { - if ($phnseq eq "") { # need disambig symbols for the empty string - # that are not use anywhere else. - $max_disambig++; - $reserved_for_the_empty_string{$max_disambig} = 1; - $phnseq = "#$max_disambig"; - } else { - $cur_disambig = $last_used_disambig_symbol_of{$phnseq}; - if (!defined $cur_disambig) { - $cur_disambig = $first_allowed_disambig; - } else { - $cur_disambig++; # Get a number that has not been used yet for - # this phone sequence. - } - while (defined $reserved_for_the_empty_string{$cur_disambig}) { - $cur_disambig++; - } - if ($cur_disambig > $max_disambig) { - $max_disambig = $cur_disambig; - } - $last_used_disambig_symbol_of{$phnseq} = $cur_disambig; - $phnseq = $phnseq . " #" . $cur_disambig; - } - } - if ($pron_probs) { - if ($sil_probs) { - print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n"; - } else { - print O "$word\t$pron_prob\t$phnseq\n"; - } - } else { - print O "$word\t$phnseq\n"; - } -} - -print $max_disambig . "\n"; diff --git a/kaldi/local/analyze_segments.pl b/kaldi/local/analyze_segments.pl deleted file mode 100755 index 26805c3..0000000 --- a/kaldi/local/analyze_segments.pl +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/perl -# Copyright 2015 GoVivace Inc. (Author: Nagendra Kumar Goel) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# Analyze a segments file and print important stats on it. - -$dur = $total = 0; -$maxDur = 0; -$minDur = 9999999999; -$n = 0; -while(<>){ - chomp; - @t = split(/\s+/); - $dur = $t[3] - $t[2]; - $total += $dur; - if ($dur > $maxDur) { - $maxSegId = $t[0]; - $maxDur = $dur; - } - if ($dur < $minDur) { - $minSegId = $t[0]; - $minDur = $dur; - } - $n++; -} -$avg=$total/$n; -$hrs = $total/3600; -print "Total $hrs hours of data\n"; -print "Average segment length $avg seconds\n"; -print "Segment $maxSegId has length of $maxDur seconds\n"; -print "Segment $minSegId has length of $minDur seconds\n"; diff --git a/kaldi/local/apply_map.pl b/kaldi/local/apply_map.pl deleted file mode 100755 index ff9507f..0000000 --- a/kaldi/local/apply_map.pl +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0. - -# This program is a bit like ./sym2int.pl in that it applies a map -# to things in a file, but it's a bit more general in that it doesn't -# assume the things being mapped to are single tokens, they could -# be sequences of tokens. See the usage message. - - -if (@ARGV > 0 && $ARGV[0] eq "-f") { - shift @ARGV; - $field_spec = shift @ARGV; - if ($field_spec =~ m/^\d+$/) { - $field_begin = $field_spec - 1; $field_end = $field_spec - 1; - } - if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) - if ($1 ne "") { - $field_begin = $1 - 1; # Change to zero-based indexing. - } - if ($2 ne "") { - $field_end = $2 - 1; # Change to zero-based indexing. - } - } - if (!defined $field_begin && !defined $field_end) { - die "Bad argument to -f option: $field_spec"; - } -} - -# Mapping is obligatory -$permissive = 0; -if (@ARGV > 0 && $ARGV[0] eq '--permissive') { - shift @ARGV; - # Mapping is optional (missing key is printed to output) - $permissive = 1; -} - -if(@ARGV != 1) { - print STDERR "Invalid usage: " . join(" ", @ARGV) . "\n"; - print STDERR "Usage: apply_map.pl [options] map output\n" . - "options: [-f ]\n" . - "Applies the map 'map' to all input text, where each line of the map\n" . - "is interpreted as a map from the first field to the list of the other fields\n" . - "Note: can look like 4-5, or 4-, or 5-, or 1, it means the field\n" . - "range in the input to apply the map to.\n" . - "e.g.: echo A B | apply_map.pl a.txt\n" . - "where a.txt is:\n" . - "A a1 a2\n" . - "B b\n" . - "will produce:\n" . - "a1 a2 b\n"; - exit(1); -} - -($map_file) = @ARGV; -open(M, "<$map_file") || die "Error opening map file $map_file: $!"; - -while () { - @A = split(" ", $_); - @A >= 1 || die "apply_map.pl: empty line."; - $i = shift @A; - $o = join(" ", @A); - $map{$i} = $o; -} - -while() { - @A = split(" ", $_); - for ($x = 0; $x < @A; $x++) { - if ( (!defined $field_begin || $x >= $field_begin) - && (!defined $field_end || $x <= $field_end)) { - $a = $A[$x]; - if (!defined $map{$a}) { - if (!$permissive) { - die "apply_map.pl: undefined key $a in $map_file\n"; - } else { - print STDERR "apply_map.pl: warning! missing key $a in $map_file\n"; - } - } else { - $A[$x] = $map{$a}; - } - } - } - print join(" ", @A) . "\n"; -} diff --git a/kaldi/local/best_wer.sh b/kaldi/local/best_wer.sh deleted file mode 100755 index 45b855d..0000000 --- a/kaldi/local/best_wer.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash -# -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# To be run from one directory above this script. - -perl -e 'while(<>){ - s/\|(\d)/\| $1/g; s/(\d)\|/$1 \|/g; - if (m/[WS]ER (\S+)/ && (!defined $bestwer || $bestwer > $1)){ $bestwer = $1; $bestline=$_; } # kaldi "compute-wer" tool. - elsif (m: (Mean|Sum/Avg|)\s*\|\s*\S+\s+\S+\s+\|\s+\S+\s+\S+\s+\S+\s+\S+\s+(\S+)\s+\S+\s+\|: - && (!defined $bestwer || $bestwer > $2)){ $bestwer = $2; $bestline=$_; } } # sclite. - if (defined $bestline){ print $bestline; } ' | \ - awk 'BEGIN{ FS="%WER"; } { if(NF == 2) { print FS$2" "$1; } else { print $0; }}' | \ - awk 'BEGIN{ FS="Sum/Avg"; } { if(NF == 2) { print $2" "$1; } else { print $0; }}' | \ - awk '{ if($1!~/%WER/) { print "%WER "$9" "$0; } else { print $0; }}' | \ - sed -e 's|\s\s*| |g' -e 's|\:$||' -e 's|\:\s*\|\s*$||' - - - diff --git a/kaldi/local/build_const_arpa_lm.sh b/kaldi/local/build_const_arpa_lm.sh deleted file mode 100755 index ec067df..0000000 --- a/kaldi/local/build_const_arpa_lm.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -# Copyright 2014 Guoguo Chen -# Apache 2.0 - -# This script reads in an Arpa format language model, and converts it into the -# ConstArpaLm format language model. - -# begin configuration section -# end configuration section - -[ -f path.sh ] && . ./path.sh; - -. utils/parse_options.sh - -if [ $# != 3 ]; then - echo "Usage: " - echo " $0 [options] " - echo "e.g.:" - echo " $0 data/local/lm/3-gram.full.arpa.gz data/lang/ data/lang_test_tgmed" - echo "Options" - exit 1; -fi - -export LC_ALL=C - -arpa_lm=$1 -old_lang=$2 -new_lang=$3 - -mkdir -p $new_lang - -mkdir -p $new_lang -cp -r $old_lang/* $new_lang - -unk=`cat $new_lang/oov.int` -bos=`grep -w "" $new_lang/words.txt | awk '{print $2}'` -eos=`grep "" $new_lang/words.txt | awk '{print $2}'` -if [[ -z $bos || -z $eos ]]; then - echo "$0: and symbols are not in $new_lang/words.txt" - exit 1 -fi - - -arpa-to-const-arpa --bos-symbol=$bos \ - --eos-symbol=$eos --unk-symbol=$unk \ - "gunzip -c $arpa_lm | utils/map_arpa_lm.pl $new_lang/words.txt|" $new_lang/G.carpa || exit 1; - -exit 0; diff --git a/kaldi/local/check_spk_emb_range.py b/kaldi/local/check_spk_emb_range.py deleted file mode 100755 index ec4d872..0000000 --- a/kaldi/local/check_spk_emb_range.py +++ /dev/null @@ -1,9 +0,0 @@ -import sys -import numpy as np -from collections import defaultdict -import kaldi_io - -for key, mat in kaldi_io.read_vec_flt_scp(sys.argv[1]): - mean = np.mean(mat, axis=0) - std = np.std(mat, axis=0) - print('key %s has mean %f and std %f' % (key, mean, std)) diff --git a/kaldi/local/combine_data.sh b/kaldi/local/combine_data.sh deleted file mode 100755 index a43cf9d..0000000 --- a/kaldi/local/combine_data.sh +++ /dev/null @@ -1,128 +0,0 @@ -#!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. -# 2014 David Snyder - -# This script combines the data from multiple source directories into -# a single destination directory. - -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information -# about what these directories contain. - -# Begin configuration section. -extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..." -skip_fix=false # skip the fix_data_dir.sh in the end -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -. parse_options.sh || exit 1; - -if [ $# -lt 2 ]; then - echo "Usage: combine_data.sh [--extra-files 'file1 file2'] ..." - echo "Note, files that don't appear in all source dirs will not be combined," - echo "with the exception of utt2uniq and segments, which are created where necessary." - exit 1 -fi - -dest=$1; -shift; - -first_src=$1; - -rm -r $dest 2>/dev/null -mkdir -p $dest; - -export LC_ALL=C - -for dir in $*; do - if [ ! -f $dir/utt2spk ]; then - echo "$0: no such file $dir/utt2spk" - exit 1; - fi -done - -# W.r.t. utt2uniq file the script has different behavior compared to other files -# it is not compulsary for it to exist in src directories, but if it exists in -# even one it should exist in all. We will create the files where necessary -has_utt2uniq=false -for in_dir in $*; do - if [ -f $in_dir/utt2uniq ]; then - has_utt2uniq=true - break - fi -done - -if $has_utt2uniq; then - # we are going to create an utt2uniq file in the destdir - for in_dir in $*; do - if [ ! -f $in_dir/utt2uniq ]; then - # we assume that utt2uniq is a one to one mapping - cat $in_dir/utt2spk | awk '{printf("%s %s\n", $1, $1);}' - else - cat $in_dir/utt2uniq - fi - done | sort -k1 > $dest/utt2uniq - echo "$0: combined utt2uniq" -else - echo "$0 [info]: not combining utt2uniq as it does not exist" -fi -# some of the old scripts might provide utt2uniq as an extrafile, so just remove it -extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g") - -# segments are treated similarly to utt2uniq. If it exists in some, but not all -# src directories, then we generate segments where necessary. -has_segments=false -for in_dir in $*; do - if [ -f $in_dir/segments ]; then - has_segments=true - break - fi -done - -if $has_segments; then - for in_dir in $*; do - if [ ! -f $in_dir/segments ]; then - echo "$0 [info]: will generate missing segments for $in_dir" 1>&2 - utils/data/get_segments_for_data.sh $in_dir - else - cat $in_dir/segments - fi - done | sort -k1 > $dest/segments - echo "$0: combined segments" -else - echo "$0 [info]: not combining segments as it does not exist" -fi - -for file in utt2spk utt2lang utt2dur reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do - exists_somewhere=false - absent_somewhere=false - for d in $*; do - if [ -f $d/$file ]; then - exists_somewhere=true - else - absent_somewhere=true - fi - done - - if ! $absent_somewhere; then - set -o pipefail - ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1; - set +o pipefail - echo "$0: combined $file" - else - if ! $exists_somewhere; then - echo "$0 [info]: not combining $file as it does not exist" - else - echo "$0 [info]: **not combining $file as it does not exist everywhere**" - fi - fi -done - -utils/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt - -if ! $skip_fix ; then - utils/fix_data_dir.sh $dest || exit 1; -fi - -exit 0 diff --git a/kaldi/local/compute_min_dcf.py b/kaldi/local/compute_min_dcf.py deleted file mode 100755 index 41b7b0f..0000000 --- a/kaldi/local/compute_min_dcf.py +++ /dev/null @@ -1,146 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2018 David Snyder -# Apache 2.0 - -# This script computes the minimum detection cost function, which is a common -# error metric used in speaker recognition. Compared to equal error-rate, -# which assigns equal weight to false negatives and false positives, this -# error-rate is usually used to assess performance in settings where achieving -# a low false positive rate is more important than achieving a low false -# negative rate. See the NIST 2016 Speaker Recognition Evaluation Plan at -# https://www.nist.gov/sites/default/files/documents/2016/10/07/sre16_eval_plan_v1.3.pdf -# for more details about the metric. -from __future__ import print_function -from operator import itemgetter -import sys, argparse, os - -def GetArgs(): - parser = argparse.ArgumentParser(description="Compute the minimum " - "detection cost function along with the threshold at which it occurs. " - "Usage: sid/compute_min_dcf.py [options...] " - " " - "E.g., sid/compute_min_dcf.py --p-target 0.01 --c-miss 1 --c-fa 1 " - "exp/scores/trials data/test/trials", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--p-target', type=float, dest = "p_target", - default = 0.01, - help='The prior probability of the target speaker in a trial.') - parser.add_argument('--c-miss', type=float, dest = "c_miss", default = 1, - help='Cost of a missed detection. This is usually not changed.') - parser.add_argument('--c-fa', type=float, dest = "c_fa", default = 1, - help='Cost of a spurious detection. This is usually not changed.') - parser.add_argument("scores_filename", - help="Input scores file, with columns of the form " - " ") - parser.add_argument("trials_filename", - help="Input trials file, with columns of the form " - " ") - sys.stderr.write(' '.join(sys.argv) + "\n") - args = parser.parse_args() - args = CheckArgs(args) - return args - -def CheckArgs(args): - if args.c_fa <= 0: - raise Exception("--c-fa must be greater than 0") - if args.c_miss <= 0: - raise Exception("--c-miss must be greater than 0") - if args.p_target <= 0 or args.p_target >= 1: - raise Exception("--p-target must be greater than 0 and less than 1") - return args - -# Creates a list of false-negative rates, a list of false-positive rates -# and a list of decision thresholds that give those error-rates. -def ComputeErrorRates(scores, labels): - - # Sort the scores from smallest to largest, and also get the corresponding - # indexes of the sorted scores. We will treat the sorted scores as the - # thresholds at which the the error-rates are evaluated. - sorted_indexes, thresholds = zip(*sorted( - [(index, threshold) for index, threshold in enumerate(scores)], - key=itemgetter(1))) - sorted_labels = [] - labels = [labels[i] for i in sorted_indexes] - fnrs = [] - fprs = [] - - # At the end of this loop, fnrs[i] is the number of errors made by - # incorrectly rejecting scores less than thresholds[i]. And, fprs[i] - # is the total number of times that we have correctly accepted scores - # greater than thresholds[i]. - for i in range(0, len(labels)): - if i == 0: - fnrs.append(labels[i]) - fprs.append(1 - labels[i]) - else: - fnrs.append(fnrs[i-1] + labels[i]) - fprs.append(fprs[i-1] + 1 - labels[i]) - fnrs_norm = sum(labels) - fprs_norm = len(labels) - fnrs_norm - - # Now divide by the total number of false negative errors to - # obtain the false positive rates across all thresholds - fnrs = [x / float(fnrs_norm) for x in fnrs] - - # Divide by the total number of corret positives to get the - # true positive rate. Subtract these quantities from 1 to - # get the false positive rates. - fprs = [1 - x / float(fprs_norm) for x in fprs] - return fnrs, fprs, thresholds - -# Computes the minimum of the detection cost function. The comments refer to -# equations in Section 3 of the NIST 2016 Speaker Recognition Evaluation Plan. -def ComputeMinDcf(fnrs, fprs, thresholds, p_target, c_miss, c_fa): - min_c_det = float("inf") - min_c_det_threshold = thresholds[0] - for i in range(0, len(fnrs)): - # See Equation (2). it is a weighted sum of false negative - # and false positive errors. - c_det = c_miss * fnrs[i] * p_target + c_fa * fprs[i] * (1 - p_target) - if c_det < min_c_det: - min_c_det = c_det - min_c_det_threshold = thresholds[i] - # See Equations (3) and (4). Now we normalize the cost. - c_def = min(c_miss * p_target, c_fa * (1 - p_target)) - min_dcf = min_c_det / c_def - return min_dcf, min_c_det_threshold - -def main(): - args = GetArgs() - scores_file = open(args.scores_filename, 'r').readlines() - trials_file = open(args.trials_filename, 'r').readlines() - c_miss = args.c_miss - c_fa = args.c_fa - p_target = args.p_target - - scores = [] - labels = [] - - trials = {} - for line in trials_file: - utt1, utt2, target = line.rstrip().split() - trial = utt1 + " " + utt2 - trials[trial] = target - - for line in scores_file: - utt1, utt2, score = line.rstrip().split() - trial = utt1 + " " + utt2 - if trial in trials: - scores.append(float(score)) - if trials[trial] == "target": - labels.append(1) - else: - labels.append(0) - else: - raise Exception("Missing entry for " + utt1 + " and " + utt2 - + " " + args.scores_filename) - - fnrs, fprs, thresholds = ComputeErrorRates(scores, labels) - mindcf, threshold = ComputeMinDcf(fnrs, fprs, thresholds, p_target, - c_miss, c_fa) - sys.stdout.write("{0:.4f}\n".format(mindcf)) - sys.stderr.write("minDCF is {0:.4f} at threshold {1:.4f} (p-target={2}, c-miss={3}," - "c-fa={4})\n".format(mindcf, threshold, p_target,c_miss, c_fa)) - -if __name__ == "__main__": - main() diff --git a/kaldi/local/compute_vad_decision.sh b/kaldi/local/compute_vad_decision.sh deleted file mode 100755 index 7099d06..0000000 --- a/kaldi/local/compute_vad_decision.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/bin/bash - -# Copyright 2013 Daniel Povey -# Apache 2.0 -# To be run from .. (one directory up from here) -# see ../run.sh for example - -# Compute energy based VAD output -# We do this in just one job; it's fast. -# - -nj=2 -cmd=run.pl -vad_config=conf/vad.conf - -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -. parse_options.sh || exit 1; - -if [ $# != 3 ]; then - echo "Usage: $0 [options] "; - echo "e.g.: $0 data/train exp/make_vad mfcc" - echo " Options:" - echo " --vad-config # config passed to compute-vad-energy" - echo " --nj # number of parallel jobs" - echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." - exit 1; -fi - -data=$1 -logdir=$2 -vaddir=$3 - -# make $vaddir an absolute pathname. -vaddir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $vaddir ${PWD}` - -# use "name" as part of name of the archive. -name=`basename $data` - -mkdir -p $vaddir || exit 1; -mkdir -p $logdir || exit 1; - - -for f in $data/feats.scp "$vad_config"; do - if [ ! -f $f ]; then - echo "compute_vad_decision.sh: no such file $f" - exit 1; - fi -done - -utils/split_data.sh $data $nj || exit 1; -sdata=$data/split$nj; - -$cmd JOB=1:$nj $logdir/vad_${name}.JOB.log \ - compute-vad --config=$vad_config scp:$sdata/JOB/feats.scp ark,scp:$vaddir/vad_${name}.JOB.ark,$vaddir/vad_${name}.JOB.scp \ - || exit 1; - -for ((n=1; n<=nj; n++)); do - cat $vaddir/vad_${name}.$n.scp || exit 1; -done > $data/vad.scp - -nc=`cat $data/vad.scp | wc -l` -nu=`cat $data/feats.scp | wc -l` -if [ $nc -ne $nu ]; then - echo "**Warning it seems not all of the speakers got VAD output ($nc != $nu);" - echo "**validate_data_dir.sh will fail; you might want to use fix_data_dir.sh" - [ $nc -eq 0 ] && exit 1; -fi - - -echo "Created VAD output for $name" diff --git a/kaldi/local/convert_ctm.pl b/kaldi/local/convert_ctm.pl deleted file mode 100755 index 7daec8e..0000000 --- a/kaldi/local/convert_ctm.pl +++ /dev/null @@ -1,96 +0,0 @@ -#!/usr/bin/env perl - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. - -# This takes as standard input a ctm file that's "relative to the utterance", -# i.e. times are measured relative to the beginning of the segments, and it -# uses a "segments" file (format: -# utterance-id recording-id start-time end-time -# ) and a "reco2file_and_channel" file (format: -# recording-id basename-of-file - -$skip_unknown=undef; -if ( $ARGV[0] eq "--skip-unknown" ) { - $skip_unknown=1; - shift @ARGV; -} - -if (@ARGV < 2 || @ARGV > 3) { - print STDERR "Usage: convert_ctm.pl [] > real-ctm\n"; - exit(1); -} - -$segments = shift @ARGV; -$reco2file_and_channel = shift @ARGV; - -open(S, "<$segments") || die "opening segments file $segments"; -while() { - @A = split(" ", $_); - @A == 4 || die "Bad line in segments file: $_"; - ($utt, $recording_id, $begin_time, $end_time) = @A; - $utt2reco{$utt} = $recording_id; - $begin{$utt} = $begin_time; - $end{$utt} = $end_time; -} -close(S); -open(R, "<$reco2file_and_channel") || die "open reco2file_and_channel file $reco2file_and_channel"; -while() { - @A = split(" ", $_); - @A == 3 || die "Bad line in reco2file_and_channel file: $_"; - ($recording_id, $file, $channel) = @A; - $reco2file{$recording_id} = $file; - $reco2channel{$recording_id} = $channel; -} - - -# Now process the ctm file, which is either the standard input or the third -# command-line argument. -$num_done = 0; -while(<>) { - @A= split(" ", $_); - ( @A == 5 || @A == 6 ) || die "Unexpected ctm format: $_"; - # lines look like: - # 1 [ confidence ] - ($utt, $one, $wbegin, $wlen, $w, $conf) = @A; - $reco = $utt2reco{$utt}; - if (!defined $reco) { - next if defined $skip_unknown; - die "Utterance-id $utt not defined in segments file $segments"; - } - $file = $reco2file{$reco}; - $channel = $reco2channel{$reco}; - if (!defined $file || !defined $channel) { - die "Recording-id $reco not defined in reco2file_and_channel file $reco2file_and_channel"; - } - $b = $begin{$utt}; - $e = $end{$utt}; - $wbegin_r = $wbegin + $b; # Make it relative to beginning of the recording. - $wbegin_r = sprintf("%.2f", $wbegin_r); - $wlen = sprintf("%.2f", $wlen); - if (defined $conf) { - $line = "$file $channel $wbegin_r $wlen $w $conf\n"; - } else { - $line = "$file $channel $wbegin_r $wlen $w\n"; - } - if ($wbegin_r + $wlen > $e + 0.01) { - print STDERR "Warning: word appears to be past end of recording; line is $line"; - } - print $line; # goes to stdout. - $num_done++; -} - -if ($num_done == 0) { exit 1; } else { exit 0; } - -__END__ - -# Test example [also test it without the 0.5's] -echo utt reco 10.0 20.0 > segments -echo reco file A > reco2file_and_channel -echo utt 1 8.0 1.0 word 0.5 > ctm_in -echo file A 18.00 1.00 word 0.5 > ctm_out -utils/convert_ctm.pl segments reco2file_and_channel ctm_in | cmp - ctm_out || echo error -rm segments reco2file_and_channel ctm_in ctm_out - - - - diff --git a/kaldi/local/convert_slf.pl b/kaldi/local/convert_slf.pl deleted file mode 100755 index 1bc6421..0000000 --- a/kaldi/local/convert_slf.pl +++ /dev/null @@ -1,302 +0,0 @@ -#!/usr/bin/env perl - -# Copyright 2014 Brno University of Technology (author Karel Vesely) -# Copyright 2013 Korbinian Riedhammer - -# Convert a kaldi-lattice to HTK SLF format; if given an output -# directory, each lattice will be put in an individual gzipped file. - -# Internal representation of nodes, links: -# node hash: -# { W=>[word], t=>[time], n_out_arcs=>[number_of_outgoing_arcs] }; -# (Time internally represented as integer number of frames.) -# link hash: -# { S=>[start_node], E=>[end_node], W=>[word], v=>[0], a=>[acoustic_score], l=>[graph_score] } -# -# The HTK output supports: -# - words on links [default], -# - simpler, same as in kaldi lattices, node-ids in output correspond to kaldi lattices -# - words on nodes, -# - apart from original nodes, there are extra nodes containing the words. -# - each original ark is replaced by word-node and two links, connecting it with original nodes. - - -use utf8; -use List::Util qw(max); - -binmode(STDIN, ":encoding(utf8)"); -binmode(STDOUT, ":encoding(utf8)"); - -# defaults -$framerate=0.01; -$wordtonode=0; - -$usage="Convert kaldi lattices to HTK SLF (v1.1) format.\n". - "Usage: convert_slf.pl [options] lat-file.txt [out-dir]\n". - " e.g. lattice-align-words lang/phones/word_boundary.int final.mdl 'ark:gunzip -c lat.gz |' ark,t:- | utils/int2sym.pl -f 3 lang/words.txt | $0 - slf/\n". - "\n". - "Options regarding the SLF output:\n". - " --frame-rate x Frame rate to compute timing information (default: $framerate)\n". - " --word-to-node Print the word symbols on nodes (adds extra nodes+links; default: words at links)\n". - "\n"; - -# parse options -while (@ARGV gt 0 and $ARGV[0] =~ m/^--/) { - $param = shift @ARGV; - if ($param eq "--frame-rate") { $framerate = shift @ARGV; } - elsif ($param eq "--word-to-node") { $wordtonode = 1;} - else { - print STDERR "Unknown option $param\n"; - print STDERR; - print STDERR $usage; - exit 1; - } -} - -# check positional arg count -if (@ARGV < 1 || @ARGV > 2) { - print STDERR $usage; - exit 1; -} - -# store gzipped lattices individually to outdir: -$outdir = ""; -if (@ARGV == 2) { - $outdir = pop @ARGV; - unless (-d $outdir) { system("mkdir -p $outdir"); } - unless (-d $outdir) { - print STDERR "Could not create directory $outdir\n"; - exit 1; - } -} -# or we'll print lattices to stdout: -if ($outdir eq "") { - open(FH, ">-") or die "Could not write to stdout (???)\n"; -} - - -### parse kaldi lattices: - -$utt = ""; -$arc = 0; -$latest_time = 0.0; -@links = (); -%nodes = (); -%nodes_extra = (); -%accepting_states = (); - -open (FI, $ARGV[0]) or die "Could not read from file\n"; -binmode(FI, ":encoding(utf8)"); - -while() { - chomp; - - @A = split /\s+/; - - if (@A == 1 and $utt eq "") { - # new lattice - $utt = $A[0]; - $nodes{0} = { W=>"!NULL", t=>0.0, n_out_arcs=>0 }; #initial node - - } elsif (@A == 1) { - # accepting node without FST weight, store data for link to terminal super-state - $accepting_states{$A[0]} = { W=>"!NULL", v=>0, a=>0, l=>0 }; - - } elsif (@A == 2) { - # accepting state with FST weight on it, again store data for the link - ($s, $info) = @A; - ($gs, $as, $ss) = split(/,/, $info); - - # kaldi saves -log, but HTK does it the other way round - $gs *= -1; - $as *= -1; - - # the state sequence is something like 1_2_4_56_45, get number of tokens after splitting by '_': - $ss = scalar split(/_/, $ss); - - # update the end time - die "Node $s not yet visited, is lattice sorted topologically? $utt" unless exists $nodes{$s}{t}; - $time_end = $nodes{$s}{t} + $ss; - if ($latest_time < $time_end) { $latest_time = $time_end; } - - # add the link data - $accepting_states{$A[0]} = { W=>"!NULL", v=>0, a=>$as, l=>$gs }; - - } elsif (@A == 4 or @A == 3) { - # FSA arc - ($s, $e, $w, $info) = @A; - if ($info ne "") { - ($gs, $as, $ss) = split(/,/, $info); - } else { - $gs = 0; $as = 0; $ss = ""; - } - - # rename epsilons to null - $w = "!NULL" if $w eq ""; - - # kaldi saves -log, but HTK does it the other way round - $gs *= -1; - $as *= -1; - - # the state sequence is something like 1_2_4_56_45, get number of tokens after splitting by '_': - $ss = scalar split(/_/, $ss); - - # keep track of the number of outgoing arcs for each node - # (later, we will connect sinks to the terminal state) - $nodes{$s}{n_out_arcs} += 1; - - # keep track of timing - die "Node $s not yet visited, is lattice sorted topologically? $utt" unless exists $nodes{$s}; - $time_end = $nodes{$s}{t} + $ss; - if ($latest_time < $time_end) { $latest_time = $time_end; } - - # sanity check on already existing node - if (exists $nodes{$e}) { - die "Node $e previously stored with different time ".$nodes{$e}{t}." now $time_end, $utt.\n" - if $time_end ne $nodes{$e}{t}; - } - - # store internal representation of the arc - if (not $wordtonode) { - # The words on links, the lattice keeps it's original structure, - # add node; do not overwrite - $nodes{$e} = { t=>$time_end, n_out_arcs=>0 } unless defined $nodes{$e}; - # add the link data - push @links, { S=>$s, E=>$e, W=>$w, v=>0, a=>$as, l=>$gs }; - - } else { - # The problem here was that, if we have a node with several incoming links, - # the links can have different words on it, so we cannot simply put word from - # link into the node. - # - # The simple solution is: - # each FST arc gets replaced by extra node with word and two links, - # connecting it with original nodes. - # - # The lattice gets larger, and it is good to minimize the lattice during importing. - # - # During reading the FST, we don't know how many nodes there are in total, - # so the extra nodes are stored separately, indexed by arc number, - # and links have flags describing which type of node are they connected to. - - # add 'extra node' containing the word: - $nodes_extra{$arc} = { W=>$w, t=>$time_end }; - # add 'original node'; do not overwrite - $nodes{$e} = { W=>"!NULL", t=>$time_end, n_out_arcs=>0 } unless defined $nodes{$e}; - - # add the link from 'original node' to 'extra node' - push @links, { S=>$s, E=>$arc, W=>$w, v=>0, a=>$as, l=>$gs, to_extra_node=>1 }; - # add the link from 'extra node' to 'original node' - push @links, { S=>$arc, E=>$e, W=>$w, v=>0, a=>0, l=>0, from_extra_node=>1 }; - - # increase arc counter - $arc++; - } - - } elsif (@A == 0) { # end of lattice reading, we'll add terminal super-state, and print it soon... - # find sinks - %sinks = (); - for $n (keys %nodes) { - $sinks{$n} = 1 if ($nodes{$n}{n_out_arcs} == 0); - } - - # sanity check: lattices need at least one sink! - if (scalar keys %sinks == 0) { - print STDERR "Error: $utt does not have at least one sink node-- cyclic lattice??\n"; - } - - # add terminal super-state, - $last_node = max(keys(%nodes)) + 1; - $nodes{$last_node} = { W=>"!NULL", t=>$latest_time }; - - # connect all accepting states with terminal super-state, - for $accept (sort { $a <=> $b } keys %accepting_states) { - %a = %{$accepting_states{$accept}}; - push @links, { S=>$accept, E=>$last_node, W=>$a{W}, v=>$a{v}, a=>$a{a}, l=>$a{l} }; - } - - # connect also all sinks that are not accepting states, - for $sink (sort { $a <=> $b } keys %sinks) { - unless(exists($accepting_states{$sink})) { - print STDERR "WARNING: detected sink node which is not accepting state in lattice $utt, incomplete lattice?\n"; - $a = \$accepting_states{$accept}; - push @links, { S=>$accept, E=>$last_node, W=>"!NULL", v=>0, a=>0, l=>0 }; - } - } - - # print out the lattice; open file handle first - unless ($outdir eq "") { - open(FH, "|-", "gzip -c > $outdir/$utt.lat.gz") or die "Could not write to $outdir/$utt.lat.gz\n"; - binmode(FH, ":encoding(utf8)"); - } - - if (not $wordtonode) { - # print lattice with words on links: - - # header - print FH "VERSION=1.1\n"; - print FH "UTTERANCE=$utt\n"; - print FH "N=".(keys %nodes)."\tL=".(@links)."\n"; - - # nodes - for $n (sort { $a <=> $b } keys %nodes) { - printf FH "I=%d\tt=%.2f\n", $n, $nodes{$n}{t}*$framerate; - } - - # links/arks - for $i (0 .. $#links) { - %l = %{$links[$i]}; # get hash representing the link... - printf FH "J=$i\tS=%d\tE=%d\tW=%s\tv=%f\ta=%f\tl=%f\n", $l{S}, $l{E}, $l{W}, $l{v}, $l{a}, $l{l}; - } - - } else { - # print lattice with words in the nodes: - - # header - print FH "VERSION=1.1\n"; - print FH "UTTERANCE=$utt\n"; - print FH "N=".(scalar(keys(%nodes))+scalar(keys(%nodes_extra)))."\tL=".(@links)."\n"; - - # number of original nodes, offset of extra_nodes - $node_id_offset = scalar keys %nodes; - - # nodes - for $n (sort { $a <=> $b } keys %nodes) { - printf FH "I=%d\tW=%s\tt=%.2f\n", $n, $nodes{$n}{W}, $nodes{$n}{t}*$framerate; - } - # extra nodes - for $n (sort { $a <=> $b } keys %nodes_extra) { - printf FH "I=%d\tW=%s\tt=%.2f\n", $n+$node_id_offset, $nodes_extra{$n}{W}, $nodes_extra{$n}{t}*$framerate; - } - - # links/arks - for $i (0 .. $#links) { - %l = %{$links[$i]}; # get hash representing the link... - if ($l{from_extra_node}) { $l{S} += $node_id_offset; } - if ($l{to_extra_node}) { $l{E} += $node_id_offset; } - printf FH "J=$i\tS=%d\tE=%d\tv=%f\ta=%f\tl=%f\n", $l{S}, $l{E}, $l{v}, $l{a}, $l{l}; - } - } - - print FH "\n"; - - # close handle if it was a file - close(FH) unless ($outdir eq ""); - - # clear data - $utt = ""; - $arc = 0; - $latest_time = 0.0; - @links = (); - %nodes = (); - %nodes_extra = (); - %accepting_states = (); - } else { - die "Unexpected column number of input line\n$_"; - } -} - -if ($utt != "") { - print STDERR "Last lattice was not printed as it might be incomplete? Missing empty line?\n"; -} - diff --git a/kaldi/local/convert_slf_parallel.sh b/kaldi/local/convert_slf_parallel.sh deleted file mode 100755 index 1b242ed..0000000 --- a/kaldi/local/convert_slf_parallel.sh +++ /dev/null @@ -1,71 +0,0 @@ -#!/bin/bash -# Copyright Brno University of Technology (Author: Karel Vesely) 2014. Apache 2.0. - -# This script converts lattices to HTK format compatible with other toolkits. -# We can choose to put words to nodes or arcs, as both is valid in the SLF format. - -# begin configuration section. -cmd=run.pl -dirname=lats-in-htk-slf -parallel_opts="--max-jobs-run 50" # We should limit disk stress -word_to_node=false # Words in arcs or nodes? [default:arcs] -#end configuration section. - -echo "$0 $@" - -[ -f ./path.sh ] && . ./path.sh -. parse_options.sh || exit 1; - -if [ $# -ne 3 ]; then - echo "Usage: $0 [options] " - echo " Options:" - echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." - echo " --word-to-link (true|false) # put word symbols on links or nodes." - echo " --parallel-opts STR # parallelization options (def.: '--max-jobs-run 50')." - echo "e.g.:" - echo "$0 data/dev data/lang exp/tri4a/decode_dev" - exit 1; -fi - -data=$1 -lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied. -dir=$3 - -model=$(dirname $dir)/final.mdl # assume model one level up from decoding dir. - -for f in $lang/words.txt $lang/phones/align_lexicon.int $model $dir/lat.1.gz; do - [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; -done - -[ ! -d $dir/$dirname/log ] && mkdir -p $dir/$dirname - -echo "$0: Converting lattices into '$dir/$dirname'" - -# Words in arcs or nodes? [default:nodes] -word_to_link_arg= -$word_to_node && word_to_node_arg="--word-to-node" - -nj=$(cat $dir/num_jobs) - -# convert the lattices (individually, gzipped) -$cmd $parallel_opts JOB=1:$nj $dir/$dirname/log/lat_convert.JOB.log \ - mkdir -p $dir/$dirname/JOB/ '&&' \ - lattice-align-words-lexicon --output-error-lats=true --output-if-empty=true \ - $lang/phones/align_lexicon.int $model "ark:gunzip -c $dir/lat.JOB.gz |" ark,t:- \| \ - utils/int2sym.pl -f 3 $lang/words.txt \| \ - utils/convert_slf.pl $word_to_node_arg - $dir/$dirname/JOB/ || exit 1 - -# make list of lattices -find -L $PWD/$dir/$dirname -name *.lat.gz > $dir/$dirname/lat_htk.scp || exit 1 - -# check number of lattices: -nseg=$(cat $data/segments | wc -l) -nlat_out=$(cat $dir/$dirname/lat_htk.scp | wc -l) -echo "segments $nseg, saved-lattices $nlat_out" -# -[ $nseg -ne $nlat_out ] && echo "WARNING: missing $((nseg-nlat_out)) lattices for some segments!" \ - && exit 1 - -echo "success, converted lats to HTK : $PWD/$dir/$dirname/lat_htk.scp" -exit 0 - diff --git a/kaldi/local/copy_data_dir.sh b/kaldi/local/copy_data_dir.sh deleted file mode 100755 index f3b885c..0000000 --- a/kaldi/local/copy_data_dir.sh +++ /dev/null @@ -1,142 +0,0 @@ -#!/bin/bash - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# feats.scp -# wav.scp -# vad.scp -# spk2utt -# utt2spk -# text -# -# It copies to another directory, possibly adding a specified prefix or a suffix -# to the utterance and/or speaker names. Note, the recording-ids stay the same. -# - - -# begin configuration section -spk_prefix= -utt_prefix= -spk_suffix= -utt_suffix= -validate_opts= # should rarely be needed. -# end configuration section - -. utils/parse_options.sh - -if [ $# != 2 ]; then - echo "Usage: " - echo " $0 [options] " - echo "e.g.:" - echo " $0 --spk-prefix=1- --utt-prefix=1- data/train data/train_1" - echo "Options" - echo " --spk-prefix= # Prefix for speaker ids, default empty" - echo " --utt-prefix= # Prefix for utterance ids, default empty" - echo " --spk-suffix= # Suffix for speaker ids, default empty" - echo " --utt-suffix= # Suffix for utterance ids, default empty" - exit 1; -fi - - -export LC_ALL=C - -srcdir=$1 -destdir=$2 - -if [ ! -f $srcdir/utt2spk ]; then - echo "copy_data_dir.sh: no such file $srcdir/utt2spk" - exit 1; -fi - -if [ "$destdir" == "$srcdir" ]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -set -e; - -mkdir -p $destdir - -cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map -cat $srcdir/spk2utt | awk -v p=$spk_prefix -v s=$spk_suffix '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/spk_map - -if [ ! -f $srcdir/utt2uniq ]; then - if [[ ! -z $utt_prefix || ! -z $utt_suffix ]]; then - cat $srcdir/utt2spk | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $1);}' > $destdir/utt2uniq - fi -else - cat $srcdir/utt2uniq | awk -v p=$utt_prefix -v s=$utt_suffix '{printf("%s%s%s %s\n", p, $1, s, $2);}' > $destdir/utt2uniq -fi - -cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ - utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk - -utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt - -if [ -f $srcdir/feats.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/feats.scp >$destdir/feats.scp -fi - -if [ -f $srcdir/vad.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/vad.scp >$destdir/vad.scp -fi - -if [ -f $srcdir/segments ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments >$destdir/segments - cp $srcdir/wav.scp $destdir -else # no segments->wav indexed by utt. - if [ -f $srcdir/wav.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp >$destdir/wav.scp - fi -fi - -if [ -f $srcdir/reco2file_and_channel ]; then - cp $srcdir/reco2file_and_channel $destdir/ -fi - -if [ -f $srcdir/text ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text -fi -if [ -f $srcdir/utt2dur ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur -fi -if [ -f $srcdir/reco2dur ]; then - if [ -f $srcdir/segments ]; then - cp $srcdir/reco2dur $destdir/reco2dur - else - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur - fi -fi -if [ -f $srcdir/spk2gender ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender -fi -if [ -f $srcdir/cmvn.scp ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/cmvn.scp >$destdir/cmvn.scp -fi -for f in stm glm ctm; do - if [ -f $srcdir/$f ]; then - cp $srcdir/$f $destdir - fi -done - -rm $destdir/spk_map $destdir/utt_map - -echo "$0: copied data from $srcdir to $destdir" - -for f in feats.scp cmvn.scp vad.scp utt2lang utt2uniq utt2dur utt2num_frames text wav.scp reco2file_and_channel stm glm ctm; do - if [ -f $destdir/$f ] && [ ! -f $srcdir/$f ]; then - echo "$0: file $f exists in dest $destdir but not in src $srcdir. Moving it to" - echo " ... $destdir/.backup/$f" - mkdir -p $destdir/.backup - mv $destdir/$f $destdir/.backup/ - fi -done - - -[ ! -f $srcdir/feats.scp ] && validate_opts="$validate_opts --no-feats" -[ ! -f $srcdir/text ] && validate_opts="$validate_opts --no-text" - -utils/validate_data_dir.sh $validate_opts $destdir diff --git a/kaldi/local/create_data_link.pl b/kaldi/local/create_data_link.pl deleted file mode 100755 index 850f29f..0000000 --- a/kaldi/local/create_data_link.pl +++ /dev/null @@ -1,132 +0,0 @@ -#!/usr/bin/env perl - -# Copyright 2013 Guoguo Chen -# 2014 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0. -# -# This script distributes data onto different file systems by making symbolic -# links. It is supposed to use together with utils/create_split_dir.pl, which -# creates a "storage" directory that links to different file systems. -# -# If a sub-directory egs/storage does not exist, it does nothing. If it exists, -# then it selects pseudo-randomly a number from those available in egs/storage/* -# creates a link such as -# -# egs/egs.3.4.ark -> storage/4/egs.3.4.ark -# -use strict; -use warnings; -use File::Basename; -use File::Spec; -use Getopt::Long; - -sub GetGCD { - my ($a, $b) = @_; - while ($a != $b) { - if ($a > $b) { - $a = $a - $b; - } else { - $b = $b - $a; - } - } - return $a; -} - -my $Usage = < storage/4/egs.3.4.ark - -Usage: utils/create_data_link.pl [ ... ] - e.g.: utils/create_data_link.pl foo/bar/egs.3.4.ark foo/bar/egs.3.5.ark - (note: the dirname, e.g. foo/bar/, must be the same in all cases). - -See also utils/remove_data_links.sh -EOU - -GetOptions(); - -if (@ARGV == 0) { - die $Usage; -} - -my $example_fullpath = $ARGV[0]; - -# Check if the storage has been created. If so, do nothing. -my $dirname = dirname($example_fullpath); -if (! -d "$dirname/storage") { - exit(0); -} - -# Storage exists, create symbolic links in the next few steps. - -# First, get a list of the available storage directories, and check if they are -# properly created. -opendir(my $dh, "$dirname/storage/") || die "$0: Fail to open $dirname/storage/\n"; -my @storage_dirs = grep(/^[0-9]*$/, readdir($dh)); -closedir($dh); -my $num_storage = scalar(@storage_dirs); -for (my $x = 1; $x <= $num_storage; $x++) { - (-d "$dirname/storage/$x") || die "$0: $dirname/storage/$x does not exist\n"; -} - -# Second, get the coprime list. -my @coprimes; -for (my $n = 1; $n <= $num_storage; $n++) { - if (GetGCD($n, $num_storage) == 1) { - push(@coprimes, $n); - } -} - -my $ret = 0; - -foreach my $fullpath (@ARGV) { - if ($dirname ne dirname($fullpath)) { - die "Mismatch in directory names of arguments: $example_fullpath versus $fullpath"; - } - - # Finally, work out the directory index where we should put the data to. - my $basename = basename($fullpath); - my $filename_numbers = $basename; - $filename_numbers =~ s/[^0-9]+/ /g; - my @filename_numbers = split(" ", $filename_numbers); - my $total = 0; - my $index = 0; - foreach my $x (@filename_numbers) { - if ($index >= scalar(@coprimes)) { - $index = 0; - } - $total += $x * $coprimes[$index]; - $index++; - } - my $dir_index = $total % $num_storage + 1; - - # Make the symbolic link. - if (-e $fullpath) { - unlink($fullpath); - } - if (symlink("storage/$dir_index/$basename", $fullpath) != 1) { # failure - $ret = 1; # will exit with error status. - } -} - -exit($ret); - -## testing: -# rm -rf foo bar -# mkdir -p bar/{1,2,3,4} -# mkdir -p foo/storage -# for x in 1 2 3 4; do ln -s ../../bar/$x foo/storage/$x; done -# utils/create_data_link.pl utils/create_data_link.pl foo/1.3.ark foo/2.3.ark -# ls -l foo -# total 0 -# lrwxrwxrwx 1 dpovey fax 17 Sep 2 17:41 1.3.ark -> storage/3/1.3.ark -# lrwxrwxrwx 1 dpovey fax 17 Sep 2 17:41 2.3.ark -> storage/4/2.3.ark -# drwxr-xr-x 2 dpovey fax 38 Sep 2 17:40 storage diff --git a/kaldi/local/create_split_dir.pl b/kaldi/local/create_split_dir.pl deleted file mode 100755 index ab95235..0000000 --- a/kaldi/local/create_split_dir.pl +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/env perl - -# Copyright 2013 Guoguo Chen -# Apache 2.0. -# -# This script creates storage directories on different file systems, and creates -# symbolic links to those directories. For example, a command -# -# utils/create_split_dir.pl /export/gpu-0{3,4,5}/egs/storage egs/storage -# -# will mkdir -p all of those directories, and will create links -# -# egs/storage/1 -> /export/gpu-03/egs/storage -# egs/storage/2 -> /export/gpu-03/egs/storage -# ... -# -use strict; -use warnings; -use File::Spec; -use Getopt::Long; - -my $Usage = < - e.g.: utils/create_split_dir.pl /export/gpu-0{3,4,5}/egs/storage egs/storage - -Allowed options: - --suffix : Common suffix to (string, default = "") - -See also create_data_link.pl, which is intended to work with the resulting -directory structure, and remove_data_links.sh -EOU - -my $suffix=""; -GetOptions('suffix=s' => \$suffix); - -if (@ARGV < 2) { - die $Usage; -} - -my $ans = 1; - -my $dir = pop(@ARGV); -system("mkdir -p $dir 2>/dev/null"); - -my @all_actual_storage = (); -foreach my $file (@ARGV) { - push @all_actual_storage, File::Spec->rel2abs($file . "/" . $suffix); -} - -my $index = 1; -foreach my $actual_storage (@all_actual_storage) { - my $pseudo_storage = "$dir/$index"; - - # If the symbolic link already exists, delete it. - if (-l $pseudo_storage) { - print STDERR "$0: link $pseudo_storage already exists, not overwriting.\n"; - $index++; - next; - } - - # Create the destination directory and make the link. - system("mkdir -p $actual_storage 2>/dev/null"); - if ($? != 0) { - print STDERR "$0: error creating directory $actual_storage\n"; - exit(1); - } - { # create a README file for easier deletion. - open(R, ">$actual_storage/README.txt"); - my $storage_dir = File::Spec->rel2abs($dir); - print R "# This directory is linked from $storage_dir, as part of Kaldi striped data\n"; - print R "# The full list of directories where this data resides is:\n"; - foreach my $d (@all_actual_storage) { - print R "$d\n"; - } - close(R); - } - my $ret = symlink($actual_storage, $pseudo_storage); - - # Process the returned values - $ans = $ans && $ret; - if (! $ret) { - print STDERR "Error linking $actual_storage to $pseudo_storage\n"; - } - - $index++; -} - -exit($ans == 1 ? 0 : 1); diff --git a/kaldi/local/dict_dir_add_pronprobs.sh b/kaldi/local/dict_dir_add_pronprobs.sh deleted file mode 100755 index 59ae4a4..0000000 --- a/kaldi/local/dict_dir_add_pronprobs.sh +++ /dev/null @@ -1,252 +0,0 @@ -#!/bin/bash - -# Apache 2.0. -# Copyright 2014 Johns Hopkins University (author: Daniel Povey) -# 2014 Guoguo Chen -# 2015 Hainan Xu - - -# The thing that this script implements is described in the paper: -# "PRONUNCIATION AND SILENCE PROBABILITY MODELING FOR ASR" -# by Guoguo Chen et al, see -# http://www.danielpovey.com/files/2015_interspeech_silprob.pdf - -. ./path.sh || exit 1; - -# begin configuration -max_normalize=true -# end configuration - -. utils/parse_options.sh || exit 1; - -set -e - -if [[ $# -ne 3 && $# -ne 5 ]]; then - echo "Usage: $0 [options] \\" - echo " [input-sil-counts] [input-bigram-counts] " - echo " e.g.: $0 data/local/dict \\" - echo " exp/tri3/pron_counts_nowb.txt exp/tri3/sil_counts_nowb.txt \\" - echo " exp/tri3/pron_bigram_counts_nowb.txt data/local/dict_prons" - echo " e.g.: $0 data/local/dict \\" - echo " exp/tri3/pron_counts_nowb.txt data/local/dict_prons" - echo "" - echo "This script takes pronunciation counts, e.g. generated by aligning your training" - echo "data and getting the prons using steps/get_prons.sh, and creates a modified" - echo "dictionary directory with pronunciation probabilities. If the [input-sil-counts]" - echo "parameter is provided, it will also include silprobs in the generated lexicon." - echo "Options:" - echo " --max-normalize (true|false) # default true. If true," - echo " # divide each pron-prob by the" - echo " # most likely pron-prob per word." - exit 1; -fi - -if [ $# -eq 3 ]; then - srcdir=$1 - pron_counts=$2 - dir=$3 -elif [ $# -eq 5 ]; then - srcdir=$1 - pron_counts=$2 - sil_counts=$3 - bigram_counts=$4 - dir=$5 -fi - -if [ ! -s $pron_counts ]; then - echo "$0: expected file $pron_counts to exist"; - exit 1; -fi - -mkdir -p $dir || exit 1; -utils/validate_dict_dir.pl $srcdir; - -if [ -f $srcdir/lexicon.txt ]; then - src_lex=$srcdir/lexicon.txt - perl -ane 'print join(" ", split(" ", $_)) . "\n";' < $src_lex |\ - sort -u > $dir/lexicon.txt -elif [ -f $srcdir/lexiconp.txt ]; then - echo "$0: removing the pron-probs from $srcdir/lexiconp.txt to create $dir/lexicon.txt" - # the Perl command below normalizes the spaces (avoid double space). - src_lex=$srcdir/lexiconp.txt - awk '{$2 = ""; print $0;}' <$srcdir/lexiconp.txt |\ - perl -ane 'print join(" ", split(" " ,$_)) . "\n";' |\ - sort -u > $dir/lexicon.txt || exit 1; -fi - - -# the cat and awk commands below are implementing add-one smoothing. -cat <(awk '{print 1, $0;}' <$dir/lexicon.txt) $pron_counts | \ - awk '{ count = $1; $1 = ""; word_count[$2] += count; pron_count[$0] += count; pron2word[$0] = $2; } - END{ for (p in pron_count) { word = pron2word[p]; num = pron_count[p]; den = word_count[word]; - print num / den, p } } ' | \ - awk '{ word = $2; $2 = $1; $1 = word; print; }' | grep -v '^' |\ - sort -k1,1 -k2g,2 -k3 > $dir/lexiconp.txt - - -n_old=$(wc -l <$dir/lexicon.txt) -n_new=$(wc -l <$dir/lexiconp.txt) - -if [ "$n_old" != "$n_new" ]; then - echo "$0: number of lines differs from $dir/lexicon.txt $n_old vs $dir/lexiconp.txt $n_new" - echo "Probably something went wrong (e.g. input prons were generated from a different lexicon" - echo "than $srcdir, or you used pron_counts.txt when you should have used pron_counts_nowb.txt" - echo "or something else. Make sure the prons in $src_lex $pron_counts look" - echo "the same." - exit 1; -fi - -if $max_normalize; then - echo "$0: normalizing pronprobs so maximum is 1 for each word." - cat $dir/lexiconp.txt | awk '{if ($2 > max[$1]) { max[$1] = $2; }} END{for (w in max) { print w, max[w]; }}' > $dir/maxp.txt - - awk -v maxf=$dir/maxp.txt 'BEGIN{ while (getline $dir/lexicon_tmp.txt || exit 1; - - if ! [ $(wc -l <$dir/lexicon_tmp.txt) -eq $(wc -l <$dir/lexiconp.txt) ]; then - echo "$0: error max-normalizing pron-probs" - exit 1; - fi - mv $dir/lexicon_tmp.txt $dir/lexiconp.txt - rm $dir/maxp.txt -fi - -# Create $dir/lexiconp_silprob.txt and $dir/silprob.txt if silence counts file -# exists. The format of $dir/lexiconp_silprob.txt is: -# word pron-prob P(s_r | w) F(s_l | w) F(n_l | w) pron -# where: P(s_r | w) is the probability of silence to the right of the word -# F(s_l | w) is a factor which is greater than one if silence to the -# left of the word is more than averagely probable. -# F(n_l | w) is a factor which is greater than one if nonsilence to the -# left of the word is more than averagely probable. -if [ -n "$sil_counts" ]; then - if [ ! -s "$sil_counts" ]; then - echo "$0: expected file $sil_counts to exist and not empty" && exit 1; - fi - cat $sil_counts | perl -e ' - # Load silence counts - %sil_wpron = (); %nonsil_wpron = (); %wpron_sil = (); %wpron_nonsil = (); - $sil_count = 0; $nonsil_count = 0; - while () { - chomp; @col = split; @col >= 5 || die "'$0': bad line \"$_\"\n"; - $wpron = join(" ", @col[4..scalar(@col)-1]); - ($sil_wpron{$wpron}, $nonsil_wpron{$wpron}, - $wpron_sil{$wpron}, $wpron_nonsil{$wpron}) = @col[0..3]; - $sil_count += $sil_wpron{$wpron}; $nonsil_count += $nonsil_wpron{$wpron}; - } - - # Open files. - ($lexiconp, $bigram_counts, $lexiconp_silprob, $silprob) = @ARGV; - open(LP, "<$lexiconp") || die "'$0': fail to open $lexiconp\n"; - open(WPC, "<$bigram_counts") || die "'$0': fail to open $bigram_counts\n"; - open(SP, ">$silprob") || die "'$0': fail to open $silprob\n"; - open(LPSP, ">$lexiconp_silprob") || - die "'$0': fail to open $lexiconp_silprob\n"; - - # Computes P(s_r | w) in the paper. - $lambda2 = 2; # Smoothing term, \lambda_2 in the paper. - %P_w_sr = (); - %all_wprons = (); - $sil_prob = sprintf("%.2f", $sil_count / ($sil_count + $nonsil_count)); - while () { - chomp; @col = split; @col >= 3 || die "'$0': bad line \"$_\"\n"; - $word = shift @col; $pron_prob = shift @col; $pron = join(" ", @col); - unshift(@col, $word); $wpron = join(" ", @col); - - $wpron_sil_count = $wpron_sil{$wpron} + $sil_prob * $lambda2; - $wpron_nonsil_count = $wpron_nonsil{$wpron} + (1 - $sil_prob) * $lambda2; - $sil_after_prob = sprintf("%.2f", - $wpron_sil_count / ($wpron_sil_count + $wpron_nonsil_count)); - if ($sil_after_prob == "0.00") { $sil_after_prob = "0.01"; } - if ($sil_after_prob == "1.00") { $sil_after_prob = "0.99"; } - $P_w_sr{$wpron} = $sil_after_prob; - - $all_wprons{$wpron} = $pron_prob; - } - - # Reads C(v ? w) in the paper. - %wpron_pair_count = (); - while () { - chomp; @col = split("\t"); @col == 3 || die "'$0': bad line \"$_\"\n"; - $count = shift @col; $wpron1 = shift @col; $wpron2 = shift @col; - $key = "${wpron1}\t${wpron2}"; - $wpron_pair_count{$key} = $count; - } - - # Computes \bar{C}(s w) and \bar{C}(n w) in the paper. - %bar_C_s_w = (); - %bar_C_n_w = (); - foreach my $key (keys %wpron_pair_count) { - $count = $wpron_pair_count{$key}; - ($wpron1, $wpron2) = split("\t", $key); - $bar_C_s_w{$wpron2} += $count * $P_w_sr{$wpron1}; - $bar_C_n_w{$wpron2} += $count * (1 - $P_w_sr{$wpron1}); - } - - # Computes F(s_l | w) and F(n_l | w) in the paper. - $lambda3 = 2; # Smoothing term, \lambda_3 in the paper. - foreach my $wpron (keys %all_wprons) { - @col = split(" ", $wpron); - $word = shift @col; - $pron = join(" ", @col); - $pron_prob = $all_wprons{$wpron}; - - $F_sl_w = ($sil_wpron{$wpron} + $lambda3) / ($bar_C_s_w{$wpron} + $lambda3); - $F_nl_w = ($nonsil_wpron{$wpron} + $lambda3) / ($bar_C_n_w{$wpron} + $lambda3); - $F_sl_w = sprintf("%.2f", $F_sl_w); - $F_nl_w = sprintf("%.2f", $F_nl_w); - if ($F_sl_w == "0.00") { $F_sl_w = "0.01"; } - if ($F_nl_w == "0.00") { $F_nl_w = "0.01"; } - - print LPSP "$word $pron_prob $P_w_sr{$wpron} $F_sl_w $F_nl_w $pron\n"; - } - - # Create silprob.txt - $BOS_sil_count = $wpron_sil{""} + $sil_prob * $lambda2; - $BOS_nonsil_count = $wpron_nonsil{""} + (1 - $sil_prob) * $lambda2; - $P_BOS_sr = sprintf("%.2f", $BOS_sil_count / ($BOS_sil_count + $BOS_nonsil_count)); - $F_sl_EOS = ($sil_wpron{""} + $lambda3) / ($bar_C_s_w{""} + $lambda3); - $F_nl_EOS = ($nonsil_wpron{""} + $lambda3) / ($bar_C_n_w{""} + $lambda3); - if ($P_BOS_sr == "1.00") { $P_BOS_sr = "0.99"; } - if ($P_BOS_sr == "0.00") { $P_BOS_sr = "0.01"; } - if ($F_sl_EOS == "0.00") { $F_sl_EOS = "0.01"; } - if ($F_nl_EOS == "0.00") { $F_nl_EOS = "0.01"; } - print SP " $P_BOS_sr\n_s $F_sl_EOS\n_n $F_nl_EOS\noverall $sil_prob\n"; - ' $dir/lexiconp.txt $bigram_counts $dir/lexiconp_silprob_unsorted.txt $dir/silprob.txt - sort -k1,1 -k2g,2 -k6 $dir/lexiconp_silprob_unsorted.txt > $dir/lexiconp_silprob.txt -fi - -# now regenerate lexicon.txt from lexiconp.txt, to make sure the lines are -# in the same order. -cat $dir/lexiconp.txt | awk '{$2 = ""; print;}' | sed 's/ / /g' >$dir/lexicon.txt - - -# add mandatory files. -for f in silence_phones.txt nonsilence_phones.txt; do - if [ ! -f $srcdir/$f ]; then - echo "$0: expected $srcdir/$f to exist." - exit 1; - fi - cp $srcdir/$f $dir/ || exit 1; -done - - -# add optional files (at least, I think these are optional; would have to check the docs). -for f in optional_silence.txt extra_questions.txt; do - if [ -f $srcdir/$f ]; then - cp $srcdir/$f $dir || exit 1; - fi -done - - -echo "$0: produced dictionary directory with probabilities in $dir/" -echo "$0: validating $dir .." -sleep 1 -utils/validate_dict_dir.pl $dir || exit 1; - - -echo "Some low-probability prons include: " -echo "# sort -k2,2 -n $dir/lexiconp.txt | head -n 8" - -sort -k2,2 -n $dir/lexiconp.txt | head -n 8 - -exit 0 diff --git a/kaldi/local/eps2disambig.pl b/kaldi/local/eps2disambig.pl deleted file mode 100755 index 47f90b2..0000000 --- a/kaldi/local/eps2disambig.pl +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces epsilon with #0 on the input side only, of the G.fst -# acceptor. - -while(<>){ - if (/\s+#0\s+/) { - print STDERR "$0: ERROR: LM has word #0, " . - "which is reserved as disambiguation symbol\n"; - exit 1; - } - s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; - print; -} diff --git a/kaldi/local/filt.py b/kaldi/local/filt.py deleted file mode 100755 index 2847c00..0000000 --- a/kaldi/local/filt.py +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env python - -# Apache 2.0 - -import sys - -vocab=set() -with open(sys.argv[1]) as vocabfile: - for line in vocabfile: - vocab.add(line.strip()) - -with open(sys.argv[2]) as textfile: - for line in textfile: - print " ".join(map(lambda word: word if word in vocab else '', line.strip().split())) diff --git a/kaldi/local/filter_scp.pl b/kaldi/local/filter_scp.pl deleted file mode 100755 index b76d37f..0000000 --- a/kaldi/local/filter_scp.pl +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation -# Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This script takes a list of utterance-ids or any file whose first field -# of each line is an utterance-id, and filters an scp -# file (or any file whose "n-th" field is an utterance id), printing -# out only those lines whose "n-th" field is in id_list. The index of -# the "n-th" field is 1, by default, but can be changed by using -# the -f switch - -$exclude = 0; -$field = 1; -$shifted = 0; - -do { - $shifted=0; - if ($ARGV[0] eq "--exclude") { - $exclude = 1; - shift @ARGV; - $shifted=1; - } - if ($ARGV[0] eq "-f") { - $field = $ARGV[1]; - shift @ARGV; shift @ARGV; - $shifted=1 - } -} while ($shifted); - -if(@ARGV < 1 || @ARGV > 2) { - die "Usage: filter_scp.pl [--exclude] [-f ] id_list [in.scp] > out.scp \n" . - "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" . - "Note: only the first field of each line in id_list matters. With --exclude, prints\n" . - "only the lines that were *not* in id_list.\n" . - "Caution: previously, the -f option was interpreted as a zero-based field index.\n" . - "If your older scripts (written before Oct 2014) stopped working and you used the\n" . - "-f option, add 1 to the argument.\n" . - "See also: utils/filter_scp.pl .\n"; -} - - -$idlist = shift @ARGV; -open(F, "<$idlist") || die "Could not open id-list file $idlist"; -while() { - @A = split; - @A>=1 || die "Invalid id-list file line $_"; - $seen{$A[0]} = 1; -} - -if ($field == 1) { # Treat this as special case, since it is common. - while(<>) { - $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field."; - # $1 is what we filter on. - if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) { - print $_; - } - } -} else { - while(<>) { - @A = split; - @A > 0 || die "Invalid scp file line $_"; - @A >= $field || die "Invalid scp file line $_"; - if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) { - print $_; - } - } -} - -# tests: -# the following should print "foo 1" -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo) -# the following should print "bar 2". -# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2) diff --git a/kaldi/local/filter_scps.pl b/kaldi/local/filter_scps.pl deleted file mode 100755 index 418f8f7..0000000 --- a/kaldi/local/filter_scps.pl +++ /dev/null @@ -1,170 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation -# 2012-2016 Johns Hopkins University (author: Daniel Povey) -# 2015 Xiaohui Zhang - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# This script takes multiple lists of utterance-ids or any file whose first field -# of each line is an utterance-id, as filters, and filters an scp -# file (or any file whose "n-th" field is an utterance id), printing -# out only those lines whose "n-th" field is in filter. The index of -# the "n-th" field is 1, by default, but can be changed by using -# the -f switch - - -$field = 1; -$shifted = 0; -$print_warnings = 1; -do { - $shifted=0; - if ($ARGV[0] eq "-f") { - $field = $ARGV[1]; - shift @ARGV; shift @ARGV; - $shifted = 1; - } - if (@ARGV[0] eq "--no-warn") { - $print_warnings = 0; - shift @ARGV; - $shifted = 1; - } -} while ($shifted); - - -if(@ARGV != 4) { - die "Usage: utils/filter_scps.pl [-f ] \n" . - "e.g.: utils/filter_scps.pl JOB=1:10 data/train/split10/JOB/spk2utt data/train/feats.scp data/train/split10/JOB/feats.scp\n" . - "similar to utils/filter_scp.pl, but it uses multiple filters and output multiple filtered files.\n". - "The -f option specifies the field in that we filter on (default: 1)." . - "See also: utils/filter_scp.pl\n"; -} - -if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:10 - $jobname = $1; - $jobstart = $2; - $jobend = $3; - shift; - if ($jobstart > $jobend) { - die "filter_scps.pl: invalid job range $ARGV[0]"; - } -} else { - die "filter_scps.pl: bad job-range specifier $ARGV[0]: expected e.g. JOB=1:10"; -} - -$idlist = shift @ARGV; - -if ($idlist !~ m/$jobname/ && - $jobend > $jobstart) { - print STDERR "filter_scps.pl: you are trying to use multiple filter files as filter patterns but " - . "you are providing just one filter file ($idlist)\n"; - exit(1); -} - - -$infile = shift @ARGV; - -$outfile = shift @ARGV; - -if ($outfile !~ m/$jobname/ && $jobend > $jobstart) { - print STDERR "filter_scps.pl: you are trying to create multiple filtered files but " - . "you are providing just one output file ($outfile)\n"; - exit(1); -} - -# This hashes from the id (e.g. utterance-id) to an array of the relevant -# job-ids (which are integers). In any normal use-case, this array will contain -# exactly one job-id for any given id, but we want to be agnostic about this. -%id2jobs = ( ); - -# Some variables that we set to produce a warning. -$warn_uncovered = 0; -$warn_multiply_covered = 0; - -for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) { - $idlist_n = $idlist; - $idlist_n =~ s/$jobname/$jobid/g; - - open(F, "<$idlist_n") || die "Could not open id-list file $idlist_n"; - - while() { - @A = split; - @A >= 1 || die "Invalid line $_ in id-list file $idlist_n"; - $id = $A[0]; - if (! defined $id2jobs{$id}) { - $id2jobs{$id} = [ ]; # new anonymous array. - } - push @{$id2jobs{$id}}, $jobid; - } - close(F); -} - -# job2output hashes from the job-id, to an anonymous array containing -# a sequence of output lines. -%job2output = ( ); -for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) { - $job2output{$jobid} = [ ]; # new anonymous array. -} - -open (F, "< $infile") or die "Can't open $infile for read: $!"; -while () { - if ($field == 1) { # Treat this as special case, since it is common. - $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field."; - # $1 is what we filter on. - $id = $1; - } else { - @A = split; - @A > 0 || die "Invalid scp file line $_"; - @A >= $field || die "Invalid scp file line $_"; - $id = $A[$field-1]; - } - if ( ! defined $id2jobs{$id}) { - $warn_uncovered = 1; - } else { - @jobs = @{$id2jobs{$id}}; # this dereferences the array reference. - if (@jobs > 1) { - $warn_multiply_covered = 1; - } - foreach $job_id (@jobs) { - if (!defined $job2output{$job_id}) { - die "Likely code error"; - } - push @{$job2output{$job_id}}, $_; - } - } -} -close(F); - -for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) { - $outfile_n = $outfile; - $outfile_n =~ s/$jobname/$jobid/g; - open(FW, ">$outfile_n") || die "Could not open output file $outfile_n"; - $printed = 0; - foreach $line (@{$job2output{$jobid}}) { - print FW $line; - $printed = 1; - } - if (!printed) { - print STDERR "filter_scps.pl: warning: output to $outfile_n is empty\n"; - } - close(FW); -} - -if ($warn_uncovered && $print_warnings) { - print STDERR "filter_scps.pl: warning: some input lines did not get output\n"; -} -if ($warn_multiply_covered && $print_warnings) { - print STDERR "filter_scps.pl: warning: some input lines were output to multiple files [OK if splitting per utt] " . - join(" ", @ARGV) . "\n"; -} diff --git a/kaldi/local/find_arpa_oovs.pl b/kaldi/local/find_arpa_oovs.pl deleted file mode 100755 index cdf6d73..0000000 --- a/kaldi/local/find_arpa_oovs.pl +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -if ( @ARGV < 1 && @ARGV > 2) { - die "Usage: find_arpa_oovs.pl words.txt [lm.arpa]\n"; - # This program finds words in the arpa file that are not symbols - # in the OpenFst-format symbol table words.txt. It prints them - # on the standard output, one per line. -} - -$symtab = shift @ARGV; -open(S, "<$symtab") || die "Failed opening symbol table file $symtab\n"; -while(){ - @A = split(" ", $_); - @A == 2 || die "Bad line in symbol table file: $_"; - $seen{$A[0]} = 1; -} - -$found_data=0; -$curgram=0; -while(<>) { # Find the \data\ marker. - if(m:^\\data\\\s*$:) { $found_data=1; last; } -} - -if ($found_data==0) { - print STDERR "find_arpa_oovs.pl: found no \\data\\ marker in the ARPA input.\n"; - exit(1); -} - -while(<>) { - if(m/^\\(\d+)\-grams:\s*$/) { - $curgram = $1; - if($curgram > 1) { - last; # This is an optimization as we can get the vocab from the 1-grams - } - } elsif($curgram > 0) { - @A = split(" ", $_); - if(@A > 1) { - shift @A; - for($n=0;$n<$curgram;$n++) { - $word = $A[$n]; - if(!defined $word) { print STDERR "Unusual line $_ (line $.) in arpa file.\n"; } - $in_arpa{$word} = 1; - } - } else { - if(@A > 0 && $A[0] !~ m:\\end\\:) { - print STDERR "Unusual line $_ (line $.) in arpa file\n"; - } - } - } -} - -foreach $w (keys %in_arpa) { - if(!defined $seen{$w} && $w ne "" && $w ne "") { - print "$w\n"; - } -} diff --git a/kaldi/local/fix_ctm.sh b/kaldi/local/fix_ctm.sh deleted file mode 100755 index 7bab9f6..0000000 --- a/kaldi/local/fix_ctm.sh +++ /dev/null @@ -1,32 +0,0 @@ -#! /bin/bash - -stmfile=$1 -ctmfile=$2 - -segments_stm=`cat $stmfile | cut -f 1 -d ' ' | sort -u` -segments_ctm=`cat $ctmfile | cut -f 1 -d ' ' | sort -u` - -segments_stm_count=`echo "$segments_stm" | wc -l ` -segments_ctm_count=`echo "$segments_ctm" | wc -l ` - -#echo $segments_stm_count -#echo $segments_ctm_count - -if [ "$segments_stm_count" -gt "$segments_ctm_count" ] ; then - pp=$( diff <(echo "$segments_stm") <(echo "$segments_ctm" ) | grep "^<" | sed "s/^< *//g") - ( - for elem in $pp ; do - echo "$elem 1 0 0 EMPTY_RECOGNIZED_PHRASE" - done - ) >> $ctmfile - echo "FIXED CTM FILE" - exit 0 -elif [ "$segments_stm_count" -lt "$segments_ctm_count" ] ; then - echo "Segment STM count: $segments_stm_count" - echo "Segment CTM count: $segments_ctm_count" - echo "FAILURE FIXING CTM FILE" - exit 1 -else - exit 0 -fi - diff --git a/kaldi/local/fix_data_dir.sh b/kaldi/local/fix_data_dir.sh deleted file mode 100755 index 103a417..0000000 --- a/kaldi/local/fix_data_dir.sh +++ /dev/null @@ -1,189 +0,0 @@ -#!/bin/bash - -# This script makes sure that only the segments present in -# all of "feats.scp", "wav.scp" [if present], segments [if present] -# text, and utt2spk are present in any of them. -# It puts the original contents of data-dir into -# data-dir/.backup - -utt_extra_files= -spk_extra_files= - -. utils/parse_options.sh - -if [ $# != 1 ]; then - echo "Usage: utils/data/fix_data_dir.sh " - echo "e.g.: utils/data/fix_data_dir.sh data/train" - echo "This script helps ensure that the various files in a data directory" - echo "are correctly sorted and filtered, for example removing utterances" - echo "that have no features (if feats.scp is present)" - exit 1 -fi - -data=$1 -mkdir -p $data/.backup - -[ ! -d $data ] && echo "$0: no such directory $data" && exit 1; - -[ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1; - -set -e -o pipefail -u - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted { - file=$1 - sort -k1,1 -u <$file >$file.tmp - if ! cmp -s $file $file.tmp; then - echo "$0: file $1 is not in sorted order or not unique, sorting it" - mv $file.tmp $file - else - rm $file.tmp - fi -} - -for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \ - reco2file_and_channel spk2gender utt2lang utt2uniq utt2dur reco2dur utt2num_frames; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - check_sorted $data/$x - fi -done - - -function filter_file { - filter=$1 - file_to_filter=$2 - cp $file_to_filter ${file_to_filter}.tmp - utils/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter - if ! cmp ${file_to_filter}.tmp $file_to_filter >&/dev/null; then - length1=$(cat ${file_to_filter}.tmp | wc -l) - length2=$(cat ${file_to_filter} | wc -l) - if [ $length1 -ne $length2 ]; then - echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter." - fi - fi - rm $file_to_filter.tmp -} - -function filter_recordings { - # We call this once before the stage when we filter on utterance-id, and once - # after. - - if [ -f $data/segments ]; then - # We have a segments file -> we need to filter this and the file wav.scp, and - # reco2file_and_utt, if it exists, to make sure they have the same list of - # recording-ids. - - if [ ! -f $data/wav.scp ]; then - echo "$0: $data/segments exists but not $data/wav.scp" - exit 1; - fi - awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings - n1=$(cat $tmpdir/recordings | wc -l) - [ ! -s $tmpdir/recordings ] && \ - echo "Empty list of recordings (bad file $data/segments)?" && exit 1; - utils/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp - mv $tmpdir/recordings.tmp $tmpdir/recordings - - - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - filter_file $tmpdir/recordings $data/segments - cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments - rm $data/segments.tmp - - filter_file $tmpdir/recordings $data/wav.scp - [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel - [ -f $data/reco2dur ] && filter_file $tmpdir/recordings $data/reco2dur - true - fi -} - -function filter_speakers { - # throughout this program, we regard utt2spk as primary and spk2utt as derived, so... - utils/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - for s in cmvn.scp spk2gender; do - f=$data/$s - if [ -f $f ]; then - filter_file $f $tmpdir/speakers - fi - done - - filter_file $tmpdir/speakers $data/spk2utt - utils/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk - - for s in cmvn.scp spk2gender $spk_extra_files; do - f=$data/$s - if [ -f $f ]; then - filter_file $tmpdir/speakers $f - fi - done -} - -function filter_utts { - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - - ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order (fix this yourself)" && exit 1; - - ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \ - echo "utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; - - ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \ - echo "spk2utt is not in sorted order (fix this yourself)" && exit 1; - - if [ -f $data/utt2uniq ]; then - ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \ - echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1; - fi - - maybe_wav= - maybe_reco2dur= - [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist. - [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts - for x in feats.scp text segments utt2lang $maybe_wav; do - if [ -f $data/$x ]; then - utils/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp - mv $tmpdir/utts.tmp $tmpdir/utts - fi - done - [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \ - rm $tmpdir/utts && exit 1; - - - if [ -f $data/utt2spk ]; then - new_nutts=$(cat $tmpdir/utts | wc -l) - old_nutts=$(cat $data/utt2spk | wc -l) - if [ $new_nutts -ne $old_nutts ]; then - echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts" - else - echo "fix_data_dir.sh: kept all $old_nutts utterances." - fi - fi - - for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2dur utt2num_frames $maybe_wav $maybe_reco2dur $utt_extra_files; do - if [ -f $data/$x ]; then - cp $data/$x $data/.backup/$x - if ! cmp -s $data/$x <( utils/filter_scp.pl $tmpdir/utts $data/$x ) ; then - utils/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x - fi - fi - done - -} - -filter_recordings -filter_speakers -filter_utts -filter_speakers -filter_recordings - -utils/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt - -echo "fix_data_dir.sh: old files are kept in $data/.backup" diff --git a/kaldi/local/format_lm.sh b/kaldi/local/format_lm.sh deleted file mode 100755 index 1558c44..0000000 --- a/kaldi/local/format_lm.sh +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env bash - -# Copyright 2012 Arnab Ghoshal -# 2010-2011 Microsoft Corporation -# 2016-2018 Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -set -e - -if [ $# -ne 4 ]; then - echo "Usage: $0 " - echo "E.g.: $0 data/lang data/local/lm/foo.kn.gz data/local/dict/lexicon.txt data/lang_test" - echo "Convert ARPA-format language models to FSTs."; - exit 1; -fi - -lang_dir=$1 -lm=$2 -lexicon=$3 -out_dir=$4 -mkdir -p $out_dir - -[ -f ./path.sh ] && . ./path.sh - -echo "Converting '$lm' to FST" - -# the -ef test checks if source and target directory -# are two different directories in the filesystem -# if they are the same, the section guarded by the test -# would be actually harmfull (deleting the phones/ subdirectory) -if [ -e $out_dir ] && [ ! $lang_dir -ef $out_dir ] ; then - if [ -e $out_dir/phones ] ; then - rm -r $out_dir/phones - fi - - for f in phones.txt words.txt topo L.fst L_disambig.fst phones oov.int oov.txt; do - cp -r $lang_dir/$f $out_dir - done -fi - -lm_base=$(basename $lm '.gz') -gunzip -c $lm \ - | arpa2fst --disambig-symbol=#0 \ - --read-symbol-table=$out_dir/words.txt - $out_dir/G.fst -set +e -fstisstochastic $out_dir/G.fst -set -e -# The output is like: -# 9.14233e-05 -0.259833 -# we do expect the first of these 2 numbers to be close to zero (the second is -# nonzero because the backoff weights make the states sum to >1). - -# Everything below is only for diagnostic. -# Checking that G has no cycles with empty words on them (e.g. , ); -# this might cause determinization failure of CLG. -# #0 is treated as an empty word. -mkdir -p $out_dir/tmpdir.g -awk '{if(NF==1){ printf("0 0 %s %s\n", $1,$1); }} - END{print "0 0 #0 #0"; print "0";}' \ - < "$lexicon" > $out_dir/tmpdir.g/select_empty.fst.txt - -fstcompile --isymbols=$out_dir/words.txt --osymbols=$out_dir/words.txt \ - $out_dir/tmpdir.g/select_empty.fst.txt \ - | fstarcsort --sort_type=olabel \ - | fstcompose - $out_dir/G.fst > $out_dir/tmpdir.g/empty_words.fst - -fstinfo $out_dir/tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' \ - && echo "Language model has cycles with empty words" && exit 1 - -rm -r $out_dir/tmpdir.g - - -echo "Succeeded in formatting LM: '$lm'" diff --git a/kaldi/local/format_lm_sri.sh b/kaldi/local/format_lm_sri.sh deleted file mode 100755 index 4ef31d9..0000000 --- a/kaldi/local/format_lm_sri.sh +++ /dev/null @@ -1,94 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Arnab Ghoshal -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# Begin configuration section. -srilm_opts="-subset -prune-lowprobs -unk -tolower" -# end configuration sections - - -. utils/parse_options.sh - -if [ $# -ne 4 ] && [ $# -ne 3 ]; then - echo "Usage: $0 [options] [] " - echo "The argument is no longer needed but is supported for back compatibility" - echo "E.g.: utils/format_lm_sri.sh data/lang data/local/lm/foo.kn.gz data/local/dict/lexicon.txt data/lang_test" - echo "Converts ARPA-format language models to FSTs. Change the LM vocabulary using SRILM." - echo "Note: if you want to just convert ARPA LMs to FSTs, there is a simpler way to do this" - echo "that doesn't require SRILM: see utils/format_lm.sh" - echo "options:" - echo " --help # print this message and exit" - echo " --srilm-opts STRING # options to pass to SRILM tools (default: '$srilm_opts')" - exit 1; -fi - - -if [ $# -eq 4 ] ; then - lang_dir=$1 - lm=$2 - lexicon=$3 - out_dir=$4 -else - lang_dir=$1 - lm=$2 - out_dir=$3 -fi - -mkdir -p $out_dir - -for f in $lm $lang_dir/words.txt; do - if [ ! -f $f ]; then - echo "$0: expected input file $f to exist." - exit 1; - fi -done - -[ -f ./path.sh ] && . ./path.sh - -loc=`which change-lm-vocab` -if [ -z $loc ]; then - echo You appear to not have SRILM tools installed. - echo cd to $KALDI_ROOT/tools and run extras/install_srilm.sh. - exit 1 -fi - -echo "Converting '$lm' to FST" -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT - -mkdir -p $out_dir -cp -r $lang_dir/* $out_dir || exit 1; - -lm_base=$(basename $lm '.gz') -awk '{print $1}' $out_dir/words.txt > $tmpdir/voc || exit 1; - -# Change the LM vocabulary to be the intersection of the current LM vocabulary -# and the set of words in the pronunciation lexicon. This also renormalizes the -# LM by recomputing the backoff weights, and remove those ngrams whose -# probabilities are lower than the backed-off estimates. -change-lm-vocab -vocab $tmpdir/voc -lm $lm -write-lm - $srilm_opts | \ - arpa2fst --disambig-symbol=#0 \ - --read-symbol-table=$out_dir/words.txt - $out_dir/G.fst || exit 1 - -fstisstochastic $out_dir/G.fst - -# The output is like: -# 9.14233e-05 -0.259833 -# we do expect the first of these 2 numbers to be close to zero (the second is -# nonzero because the backoff weights make the states sum to >1). - -echo "Succeeded in formatting LM '$lm' -> '$out_dir/G.fst'" diff --git a/kaldi/local/gen_topo.pl b/kaldi/local/gen_topo.pl deleted file mode 100755 index 1c02ed0..0000000 --- a/kaldi/local/gen_topo.pl +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env perl - -# Copyright 2012 Johns Hopkins University (author: Daniel Povey) - -# Generate a topology file. This allows control of the number of states in the -# non-silence HMMs, and in the silence HMMs. - -if (@ARGV != 4) { - print STDERR "Usage: utils/gen_topo.pl \n"; - print STDERR "e.g.: utils/gen_topo.pl 3 5 4:5:6:7:8:9:10 1:2:3\n"; - exit (1); -} - -($num_nonsil_states, $num_sil_states, $nonsil_phones, $sil_phones) = @ARGV; - -( $num_nonsil_states >= 1 && $num_nonsil_states <= 100 ) || - die "Unexpected number of nonsilence-model states $num_nonsil_states\n"; -(( $num_sil_states == 1 || $num_sil_states >= 3) && $num_sil_states <= 100 ) || - die "Unexpected number of silence-model states $num_sil_states\n"; - -$nonsil_phones =~ s/:/ /g; -$sil_phones =~ s/:/ /g; -$nonsil_phones =~ m/^\d[ \d]+$/ || die "$0: bad arguments @ARGV\n"; -$sil_phones =~ m/^\d[ \d]*$/ || die "$0: bad arguments @ARGV\n"; - -print "\n"; -print "\n"; -print "\n"; -print "$nonsil_phones\n"; -print "\n"; -for ($state = 0; $state < $num_nonsil_states; $state++) { - $statep1 = $state+1; - print " $state $state $state 0.75 $statep1 0.25 \n"; -} -print " $num_nonsil_states \n"; # non-emitting final state. -print "\n"; -# Now silence phones. They have a different topology-- apart from the first and -# last states, it's fully connected, as long as you have >= 3 states. - -if ($num_sil_states > 1) { - $transp = 1.0 / ($num_sil_states-1); - print "\n"; - print "\n"; - print "$sil_phones\n"; - print "\n"; - print " 0 0 "; - for ($nextstate = 0; $nextstate < $num_sil_states-1; $nextstate++) { # Transitions to all but last - # emitting state. - print " $nextstate $transp "; - } - print "\n"; - for ($state = 1; $state < $num_sil_states-1; $state++) { # the central states all have transitions to - # themselves and to the last emitting state. - print " $state $state "; - for ($nextstate = 1; $nextstate < $num_sil_states; $nextstate++) { - print " $nextstate $transp "; - } - print "\n"; - } - # Final emitting state (non-skippable). - $state = $num_sil_states-1; - print " $state $state $state 0.75 $num_sil_states 0.25 \n"; - # Final nonemitting state: - print " $num_sil_states \n"; - print "\n"; -} else { - print "\n"; - print "\n"; - print "$sil_phones\n"; - print "\n"; - print " 0 0 "; - print " 0 0.75 "; - print " 1 0.25 "; - print "\n"; - print " $num_sil_states \n"; # non-emitting final state. - print "\n"; -} - -print "\n"; diff --git a/kaldi/local/generate_vctk_wav.py b/kaldi/local/generate_vctk_wav.py deleted file mode 100755 index 7bb1838..0000000 --- a/kaldi/local/generate_vctk_wav.py +++ /dev/null @@ -1,22 +0,0 @@ -import os -import sys - -# wav.scp -with open(os.path.join(sys.argv[2], 'wav.scp'), 'w') as f: - for root, directory, files in os.walk(sys.argv[1]): - for file in files: - utt = file[:-4].split('_')[0] - seg = file[:-4].split('_')[1] - key = utt + '-' + utt + '-' + seg - rxfile = os.path.join(root, file) - #f.write('%s %s\n' % (key, rxfile)) - f.write('%s sox %s -t wav -c 1 -r 16000 -b 16 -e signed-integer - |\n' % (key, rxfile)) - -# utt2spk -with open(os.path.join(sys.argv[2], 'utt2spk'), 'w') as f: - for root, directory, files in os.walk(sys.argv[1]): - for file in files: - utt = file[:-4].split('_')[0] - seg = file[:-4].split('_')[1] - key = utt + '-' + utt + '-' + seg - f.write('%s %s\n' % (key, utt)) diff --git a/kaldi/local/get_spk_emb.py b/kaldi/local/get_spk_emb.py deleted file mode 100755 index 1b7cb29..0000000 --- a/kaldi/local/get_spk_emb.py +++ /dev/null @@ -1,29 +0,0 @@ -from kaldi_io import read_vec_flt, write_vec_flt, open_or_fd, write_mat -import sys -import numpy as np -from collections import defaultdict - -dev_test_spk = ['p311', 'p226', 'p303', 'p234', 'p302', 'p237', 'p294', 'p225'] - -with open(sys.argv[1], 'r') as f: - content = f.readlines() -content = [x.strip() for x in content] - -spk2mat = defaultdict(list) -for line in content: - (key,rxfile) = line.split() - spk = key.split('-')[0] - if spk in dev_test_spk: - seg = int(key.split('-')[2]) - if seg < 25: - continue - spk2mat[spk].append(read_vec_flt(rxfile)) - -out_file = sys.argv[2] -ark_scp_output = 'ark:| copy-feats --compress=true ark:- ark,scp:' + out_file + '.ark,' + out_file + '.scp' -with open_or_fd(ark_scp_output, 'wb') as f: - for spk,mat in spk2mat.items(): - spk_emb = np.mean(mat, axis=0).reshape(-1, 1) - #print(spk) - #print(spk_emb.shape) - write_mat(f, spk_emb, key=spk) diff --git a/kaldi/local/get_spk_emb_2.py b/kaldi/local/get_spk_emb_2.py deleted file mode 100755 index 2d6825b..0000000 --- a/kaldi/local/get_spk_emb_2.py +++ /dev/null @@ -1,45 +0,0 @@ -from kaldi_io import read_vec_flt, write_vec_flt, open_or_fd, write_mat -import sys -import numpy as np -from collections import defaultdict - -# first read the dev/test set -with open(sys.argv[3], 'r') as f: - content = f.readlines() -content = [x.strip() for x in content] - -dev_test_spk = defaultdict(list) -for uttid in content: - spk = uttid.split('_')[0] - dev_test_spk[spk].append(uttid) - -# read utterance embeddings -with open(sys.argv[1], 'r') as f: - content = f.readlines() -content = [x.strip() for x in content] - -# speaker to utterances mapping -spk2mat = defaultdict(list) -for line in content: - (key,rxfile) = line.split() - spk = key.split('-')[0] - if spk in dev_test_spk.keys(): - uttid = key.split('-')[1] + '_' + key.split('-')[2] - if uttid not in dev_test_spk[spk]: - continue - spk2mat[spk].append(read_vec_flt(rxfile)) - -#for i in spk2mat.keys(): -# if i in dev_test_spk.keys(): -# print(len(spk2mat[i])) - -# create speaker embeddings -out_file = sys.argv[2] -ark_scp_output = 'ark:| copy-vector ark:- ark,scp:' + out_file + '.ark,' + out_file + '.scp' -with open_or_fd(ark_scp_output, 'wb') as f: - for spk,mat in spk2mat.items(): - spk_emb = np.mean(mat, axis=0).reshape(-1,) # get speaker embedding (vector) - #print(spk_emb.shape) - #print(spk) - #print(spk_emb.shape) - write_vec_flt(f, spk_emb, key=spk) diff --git a/kaldi/local/get_utt2num_frames.sh b/kaldi/local/get_utt2num_frames.sh deleted file mode 100755 index 9ef66a7..0000000 --- a/kaldi/local/get_utt2num_frames.sh +++ /dev/null @@ -1,47 +0,0 @@ -#! /bin/bash - -# Copyright 2016 Vimal Manohar -# Apache 2.0. - -cmd=run.pl -nj=4 - -frame_shift=0.01 -frame_overlap=0.015 - -. utils/parse_options.sh -. ./path.sh - -if [ $# -ne 1 ]; then - echo "This script writes a file utt2num_frames with the " - echo "number of frames in each utterance as measured based on the " - echo "duration of the utterances (in utt2dur) and the specified " - echo "frame_shift and frame_overlap." - echo "Usage: $0 " - exit 1 -fi - -data=$1 - -if [ -s $data/utt2num_frames ]; then - echo "$0: $data/utt2num_frames already present!" - exit 0; -fi - -if [ ! -f $data/feats.scp ]; then - utils/data/get_utt2dur.sh $data - awk -v fs=$frame_shift -v fovlp=$frame_overlap \ - '{print $1" "int( ($2 - fovlp) / fs)}' $data/utt2dur > $data/utt2num_frames - exit 0 -fi - -utils/split_data.sh --per-utt $data $nj || exit 1 -$cmd JOB=1:$nj $data/log/get_utt2num_frames.JOB.log \ - feat-to-len scp:$data/split${nj}utt/JOB/feats.scp ark,t:$data/split${nj}utt/JOB/utt2num_frames || exit 1 - -for n in `seq $nj`; do - cat $data/split${nj}utt/$n/utt2num_frames -done > $data/utt2num_frames - -echo "$0: Computed and wrote $data/utt2num_frames" - diff --git a/kaldi/local/int2sym.pl b/kaldi/local/int2sym.pl deleted file mode 100755 index d618939..0000000 --- a/kaldi/local/int2sym.pl +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0. - -undef $field_begin; -undef $field_end; - - -if ($ARGV[0] eq "-f") { - shift @ARGV; - $field_spec = shift @ARGV; - if ($field_spec =~ m/^\d+$/) { - $field_begin = $field_spec - 1; $field_end = $field_spec - 1; - } - if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) - if ($1 ne "") { - $field_begin = $1 - 1; # Change to zero-based indexing. - } - if ($2 ne "") { - $field_end = $2 - 1; # Change to zero-based indexing. - } - } - if (!defined $field_begin && !defined $field_end) { - die "Bad argument to -f option: $field_spec"; - } -} -$symtab = shift @ARGV; -if(!defined $symtab) { - print STDERR "Usage: sym2int.pl [options] symtab [input] > output\n" . - "options: [-f (|-)]\n" . - "e.g.: -f 2, or -f 3-4\n"; - exit(1); -} - -open(F, "<$symtab") || die "Error opening symbol table file $symtab"; -while() { - @A = split(" ", $_); - @A == 2 || die "bad line in symbol table file: $_"; - $int2sym{$A[1]} = $A[0]; -} - -sub int2sym { - my $a = shift @_; - my $pos = shift @_; - if($a !~ m:^\d+$:) { # not all digits.. - $pos1 = $pos+1; # make it one-based. - die "int2sym.pl: found noninteger token $a [in position $pos1]\n"; - } - $s = $int2sym{$a}; - if(!defined ($s)) { - die "int2sym.pl: integer $a not in symbol table $symtab."; - } - return $s; -} - -$error = 0; -while (<>) { - @A = split(" ", $_); - for ($pos = 0; $pos <= $#A; $pos++) { - $a = $A[$pos]; - if ( (!defined $field_begin || $pos >= $field_begin) - && (!defined $field_end || $pos <= $field_end)) { - $a = int2sym($a, $pos); - } - print $a . " "; - } - print "\n"; -} - - - diff --git a/kaldi/local/kwslist_post_process.pl b/kaldi/local/kwslist_post_process.pl deleted file mode 100755 index 8f8fcf3..0000000 --- a/kaldi/local/kwslist_post_process.pl +++ /dev/null @@ -1,291 +0,0 @@ -#!/usr/bin/env perl - -# Copyright 2012 Johns Hopkins University (Author: Guoguo Chen) -# Apache 2.0. -# - -use strict; -use warnings; -use Getopt::Long; - -sub ReadKwslist { - my $kwslist_in = shift @_; - - my $source = "STDIN"; - if ($kwslist_in ne "-") { - open(I, "<$kwslist_in") || die "$0: Fail to open kwslist $kwslist_in\n"; - $source = "I"; - } - - # Read in the kwslist and parse it. Note that this is a naive parse -- I simply - # assume that the kwslist is "properly" generated - my @KWS; - my (@info, $kwid, $tbeg, $dur, $file, $score, $channel); - my ($kwlist_filename, $language, $system_id) = ("", "", ""); - while (<$source>) { - chomp; - - if (/[0]\" language=\"$info->[1]\" system_id=\"$info->[2]\">\n"; - my $prev_kw = ""; - foreach my $kwentry (@{$KWS}) { - if ($prev_kw ne $kwentry->[0]) { - if ($prev_kw ne "") {$kwslist .= " \n";} - $kwslist .= " [0]\" oov_count=\"0\">\n"; - $prev_kw = $kwentry->[0]; - } - $kwslist .= " [1]\" channel=\"$kwentry->[2]\" tbeg=\"$kwentry->[3]\" dur=\"$kwentry->[4]\" score=\"$kwentry->[5]\" decision=\"$kwentry->[6]\""; - if (defined($kwentry->[7])) {$kwslist .= " threshold=\"$kwentry->[7]\"";} - if (defined($kwentry->[8])) {$kwslist .= " raw_score=\"$kwentry->[8]\"";} - $kwslist .= "/>\n"; - } - $kwslist .= " \n"; - $kwslist .= "\n"; - - return $kwslist; -} - -sub KwslistOutputSort { - if ($a->[0] ne $b->[0]) { - if ($a->[0] =~ m/[0-9]+$/ and $b->[0] =~ m/[0-9]+$/) { - ($a->[0] =~ /([0-9]*)$/)[0] <=> ($b->[0] =~ /([0-9]*)$/)[0] - } else { - $a->[0] cmp $b->[0]; - } - } elsif ($a->[5] ne $b->[5]) { - $b->[5] <=> $a->[5]; - } else { - $a->[1] cmp $b->[1]; - } -} -sub KwslistDupSort { - my ($a, $b, $duptime) = @_; - if ($a->[0] ne $b->[0]) { - $a->[0] cmp $b->[0]; - } elsif ($a->[1] ne $b->[1]) { - $a->[1] cmp $b->[1]; - } elsif ($a->[2] ne $b->[2]) { - $a->[2] cmp $b->[2]; - } elsif (abs($a->[3]-$b->[3]) >= $duptime){ - $a->[3] <=> $b->[3]; - } elsif ($a->[5] ne $b->[5]) { - $b->[5] <=> $a->[5]; - } else { - $b->[4] <=> $a->[4]; - } -} - -my $Usage = < - e.g.: kwslist_post_process.pl kwslist.in.xml kwslist.out.xml - -Allowed options: - --beta : Beta value when computing ATWV (float, default = 999.9) - --digits : How many digits should the score use (int, default = "infinite") - --duptime : Tolerance for duplicates (float, default = 0.5) - --duration : Duration of the audio (Actural length/2) (float, default = 3600) - --normalize : Normalize scores or not (boolean, default = false) - --Ntrue-scale : Keyword independent scale factor for Ntrue (float, default = 1.0) - --remove-dup : Remove duplicates (boolean, default = false) - --remove-NO : Remove the "NO" decision instances (boolean, default = false) - --verbose : Verbose level (higher --> more kws section) (integer, default 0) - --YES-cutoff : Only keep "\$YES-cutoff" yeses for each kw (int, default = -1) - -EOU - -my $beta = 999.9; -my $duration = 3600; -my $normalize = "false"; -my $verbose = 0; -my $Ntrue_scale = 1.0; -my $remove_dup = "false"; -my $duptime = 0.5; -my $remove_NO = "false"; -my $digits = 0; -my $YES_cutoff = -1; -GetOptions('beta=f' => \$beta, - 'duration=f' => \$duration, - 'normalize=s' => \$normalize, - 'verbose=i' => \$verbose, - 'Ntrue-scale=f' => \$Ntrue_scale, - 'remove-dup=s' => \$remove_dup, - 'duptime=f' => \$duptime, - 'remove-NO=s' => \$remove_NO, - 'digits=i' => \$digits, - 'YES-cutoff=i' => \$YES_cutoff); - -($normalize eq "true" || $normalize eq "false") || die "$0: Bad value for option --normalize\n"; -($remove_dup eq "true" || $remove_dup eq "false") || die "$0: Bad value for option --remove-dup\n"; -($remove_NO eq "true" || $remove_NO eq "false") || die "$0: Bad value for option --remove-NO\n"; - -@ARGV == 2 || die $Usage; - -# Workout the input/output source -my $kwslist_in = shift @ARGV; -my $kwslist_out = shift @ARGV; - -my ($info, $KWS) = @{ReadKwslist($kwslist_in)}; - -# Work out the Ntrue -my %Ntrue; -foreach my $kwentry (@{$KWS}) { - if (!defined($Ntrue{$kwentry->[0]})) { - $Ntrue{$kwentry->[0]} = 0.0; - } - $Ntrue{$kwentry->[0]} += $kwentry->[5]; -} - -# Scale the Ntrue and work out the expected count based threshold -my %threshold; -foreach my $key (keys %Ntrue) { - $Ntrue{$key} *= $Ntrue_scale; - $threshold{$key} = $Ntrue{$key}/($duration/$beta+($beta-1)/$beta*$Ntrue{$key}); -} - -# Removing duplicates -if ($remove_dup eq "true") { - my @tmp = sort {KwslistDupSort($a, $b, $duptime)} @{$KWS}; - my @KWS = (); - push(@KWS, $tmp[0]); - for (my $i = 1; $i < scalar(@tmp); $i ++) { - my $prev = $KWS[-1]; - my $curr = $tmp[$i]; - if ((abs($prev->[3]-$curr->[3]) < $duptime ) && - ($prev->[2] eq $curr->[2]) && - ($prev->[1] eq $curr->[1]) && - ($prev->[0] eq $curr->[0])) { - next; - } else { - push(@KWS, $curr); - } - } - $KWS = \@KWS; -} - -my $format_string = "%g"; -if ($digits gt 0 ) { - $format_string = "%." . $digits ."f"; -} - -# Making decisions... -my %YES_count; -foreach my $kwentry (@{$KWS}) { - my $threshold = $threshold{$kwentry->[0]}; - if ($kwentry->[5] > $threshold) { - $kwentry->[6] = "YES"; - if (defined($YES_count{$kwentry->[0]})) { - $YES_count{$kwentry->[0]} ++; - } else { - $YES_count{$kwentry->[0]} = 1; - } - } else { - $kwentry->[6] = "NO"; - if (!defined($YES_count{$kwentry->[0]})) { - $YES_count{$kwentry->[0]} = 0; - } - } - if ($verbose > 0) { - push(@{$kwentry}, sprintf("%g", $threshold)); - } - if ($normalize eq "true") { - if ($verbose > 0) { - push(@{$kwentry}, $kwentry->[5]); - } - my $numerator = (1-$threshold)*$kwentry->[5]; - my $denominator = (1-$threshold)*$kwentry->[5]+(1-$kwentry->[5])*$threshold; - if ($denominator != 0) { - $kwentry->[5] = sprintf($format_string, $numerator/$denominator); - } else { - $kwentry->[5] = sprintf($format_string, $kwentry->[5]); - } - } else { - $kwentry->[5] = sprintf($format_string, $kwentry->[5]); - } -} - -# Sorting and printing -my @tmp = sort KwslistOutputSort @{$KWS}; - -# Process the YES-cutoff. Note that you don't need this for the normal cases where -# hits and false alarms are balanced -if ($YES_cutoff != -1) { - my $count = 1; - for (my $i = 1; $i < scalar(@tmp); $i ++) { - if ($tmp[$i]->[0] ne $tmp[$i-1]->[0]) { - $count = 1; - next; - } - if ($YES_count{$tmp[$i]->[0]} > $YES_cutoff*2) { - $tmp[$i]->[6] = "NO"; - $tmp[$i]->[5] = 0; - next; - } - if (($count == $YES_cutoff) && ($tmp[$i]->[6] eq "YES")) { - $tmp[$i]->[6] = "NO"; - $tmp[$i]->[5] = 0; - next; - } - if ($tmp[$i]->[6] eq "YES") { - $count ++; - } - } -} - -# Process the remove-NO decision -if ($remove_NO eq "true") { - my @KWS = @tmp; - @tmp = (); - for (my $i = 0; $i < scalar(@KWS); $i ++) { - if ($KWS[$i]->[6] eq "YES") { - push(@tmp, $KWS[$i]); - } - } -} - -# Printing -my $kwslist = PrintKwslist($info, \@tmp); - -if ($kwslist_out eq "-") { - print $kwslist; -} else { - open(O, ">$kwslist_out") || die "$0: Fail to open output file $kwslist_out\n"; - print O $kwslist; - close(O); -} diff --git a/kaldi/local/ln.pl b/kaldi/local/ln.pl deleted file mode 100755 index 634b439..0000000 --- a/kaldi/local/ln.pl +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env perl -use File::Spec; - -if ( @ARGV < 2 ) { - print STDERR "usage: ln.pl input1 input2 dest-dir\n" . - "This script does a soft link of input1, input2, etc." . - "to dest-dir, using relative links where possible\n" . - "Note: input-n and dest-dir may both be absolute pathnames,\n" . - "or relative pathnames, relative to the current directlory.\n"; - exit(1); -} - -$dir = pop @ARGV; -if ( ! -d $dir ) { - print STDERR "ln.pl: last argument must be a directory ($dir is not a directory)\n"; - exit(1); -} - -$ans = 1; # true. - -$absdir = File::Spec->rel2abs($dir); # Get $dir as abs path. -defined $absdir || die "No such directory $dir"; -foreach $file (@ARGV) { - $absfile = File::Spec->rel2abs($file); # Get $file as abs path. - defined $absfile || die "No such file or directory: $file"; - @absdir_split = split("/", $absdir); - @absfile_split = split("/", $absfile); - - $newfile = $absdir . "/" . $absfile_split[$#absfile_split]; # we'll use this - # as the destination in the link command. - $num_removed = 0; - while (@absdir_split > 0 && $absdir_split[0] eq $absfile_split[0]) { - shift @absdir_split; - shift @absfile_split; - $num_removed++; - } - if (-l $newfile) { # newfile is already a link -> safe to delete it. - unlink($newfile); # "unlink" just means delete. - } - if ($num_removed == 0) { # will use absolute pathnames. - $oldfile = "/" . join("/", @absfile_split); - $ret = symlink($oldfile, $newfile); - } else { - $num_dots = @absdir_split; - $oldfile = join("/", @absfile_split); - for ($n = 0; $n < $num_dots; $n++) { - $oldfile = "../" . $oldfile; - } - $ret = symlink($oldfile, $newfile); - } - $ans = $ans && $ret; - if (! $ret) { - print STDERR "Error linking $oldfile to $newfile\n"; - } -} - -exit ($ans == 1 ? 0 : 1); - diff --git a/kaldi/local/make_absolute.sh b/kaldi/local/make_absolute.sh deleted file mode 100755 index 523e19a..0000000 --- a/kaldi/local/make_absolute.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -# This script replaces the command readlink -f (which is not portable). -# It turns a pathname into an absolute pathname, including following soft links. -target_file=$1 - -cd $(dirname $target_file) -target_file=$(basename $target_file) - -# Iterate down a (possible) chain of symlinks -while [ -L "$target_file" ]; do - target_file=$(readlink $target_file) - cd $(dirname $target_file) - target_file=$(basename $target_file) -done - -# Compute the canonicalized name by finding the physical path -# for the directory we're in and appending the target file. -phys_dir=$(pwd -P) -result=$phys_dir/$target_file -echo $result diff --git a/kaldi/local/make_fbank.sh b/kaldi/local/make_fbank.sh deleted file mode 100755 index 77c48be..0000000 --- a/kaldi/local/make_fbank.sh +++ /dev/null @@ -1,156 +0,0 @@ -#!/bin/bash - -# Copyright 2012-2016 Karel Vesely Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 -# To be run from .. (one directory up from here) -# see ../run.sh for example - -# Begin configuration section. -nj=4 -cmd=run.pl -fbank_config=conf/fbank.conf -compress=true -write_utt2num_frames=false # if true writes utt2num_frames -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -. parse_options.sh || exit 1; - -if [ $# -lt 1 ] || [ $# -gt 3 ]; then - echo "Usage: $0 [options] [ [] ]"; - echo "e.g.: $0 data/train exp/make_fbank/train mfcc" - echo "Note: defaults to /log, and defaults to /data" - echo "Options: " - echo " --fbank-config # config passed to compute-fbank-feats " - echo " --nj # number of parallel jobs" - echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." - echo " --write-utt2num-frames # If true, write utt2num_frames file." - exit 1; -fi - -data=$1 -if [ $# -ge 2 ]; then - logdir=$2 -else - logdir=$data/log -fi -if [ $# -ge 3 ]; then - fbankdir=$3 -else - fbankdir=$data/data -fi - - -# make $fbankdir an absolute pathname. -fbankdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $fbankdir ${PWD}` - -# use "name" as part of name of the archive. -name=`basename $data` - -mkdir -p $fbankdir || exit 1; -mkdir -p $logdir || exit 1; - -if [ -f $data/feats.scp ]; then - mkdir -p $data/.backup - echo "$0: moving $data/feats.scp to $data/.backup" - mv $data/feats.scp $data/.backup -fi - -scp=$data/wav.scp - -required="$scp $fbank_config" - -for f in $required; do - if [ ! -f $f ]; then - echo "make_fbank.sh: no such file $f" - exit 1; - fi -done - -utils/validate_data_dir.sh --no-text --no-feats $data || exit 1; - -if [ -f $data/spk2warp ]; then - echo "$0 [info]: using VTLN warp factors from $data/spk2warp" - vtln_opts="--vtln-map=ark:$data/spk2warp --utt2spk=ark:$data/utt2spk" -elif [ -f $data/utt2warp ]; then - echo "$0 [info]: using VTLN warp factors from $data/utt2warp" - vtln_opts="--vtln-map=ark:$data/utt2warp" -fi - -for n in $(seq $nj); do - # the next command does nothing unless $fbankdir/storage/ exists, see - # utils/create_data_link.pl for more info. - utils/create_data_link.pl $fbankdir/raw_fbank_$name.$n.ark -done - -if $write_utt2num_frames; then - write_num_frames_opt="--write-num-frames=ark,t:$logdir/utt2num_frames.JOB" -else - write_num_frames_opt= -fi - -if [ -f $data/segments ]; then - echo "$0 [info]: segments file exists: using that." - split_segments="" - for n in $(seq $nj); do - split_segments="$split_segments $logdir/segments.$n" - done - - utils/split_scp.pl $data/segments $split_segments || exit 1; - rm $logdir/.error 2>/dev/null - - $cmd JOB=1:$nj $logdir/make_fbank_${name}.JOB.log \ - extract-segments scp,p:$scp $logdir/segments.JOB ark:- \| \ - compute-fbank-feats $vtln_opts --verbose=2 --config=$fbank_config ark:- ark:- \| \ - copy-feats --compress=$compress $write_num_frames_opt ark:- \ - ark,scp:$fbankdir/raw_fbank_$name.JOB.ark,$fbankdir/raw_fbank_$name.JOB.scp \ - || exit 1; - -else - echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance." - split_scps="" - for n in $(seq $nj); do - split_scps="$split_scps $logdir/wav.$n.scp" - done - - utils/split_scp.pl $scp $split_scps || exit 1; - - $cmd JOB=1:$nj $logdir/make_fbank_${name}.JOB.log \ - compute-fbank-feats $vtln_opts --verbose=2 --config=$fbank_config scp,p:$logdir/wav.JOB.scp ark:- \| \ - copy-feats --compress=$compress $write_num_frames_opt ark:- \ - ark,scp:$fbankdir/raw_fbank_$name.JOB.ark,$fbankdir/raw_fbank_$name.JOB.scp \ - || exit 1; - -fi - - -if [ -f $logdir/.error.$name ]; then - echo "Error producing fbank features for $name:" - tail $logdir/make_fbank_${name}.1.log - exit 1; -fi - -# concatenate the .scp files together. -for n in $(seq $nj); do - cat $fbankdir/raw_fbank_$name.$n.scp || exit 1; -done > $data/feats.scp - -if $write_utt2num_frames; then - for n in $(seq $nj); do - cat $logdir/utt2num_frames.$n || exit 1; - done > $data/utt2num_frames || exit 1 - rm $logdir/utt2num_frames.* -fi - -rm $logdir/wav.*.scp $logdir/segments.* 2>/dev/null - -nf=`cat $data/feats.scp | wc -l` -nu=`cat $data/utt2spk | wc -l` -if [ $nf -ne $nu ]; then - echo "It seems not all of the feature files were successfully ($nf != $nu);" - echo "consider using utils/fix_data_dir.sh $data" -fi - -echo "Succeeded creating filterbank features for $name" diff --git a/kaldi/local/make_lexicon_fst.pl b/kaldi/local/make_lexicon_fst.pl deleted file mode 100755 index f97129c..0000000 --- a/kaldi/local/make_lexicon_fst.pl +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation -# 2013 Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional). - -$pron_probs = 0; - -if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) { - $pron_probs = 1; - shift @ARGV; -} - -if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { - print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n"; - print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n"; - print STDERR "Note: ordinarily, each line of lexicon.txt is:\n"; - print STDERR " word phone1 phone2 ... phoneN;\n"; - print STDERR "if the --pron-probs option is used, each line is:\n"; - print STDERR " word pronunciation-probability phone1 phone2 ... phoneN.\n\n"; - print STDERR "The probability 'prob' will typically be between zero and one, and note that\n"; - print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n"; - print STDERR "this is your responsibility.\n\n"; - print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n"; - print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n"; - print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n"; - exit(1); -} - -$lexfn = shift @ARGV; -if (@ARGV == 0) { - $silprob = 0.0; -} elsif (@ARGV == 2) { - ($silprob,$silphone) = @ARGV; -} else { - ($silprob,$silphone,$sildisambig) = @ARGV; -} -if ($silprob != 0.0) { - $silprob < 1.0 || die "Sil prob cannot be >= 1.0"; - $silcost = -log($silprob); - $nosilcost = -log(1.0 - $silprob); -} - - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - - -if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. - $loopstate = 0; - $nextstate = 1; # next unallocated state. - while () { - @A = split(" ", $_); - @A == 0 && die "Empty lexicon line."; - foreach $a (@A) { - if ($a eq "") { - die "Bad lexicon line $_ ( is forbidden)"; - } - } - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - } else { - $ns = $loopstate; - } - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; # so we only print it on the first arc of the word. - $s = $ns; - } - } - print "$loopstate\t0\n"; # final-cost. -} else { # have silence probs. - $startstate = 0; - $loopstate = 1; - $silstate = 2; # state from where we go to loopstate after emitting silence. - print "$startstate\t$loopstate\t\t\t$nosilcost\n"; # no silence. - if (!defined $sildisambig) { - print "$startstate\t$loopstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$loopstate\t$silphone\t\n"; # no cost. - $nextstate = 3; - } else { - $disambigstate = 3; - $nextstate = 4; - print "$startstate\t$disambigstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$disambigstate\t$silphone\t\n"; # no cost. - print "$disambigstate\t$loopstate\t$sildisambig\t\n"; # silence disambiguation symbol. - } - while () { - @A = split(" ", $_); - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time. - $s = $ns; - } elsif (!defined($silphone) || $p ne $silphone) { - # This is non-deterministic but relatively compact, - # and avoids epsilons. - $local_nosilcost = $nosilcost + $pron_cost; - $local_silcost = $silcost + $pron_cost; - print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n"; - print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n"; - } else { - # no point putting opt-sil after silence word. - print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n"; - } - } - } - print "$loopstate\t0\n"; # final-cost. -} diff --git a/kaldi/local/make_lexicon_fst_silprob.pl b/kaldi/local/make_lexicon_fst_silprob.pl deleted file mode 100755 index 557af4f..0000000 --- a/kaldi/local/make_lexicon_fst_silprob.pl +++ /dev/null @@ -1,147 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation -# 2013 Johns Hopkins University (author: Daniel Povey) -# 2015 Hainan Xu -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# makes lexicon FST, in text form, from lexicon which contains (optional) -# probabilities of pronuniations, and (mandatory) probabilities of silence -# before and after the pronunciation. This script is almost the same with -# the make_lexicon_fst.pl script except for the word-dependent silprobs part - -if (@ARGV != 4) { - print STDERR "Usage: $0 lexiconp_silprob_disambig.txt \\\n"; - print STDERR " silprob.txt silphone_string sil_disambig_sym > lexiconfst.txt \n"; - print STDERR "\n"; - print STDERR "This script is almost the same as the utils/make_lexicon_fst.pl\n"; - print STDERR "except here we include word-dependent silence probabilities\n"; - print STDERR "when making the lexicon FSTs. "; - print STDERR "For details, see paper \nhttp://danielpovey.com/files/2015_interspeech_silprob.pdf\n\n"; - print STDERR "The lexiconp_silprob_disambig.txt file should have each line like \n\n"; - print STDERR "word p(pronunciation|word) p(sil-after|word) correction-term-for-sil "; - print STDERR "correction-term-for-no-sil phone-1 phone-2 ... phone-N\n\n"; - print STDERR "The pronunciation would have to include disambiguation symbols;\n"; - print STDERR "the 2 correction terms above are computed to reflect how much a \n"; - print STDERR "word affects the probability of a [non-]silence before it. \n"; - print STDERR "Please see the paper (link given above) for detailed descriptions\n"; - print STDERR "for how the 2 terms are computed.\n\n"; - print STDERR "The silprob.txt file contains 4 lines, \n\n"; - print STDERR " p(sil-after|)\n"; - print STDERR "_s correction-term-for-sil-for-\n"; - print STDERR "_n correction-term-for-no-sil-for-\n"; - print STDERR "overall p(overall-sil)\n\n"; - print STDERR "Other files are the same as utils/make_lexicon_fst.pl\n"; - - exit(1); -} - -$lexfn = shift @ARGV; -$silprobfile = shift @ARGV; - -($silphone,$sildisambig) = @ARGV; - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; -open(SP, "<$silprobfile") || die "Error opening word-sil-probs $SP"; - -$silbeginprob = -1; -$silendcorrection = -1; -$nonsilendcorrection = -1; -$siloverallprob = -1; - -while () { - @A = split(" ", $_); - $w = shift @A; - if ($w eq "") { - $silbeginprob = shift @A; - } - if ($w eq "_s") { - $silendcorrection = shift @A; - } - if ($w eq "_n") { - $nonsilendcorrection = shift @A; - } - if ($w eq "overall") { - $siloverallprob = shift @A; - } -} - -$startstate = 0; -$nonsilstart = 1; -$silstart = 2; -$nextstate = 3; - -$cost = -log($silbeginprob); -print "$startstate\t$silstart\t$silphone\t\t$cost\n"; # will change these -$cost = -log(1 - $silbeginprob); -print "$startstate\t$nonsilstart\t$sildisambig\t\t$cost\n"; - -while () { - @A = split(" ", $_); - $w = shift @A; - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - - $wordsilprob = shift @A; - $silwordcorrection = shift @A; - $nonsilwordcorrection = shift @A; - - $pron_cost = -log($pron_prob); - $wordsilcost = -log($wordsilprob); - $wordnonsilcost = -log(1.0 - $wordsilprob); - $silwordcost = -log($silwordcorrection); - $nonsilwordcost = -log($nonsilwordcorrection); - - $first = 1; # used as a bool, to handle the first phone (adding sils) - while (@A > 0) { - $p = shift @A; - - if ($first == 1) { - $newstate = $nextstate++; - - # for nonsil before w - $cost = $nonsilwordcost + $pron_cost; - print "$nonsilstart\t$newstate\t$p\t$w\t$cost\n"; - - # for sil before w - $cost = $silwordcost + $pron_cost; - print "$silstart\t$newstate\t$p\t$w\t$cost\n"; - $first = 0; - } - else { - $oldstate = $nextstate - 1; - print "$oldstate\t$nextstate\t$p\t\n"; - $nextstate++; - } - if (@A == 0) { - $oldstate = $nextstate - 1; - # for no sil after w - $cost = $wordnonsilcost; - print "$oldstate\t$nonsilstart\t$sildisambig\t\t$cost\n"; - - # for sil after w - $cost = $wordsilcost; - print "$oldstate\t$silstart\t$silphone\t\t$cost\n"; - } - } -} -$cost = -log($silendcorrection); -print "$silstart\t$cost\n"; -$cost = -log($nonsilendcorrection); -print "$nonsilstart\t$cost\n"; diff --git a/kaldi/local/make_mfcc.sh b/kaldi/local/make_mfcc.sh deleted file mode 100755 index c88e0d6..0000000 --- a/kaldi/local/make_mfcc.sh +++ /dev/null @@ -1,166 +0,0 @@ -#!/bin/bash - -# Copyright 2012-2016 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 -# To be run from .. (one directory up from here) -# see ../run.sh for example - -# Begin configuration section. -nj=4 -cmd=run.pl -mfcc_config=conf/mfcc.conf -compress=true -write_utt2num_frames=false # if true writes utt2num_frames -# End configuration section. - -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -. parse_options.sh || exit 1; - -if [ $# -lt 1 ] || [ $# -gt 3 ]; then - echo "Usage: $0 [options] [ [] ]"; - echo "e.g.: $0 data/train exp/make_mfcc/train mfcc" - echo "Note: defaults to /log, and defaults to /data" - echo "Options: " - echo " --mfcc-config # config passed to compute-mfcc-feats " - echo " --nj # number of parallel jobs" - echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." - echo " --write-utt2num-frames # If true, write utt2num_frames file." - exit 1; -fi - -data=$1 -if [ $# -ge 2 ]; then - logdir=$2 -else - logdir=$data/log -fi -if [ $# -ge 3 ]; then - mfccdir=$3 -else - mfccdir=$data/data -fi - -# make $mfccdir an absolute pathname. -mfccdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $mfccdir ${PWD}` - -# use "name" as part of name of the archive. -name=`basename $data` - -mkdir -p $mfccdir || exit 1; -mkdir -p $logdir || exit 1; - -if [ -f $data/feats.scp ]; then - mkdir -p $data/.backup - echo "$0: moving $data/feats.scp to $data/.backup" - mv $data/feats.scp $data/.backup -fi - -scp=$data/wav.scp - -required="$scp $mfcc_config" - -for f in $required; do - if [ ! -f $f ]; then - echo "make_mfcc.sh: no such file $f" - exit 1; - fi -done -utils/validate_data_dir.sh --no-text --no-feats $data || exit 1; - -if [ -f $data/spk2warp ]; then - echo "$0 [info]: using VTLN warp factors from $data/spk2warp" - vtln_opts="--vtln-map=ark:$data/spk2warp --utt2spk=ark:$data/utt2spk" -elif [ -f $data/utt2warp ]; then - echo "$0 [info]: using VTLN warp factors from $data/utt2warp" - vtln_opts="--vtln-map=ark:$data/utt2warp" -fi - -for n in $(seq $nj); do - # the next command does nothing unless $mfccdir/storage/ exists, see - # utils/create_data_link.pl for more info. - utils/create_data_link.pl $mfccdir/raw_mfcc_$name.$n.ark -done - - -if $write_utt2num_frames; then - write_num_frames_opt="--write-num-frames=ark,t:$logdir/utt2num_frames.JOB" -else - write_num_frames_opt= -fi - - -if [ -f $data/segments ]; then - echo "$0 [info]: segments file exists: using that." - - split_segments="" - for n in $(seq $nj); do - split_segments="$split_segments $logdir/segments.$n" - done - - utils/split_scp.pl $data/segments $split_segments || exit 1; - rm $logdir/.error 2>/dev/null - - $cmd JOB=1:$nj $logdir/make_mfcc_${name}.JOB.log \ - extract-segments scp,p:$scp $logdir/segments.JOB ark:- \| \ - compute-mfcc-feats $vtln_opts --verbose=2 --config=$mfcc_config ark:- ark:- \| \ - copy-feats --compress=$compress $write_num_frames_opt ark:- \ - ark,scp:$mfccdir/raw_mfcc_$name.JOB.ark,$mfccdir/raw_mfcc_$name.JOB.scp \ - || exit 1; - -else - echo "$0: [info]: no segments file exists: assuming wav.scp indexed by utterance." - split_scps="" - for n in $(seq $nj); do - split_scps="$split_scps $logdir/wav_${name}.$n.scp" - done - - utils/split_scp.pl $scp $split_scps || exit 1; - - - # add ,p to the input rspecifier so that we can just skip over - # utterances that have bad wave data. - - $cmd JOB=1:$nj $logdir/make_mfcc_${name}.JOB.log \ - compute-mfcc-feats $vtln_opts --verbose=2 --config=$mfcc_config \ - scp,p:$logdir/wav_${name}.JOB.scp ark:- \| \ - copy-feats $write_num_frames_opt --compress=$compress ark:- \ - ark,scp:$mfccdir/raw_mfcc_$name.JOB.ark,$mfccdir/raw_mfcc_$name.JOB.scp \ - || exit 1; -fi - - -if [ -f $logdir/.error.$name ]; then - echo "Error producing mfcc features for $name:" - tail $logdir/make_mfcc_${name}.1.log - exit 1; -fi - -# concatenate the .scp files together. -for n in $(seq $nj); do - cat $mfccdir/raw_mfcc_$name.$n.scp || exit 1; -done > $data/feats.scp || exit 1 - -if $write_utt2num_frames; then - for n in $(seq $nj); do - cat $logdir/utt2num_frames.$n || exit 1; - done > $data/utt2num_frames || exit 1 - rm $logdir/utt2num_frames.* -fi - -rm $logdir/wav_${name}.*.scp $logdir/segments.* 2>/dev/null - -nf=`cat $data/feats.scp | wc -l` -nu=`cat $data/utt2spk | wc -l` -if [ $nf -ne $nu ]; then - echo "It seems not all of the feature files were successfully processed ($nf != $nu);" - echo "consider using utils/fix_data_dir.sh $data" -fi - -if [ $nf -lt $[$nu - ($nu/20)] ]; then - echo "Less than 95% the features were successfully generated. Probably a serious error." - exit 1; -fi - -echo "Succeeded creating MFCC features for $name" diff --git a/kaldi/local/make_musan.py b/kaldi/local/make_musan.py deleted file mode 100755 index 74c4349..0000000 --- a/kaldi/local/make_musan.py +++ /dev/null @@ -1,123 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2015 David Snyder -# 2018 Ewald Enzinger -# Apache 2.0. -# -# Modified version of egs/sre16/v1/local/make_musan.py (commit e3fb7c4a0da4167f8c94b80f4d3cc5ab4d0e22e8). -# This version uses the raw MUSAN audio files (16 kHz) and does not use sox to resample at 8 kHz. -# -# This file is meant to be invoked by make_musan.sh. - -import os, sys - -def process_music_annotations(path): - utt2spk = {} - utt2vocals = {} - lines = open(path, 'r').readlines() - for line in lines: - utt, genres, vocals, musician = line.rstrip().split()[:4] - # For this application, the musican ID isn't important - utt2spk[utt] = utt - utt2vocals[utt] = vocals == "Y" - return utt2spk, utt2vocals - -def prepare_music(root_dir, use_vocals): - utt2vocals = {} - utt2spk = {} - utt2wav = {} - num_good_files = 0 - num_bad_files = 0 - music_dir = os.path.join(root_dir, "music") - for root, dirs, files in os.walk(music_dir): - for file in files: - file_path = os.path.join(root, file) - if file.endswith(".wav"): - utt = str(file).replace(".wav", "") - utt2wav[utt] = file_path - elif str(file) == "ANNOTATIONS": - utt2spk_part, utt2vocals_part = process_music_annotations(file_path) - utt2spk.update(utt2spk_part) - utt2vocals.update(utt2vocals_part) - utt2spk_str = "" - utt2wav_str = "" - for utt in utt2vocals: - if utt in utt2wav: - if use_vocals or not utt2vocals[utt]: - utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" - utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" - num_good_files += 1 - else: - print("Missing file", utt) - num_bad_files += 1 - print("In music directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") - return utt2spk_str, utt2wav_str - -def prepare_speech(root_dir): - utt2spk = {} - utt2wav = {} - num_good_files = 0 - num_bad_files = 0 - speech_dir = os.path.join(root_dir, "speech") - for root, dirs, files in os.walk(speech_dir): - for file in files: - file_path = os.path.join(root, file) - if file.endswith(".wav"): - utt = str(file).replace(".wav", "") - utt2wav[utt] = file_path - utt2spk[utt] = utt - utt2spk_str = "" - utt2wav_str = "" - for utt in utt2spk: - if utt in utt2wav: - utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" - utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" - num_good_files += 1 - else: - print("Missing file", utt) - num_bad_files += 1 - print("In speech directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") - return utt2spk_str, utt2wav_str - -def prepare_noise(root_dir): - utt2spk = {} - utt2wav = {} - num_good_files = 0 - num_bad_files = 0 - noise_dir = os.path.join(root_dir, "noise") - for root, dirs, files in os.walk(noise_dir): - for file in files: - file_path = os.path.join(root, file) - if file.endswith(".wav"): - utt = str(file).replace(".wav", "") - utt2wav[utt] = file_path - utt2spk[utt] = utt - utt2spk_str = "" - utt2wav_str = "" - for utt in utt2spk: - if utt in utt2wav: - utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" - utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" - num_good_files += 1 - else: - print("Missing file", utt) - num_bad_files += 1 - print("In noise directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") - return utt2spk_str, utt2wav_str - -def main(): - in_dir = sys.argv[1] - out_dir = sys.argv[2] - use_vocals = sys.argv[3] == "Y" - utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals) - utt2spk_speech, utt2wav_speech = prepare_speech(in_dir) - utt2spk_noise, utt2wav_noise = prepare_noise(in_dir) - utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise - utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise - wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w') - wav_fi.write(utt2wav) - utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w') - utt2spk_fi.write(utt2spk) - - -if __name__=="__main__": - main() diff --git a/kaldi/local/make_musan.sh b/kaldi/local/make_musan.sh deleted file mode 100755 index 1565ef0..0000000 --- a/kaldi/local/make_musan.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -# Copyright 2015 David Snyder -# Apache 2.0. -# -# Copy of egs/sre16/v1/local/make_musan.sh (commit e3fb7c4a0da4167f8c94b80f4d3cc5ab4d0e22e8). -# -# This script, called by ../run.sh, creates the MUSAN -# data directory. The required dataset is freely available at -# http://www.openslr.org/17/ - -set -e -in_dir=$1 -data_dir=$2 -use_vocals='Y' - -mkdir -p local/musan.tmp - -echo "Preparing ${data_dir}/musan..." -mkdir -p ${data_dir}/musan -local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals} - -utils/fix_data_dir.sh ${data_dir}/musan - -grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music -grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech -grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise -utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \ - ${data_dir}/musan ${data_dir}/musan_music -utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \ - ${data_dir}/musan ${data_dir}/musan_speech -utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \ - ${data_dir}/musan ${data_dir}/musan_noise - -utils/fix_data_dir.sh ${data_dir}/musan_music -utils/fix_data_dir.sh ${data_dir}/musan_speech -utils/fix_data_dir.sh ${data_dir}/musan_noise - -rm -rf local/musan.tmp - diff --git a/kaldi/local/make_unigram_grammar.pl b/kaldi/local/make_unigram_grammar.pl deleted file mode 100755 index 6ca740f..0000000 --- a/kaldi/local/make_unigram_grammar.pl +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script is used in discriminative training. -# This script makes a simple unigram-loop version of G.fst -# using a unigram grammar estimated from some training transcripts. -# This is for MMI training. -# We don't have any silences in G.fst; these are supplied by the -# optional silences in the lexicon. - -# Note: the symbols in the transcripts become the input and output -# symbols of G.txt; these can be numeric or not. - -if(@ARGV != 0) { - die "Usage: make_unigram_grammar.pl < text-transcripts > G.txt" -} - -$totcount = 0; -$nl = 0; -while (<>) { - @A = split(" ", $_); - foreach $a (@A) { - $count{$a}++; - $totcount++; - } - $nl++; - $totcount++; # Treat end-of-sentence as a symbol for purposes of - # $totcount, so the grammar is properly stochastic. This doesn't - # become , it just becomes the final-prob. -} - -foreach $a (keys %count) { - $prob = $count{$a} / $totcount; - $cost = -log($prob); # Negated natural-log probs. - print "0\t0\t$a\t$a\t$cost\n"; -} -# Zero final-cost. -$final_prob = $nl / $totcount; -$final_cost = -log($final_prob); -print "0\t$final_cost\n"; - diff --git a/kaldi/local/make_vctk.pl b/kaldi/local/make_vctk.pl deleted file mode 100755 index 23c668e..0000000 --- a/kaldi/local/make_vctk.pl +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/perl -# -# Copyright 2018 Ewald Enzinger -# 2018 David Snyder -# -# Usage: make_voxceleb1.pl /export/voxceleb1 data/ - -if (@ARGV != 2) { - print STDERR "Usage: $0 \n"; - print STDERR "e.g. $0 /export/voxceleb1 data/\n"; - exit(1); -} - -($data_base, $out_dir) = @ARGV; -my $out_vctk_dir = "$out_dir/vctk"; - -if (system("mkdir -p $out_vctk_dir") != 0) { - die "Error making directory $out_vctk_dir"; -} - -opendir my $dh, "$data_base/wav" or die "Cannot open directory: $!"; -my @spkr_dirs = grep {-d "$data_base/wav/$_" && ! /^\.{1,2}$/} readdir($dh); -closedir $dh; - -open(SPKR_VCTK, ">", "$out_vctk_dir/utt2spk") or die "Could not open the output file $out_vctk_dir/utt2spk"; -open(WAV_VCTK, ">", "$out_vctk_dir/wav.scp") or die "Could not open the output file $out_vctk_dir/wav.scp"; - -foreach (@spkr_dirs) { - my $spkr_id = $_; - my $new_spkr_id = $spkr_id; - # If we're using a newer version of VoxCeleb1, we need to "deanonymize" - # the speaker labels. - if (exists $id2spkr{$spkr_id}) { - $new_spkr_id = $id2spkr{$spkr_id}; - } - opendir my $dh, "$data_base/wav/$spkr_id/" or die "Cannot open directory: $!"; - my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); - closedir $dh; - foreach (@files) { - my $filename = $_; - my $rec_id = substr($filename, 0, 4); - my $segment = substr($filename, 5, 3); - my $wav = "$data_base/wav/$spkr_id/$filename.wav"; - my $utt_id = "$new_spkr_id-$rec_id-$segment"; - - print WAV_VCTK "$utt_id", " $wav", "\n"; - print SPKR_VCTK "$utt_id", " $new_spkr_id", "\n"; - } -} - -close(SPKR_VCTK) or die; -close(WAV_VCTK) or die; - -if (system( - "utils/utt2spk_to_spk2utt.pl $out_vctk_dir/utt2spk >$out_vctk_dir/spk2utt") != 0) { - die "Error creating spk2utt file in directory $out_vctk_dir"; -} -system("env LC_COLLATE=C utils/fix_data_dir.sh $out_vctk_dir"); -if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_vctk_dir") != 0) { - die "Error validating directory $out_vctk_dir"; -} diff --git a/kaldi/local/make_vctk_wav.py b/kaldi/local/make_vctk_wav.py deleted file mode 100755 index 92d4011..0000000 --- a/kaldi/local/make_vctk_wav.py +++ /dev/null @@ -1,4 +0,0 @@ -import os -import sys - -path diff --git a/kaldi/local/make_vctk_wav.sh b/kaldi/local/make_vctk_wav.sh deleted file mode 100755 index e69de29..0000000 diff --git a/kaldi/local/make_voxceleb1.pl b/kaldi/local/make_voxceleb1.pl deleted file mode 100755 index 33d513a..0000000 --- a/kaldi/local/make_voxceleb1.pl +++ /dev/null @@ -1,113 +0,0 @@ -#!/usr/bin/perl -# -# Copyright 2018 Ewald Enzinger -# 2018 David Snyder -# -# Usage: make_voxceleb1.pl /export/voxceleb1 data/ - -if (@ARGV != 2) { - print STDERR "Usage: $0 \n"; - print STDERR "e.g. $0 /export/voxceleb1 data/\n"; - exit(1); -} - -($data_base, $out_dir) = @ARGV; -my $out_test_dir = "$out_dir/voxceleb1_test"; -my $out_train_dir = "$out_dir/voxceleb1_train"; - -if (! -e "$data_base/voxceleb1_test.txt") { - system("wget -O $data_base/voxceleb1_test.txt http://www.openslr.org/resources/49/voxceleb1_test.txt"); -} - -if (system("mkdir -p $out_test_dir") != 0) { - die "Error making directory $out_test_dir"; -} - -if (system("mkdir -p $out_train_dir") != 0) { - die "Error making directory $out_train_dir"; -} - -opendir my $dh, "$data_base/voxceleb1_wav" or die "Cannot open directory: $!"; -my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh); -closedir $dh; - -open(TRIAL_IN, "<", "$data_base/voxceleb1_test.txt") or die "Could not open the verification trials file $data_base/voxceleb1_test.txt"; -open(SPKR_TEST, ">", "$out_test_dir/utt2spk") or die "Could not open the output file $out_test_dir/utt2spk"; -open(WAV_TEST, ">", "$out_test_dir/wav.scp") or die "Could not open the output file $out_test_dir/wav.scp"; -open(SPKR_TRAIN, ">", "$out_train_dir/utt2spk") or die "Could not open the output file $out_train_dir/utt2spk"; -open(WAV_TRAIN, ">", "$out_train_dir/wav.scp") or die "Could not open the output file $out_train_dir/wav.scp"; -open(TRIAL_OUT, ">", "$out_test_dir/trials") or die "Could not open the output file $out_test_dir/trials"; - -my $test_spkrs = (); -while () { - chomp; - my ($tar_or_none, $path1, $path2) = split; - - # Create entry for left-hand side of trial - my $wav = "$data_base/voxceleb1_wav/$path1"; - my ($spkr_id, $rec_id, $segment) = split('/', $path1); - #my $rec_id = substr($filename, 0, 11); - my $segment_ = substr($segment, 0, 5); - my $utt_id1 = "$spkr_id-$rec_id-$segment_"; - $test_spkrs{$spkr_id} = (); - - # Create entry for right-hand side of trial - my $wav = "$data_base/voxceleb1_wav/$path2"; - my ($spkr_id, $rec_id, $segment) = split('/', $path2); - #my $rec_id = substr($filename, 0, 11); - my $segment_ = substr($segment, 0, 5); - my $utt_id2 = "$spkr_id-$rec_id-$segment_"; - $test_spkrs{$spkr_id} = (); - - my $target = "nontarget"; - if ($tar_or_none eq "1") { - $target = "target"; - } - print TRIAL_OUT "$utt_id1 $utt_id2 $target\n"; -} - -foreach (@spkr_dirs) { - my $spkr_id = $_; - opendir my $dh, "$data_base/voxceleb1_wav/$spkr_id/" or die "Cannot open directory: $!"; - my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); - closedir $dh; - foreach (@files) { - my $filename = $_; - my $rec_id = substr($filename, 0, 11); - my $segment = substr($filename, 12, 7); - my $utt_id = "$spkr_id-$rec_id-$segment"; - my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav"; - if (exists $test_spkrs{$spkr_id}) { - print WAV_TEST "$utt_id", " $wav", "\n"; - print SPKR_TEST "$utt_id", " $spkr_id", "\n"; - } else { - print WAV_TRAIN "$utt_id", " $wav", "\n"; - print SPKR_TRAIN "$utt_id", " $spkr_id", "\n"; - } - } -} - -close(SPKR_TEST) or die; -close(WAV_TEST) or die; -close(SPKR_TRAIN) or die; -close(WAV_TRAIN) or die; -close(TRIAL_OUT) or die; -close(TRIAL_IN) or die; - -if (system( - "utils/utt2spk_to_spk2utt.pl $out_test_dir/utt2spk >$out_test_dir/spk2utt") != 0) { - die "Error creating spk2utt file in directory $out_test_dir"; -} -system("env LC_COLLATE=C utils/fix_data_dir.sh $out_test_dir"); -if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_test_dir") != 0) { - die "Error validating directory $out_test_dir"; -} - -if (system( - "utils/utt2spk_to_spk2utt.pl $out_train_dir/utt2spk >$out_train_dir/spk2utt") != 0) { - die "Error creating spk2utt file in directory $out_train_dir"; -} -system("env LC_COLLATE=C utils/fix_data_dir.sh $out_train_dir"); -if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_train_dir") != 0) { - die "Error validating directory $out_train_dir"; -} diff --git a/kaldi/local/make_voxceleb2.pl b/kaldi/local/make_voxceleb2.pl deleted file mode 100755 index 307bf73..0000000 --- a/kaldi/local/make_voxceleb2.pl +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/perl -# -# Copyright 2018 Ewald Enzinger -# -# Usage: make_voxceleb2.pl /export/voxceleb2 dev data/dev -# -# Note: This script requires ffmpeg to be installed and its location included in $PATH. - -if (@ARGV != 3) { - print STDERR "Usage: $0 \n"; - print STDERR "e.g. $0 /export/voxceleb2 dev data/dev\n"; - exit(1); -} - -# Check that ffmpeg is installed. -if (`which ffmpeg` eq "") { - die "Error: this script requires that ffmpeg is installed."; -} - -($data_base, $dataset, $out_dir) = @ARGV; - -if ("$dataset" ne "dev" && "$dataset" ne "test") { - die "dataset parameter must be 'dev' or 'test'!"; -} - -opendir my $dh, "$data_base/$dataset/aac" or die "Cannot open directory: $!"; -my @spkr_dirs = grep {-d "$data_base/$dataset/aac/$_" && ! /^\.{1,2}$/} readdir($dh); -closedir $dh; - -if (system("mkdir -p $out_dir") != 0) { - die "Error making directory $out_dir"; -} - -open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; -open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; - -foreach (@spkr_dirs) { - my $spkr_id = $_; - - opendir my $dh, "$data_base/$dataset/aac/$spkr_id/" or die "Cannot open directory: $!"; - my @rec_dirs = grep {-d "$data_base/$dataset/aac/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh); - closedir $dh; - - foreach (@rec_dirs) { - my $rec_id = $_; - - opendir my $dh, "$data_base/$dataset/aac/$spkr_id/$rec_id/" or die "Cannot open directory: $!"; - my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); - closedir $dh; - - foreach (@files) { - my $name = $_; - my $wav = "$data_base/$dataset/aac/$spkr_id/$rec_id/$name.wav"; - my $utt_id = "$spkr_id-$rec_id-$name"; - print WAV "$utt_id", " $wav", "\n"; - print SPKR "$utt_id", " $spkr_id", "\n"; - } - } -} -close(SPKR) or die; -close(WAV) or die; - -if (system( - "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { - die "Error creating spk2utt file in directory $out_dir"; -} -system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir"); -if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { - die "Error validating directory $out_dir"; -} diff --git a/kaldi/local/map_arpa_lm.pl b/kaldi/local/map_arpa_lm.pl deleted file mode 100755 index 25b4781..0000000 --- a/kaldi/local/map_arpa_lm.pl +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/env perl - -# Copyright 2014 Guoguo Chen -# 2014 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0. -# -use strict; -use warnings; -use Getopt::Long; - -my $Usage = < < input-arpa >output-arpa - e.g.: utils/map_arpa_lm.pl words.txt arpa_lm.int - -Allowed options: - --sym2int : If true, maps words to integers, other wise maps integers to - words. (boolean, default = true) - -EOU - -my $sym2int = "true"; -GetOptions('sym2int=s' => \$sym2int); - -($sym2int eq "true" || $sym2int eq "false") || - die "$0: Bad value for option --sym2int\n"; - -if (@ARGV != 1) { - die $Usage; -} - -# Gets parameters. -my $symtab = shift @ARGV; -my $arpa_in = shift @ARGV; -my $arpa_out = shift @ARGV; - -# Opens files. -open(M, "<$symtab") || die "$0: Fail to open $symtab\n"; - -# Reads in the mapper. -my %mapper; -while () { - chomp; - my @col = split(/[\s]+/, $_); - @col == 2 || die "$0: Bad line in mapper file \"$_\"\n"; - if ($sym2int eq "true") { - if (defined($mapper{$col[0]})) { - die "$0: Duplicate entry \"$col[0]\"\n"; - } - $mapper{$col[0]} = $col[1]; - } else { - if (defined($mapper{$col[1]})) { - die "$0: Duplicate entry \"$col[1]\"\n"; - } - $mapper{$col[1]} = $col[0]; - } -} - -my $num_oov_lines = 0; -my $max_oov_warn = 20; - -# Parses Arpa n-gram language model. -my $arpa = ""; -my $current_order = -1; -my %head_ngram_count; -my %actual_ngram_count; -while () { - chomp; - my @col = split(" ", $_); - - if ($current_order == -1 and ! m/^\\data\\$/) { - next; - } - - if (m/^\\data\\$/) { - print STDERR "$0: Processing \"\\data\\\"\n"; - print "$_\n"; - $current_order = 0; - } elsif (m/^\\[0-9]*-grams:$/) { - $current_order = $_; - $current_order =~ s/-grams:$//g; - $current_order =~ s/^\\//g; - print "$_\n"; - print STDERR "$0: Processing \"\\$current_order-grams:\\\"\n"; - } elsif (m/^\\end\\/) { - print "$_\n"; - } elsif ($_ eq "") { - if ($current_order >= 1) { - print "\n"; - } - } else { - if ($current_order == 0) { - # echo head section. - print "$_\n"; - } else { - # Parses n-gram section. - if (@col > 2 + $current_order || @col < 1 + $current_order) { - die "$0: Bad line in arpa lm \"$_\"\n"; - } - my $prob = shift @col; - my $is_oov = 0; - for (my $i = 0; $i < $current_order; $i++) { - my $temp = $mapper{$col[$i]}; - if (!defined($temp)) { - $is_oov = 1; - $num_oov_lines++; - last; - } else { - $col[$i] = $temp; - } - } - if (!$is_oov) { - my $rest_of_line = join(" ", @col); - print "$prob\t$rest_of_line\n"; - } else { - if ($num_oov_lines < $max_oov_warn) { - print STDERR "$0: Warning: OOV line $_\n"; - } - } - } - } -} - -if ($num_oov_lines > 0) { - print STDERR "$0: $num_oov_lines lines of the Arpa file contained OOVs and "; - print STDERR "were not printed.\n"; -} - -close(M); diff --git a/kaldi/local/mkgraph.sh b/kaldi/local/mkgraph.sh deleted file mode 100755 index 1becfc4..0000000 --- a/kaldi/local/mkgraph.sh +++ /dev/null @@ -1,171 +0,0 @@ -#!/bin/bash -# Copyright 2010-2012 Microsoft Corporation -# 2012-2013 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - -# This script creates a fully expanded decoding graph (HCLG) that represents -# all the language-model, pronunciation dictionary (lexicon), context-dependency, -# and HMM structure in our model. The output is a Finite State Transducer -# that has word-ids on the output, and pdf-ids on the input (these are indexes -# that resolve to Gaussian Mixture Models). -# See -# http://kaldi-asr.org/doc/graph_recipe_test.html -# (this is compiled from this repository using Doxygen, -# the source for this part is in src/doc/graph_recipe_test.dox) - -set -o pipefail - -tscale=1.0 -loopscale=0.1 - -remove_oov=false - -for x in `seq 4`; do - [ "$1" == "--mono" -o "$1" == "--left-biphone" -o "$1" == "--quinphone" ] && shift && \ - echo "WARNING: the --mono, --left-biphone and --quinphone options are now deprecated and ignored." - [ "$1" == "--remove-oov" ] && remove_oov=true && shift; - [ "$1" == "--transition-scale" ] && tscale=$2 && shift 2; - [ "$1" == "--self-loop-scale" ] && loopscale=$2 && shift 2; -done - -if [ $# != 3 ]; then - echo "Usage: utils/mkgraph.sh [options] " - echo "e.g.: utils/mkgraph.sh data/lang_test exp/tri1/ exp/tri1/graph" - echo " Options:" - echo " --remove-oov # If true, any paths containing the OOV symbol (obtained from oov.int" - echo " # in the lang directory) are removed from the G.fst during compilation." - echo " --transition-scale # Scaling factor on transition probabilities." - echo " --self-loop-scale # Please see: http://kaldi-asr.org/doc/hmm.html#hmm_scale." - echo "Note: the --mono, --left-biphone and --quinphone options are now deprecated" - echo "and will be ignored." - exit 1; -fi - -if [ -f path.sh ]; then . ./path.sh; fi - -lang=$1 -tree=$2/tree -model=$2/final.mdl -dir=$3 - -mkdir -p $dir - -# If $lang/tmp/LG.fst does not exist or is older than its sources, make it... -# (note: the [[ ]] brackets make the || type operators work (inside [ ], we -# would have to use -o instead), -f means file exists, and -ot means older than). - -required="$lang/L.fst $lang/G.fst $lang/phones.txt $lang/words.txt $lang/phones/silence.csl $lang/phones/disambig.int $model $tree" -for f in $required; do - [ ! -f $f ] && echo "mkgraph.sh: expected $f to exist" && exit 1; -done - -if [ -f $dir/HCLG.fst ]; then - # detect when the result already exists, and avoid overwriting it. - must_rebuild=false - for f in $required; do - [ $f -nt $dir/HCLG.fst ] && must_rebuild=true - done - if ! $must_rebuild; then - echo "$0: $dir/HCLG.fst is up to date." - exit 0 - fi -fi - - -N=$(tree-info $tree | grep "context-width" | cut -d' ' -f2) || { echo "Error when getting context-width"; exit 1; } -P=$(tree-info $tree | grep "central-position" | cut -d' ' -f2) || { echo "Error when getting central-position"; exit 1; } - -[[ -f $2/frame_subsampling_factor && "$loopscale" == "0.1" ]] && \ - echo "$0: WARNING: chain models need '--self-loop-scale 1.0'"; - -mkdir -p $lang/tmp -trap "rm -f $lang/tmp/LG.fst.$$" EXIT HUP INT PIPE TERM -# Note: [[ ]] is like [ ] but enables certain extra constructs, e.g. || in -# place of -o -if [[ ! -s $lang/tmp/LG.fst || $lang/tmp/LG.fst -ot $lang/G.fst || \ - $lang/tmp/LG.fst -ot $lang/L_disambig.fst ]]; then - fsttablecompose $lang/L_disambig.fst $lang/G.fst | fstdeterminizestar --use-log=true | \ - fstminimizeencoded | fstpushspecial | \ - fstarcsort --sort_type=ilabel > $lang/tmp/LG.fst.$$ || exit 1; - mv $lang/tmp/LG.fst.$$ $lang/tmp/LG.fst - fstisstochastic $lang/tmp/LG.fst || echo "[info]: LG not stochastic." -fi - -clg=$lang/tmp/CLG_${N}_${P}.fst -clg_tmp=$clg.$$ -ilabels=$lang/tmp/ilabels_${N}_${P} -ilabels_tmp=$ilabels.$$ -trap "rm -f $clg_tmp $ilabels_tmp" EXIT HUP INT PIPE TERM -if [[ ! -s $clg || $clg -ot $lang/tmp/LG.fst \ - || ! -s $ilabels || $ilabels -ot $lang/tmp/LG.fst ]]; then - fstcomposecontext --context-size=$N --central-position=$P \ - --read-disambig-syms=$lang/phones/disambig.int \ - --write-disambig-syms=$lang/tmp/disambig_ilabels_${N}_${P}.int \ - $ilabels_tmp < $lang/tmp/LG.fst |\ - fstarcsort --sort_type=ilabel > $clg_tmp - mv $clg_tmp $clg - mv $ilabels_tmp $ilabels - fstisstochastic $clg || echo "[info]: CLG not stochastic." -fi - -trap "rm -f $dir/Ha.fst.$$" EXIT HUP INT PIPE TERM -if [[ ! -s $dir/Ha.fst || $dir/Ha.fst -ot $model \ - || $dir/Ha.fst -ot $lang/tmp/ilabels_${N}_${P} ]]; then - make-h-transducer --disambig-syms-out=$dir/disambig_tid.int \ - --transition-scale=$tscale $lang/tmp/ilabels_${N}_${P} $tree $model \ - > $dir/Ha.fst.$$ || exit 1; - mv $dir/Ha.fst.$$ $dir/Ha.fst -fi - -trap "rm -f $dir/HCLGa.fst.$$" EXIT HUP INT PIPE TERM -if [[ ! -s $dir/HCLGa.fst || $dir/HCLGa.fst -ot $dir/Ha.fst || \ - $dir/HCLGa.fst -ot $clg ]]; then - if $remove_oov; then - [ ! -f $lang/oov.int ] && \ - echo "$0: --remove-oov option: no file $lang/oov.int" && exit 1; - clg="fstrmsymbols --remove-arcs=true --apply-to-output=true $lang/oov.int $clg|" - fi - fsttablecompose $dir/Ha.fst "$clg" | fstdeterminizestar --use-log=true \ - | fstrmsymbols $dir/disambig_tid.int | fstrmepslocal | \ - fstminimizeencoded > $dir/HCLGa.fst.$$ || exit 1; - mv $dir/HCLGa.fst.$$ $dir/HCLGa.fst - fstisstochastic $dir/HCLGa.fst || echo "HCLGa is not stochastic" -fi - -trap "rm -f $dir/HCLG.fst.$$" EXIT HUP INT PIPE TERM -if [[ ! -s $dir/HCLG.fst || $dir/HCLG.fst -ot $dir/HCLGa.fst ]]; then - add-self-loops --self-loop-scale=$loopscale --reorder=true \ - $model < $dir/HCLGa.fst | fstconvert --fst_type=const > $dir/HCLG.fst.$$ || exit 1; - mv $dir/HCLG.fst.$$ $dir/HCLG.fst - if [ $tscale == 1.0 -a $loopscale == 1.0 ]; then - # No point doing this test if transition-scale not 1, as it is bound to fail. - fstisstochastic $dir/HCLG.fst || echo "[info]: final HCLG is not stochastic." - fi -fi - -# note: the empty FST has 66 bytes. this check is for whether the final FST -# is the empty file or is the empty FST. -if ! [ $(head -c 67 $dir/HCLG.fst | wc -c) -eq 67 ]; then - echo "$0: it looks like the result in $dir/HCLG.fst is empty" - exit 1 -fi - -# save space. -rm $dir/HCLGa.fst $dir/Ha.fst 2>/dev/null || true - -# keep a copy of the lexicon and a list of silence phones with HCLG... -# this means we can decode without reference to the $lang directory. - - -cp $lang/words.txt $dir/ || exit 1; -mkdir -p $dir/phones -cp $lang/phones/word_boundary.* $dir/phones/ 2>/dev/null # might be needed for ctm scoring, -cp $lang/phones/align_lexicon.* $dir/phones/ 2>/dev/null # might be needed for ctm scoring, -cp $lang/phones/optional_silence.* $dir/phones/ 2>/dev/null # might be needed for analyzing alignments. - # but ignore the error if it's not there. - -cp $lang/phones/disambig.{txt,int} $dir/phones/ 2> /dev/null -cp $lang/phones/silence.csl $dir/phones/ || exit 1; -cp $lang/phones.txt $dir/ 2> /dev/null # ignore the error if it's not there. - -am-info --print-args=false $model | grep pdfs | awk '{print $NF}' > $dir/num_pdfs diff --git a/kaldi/local/nnet3/xvector/prepare_feats_for_egs.sh b/kaldi/local/nnet3/xvector/prepare_feats_for_egs.sh deleted file mode 100755 index 1609194..0000000 --- a/kaldi/local/nnet3/xvector/prepare_feats_for_egs.sh +++ /dev/null @@ -1,83 +0,0 @@ -#!/bin/bash -# -# Copied from egs/sre16/v1/local/nnet3/xvector/prepare_feats_for_egs.sh (commit 3ea534070fd2cccd2e4ee21772132230033022ce). -# -# Apache 2.0. - -# This script applies sliding window cmvn and removes silence frames. This -# is performed on the raw features prior to generating examples for training -# the xvector system. - -nj=40 -cmd="run.pl" -stage=0 -norm_vars=false -center=true -compress=true -cmn_window=300 -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -. parse_options.sh || exit 1; -if [ $# != 3 ]; then - echo "Usage: $0 " - echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features" - echo "Options: " - echo " --nj # number of parallel jobs" - echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." - echo " --norm-vars # If true, normalize variances in the sliding window cmvn" - exit 1; -fi - -data_in=$1 -data_out=$2 -dir=$3 - -name=`basename $data_in` - -for f in $data_in/feats.scp $data_in/vad.scp ; do - [ ! -f $f ] && echo "$0: No such file $f" && exit 1; -done - -# Set various variables. -mkdir -p $dir/log -mkdir -p $data_out -featdir=$(utils/make_absolute.sh $dir) - -if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $featdir/storage ]; then - utils/create_split_dir.pl \ - /export/b{14,15,16,17}/$USER/kaldi-data/egs/voxceleb2/v2/xvector-$(date +'%m_%d_%H_%M')/xvector_feats/storage $featdir/storage -fi - -for n in $(seq $nj); do - # the next command does nothing unless $featdir/storage/ exists, see - # utils/create_data_link.pl for more info. - utils/create_data_link.pl $featdir/xvector_feats_${name}.${n}.ark -done - -cp $data_in/utt2spk $data_out/utt2spk -cp $data_in/spk2utt $data_out/spk2utt -cp $data_in/wav.scp $data_out/wav.scp - -write_num_frames_opt="--write-num-frames=ark,t:$featdir/log/utt2num_frames.JOB" - -sdata_in=$data_in/split$nj; -utils/split_data.sh $data_in $nj || exit 1; - -$cmd JOB=1:$nj $dir/log/create_xvector_feats_${name}.JOB.log \ - apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=$cmn_window \ - scp:${sdata_in}/JOB/feats.scp ark:- \| \ - select-voiced-frames ark:- scp,s,cs:${sdata_in}/JOB/vad.scp ark:- \| \ - copy-feats --compress=$compress $write_num_frames_opt ark:- \ - ark,scp:$featdir/xvector_feats_${name}.JOB.ark,$featdir/xvector_feats_${name}.JOB.scp || exit 1; - -for n in $(seq $nj); do - cat $featdir/xvector_feats_${name}.$n.scp || exit 1; -done > ${data_out}/feats.scp || exit 1 - -for n in $(seq $nj); do - cat $featdir/log/utt2num_frames.$n || exit 1; -done > $data_out/utt2num_frames || exit 1 -rm $featdir/log/utt2num_frames.* - -echo "$0: Succeeded creating xvector features for $name" diff --git a/kaldi/local/nnet3/xvector/run_xvector.sh b/kaldi/local/nnet3/xvector/run_xvector.sh deleted file mode 100755 index 0c2c77b..0000000 --- a/kaldi/local/nnet3/xvector/run_xvector.sh +++ /dev/null @@ -1,155 +0,0 @@ -#!/bin/bash -# Copyright 2017 David Snyder -# 2017 Johns Hopkins University (Author: Daniel Garcia-Romero) -# 2017 Johns Hopkins University (Author: Daniel Povey) -# -# Copied from egs/sre16/v1/local/nnet3/xvector/tuning/run_xvector_1a.sh (commit e082c17d4a8f8a791428ae4d9f7ceb776aef3f0b). -# -# Apache 2.0. - -# This script trains a DNN similar to the recipe described in -# http://www.danielpovey.com/files/2018_icassp_xvectors.pdf - -. ./cmd.sh -set -e - -stage=1 -train_stage=0 -use_gpu=true -remove_egs=false - -data=data/train -nnet_dir=exp/xvector_nnet_1a/ -egs_dir=exp/xvector_nnet_1a/egs - -. ./path.sh -. ./cmd.sh -. ./utils/parse_options.sh - -num_pdfs=$(awk '{print $2}' $data/utt2spk | sort | uniq -c | wc -l) - -# Now we create the nnet examples using sid/nnet3/xvector/get_egs.sh. -# The argument --num-repeats is related to the number of times a speaker -# repeats per archive. If it seems like you're getting too many archives -# (e.g., more than 200) try increasing the --frames-per-iter option. The -# arguments --min-frames-per-chunk and --max-frames-per-chunk specify the -# minimum and maximum length (in terms of number of frames) of the features -# in the examples. -# -# To make sense of the egs script, it may be necessary to put an "exit 1" -# command immediately after stage 3. Then, inspect -# exp//egs/temp/ranges.* . The ranges files specify the examples that -# will be created, and which archives they will be stored in. Each line of -# ranges.* has the following form: -# -# For example: -# 100304-f-sre2006-kacg-A 1 2 4079 881 23 - -# If you're satisfied with the number of archives (e.g., 50-150 archives is -# reasonable) and with the number of examples per speaker (e.g., 1000-5000 -# is reasonable) then you can let the script continue to the later stages. -# Otherwise, try increasing or decreasing the --num-repeats option. You might -# need to fiddle with --frames-per-iter. Increasing this value decreases the -# the number of archives and increases the number of examples per archive. -# Decreasing this value increases the number of archives, while decreasing the -# number of examples per archive. -if [ $stage -le 6 ]; then - echo "$0: Getting neural network training egs"; - # dump egs. - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $egs_dir/storage ]; then - utils/create_split_dir.pl \ - /export/b{03,04,05,06}/$USER/kaldi-data/egs/voxceleb2/v2/xvector-$(date +'%m_%d_%H_%M')/$egs_dir/storage $egs_dir/storage - fi - sid/nnet3/xvector/get_egs.sh --cmd "$train_cmd" \ - --nj 8 \ - --stage 0 \ - --frames-per-iter 1000000000 \ - --frames-per-iter-diagnostic 100000 \ - --min-frames-per-chunk 200 \ - --max-frames-per-chunk 400 \ - --num-diagnostic-archives 3 \ - --num-repeats 50 \ - "$data" $egs_dir -fi - -if [ $stage -le 7 ]; then - echo "$0: creating neural net configs using the xconfig parser"; - num_targets=$(wc -w $egs_dir/pdf2num | awk '{print $1}') - feat_dim=$(cat $egs_dir/info/feat_dim) - - # This chunk-size corresponds to the maximum number of frames the - # stats layer is able to pool over. In this script, it corresponds - # to 100 seconds. If the input recording is greater than 100 seconds, - # we will compute multiple xvectors from the same recording and average - # to produce the final xvector. - max_chunk_size=10000 - - # The smallest number of frames we're comfortable computing an xvector from. - # Note that the hard minimum is given by the left and right context of the - # frame-level layers. - min_chunk_size=25 - mkdir -p $nnet_dir/configs - cat < $nnet_dir/configs/network.xconfig - # please note that it is important to have input layer with the name=input - - # The frame-level layers - input dim=${feat_dim} name=input - relu-batchnorm-layer name=tdnn1 input=Append(-2,-1,0,1,2) dim=512 - relu-batchnorm-layer name=tdnn2 input=Append(-2,0,2) dim=512 - relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=512 - relu-batchnorm-layer name=tdnn4 dim=512 - relu-batchnorm-layer name=tdnn5 dim=1500 - - # The stats pooling layer. Layers after this are segment-level. - # In the config below, the first and last argument (0, and ${max_chunk_size}) - # means that we pool over an input segment starting at frame 0 - # and ending at frame ${max_chunk_size} or earlier. The other arguments (1:1) - # mean that no subsampling is performed. - stats-layer name=stats config=mean+stddev(0:1:1:${max_chunk_size}) - - # This is where we usually extract the embedding (aka xvector) from. - relu-batchnorm-layer name=tdnn6 dim=512 input=stats - - # This is where another layer the embedding could be extracted - # from, but usually the previous one works better. - relu-batchnorm-layer name=tdnn7 dim=512 - output-layer name=output include-log-softmax=true dim=${num_targets} -EOF - - steps/nnet3/xconfig_to_configs.py \ - --xconfig-file $nnet_dir/configs/network.xconfig \ - --config-dir $nnet_dir/configs/ - cp $nnet_dir/configs/final.config $nnet_dir/nnet.config - - # These three files will be used by sid/nnet3/xvector/extract_xvectors.sh - echo "output-node name=output input=tdnn6.affine" > $nnet_dir/extract.config - echo "$max_chunk_size" > $nnet_dir/max_chunk_size - echo "$min_chunk_size" > $nnet_dir/min_chunk_size -fi - -dropout_schedule='0,0@0.20,0.1@0.50,0' -srand=123 -if [ $stage -le 8 ]; then - steps/nnet3/train_raw_dnn.py --stage=$train_stage \ - --cmd="$train_cmd" \ - --trainer.optimization.proportional-shrink 10 \ - --trainer.optimization.momentum=0.5 \ - --trainer.optimization.num-jobs-initial=3 \ - --trainer.optimization.num-jobs-final=8 \ - --trainer.optimization.initial-effective-lrate=0.001 \ - --trainer.optimization.final-effective-lrate=0.0001 \ - --trainer.optimization.minibatch-size=64 \ - --trainer.srand=$srand \ - --trainer.max-param-change=2 \ - --trainer.num-epochs=3 \ - --trainer.dropout-schedule="$dropout_schedule" \ - --trainer.shuffle-buffer-size=1000 \ - --egs.frames-per-eg=1 \ - --egs.dir="$egs_dir" \ - --cleanup.remove-egs $remove_egs \ - --cleanup.preserve-model-interval=10 \ - --use-gpu=true \ - --dir=$nnet_dir || exit 1; -fi - -exit 0; diff --git a/kaldi/local/nnet3/xvector/tuning/run_xvector_1a.sh b/kaldi/local/nnet3/xvector/tuning/run_xvector_1a.sh deleted file mode 100755 index 0c2c77b..0000000 --- a/kaldi/local/nnet3/xvector/tuning/run_xvector_1a.sh +++ /dev/null @@ -1,155 +0,0 @@ -#!/bin/bash -# Copyright 2017 David Snyder -# 2017 Johns Hopkins University (Author: Daniel Garcia-Romero) -# 2017 Johns Hopkins University (Author: Daniel Povey) -# -# Copied from egs/sre16/v1/local/nnet3/xvector/tuning/run_xvector_1a.sh (commit e082c17d4a8f8a791428ae4d9f7ceb776aef3f0b). -# -# Apache 2.0. - -# This script trains a DNN similar to the recipe described in -# http://www.danielpovey.com/files/2018_icassp_xvectors.pdf - -. ./cmd.sh -set -e - -stage=1 -train_stage=0 -use_gpu=true -remove_egs=false - -data=data/train -nnet_dir=exp/xvector_nnet_1a/ -egs_dir=exp/xvector_nnet_1a/egs - -. ./path.sh -. ./cmd.sh -. ./utils/parse_options.sh - -num_pdfs=$(awk '{print $2}' $data/utt2spk | sort | uniq -c | wc -l) - -# Now we create the nnet examples using sid/nnet3/xvector/get_egs.sh. -# The argument --num-repeats is related to the number of times a speaker -# repeats per archive. If it seems like you're getting too many archives -# (e.g., more than 200) try increasing the --frames-per-iter option. The -# arguments --min-frames-per-chunk and --max-frames-per-chunk specify the -# minimum and maximum length (in terms of number of frames) of the features -# in the examples. -# -# To make sense of the egs script, it may be necessary to put an "exit 1" -# command immediately after stage 3. Then, inspect -# exp//egs/temp/ranges.* . The ranges files specify the examples that -# will be created, and which archives they will be stored in. Each line of -# ranges.* has the following form: -# -# For example: -# 100304-f-sre2006-kacg-A 1 2 4079 881 23 - -# If you're satisfied with the number of archives (e.g., 50-150 archives is -# reasonable) and with the number of examples per speaker (e.g., 1000-5000 -# is reasonable) then you can let the script continue to the later stages. -# Otherwise, try increasing or decreasing the --num-repeats option. You might -# need to fiddle with --frames-per-iter. Increasing this value decreases the -# the number of archives and increases the number of examples per archive. -# Decreasing this value increases the number of archives, while decreasing the -# number of examples per archive. -if [ $stage -le 6 ]; then - echo "$0: Getting neural network training egs"; - # dump egs. - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $egs_dir/storage ]; then - utils/create_split_dir.pl \ - /export/b{03,04,05,06}/$USER/kaldi-data/egs/voxceleb2/v2/xvector-$(date +'%m_%d_%H_%M')/$egs_dir/storage $egs_dir/storage - fi - sid/nnet3/xvector/get_egs.sh --cmd "$train_cmd" \ - --nj 8 \ - --stage 0 \ - --frames-per-iter 1000000000 \ - --frames-per-iter-diagnostic 100000 \ - --min-frames-per-chunk 200 \ - --max-frames-per-chunk 400 \ - --num-diagnostic-archives 3 \ - --num-repeats 50 \ - "$data" $egs_dir -fi - -if [ $stage -le 7 ]; then - echo "$0: creating neural net configs using the xconfig parser"; - num_targets=$(wc -w $egs_dir/pdf2num | awk '{print $1}') - feat_dim=$(cat $egs_dir/info/feat_dim) - - # This chunk-size corresponds to the maximum number of frames the - # stats layer is able to pool over. In this script, it corresponds - # to 100 seconds. If the input recording is greater than 100 seconds, - # we will compute multiple xvectors from the same recording and average - # to produce the final xvector. - max_chunk_size=10000 - - # The smallest number of frames we're comfortable computing an xvector from. - # Note that the hard minimum is given by the left and right context of the - # frame-level layers. - min_chunk_size=25 - mkdir -p $nnet_dir/configs - cat < $nnet_dir/configs/network.xconfig - # please note that it is important to have input layer with the name=input - - # The frame-level layers - input dim=${feat_dim} name=input - relu-batchnorm-layer name=tdnn1 input=Append(-2,-1,0,1,2) dim=512 - relu-batchnorm-layer name=tdnn2 input=Append(-2,0,2) dim=512 - relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=512 - relu-batchnorm-layer name=tdnn4 dim=512 - relu-batchnorm-layer name=tdnn5 dim=1500 - - # The stats pooling layer. Layers after this are segment-level. - # In the config below, the first and last argument (0, and ${max_chunk_size}) - # means that we pool over an input segment starting at frame 0 - # and ending at frame ${max_chunk_size} or earlier. The other arguments (1:1) - # mean that no subsampling is performed. - stats-layer name=stats config=mean+stddev(0:1:1:${max_chunk_size}) - - # This is where we usually extract the embedding (aka xvector) from. - relu-batchnorm-layer name=tdnn6 dim=512 input=stats - - # This is where another layer the embedding could be extracted - # from, but usually the previous one works better. - relu-batchnorm-layer name=tdnn7 dim=512 - output-layer name=output include-log-softmax=true dim=${num_targets} -EOF - - steps/nnet3/xconfig_to_configs.py \ - --xconfig-file $nnet_dir/configs/network.xconfig \ - --config-dir $nnet_dir/configs/ - cp $nnet_dir/configs/final.config $nnet_dir/nnet.config - - # These three files will be used by sid/nnet3/xvector/extract_xvectors.sh - echo "output-node name=output input=tdnn6.affine" > $nnet_dir/extract.config - echo "$max_chunk_size" > $nnet_dir/max_chunk_size - echo "$min_chunk_size" > $nnet_dir/min_chunk_size -fi - -dropout_schedule='0,0@0.20,0.1@0.50,0' -srand=123 -if [ $stage -le 8 ]; then - steps/nnet3/train_raw_dnn.py --stage=$train_stage \ - --cmd="$train_cmd" \ - --trainer.optimization.proportional-shrink 10 \ - --trainer.optimization.momentum=0.5 \ - --trainer.optimization.num-jobs-initial=3 \ - --trainer.optimization.num-jobs-final=8 \ - --trainer.optimization.initial-effective-lrate=0.001 \ - --trainer.optimization.final-effective-lrate=0.0001 \ - --trainer.optimization.minibatch-size=64 \ - --trainer.srand=$srand \ - --trainer.max-param-change=2 \ - --trainer.num-epochs=3 \ - --trainer.dropout-schedule="$dropout_schedule" \ - --trainer.shuffle-buffer-size=1000 \ - --egs.frames-per-eg=1 \ - --egs.dir="$egs_dir" \ - --cleanup.remove-egs $remove_egs \ - --cleanup.preserve-model-interval=10 \ - --use-gpu=true \ - --dir=$nnet_dir || exit 1; -fi - -exit 0; diff --git a/kaldi/local/parse_options.sh b/kaldi/local/parse_options.sh deleted file mode 100755 index 34476fd..0000000 --- a/kaldi/local/parse_options.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); -# Arnab Ghoshal, Karel Vesely - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Parse command-line options. -# To be sourced by another script (as in ". parse_options.sh"). -# Option format is: --option-name arg -# and shell variable "option_name" gets set to value "arg." -# The exception is --help, which takes no arguments, but prints the -# $help_message variable (if defined). - - -### -### The --config file options have lower priority to command line -### options, so we need to import them first... -### - -# Now import all the configs specified by command-line, in left-to-right order -for ((argpos=1; argpos<$#; argpos++)); do - if [ "${!argpos}" == "--config" ]; then - argpos_plus1=$((argpos+1)) - config=${!argpos_plus1} - [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 - . $config # source the config file. - fi -done - - -### -### No we process the command line options -### -while true; do - [ -z "${1:-}" ] && break; # break if there are no arguments - case "$1" in - # If the enclosing script is called with --help option, print the help - # message and exit. Scripts should put help messages in $help_message - --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; - else printf "$help_message\n" 1>&2 ; fi; - exit 0 ;; - --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" - exit 1 ;; - # If the first command-line argument begins with "--" (e.g. --foo-bar), - # then work out the variable name as $name, which will equal "foo_bar". - --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; - # Next we test whether the variable in question is undefned-- if so it's - # an invalid option and we die. Note: $0 evaluates to the name of the - # enclosing script. - # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar - # is undefined. We then have to wrap this test inside "eval" because - # foo_bar is itself inside a variable ($name). - eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; - - oldval="`eval echo \\$$name`"; - # Work out whether we seem to be expecting a Boolean argument. - if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then - was_bool=true; - else - was_bool=false; - fi - - # Set the variable to the right value-- the escaped quotes make it work if - # the option had spaces, like --cmd "queue.pl -sync y" - eval $name=\"$2\"; - - # Check that Boolean-valued arguments are really Boolean. - if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then - echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 - exit 1; - fi - shift 2; - ;; - *) break; - esac -done - - -# Check for an empty argument to the --cmd option, which can easily occur as a -# result of scripting errors. -[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; - - -true; # so this script returns exit code 0. diff --git a/kaldi/local/pbs.pl b/kaldi/local/pbs.pl deleted file mode 100755 index 6c8d448..0000000 --- a/kaldi/local/pbs.pl +++ /dev/null @@ -1,587 +0,0 @@ -#!/usr/bin/env perl -use strict; -use warnings; - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). -# 2014 Johns Hopkins University (Author: Vimal Manohar) -# 2015 Queensland University of Technology (Author: Ahilan Kanagasundaram ) -# Apache 2.0. - -use File::Basename; -use Cwd; -use Getopt::Long; - -# This is a version of the queue.pl modified so that it works under PBS -# The PBS is one of the several "almost compatible" queueing systems. The -# command switches and environment variables are different, so we are adding -# a this script. An optimal solution might probably be to make the variable -# names and the commands configurable, as similar problems can be expected -# with Torque, Univa... and who knows what else -# -# queue.pl has the same functionality as run.pl, except that -# it runs the job in question on the queue (Sun GridEngine). -# This version of queue.pl uses the task array functionality -# of the grid engine. Note: it's different from the queue.pl -# in the s4 and earlier scripts. - -# The script now supports configuring the queue system using a config file -# (default in conf/pbs.conf; but can be passed specified with --config option) -# and a set of command line options. -# The current script handles: -# 1) Normal configuration arguments -# For e.g. a command line option of "--gpu 1" could be converted into the option -# "-q g.q -l gpu=1" to qsub. How the CLI option is handled is determined by a -# line in the config file like -# gpu=* -q g.q -l gpu=$0 -# $0 here in the line is replaced with the argument read from the CLI and the -# resulting string is passed to qsub. -# 2) Special arguments to options such as -# gpu=0 -# If --gpu 0 is given in the command line, then no special "-q" is given. -# 3) Default argument -# default gpu=0 -# If --gpu option is not passed in the command line, then the script behaves as -# if --gpu 0 was passed since 0 is specified as the default argument for that -# option -# 4) Arbitrary options and arguments. -# Any command line option starting with '--' and its argument would be handled -# as long as its defined in the config file. -# 5) Default behavior -# If the config file that is passed using is not readable, then the script -# behaves as if the queue has the following config file: -# $ cat conf/pbs.conf -# # Default configuration -# command qsub -v PATH -S /bin/bash -l arch=*64* -# option mem=* -l mem_free=$0,ram_free=$0 -# option mem=0 # Do not add anything to qsub_opts -# option num_threads=* -pe smp $0 -# option num_threads=1 # Do not add anything to qsub_opts -# option max_jobs_run=* -tc $0 -# default gpu=0 -# option gpu=0 -q all.q -# option gpu=* -l gpu=$0 -q g.q - -my $qsub_opts = ""; -my $sync = 0; -my $num_threads = 1; -my $gpu = 0; - -my $config = "conf/pbs.conf"; - -my %cli_options = (); - -my $jobname; -my $jobstart; -my $jobend; - -my $array_job = 0; - -sub print_usage() { - print STDERR - "Usage: queue.pl [options] [JOB=1:n] log-file command-line arguments...\n" . - "e.g.: queue.pl foo.log echo baz\n" . - " (which will echo \"baz\", with stdout and stderr directed to foo.log)\n" . - "or: queue.pl -q all.q\@xyz foo.log echo bar \| sed s/bar/baz/ \n" . - " (which is an example of using a pipe; you can provide other escaped bash constructs)\n" . - "or: queue.pl -q all.q\@qyz JOB=1:10 foo.JOB.log echo JOB \n" . - " (which illustrates the mechanism to submit parallel jobs; note, you can use \n" . - " another string other than JOB)\n" . - "Note: if you pass the \"-sync y\" option to qsub, this script will take note\n" . - "and change its behavior. Otherwise it uses qstat to work out when the job finished\n" . - "Options:\n" . - " --config (default: $config)\n" . - " --mem (e.g. --mem 2G, --mem 500M, \n" . - " also support K and numbers mean bytes)\n" . - " --num-threads (default: $num_threads)\n" . - " --max-jobs-run \n" . - " --gpu <0|1> (default: $gpu)\n"; - exit 1; -} - -if (@ARGV < 2) { - print_usage(); -} - -for (my $x = 1; $x <= 2; $x++) { # This for-loop is to - # allow the JOB=1:n option to be interleaved with the - # options to qsub. - while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) { - my $switch = shift @ARGV; - - if ($switch eq "-V") { - $qsub_opts .= "-V "; - } else { - my $argument = shift @ARGV; - if ($argument =~ m/^--/) { - print STDERR "queue.pl: Warning: suspicious argument '$argument' to $switch; starts with '-'\n"; - } - if ($switch eq "-sync" && $argument =~ m/^[yY]/) { - $sync = 1; - $qsub_opts .= "$switch $argument "; - } elsif ($switch eq "-pe") { # e.g. -pe smp 5 - my $argument2 = shift @ARGV; - $qsub_opts .= "$switch $argument $argument2 "; - $num_threads = $argument2; - } elsif ($switch =~ m/^--/) { # Config options - # Convert CLI option to variable name - # by removing '--' from the switch and replacing any - # '-' with a '_' - $switch =~ s/^--//; - $switch =~ s/-/_/g; - $cli_options{$switch} = $argument; - } else { # Other qsub options - passed as is - $qsub_opts .= "$switch $argument "; - } - } - } - if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20 - $array_job = 1; - $jobname = $1; - $jobstart = $2; - $jobend = $3; - shift; - if ($jobstart > $jobend) { - die "queue.pl: invalid job range $ARGV[0]"; - } - if ($jobstart <= 0) { - die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is a GridEngine limitation)."; - } - } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1. - $array_job = 1; - $jobname = $1; - $jobstart = $2; - $jobend = $2; - shift; - } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) { - print STDERR "queue.pl: Warning: suspicious first argument to queue.pl: $ARGV[0]\n"; - } -} - -if (@ARGV < 2) { - print_usage(); -} - -if (exists $cli_options{"config"}) { - $config = $cli_options{"config"}; -} - -my $default_config_file = <<'EOF'; -# Default configuration -command qsub -V -v PATH -S /bin/bash -l mem=4G -option mem=* -l mem=$0 -option mem=0 # Do not add anything to qsub_opts -option num_threads=* -l ncpus=$0 -option num_threads=1 # Do not add anything to qsub_opts -default gpu=0 -option gpu=0 -option gpu=* -l ncpus=$0 -EOF - -# Here the configuration options specified by the user on the command line -# (e.g. --mem 2G) are converted to options to the qsub system as defined in -# the config file. (e.g. if the config file has the line -# "option mem=* -l ram_free=$0,mem_free=$0" -# and the user has specified '--mem 2G' on the command line, the options -# passed to queue system would be "-l ram_free=2G,mem_free=2G -# A more detailed description of the ways the options would be handled is at -# the top of this file. - -my $opened_config_file = 1; - -open CONFIG, "<$config" or $opened_config_file = 0; - -my %cli_config_options = (); -my %cli_default_options = (); - -if ($opened_config_file == 0 && exists($cli_options{"config"})) { - print STDERR "Could not open config file $config\n"; - exit(1); -} elsif ($opened_config_file == 0 && !exists($cli_options{"config"})) { - # Open the default config file instead - open (CONFIG, "echo '$default_config_file' |") or die "Unable to open pipe\n"; - $config = "Default config"; -} - -my $qsub_cmd = ""; -my $read_command = 0; - -while() { - chomp; - my $line = $_; - $_ =~ s/\s*#.*//g; - if ($_ eq "") { next; } - if ($_ =~ /^command (.+)/) { - $read_command = 1; - $qsub_cmd = $1 . " "; - } elsif ($_ =~ m/^option ([^=]+)=\* (.+)$/) { - # Config option that needs replacement with parameter value read from CLI - # e.g.: option mem=* -l mem_free=$0,ram_free=$0 - my $option = $1; # mem - my $arg= $2; # -l mem_free=$0,ram_free=$0 - if ($arg !~ m:\$0:) { - die "Unable to parse line '$line' in config file ($config)\n"; - } - if (exists $cli_options{$option}) { - # Replace $0 with the argument read from command line. - # e.g. "-l mem_free=$0,ram_free=$0" -> "-l mem_free=2G,ram_free=2G" - $arg =~ s/\$0/$cli_options{$option}/g; - $cli_config_options{$option} = $arg; - } - } elsif ($_ =~ m/^option ([^=]+)=(\S+)\s?(.*)$/) { - # Config option that does not need replacement - # e.g. option gpu=0 -q all.q - my $option = $1; # gpu - my $value = $2; # 0 - my $arg = $3; # -q all.q - if (exists $cli_options{$option}) { - $cli_default_options{($option,$value)} = $arg; - } - } elsif ($_ =~ m/^default (\S+)=(\S+)/) { - # Default options. Used for setting default values to options i.e. when - # the user does not specify the option on the command line - # e.g. default gpu=0 - my $option = $1; # gpu - my $value = $2; # 0 - if (!exists $cli_options{$option}) { - # If the user has specified this option on the command line, then we - # don't have to do anything - $cli_options{$option} = $value; - } - } else { - print STDERR "queue.pl: unable to parse line '$line' in config file ($config)\n"; - exit(1); - } -} - -close(CONFIG); - -if ($read_command != 1) { - print STDERR "queue.pl: config file ($config) does not contain the line \"command .*\"\n"; - exit(1); -} - -for my $option (keys %cli_options) { - if ($option eq "config") { next; } - if ($option eq "max_jobs_run" && $array_job != 1) { next; } - my $value = $cli_options{$option}; - - if (exists $cli_default_options{($option,$value)}) { - $qsub_opts .= "$cli_default_options{($option,$value)} "; - } elsif (exists $cli_config_options{$option}) { - $qsub_opts .= "$cli_config_options{$option} "; - } else { - if ($opened_config_file == 0) { $config = "default config file"; } - die "queue.pl: Command line option $option not described in $config (or value '$value' not allowed)\n"; - } -} - -my $cwd = getcwd(); -my $logfile = shift @ARGV; - -if ($array_job == 1 && $logfile !~ m/$jobname/ - && $jobend > $jobstart) { - print STDERR "queue.pl: you are trying to run a parallel job but " - . "you are putting the output into just one log file ($logfile)\n"; - exit(1); -} - -# -# Work out the command; quote escaping is done here. -# Note: the rules for escaping stuff are worked out pretty -# arbitrarily, based on what we want it to do. Some things that -# we pass as arguments to queue.pl, such as "|", we want to be -# interpreted by bash, so we don't escape them. Other things, -# such as archive specifiers like 'ark:gunzip -c foo.gz|', we want -# to be passed, in quotes, to the Kaldi program. Our heuristic -# is that stuff with spaces in should be quoted. This doesn't -# always work. -# -my $cmd = ""; - -foreach my $x (@ARGV) { - if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } # If string contains no spaces, take - # as-is. - elsif ($x =~ m:\":) { $cmd .= "'$x' "; } # else if no dbl-quotes, use single - else { $cmd .= "\"$x\" "; } # else use double. -} - -# -# Work out the location of the script file, and open it for writing. -# -my $dir = dirname($logfile); -my $base = basename($logfile); -my $qdir = "$dir/q"; -$qdir =~ s:/(log|LOG)/*q:/q:; # If qdir ends in .../log/q, make it just .../q. -my $queue_logfile = "$qdir/$base"; - -if (!-d $dir) { system "mkdir -p $dir 2>/dev/null"; } # another job may be doing this... -if (!-d $dir) { die "Cannot make the directory $dir\n"; } -# make a directory called "q", -# where we will put the log created by qsub... normally this doesn't contain -# anything interesting, evertyhing goes to $logfile. -if (! -d "$qdir") { - system "mkdir $qdir 2>/dev/null"; - sleep(5); ## This is to fix an issue we encountered in denominator lattice creation, - ## where if e.g. the exp/tri2b_denlats/log/15/q directory had just been - ## created and the job immediately ran, it would die with an error because nfs - ## had not yet synced. I'm also decreasing the acdirmin and acdirmax in our - ## NFS settings to something like 5 seconds. -} - -my $queue_array_opt = ""; -if ($array_job == 1) { # It's an array job. - $queue_array_opt = "-J $jobstart-$jobend"; - $logfile =~ s/$jobname/\$PBS_ARRAY_INDEX/g; # This variable will get - # replaced by qsub, in each job, with the job-id. - $cmd =~ s/$jobname/\$\{PBS_ARRAY_INDEX\}/g; # same for the command... - $queue_logfile =~ s/\.?$jobname//; # the log file in the q/ subdirectory - # is for the queue to put its log, and this doesn't need the task array subscript - # so we remove it. -} - -# queue_scriptfile is as $queue_logfile [e.g. dir/q/foo.log] but -# with the suffix .sh. -my $queue_scriptfile = $queue_logfile; -($queue_scriptfile =~ s/\.[a-zA-Z]{1,5}$/.sh/) || ($queue_scriptfile .= ".sh"); -if ($queue_scriptfile !~ m:^/:) { - $queue_scriptfile = $cwd . "/" . $queue_scriptfile; # just in case. -} - -# We'll write to the standard input of "qsub" (the file-handle Q), -# the job that we want it to execute. -# Also keep our current PATH around, just in case there was something -# in it that we need (although we also source ./path.sh) - -my $syncfile = "$qdir/done.$$"; - -system("rm $queue_logfile $syncfile 2>/dev/null"); -# -# Write to the script file, and then close it. -# -open(Q, ">$queue_scriptfile") || die "Failed to write to $queue_scriptfile"; - -print Q "#!/bin/bash\n"; -print Q "cd $cwd\n"; -print Q ". ./path.sh\n"; -print Q "( echo '#' Running on \`hostname\`\n"; -print Q " echo '#' Started at \`date\`\n"; -print Q " echo -n '# '; cat <$logfile\n"; -print Q "time1=\`date +\"%s\"\`\n"; -print Q " ( $cmd ) 2>>$logfile >>$logfile\n"; -print Q "ret=\$?\n"; -print Q "time2=\`date +\"%s\"\`\n"; -print Q "echo '#' Accounting: time=\$((\$time2-\$time1)) threads=$num_threads >>$logfile\n"; -print Q "echo '#' Finished at \`date\` with status \$ret >>$logfile\n"; -print Q "[ \$ret -eq 137 ] && exit 100;\n"; # If process was killed (e.g. oom) it will exit with status 137; - # let the script return with status 100 which will put it to E state; more easily rerunnable. -if ($array_job == 0) { # not an array job - print Q "touch $syncfile\n"; # so we know it's done. -} else { - print Q "touch $syncfile.\$PBS_ARRAY_INDEX\n"; # touch a bunch of sync-files. -} -print Q "exit \$[\$ret ? 1 : 0]\n"; # avoid status 100 which grid-engine -print Q "## submitted with:\n"; # treats specially. -$qsub_cmd .= "-o $queue_logfile $qsub_opts $queue_array_opt $queue_scriptfile >>$queue_logfile 2>&1"; -print Q "# $qsub_cmd\n"; -if (!close(Q)) { # close was not successful... || die "Could not close script file $shfile"; - die "Failed to close the script file (full disk?)"; -} - -my $ret = system ($qsub_cmd); -if ($ret != 0) { - if ($sync && $ret == 256) { # this is the exit status when a job failed (bad exit status) - if (defined $jobname) { $logfile =~ s/\$PBS_ARRAY_INDEX/*/g; } - print STDERR "queue.pl: job writing to $logfile failed\n"; - } else { - print STDERR "queue.pl: error submitting jobs to queue (return status was $ret)\n"; - print STDERR "queue log file is $queue_logfile, command was $qsub_cmd\n"; - print STDERR `tail $queue_logfile`; - } - exit(1); -} - -my $sge_job_id; -if (! $sync) { # We're not submitting with -sync y, so we - # need to wait for the jobs to finish. We wait for the - # sync-files we "touched" in the script to exist. - my @syncfiles = (); - if (!defined $jobname) { # not an array job. - push @syncfiles, $syncfile; - } else { - for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) { - push @syncfiles, "$syncfile.$jobid"; - } - } - # We will need the sge_job_id, to check that job still exists - { # Get the SGE job-id from the log file in q/ - open(L, "<$queue_logfile") || die "Error opening log file $queue_logfile"; - undef $sge_job_id; - while () { - if (m/Your job\S* (\d+)[. ].+ has been submitted/) { - if (defined $sge_job_id) { - die "Error: your job was submitted more than once (see $queue_logfile)"; - } else { - $sge_job_id = $1; - } - } - } - close(L); - if (!defined $sge_job_id) { - die "Error: log file $queue_logfile does not specify the SGE job-id."; - } - } - my $check_sge_job_ctr=1; - # - my $wait = 0.1; - my $counter = 0; - foreach my $f (@syncfiles) { - # wait for them to finish one by one. - while (! -f $f) { - sleep($wait); - $wait *= 1.2; - if ($wait > 3.0) { - $wait = 3.0; # never wait more than 3 seconds. - # the following (.kick) commands are basically workarounds for NFS bugs. - if (rand() < 0.25) { # don't do this every time... - if (rand() > 0.5) { - system("touch $qdir/.kick"); - } else { - system("rm $qdir/.kick 2>/dev/null"); - } - } - if ($counter++ % 10 == 0) { - # This seems to kick NFS in the teeth to cause it to refresh the - # directory. I've seen cases where it would indefinitely fail to get - # updated, even though the file exists on the server. - # Only do this every 10 waits (every 30 seconds) though, or if there - # are many jobs waiting they can overwhelm the file server. - system("ls $qdir >/dev/null"); - } - } - - # Check that the job exists in SGE. Job can be killed if duration - # exceeds some hard limit, or in case of a machine shutdown. - if (($check_sge_job_ctr++ % 10) == 0) { # Don't run qstat too often, avoid stress on SGE. - if ( -f $f ) { next; }; #syncfile appeared: OK. - $ret = system("qstat -t $sge_job_id >/dev/null 2>/dev/null"); - # system(...) : To get the actual exit value, shift $ret right by eight bits. - if ($ret>>8 == 1) { # Job does not seem to exist - # Don't consider immediately missing job as error, first wait some - # time to make sure it is not just delayed creation of the syncfile. - - sleep(3); - # Sometimes NFS gets confused and thinks it's transmitted the directory - # but it hasn't, due to timestamp issues. Changing something in the - # directory will usually fix that. - system("touch $qdir/.kick"); - system("rm $qdir/.kick 2>/dev/null"); - if ( -f $f ) { next; } #syncfile appeared, ok - sleep(7); - system("touch $qdir/.kick"); - sleep(1); - system("rm $qdir/.kick 2>/dev/null"); - if ( -f $f ) { next; } #syncfile appeared, ok - sleep(60); - system("touch $qdir/.kick"); - sleep(1); - system("rm $qdir/.kick 2>/dev/null"); - if ( -f $f ) { next; } #syncfile appeared, ok - $f =~ m/\.(\d+)$/ || die "Bad sync-file name $f"; - my $job_id = $1; - if (defined $jobname) { - $logfile =~ s/\$PBS_ARRAY_INDEX/$job_id/g; - } - my $last_line = `tail -n 1 $logfile`; - if ($last_line =~ m/status 0$/ && (-M $logfile) < 0) { - # if the last line of $logfile ended with "status 0" and - # $logfile is newer than this program [(-M $logfile) gives the - # time elapsed between file modification and the start of this - # program], then we assume the program really finished OK, - # and maybe something is up with the file system. - print STDERR "**queue.pl: syncfile $f was not created but job seems\n" . - "**to have finished OK. Probably your file-system has problems.\n" . - "**This is just a warning.\n"; - last; - } else { - chop $last_line; - print STDERR "queue.pl: Error, unfinished job no " . - "longer exists, log is in $logfile, last line is '$last_line', " . - "syncfile is $f, return status of qstat was $ret\n" . - "Possible reasons: a) Exceeded time limit? -> Use more jobs!" . - " b) Shutdown/Frozen machine? -> Run again!\n"; - exit(1); - } - } elsif ($ret != 0) { - print STDERR "queue.pl: Warning: qstat command returned status $ret (qstat -t $sge_job_id,$!)\n"; - } - } - } - } - my $all_syncfiles = join(" ", @syncfiles); - system("rm $all_syncfiles 2>/dev/null"); -} - -# OK, at this point we are synced; we know the job is done. -# But we don't know about its exit status. We'll look at $logfile for this. -# First work out an array @logfiles of file-locations we need to -# read (just one, unless it's an array job). -my @logfiles = (); -if (!defined $jobname) { # not an array job. - push @logfiles, $logfile; -} else { - for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) { - my $l = $logfile; - $l =~ s/\$PBS_ARRAY_INDEX/$jobid/g; - push @logfiles, $l; - } -} - -my $num_failed = 0; -my $status = 1; -foreach my $l (@logfiles) { - my @wait_times = (0.1, 0.2, 0.2, 0.3, 0.5, 0.5, 1.0, 2.0, 5.0, 5.0, 5.0, 10.0, 25.0); - for (my $iter = 0; $iter <= @wait_times; $iter++) { - my $line = `tail -10 $l 2>/dev/null`; # Note: although this line should be the last - # line of the file, I've seen cases where it was not quite the last line because - # of delayed output by the process that was running, or processes it had called. - # so tail -10 gives it a little leeway. - if ($line =~ m/with status (\d+)/) { - $status = $1; - last; - } else { - if ($iter < @wait_times) { - sleep($wait_times[$iter]); - } else { - if (! -f $l) { - print STDERR "Log-file $l does not exist.\n"; - } else { - print STDERR "The last line of log-file $l does not seem to indicate the " - . "return status as expected\n"; - } - exit(1); # Something went wrong with the queue, or the - # machine it was running on, probably. - } - } - } - # OK, now we have $status, which is the return-status of - # the command in the job. - if ($status != 0) { $num_failed++; } -} -if ($num_failed == 0) { exit(0); } -else { # we failed. - if (@logfiles == 1) { - if (defined $jobname) { $logfile =~ s/\$PBS_ARRAY_INDEX/$jobstart/g; } - print STDERR "queue.pl: job failed with status $status, log is in $logfile\n"; - if ($logfile =~ m/JOB/) { - print STDERR "queue.pl: probably you forgot to put JOB=1:\$nj in your script.\n"; - } - } else { - if (defined $jobname) { $logfile =~ s/\$PBS_ARRAY_INDEX/*/g; } - my $numjobs = 1 + $jobend - $jobstart; - print STDERR "queue.pl: $num_failed / $numjobs failed, log is in $logfile\n"; - } - exit(1); -} diff --git a/kaldi/local/perturb_data_dir_speed.sh b/kaldi/local/perturb_data_dir_speed.sh deleted file mode 100755 index a50cdb0..0000000 --- a/kaldi/local/perturb_data_dir_speed.sh +++ /dev/null @@ -1,125 +0,0 @@ -#!/bin/bash - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) -# 2014 Tom Ko -# 2018 Emotech LTD (author: Pawel Swietojanski) -# Apache 2.0 - -# This script operates on a directory, such as in data/train/, -# that contains some subset of the following files: -# wav.scp -# spk2utt -# utt2spk -# text -# utt2dur -# reco2dur -# -# It generates the files which are used for perturbing the speed of the original data. - -. utils/parse_options.sh - -if [ $# != 3 ]; then - echo "Usage: perturb_data_dir_speed.sh " - echo "e.g.:" - echo " $0 0.9 data/train_si284 data/train_si284p" - exit 1 -fi - -export LC_ALL=C - -factor=$1 -srcdir=$2 -destdir=$3 -label="sp" -spk_prefix=$label$factor"-" -utt_prefix=$label$factor"-" - -#check is sox on the path -which sox &>/dev/null -! [ $? -eq 0 ] && echo "sox: command not found" && exit 1; - -if [ ! -f $srcdir/utt2spk ]; then - echo "$0: no such file $srcdir/utt2spk" - exit 1; -fi - -if [ "$destdir" == "$srcdir" ]; then - echo "$0: this script requires and to be different." - exit 1 -fi - -set -e; -set -o pipefail - -mkdir -p $destdir - -cat $srcdir/utt2spk | awk -v p=$utt_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/utt_map -cat $srcdir/spk2utt | awk -v p=$spk_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/spk_map -cat $srcdir/wav.scp | awk -v p=$spk_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/reco_map -if [ ! -f $srcdir/utt2uniq ]; then - cat $srcdir/utt2spk | awk -v p=$utt_prefix '{printf("%s%s %s\n", p, $1, $1);}' > $destdir/utt2uniq -else - cat $srcdir/utt2uniq | awk -v p=$utt_prefix '{printf("%s%s %s\n", p, $1, $2);}' > $destdir/utt2uniq -fi - - -cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ - utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk - -utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt - -if [ -f $srcdir/segments ]; then - - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments | \ - utils/apply_map.pl -f 2 $destdir/reco_map | \ - awk -v factor=$factor \ - '{printf("%s %s %.2f %.2f\n", $1, $2, $3/factor, $4/factor);}' >$destdir/segments - - utils/apply_map.pl -f 1 $destdir/reco_map <$srcdir/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor=$factor \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox -t wav" $_ " -t wav - speed " factor " |"}}' > $destdir/wav.scp - if [ -f $srcdir/reco2file_and_channel ]; then - utils/apply_map.pl -f 1 $destdir/reco_map <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel - fi - -else # no segments->wav indexed by utterance. - if [ -f $srcdir/wav.scp ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp | sed 's/| *$/ |/' | \ - # Handle three cases of rxfilenames appropriately; "input piped command", "file offset" and "filename" - awk -v factor=$factor \ - '{wid=$1; $1=""; if ($NF=="|") {print wid $_ " sox -t wav - -t wav - speed " factor " |"} - else if (match($0, /:[0-9]+$/)) {print wid " wav-copy" $_ " - | sox -t wav - -t wav - speed " factor " |" } - else {print wid " sox -t wav" $_ " -t wav - speed " factor " |"}}' > $destdir/wav.scp - fi -fi - -if [ -f $srcdir/text ]; then - utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text -fi -if [ -f $srcdir/spk2gender ]; then - utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender -fi - -#prepare speed-perturbed utt2dur -if [ ! -f $srcdir/utt2dur ]; then - # generate utt2dur if it does not exist in srcdir - utils/data/get_utt2dur.sh $srcdir -fi -cat $srcdir/utt2dur | utils/apply_map.pl -f 1 $destdir/utt_map | \ - awk -v factor=$factor '{print $1, $2/factor;}' >$destdir/utt2dur - -#prepare speed-perturbed reco2dur -if [ ! -f $srcdir/reco2dur ]; then - # generate reco2dur if it does not exist in srcdir - utils/data/get_reco2dur.sh $srcdir -fi -cat $srcdir/reco2dur | utils/apply_map.pl -f 1 $destdir/reco_map | \ - awk -v factor=$factor '{print $1, $2/factor;}' >$destdir/reco2dur - -rm $destdir/spk_map $destdir/utt_map $destdir/reco_map 2>/dev/null -echo "$0: generated speed-perturbed version of data in $srcdir, in $destdir" - -utils/validate_data_dir.sh --no-feats --no-text $destdir diff --git a/kaldi/local/pinyin_map.pl b/kaldi/local/pinyin_map.pl deleted file mode 100755 index 8210ec2..0000000 --- a/kaldi/local/pinyin_map.pl +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter - -$num_args = $#ARGV + 1; -if ($num_args != 1) { - print "\nUsage: pinyin2phone.pl pinyin2phone\n"; - exit; -} - -open(MAPS, $ARGV[0]) or die("Could not open pinyin map file."); -my %py2ph; foreach $line () { @A = split(" ", $line); - $py = shift(@A); - $py2ph{$py} = [@A]; -} - -#foreach $word ( keys %py2ph ) { - #foreach $i ( 0 .. $#{ $py2ph{$word} } ) { - # print " $word = $py2ph{$word}[$i]"; - #} - #print " $#{ $py2ph{$word} }"; - #print "\n"; -#} - -my @entry; - -while () { - @A = split(" ", $_); - @entry = (); - $W = shift(@A); - push(@entry, $W); - for($i = 0; $i < @A; $i++) { - $initial= $A[$i]; $final = $A[$i]; - #print $initial, " ", $final, "\n"; - if ($A[$i] =~ /^CH[A-Z0-9]+$/) {$initial =~ s:(CH)[A-Z0-9]+:$1:; $final =~ s:CH([A-Z0-9]+):$1:;} - elsif ($A[$i] =~ /^SH[A-Z0-9]+$/) {$initial =~ s:(SH)[A-Z0-9]+:$1:; $final =~ s:SH([A-Z0-9]+):$1:;} - elsif ($A[$i] =~ /^ZH[A-Z0-9]+$/) {$initial =~ s:(ZH)[A-Z0-9]+:$1:; $final =~ s:ZH([A-Z0-9]+):$1:;} - elsif ($A[$i] =~ /^B[A-Z0-9]+$/) {$initial =~ s:(B)[A-Z0-9]+:$1:; $final =~ s:B([A-Z0-9]+):$1:;} - elsif ($A[$i] =~ /^C[A-Z0-9]+$/) {$initial =~ s:(C)[A-Z0-9]+:$1:; $final =~ s:C([A-Z0-9]+):$1:;} - elsif ($A[$i] =~ /^D[A-Z0-9]+$/) {$initial =~ s:(D)[A-Z0-9]+:$1:; $final =~ s:D([A-Z0-9]+):$1:;} - elsif ($A[$i] =~ /^F[A-Z0-9]+$/) {$initial =~ s:(F)[A-Z0-9]+:$1:; $final =~ s:F([A-Z0-9]+):$1:;} - elsif ($A[$i] =~ /^G[A-Z0-9]+$/) {$initial =~ s:(G)[A-Z0-9]+:$1:; $final =~ s:G([A-Z0-9]+):$1:;} - elsif ($A[$i] =~ /^H[A-Z0-9]+$/) {$initial =~ s:(H)[A-Z0-9]+:$1:; $final =~ s:H([A-Z0-9]+):$1:;} - elsif ($A[$i] =~ /^J[A-Z0-9]+$/) {$initial =~ s:(J)[A-Z0-9]+:$1:; $final =~ s:J([A-Z0-9]+):$1:;} - elsif ($A[$i] =~ /^K[A-Z0-9]+$/) {$initial =~ s:(K)[A-Z0-9]+:$1:; $final =~ s:K([A-Z0-9]+):$1:;} - elsif ($A[$i] =~ /^L[A-Z0-9]+$/) {$initial =~ s:(L)[A-Z0-9]+:$1:; $final =~ s:L([A-Z0-9]+):$1:;} - elsif ($A[$i] =~ /^M[A-Z0-9]+$/) {$initial =~ s:(M)[A-Z0-9]+:$1:; $final =~ s:M([A-Z0-9]+):$1:;} - elsif ($A[$i] =~ /^N[A-Z0-9]+$/) {$initial =~ s:(N)[A-Z0-9]+:$1:; $final =~ s:N([A-Z0-9]+):$1:;} - elsif ($A[$i] =~ /^P[A-Z0-9]+$/) {$initial =~ s:(P)[A-Z0-9]+:$1:; $final =~ s:P([A-Z0-9]+):$1:;} - elsif ($A[$i] =~ /^Q[A-Z0-9]+$/) {$initial =~ s:(Q)[A-Z0-9]+:$1:; $final =~ s:Q([A-Z0-9]+):$1:;} - elsif ($A[$i] =~ /^R[A-Z0-9]+$/) {$initial =~ s:(R)[A-Z0-9]+:$1:; $final =~ s:R([A-Z0-9]+):$1:;} - elsif ($A[$i] =~ /^S[A-Z0-9]+$/) {$initial =~ s:(S)[A-Z0-9]+:$1:; $final =~ s:S([A-Z0-9]+):$1:;} - elsif ($A[$i] =~ /^T[A-Z0-9]+$/) {$initial =~ s:(T)[A-Z0-9]+:$1:; $final =~ s:T([A-Z0-9]+):$1:;} - elsif ($A[$i] =~ /^W[A-Z0-9]+$/) {$initial =~ s:(W)[A-Z0-9]+:$1:; $final =~ s:W([A-Z0-9]+):$1:;} - elsif ($A[$i] =~ /^X[A-Z0-9]+$/) {$initial =~ s:(X)[A-Z0-9]+:$1:; $final =~ s:X([A-Z0-9]+):$1:;} - elsif ($A[$i] =~ /^Y[A-Z0-9]+$/) {$initial =~ s:(Y)[A-Z0-9]+:$1:; $final =~ s:Y([A-Z0-9]+):$1:;} - elsif ($A[$i] =~ /^Z[A-Z0-9]+$/) {$initial =~ s:(Z)[A-Z0-9]+:$1:; $final =~ s:Z([A-Z0-9]+):$1:;} - if ($initial ne $A[$i]) { - $tone = $final; - $final =~ s:([A-Z]+)[0-9]:$1:; - $tone =~ s:[A-Z]+([0-9]):$1:; - if (!(exists $py2ph{$initial}) or !(exists $py2ph{$final})) { die "$0: no entry find for ", $A[$i], " ", $initial, " ", $final;} - push(@entry, @{$py2ph{$initial}}); - @tmp = @{$py2ph{$final}}; - for($j = 0; $j < @tmp ; $j++) {$tmp[$j] = $tmp[$j].$tone;} - push(@entry, @tmp); - } - else { - $tone = $A[$i]; - $A[$i] =~ s:([A-Z]+)[0-9]:$1:; - $tone =~ s:[A-Z]+([0-9]):$1:; - if (!(exists $py2ph{$A[$i]})) { die "$0: no entry find for ", $A[$i];} - @tmp = @{$py2ph{$A[$i]}}; - for($j = 0; $j < @tmp ; $j++) {$tmp[$j] = $tmp[$j].$tone;} - push(@entry, @tmp); - } - } - print "@entry"; - print "\n"; -} diff --git a/kaldi/local/prepare_extended_lang.sh b/kaldi/local/prepare_extended_lang.sh deleted file mode 100755 index 824654c..0000000 --- a/kaldi/local/prepare_extended_lang.sh +++ /dev/null @@ -1,165 +0,0 @@ -#!/bin/bash -# Copyright 2018 Xiaohui Zhang - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script adds word-position-dependent phones and constructs a host of other -# derived files, that go in data/lang/. - -# Begin configuration section. -prep_lang_opts= -stage=0 -word_list= # if a word list (mapping words from the srcdict to IDs) is provided, -# we'll make sure the IDs of these words are kept as before. -# end configuration sections - -echo "$0 $@" # Print the command line for logging - -. utils/parse_options.sh - -if [ $# -ne 7 ]; then - echo "usage: utils/prepare_extended_lang.sh " - echo " " - echo "e.g.: utils/prepare_extended_lang.sh data/local/dict '' lexicon_extra.txt" - echo "data/lang/phones.txt data/local/dict_ext data/local/lang_ext data/lang_ext" - echo "The goal is to extend the lexicon from with extra lexical entries from " - echo ", putting the extended lexicon into , and then build" - echo "a valid lang dir . This is useful when we want to extend the vocab" - echo "in test time." - echo " must be a valid dictionary dir and is the oov word " - echo "(see utils/prepare_lang.sh for details). A phone symbol table from a previsouly built " - echo "lang dir is required, for validating provided lexical entries." - echo "options: " - echo " --prep-lang-opts STRING # options to pass to utils/prepare_lang.sh" - echo " --word-list # default: \"\"; if not empty, re-order the " - echo " # words in the generated words.txt so that the" - echo " # words from the provided list have their ids" - echo " # kept unchanged." - exit 1; -fi - -srcdict=$1 -oov_word=$2 -extra_lexicon=$3 -phone_symbol_table=$4 -extdict=$5 # extended dict dir -tmpdir=$6 -extlang=$7 # extended lang dir - -mkdir -p $extlang $tmpdir - -[ -f path.sh ] && . ./path.sh - -! utils/validate_dict_dir.pl $srcdict && \ - echo "*Error validating directory $srcdict*" && exit 1; - -if [[ ! -f $srcdict/lexicon.txt ]]; then - echo "**Creating $dir/lexicon.txt from $dir/lexiconp.txt" - perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' < $srcdict/lexiconp.txt \ - > $srcdict/lexicon.txt || exit 1; -fi - -if [[ ! -f $srcdict/lexiconp.txt ]]; then - echo "**Creating $srcdict/lexiconp.txt from $srcdict/lexicon.txt" - perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdict/lexicon.txt > $srcdict/lexiconp.txt || exit 1; -fi - -# Checks if the phone sets match. -echo "$(basename $0): Validating the source lexicon" -cat $srcdict/lexicon.txt | awk -v f=$phone_symbol_table ' -BEGIN { while ((getline < f) > 0) { sub(/_[BEIS]$/, "", $1); phones[$1] = 1; }} -{ for (x = 2; x <= NF; ++x) { - if (!($x in phones)) { - print "The source lexicon contains a phone not in the phones.txt: "$x; - print "You must provide a phones.txt from the lang built with the source lexicon."; - exit 1; - } -}}' || exit 1; - -echo "$(basename $0): Validating the extra lexicon" -cat $extra_lexicon | awk -v f=$phone_symbol_table ' -BEGIN { while ((getline < f) > 0) { sub(/_[BEIS]$/, "", $1); phones[$1] = 1; }} -{ for (x = 2; x <= NF; ++x) { if (!($x in phones)) { - print "The extra lexicon contains a phone not in the phone symbol table: "$x; exit 1; } - } -}' || exit 1; - -if [ $stage -le 0 ]; then - # Genearte the extended dict dir - echo "$(basename $0): Creating the extended lexicon $extdict/lexicon.txt" - [ -d $extdict ] && rm -r $extdict 2>/dev/null - cp -R $srcdict $extdict 2>/dev/null - - # Reformat the source lexicon - perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$srcdict/lexiconp.txt | awk '{ gsub(/\t/, " "); print }' \ - >$tmpdir/lexicon.txt || exit 1; - - # Filter lexical entries which are already in the source lexicon - awk '{ gsub(/\t/, " "); print }' $extra_lexicon | sort -u | \ - awk 'NR==FNR{a[$0]=1;next} {if (!($0 in a)) print $0 }' $tmpdir/lexicon.txt - \ - > $extdict/lexicon_extra.txt || exit 1; - - echo "$(basename $0): Creating $extdict/lexiconp.txt from $srcdict/lexiconp.txt and $extdict/lexicon_extra.txt" - perl -ape 's/(\S+\s+)(.+)/${1}1 $2/;' < $extdict/lexicon_extra.txt | \ - cat $srcdict/lexiconp.txt - | awk '{ gsub(/\t/, " "); print }' | \ - sort -u -k1,1 -k2g,2 -k3 > $extdict/lexiconp.txt || exit 1; - - perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$extdict/lexiconp.txt >$extdict/lexicon.txt || exit 1; - - # Create lexicon_silprobs.txt - silprob=false - [ -f $srcdict/lexiconp_silprob.txt ] && silprob=true - if "$silprob"; then - echo "$(basename $0): Creating $extdict/lexiconp_silprob.txt from $srcdict/lexiconp_silprob.txt" - # Here we assume no acoustic evidence for the extra word-pron pairs. - # So we assign silprob1 = overall_silprob, silprob2 = silprob3 = 1.00 - overall_silprob=`awk '{if ($1=="overall") print $2}' $srcdict/silprob.txt` - awk -v overall=$overall_silprob '{ - printf("%s %d %.1f %.2f %.2f",$1, 1, overall, 1.00, 1.00); - for(n=2;n<=NF;n++) printf " "$n; printf("\n"); - }' $extdict/lexicon_extra.txt | cat $srcdict/lexiconp_silprob.txt - | \ - sort -k1,1 -k2g,2 -k6 \ - > $extdict/lexiconp_silprob.txt || exit 1; - fi - - if ! utils/validate_dict_dir.pl $extdict >&/dev/null; then - utils/validate_dict_dir.pl $extdict # show the output. - echo "$(basename $0): Validation failed on the extended dict" - exit 1; - fi -fi - -if [ $stage -le 1 ]; then - echo "$(basename $0): Preparing the extended lang dir." - [ -d $extlang ] && rm -r $extlang 2>/dev/null - utils/prepare_lang.sh $prep_lang_opts $extdict \ - $oov_word $tmpdir $extlang || exit 1; - - # If a word list is provided, make sure the word-ids of these words are kept unchanged - # in the extended word list. - if [ -f $word_list ]; then - # First, make sure there's no OOV in the provided word-list. - if [ `awk -v s=$extlang/words.txt 'BEGIN{ while((getline < s) > 0) { vocab[$1] = 1;}} \ - {if (!($1 in vocab)) print $0}' $word_list | wc -l` -gt 0 ]; then - echo "$(basename $0): The provided word list contains words out of the extended vocab." - exit 1; - fi - awk -v s=$word_list -v oov=$oov_word -v boost=$oov_unigram_prob -v prob=$oov_prob \ - 'BEGIN{ while((getline < s) > 0) { vocab[$1] = 1; n+=1; print $0}} \ - { if (!($1 in vocab)) {print $1" "n; n+=1;}}' $extlang/words.txt > $extlang/words.txt.$$ - mv $extlang/words.txt.$$ $extlang/words.txt - fi -fi - -exit 0; diff --git a/kaldi/local/prepare_for_eer.py b/kaldi/local/prepare_for_eer.py deleted file mode 100755 index 6bfa04e..0000000 --- a/kaldi/local/prepare_for_eer.py +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/env python3 -# -# Copyright 2015 David Snyder -# Apache 2.0. -# -# Copied from egs/sre10/v1/local/prepare_for_eer.py (commit 9cb4c4c2fb0223ee90c38d98af11305074eb7ef8) -# -# Given a trials and scores file, this script -# prepares input for the binary compute-eer. -import sys -trials = open(sys.argv[1], 'r').readlines() -scores = open(sys.argv[2], 'r').readlines() -spkrutt2target = {} -for line in trials: - spkr, utt, target = line.strip().split() - spkrutt2target[spkr+utt]=target -for line in scores: - spkr, utt, score = line.strip().split() - print(score, spkrutt2target[spkr+utt]) diff --git a/kaldi/local/prepare_lang.sh b/kaldi/local/prepare_lang.sh deleted file mode 100755 index fa5ff78..0000000 --- a/kaldi/local/prepare_lang.sh +++ /dev/null @@ -1,522 +0,0 @@ -#!/bin/bash -# Copyright 2012-2013 Johns Hopkins University (Author: Daniel Povey); -# Arnab Ghoshal -# 2014 Guoguo Chen -# 2015 Hainan Xu -# 2016 FAU Erlangen (Author: Axel Horndasch) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script prepares a directory such as data/lang/, in the standard format, -# given a source directory containing a dictionary lexicon.txt in a form like: -# word phone1 phone2 ... phoneN -# per line (alternate prons would be separate lines), or a dictionary with probabilities -# called lexiconp.txt in a form: -# word pron-prob phone1 phone2 ... phoneN -# (with 0.0 < pron-prob <= 1.0); note: if lexiconp.txt exists, we use it even if -# lexicon.txt exists. -# and also files silence_phones.txt, nonsilence_phones.txt, optional_silence.txt -# and extra_questions.txt -# Here, silence_phones.txt and nonsilence_phones.txt are lists of silence and -# non-silence phones respectively (where silence includes various kinds of -# noise, laugh, cough, filled pauses etc., and nonsilence phones includes the -# "real" phones.) -# In each line of those files is a list of phones, and the phones on each line -# are assumed to correspond to the same "base phone", i.e. they will be -# different stress or tone variations of the same basic phone. -# The file "optional_silence.txt" contains just a single phone (typically SIL) -# which is used for optional silence in the lexicon. -# extra_questions.txt might be empty; typically will consist of lists of phones, -# all members of each list with the same stress or tone; and also possibly a -# list for the silence phones. This will augment the automatically generated -# questions (note: the automatically generated ones will treat all the -# stress/tone versions of a phone the same, so will not "get to ask" about -# stress or tone). -# - -# This script adds word-position-dependent phones and constructs a host of other -# derived files, that go in data/lang/. - -# Begin configuration section. -num_sil_states=5 -num_nonsil_states=3 -position_dependent_phones=true -# position_dependent_phones is false also when position dependent phones and word_boundary.txt -# have been generated by another source -share_silence_phones=false # if true, then share pdfs of different silence - # phones together. -sil_prob=0.5 -unk_fst= # if you want to model the unknown-word () - # with a phone-level LM as created by make_unk_lm.sh, - # provide the text-form FST via this flag, e.g. /unk_fst.txt - # where was the 2nd argument of make_unk_lm.sh. -phone_symbol_table= # if set, use a specified phones.txt file. -extra_word_disambig_syms= # if set, add disambiguation symbols from this file (one per line) - # to phones/disambig.txt, phones/wdisambig.txt and words.txt -num_extra_phone_disambig_syms=1 # Standard one phone disambiguation symbol is used for optional silence. - # Increasing this number does not harm, but is only useful if you later - # want to introduce this labels to L_disambig.fst -# end configuration sections - -echo "$0 $@" # Print the command line for logging - -. utils/parse_options.sh - -if [ $# -ne 4 ]; then - echo "usage: utils/prepare_lang.sh " - echo "e.g.: utils/prepare_lang.sh data/local/dict data/local/lang data/lang" - echo " should contain the following files:" - echo " extra_questions.txt lexicon.txt nonsilence_phones.txt optional_silence.txt silence_phones.txt" - echo "See http://kaldi-asr.org/doc/data_prep.html#data_prep_lang_creating for more info." - echo "options: " - echo " --num-sil-states # default: 5, #states in silence models." - echo " --num-nonsil-states # default: 3, #states in non-silence models." - echo " --position-dependent-phones (true|false) # default: true; if true, use _B, _E, _S & _I" - echo " # markers on phones to indicate word-internal positions. " - echo " --share-silence-phones (true|false) # default: false; if true, share pdfs of " - echo " # all silence phones. " - echo " --sil-prob # default: 0.5 [must have 0 <= silprob < 1]" - echo " --phone-symbol-table # default: \"\"; if not empty, use the provided " - echo " # phones.txt as phone symbol table. This is useful " - echo " # if you use a new dictionary for the existing setup." - echo " --unk-fst # default: none. e.g. exp/make_unk_lm/unk_fst.txt." - echo " # This is for if you want to model the unknown word" - echo " # via a phone-level LM rather than a special phone" - echo " # (this should be more useful for test-time than train-time)." - echo " --extra-word-disambig-syms # default: \"\"; if not empty, add disambiguation symbols" - echo " # from this file (one per line) to phones/disambig.txt," - echo " # phones/wdisambig.txt and words.txt" - exit 1; -fi - -srcdir=$1 -oov_word=$2 -tmpdir=$3 -dir=$4 -mkdir -p $dir $tmpdir $dir/phones - -silprob=false -[ -f $srcdir/lexiconp_silprob.txt ] && silprob=true - -[ -f path.sh ] && . ./path.sh - -! utils/validate_dict_dir.pl $srcdir && \ - echo "*Error validating directory $srcdir*" && exit 1; - -if [[ ! -f $srcdir/lexicon.txt ]]; then - echo "**Creating $srcdir/lexicon.txt from $srcdir/lexiconp.txt" - perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' < $srcdir/lexiconp.txt > $srcdir/lexicon.txt || exit 1; -fi -if [[ ! -f $srcdir/lexiconp.txt ]]; then - echo "**Creating $srcdir/lexiconp.txt from $srcdir/lexicon.txt" - perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $srcdir/lexiconp.txt || exit 1; -fi - -if [ ! -z "$unk_fst" ] && [ ! -f "$unk_fst" ]; then - echo "$0: expected --unk-fst $unk_fst to exist as a file" - exit 1 -fi - -if ! utils/validate_dict_dir.pl $srcdir >&/dev/null; then - utils/validate_dict_dir.pl $srcdir # show the output. - echo "Validation failed (second time)" - exit 1; -fi - -# phones.txt file provided, we will do some sanity check here. -if [[ ! -z $phone_symbol_table ]]; then - # Checks if we have position dependent phones - n1=`cat $phone_symbol_table | grep -v -E "^#[0-9]+$" | cut -d' ' -f1 | sort -u | wc -l` - n2=`cat $phone_symbol_table | grep -v -E "^#[0-9]+$" | cut -d' ' -f1 | sed 's/_[BIES]$//g' | sort -u | wc -l` - $position_dependent_phones && [ $n1 -eq $n2 ] &&\ - echo "$0: Position dependent phones requested, but not in provided phone symbols" && exit 1; - ! $position_dependent_phones && [ $n1 -ne $n2 ] &&\ - echo "$0: Position dependent phones not requested, but appear in the provided phones.txt" && exit 1; - - # Checks if the phone sets match. - cat $srcdir/{,non}silence_phones.txt | awk -v f=$phone_symbol_table ' - BEGIN { while ((getline < f) > 0) { sub(/_[BEIS]$/, "", $1); phones[$1] = 1; }} - { for (x = 1; x <= NF; ++x) { if (!($x in phones)) { - print "Phone appears in the lexicon but not in the provided phones.txt: "$x; exit 1; }}}' || exit 1; -fi - -# In case there are extra word-level disambiguation symbols we need -# to make sure that all symbols in the provided file are valid. -if [ ! -z "$extra_word_disambig_syms" ]; then - if ! utils/lang/validate_disambig_sym_file.pl --allow-numeric "false" $extra_word_disambig_syms; then - echo "$0: Validation of disambiguation file \"$extra_word_disambig_syms\" failed." - exit 1; - fi -fi - -if $position_dependent_phones; then - # Create $tmpdir/lexiconp.txt from $srcdir/lexiconp.txt (or - # $tmpdir/lexiconp_silprob.txt from $srcdir/lexiconp_silprob.txt) by - # adding the markers _B, _E, _S, _I depending on word position. - # In this recipe, these markers apply to silence also. - # Do this starting from lexiconp.txt only. - if "$silprob"; then - perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; $silword_p = shift @A; - $wordsil_f = shift @A; $wordnonsil_f = shift @A; @A>0||die; - if(@A==1) { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_S\n"; } - else { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_B "; - for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \ - < $srcdir/lexiconp_silprob.txt > $tmpdir/lexiconp_silprob.txt - else - perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; @A>0||die; - if(@A==1) { print "$w $p $A[0]_S\n"; } else { print "$w $p $A[0]_B "; - for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \ - < $srcdir/lexiconp.txt > $tmpdir/lexiconp.txt || exit 1; - fi - - # create $tmpdir/phone_map.txt - # this has the format (on each line) - # ... - # where the versions depend on the position of the phone within a word. - # For instance, we'd have: - # AA AA_B AA_E AA_I AA_S - # for (B)egin, (E)nd, (I)nternal and (S)ingleton - # and in the case of silence - # SIL SIL SIL_B SIL_E SIL_I SIL_S - # [because SIL on its own is one of the variants; this is for when it doesn't - # occur inside a word but as an option in the lexicon.] - - # This phone map expands the phone lists into all the word-position-dependent - # versions of the phone lists. - cat <(set -f; for x in `cat $srcdir/silence_phones.txt`; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \ - <(set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \ - > $tmpdir/phone_map.txt -else - if "$silprob"; then - cp $srcdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob.txt - else - cp $srcdir/lexiconp.txt $tmpdir/lexiconp.txt - fi - - cat $srcdir/silence_phones.txt $srcdir/nonsilence_phones.txt | \ - awk '{for(n=1;n<=NF;n++) print $n; }' > $tmpdir/phones - paste -d' ' $tmpdir/phones $tmpdir/phones > $tmpdir/phone_map.txt -fi - -mkdir -p $dir/phones # various sets of phones... - -# Sets of phones for use in clustering, and making monophone systems. - -if $share_silence_phones; then - # build a roots file that will force all the silence phones to share the - # same pdf's. [three distinct states, only the transitions will differ.] - # 'shared'/'not-shared' means, do we share the 3 states of the HMM - # in the same tree-root? - # Sharing across models(phones) is achieved by writing several phones - # into one line of roots.txt (shared/not-shared doesn't affect this). - # 'not-shared not-split' means we have separate tree roots for the 3 states, - # but we never split the tree so they remain stumps, - # so all phones in the line correspond to the same model. - - cat $srcdir/silence_phones.txt | awk '{printf("%s ", $0); } END{printf("\n");}' | cat - $srcdir/nonsilence_phones.txt | \ - utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt - cat $dir/phones/sets.txt | \ - awk '{if(NR==1) print "not-shared", "not-split", $0; else print "shared", "split", $0;}' > $dir/phones/roots.txt -else - # different silence phones will have different GMMs. [note: here, all "shared split" means - # is that we may have one GMM for all the states, or we can split on states. because they're - # context-independent phones, they don't see the context.] - cat $srcdir/{,non}silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt - cat $dir/phones/sets.txt | awk '{print "shared", "split", $0;}' > $dir/phones/roots.txt -fi - -cat $srcdir/silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \ - awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/silence.txt -cat $srcdir/nonsilence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \ - awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/nonsilence.txt -cp $srcdir/optional_silence.txt $dir/phones/optional_silence.txt -cp $dir/phones/silence.txt $dir/phones/context_indep.txt - -# if extra_questions.txt is empty, it's OK. -cat $srcdir/extra_questions.txt 2>/dev/null | utils/apply_map.pl $tmpdir/phone_map.txt \ - >$dir/phones/extra_questions.txt - -# Want extra questions about the word-start/word-end stuff. Make it separate for -# silence and non-silence. Probably doesn't matter, as silence will rarely -# be inside a word. -if $position_dependent_phones; then - for suffix in _B _E _I _S; do - (set -f; for x in `cat $srcdir/nonsilence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt - done - for suffix in "" _B _E _I _S; do - (set -f; for x in `cat $srcdir/silence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt - done -fi - -# add_lex_disambig.pl is responsible for adding disambiguation symbols to -# the lexicon, for telling us how many disambiguation symbols it used, -# and and also for modifying the unknown-word's pronunciation (if the -# --unk-fst was provided) to the sequence "#1 #2 #3", and reserving those -# disambig symbols for that purpose. -# The #2 will later be replaced with the actual unk model. The reason -# for the #1 and the #3 is for disambiguation and also to keep the -# FST compact. If we didn't have the #1, we might have a different copy of -# the unk-model FST, or at least some of its arcs, for each start-state from -# which an transition comes (instead of per end-state, which is more compact); -# and adding the #3 prevents us from potentially having 2 copies of the unk-model -# FST due to the optional-silence [the last phone of any word gets 2 arcs]. -if [ ! -z "$unk_fst" ]; then # if the --unk-fst option was provided... - if "$silprob"; then - utils/lang/internal/modify_unk_pron.py $tmpdir/lexiconp_silprob.txt "$oov_word" || exit 1 - else - utils/lang/internal/modify_unk_pron.py $tmpdir/lexiconp.txt "$oov_word" || exit 1 - fi - unk_opt="--first-allowed-disambig 4" -else - unk_opt= -fi - -if "$silprob"; then - ndisambig=$(utils/add_lex_disambig.pl $unk_opt --pron-probs --sil-probs $tmpdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob_disambig.txt) -else - ndisambig=$(utils/add_lex_disambig.pl $unk_opt --pron-probs $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt) -fi -ndisambig=$[$ndisambig+$num_extra_phone_disambig_syms]; # add (at least) one disambig symbol for silence in lexicon FST. -echo $ndisambig > $tmpdir/lex_ndisambig - -# Format of lexiconp_disambig.txt: -# !SIL 1.0 SIL_S -# 1.0 SPN_S #1 -# 1.0 SPN_S #2 -# 1.0 NSN_S -# !EXCLAMATION-POINT 1.0 EH2_B K_I S_I K_I L_I AH0_I M_I EY1_I SH_I AH0_I N_I P_I OY2_I N_I T_E - -( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) >$dir/phones/disambig.txt - -# In case there are extra word-level disambiguation symbols they also -# need to be added to the list of phone-level disambiguation symbols. -if [ ! -z "$extra_word_disambig_syms" ]; then - # We expect a file containing valid word-level disambiguation symbols. - cat $extra_word_disambig_syms | awk '{ print $1 }' >> $dir/phones/disambig.txt -fi - -# Create phone symbol table. -if [[ ! -z $phone_symbol_table ]]; then - start_symbol=`grep \#0 $phone_symbol_table | awk '{print $2}'` - echo "" | cat - $dir/phones/{silence,nonsilence}.txt | awk -v f=$phone_symbol_table ' - BEGIN { while ((getline < f) > 0) { phones[$1] = $2; }} { print $1" "phones[$1]; }' | sort -k2 -g |\ - cat - <(cat $dir/phones/disambig.txt | awk -v x=$start_symbol '{n=x+NR-1; print $1, n;}') > $dir/phones.txt -else - echo "" | cat - $dir/phones/{silence,nonsilence,disambig}.txt | \ - awk '{n=NR-1; print $1, n;}' > $dir/phones.txt -fi - -# Create a file that describes the word-boundary information for -# each phone. 5 categories. -if $position_dependent_phones; then - cat $dir/phones/{silence,nonsilence}.txt | \ - awk '/_I$/{print $1, "internal"; next;} /_B$/{print $1, "begin"; next; } - /_S$/{print $1, "singleton"; next;} /_E$/{print $1, "end"; next; } - {print $1, "nonword";} ' > $dir/phones/word_boundary.txt -else - # word_boundary.txt might have been generated by another source - [ -f $srcdir/word_boundary.txt ] && cp $srcdir/word_boundary.txt $dir/phones/word_boundary.txt -fi - -# Create word symbol table. -# and are only needed due to the need to rescore lattices with -# ConstArpaLm format language model. They do not normally appear in G.fst or -# L.fst. - -if "$silprob"; then - # remove the silprob - cat $tmpdir/lexiconp_silprob.txt |\ - awk '{ - for(i=1; i<=NF; i++) { - if(i!=3 && i!=4 && i!=5) printf("%s\t", $i); if(i==NF) print ""; - } - }' > $tmpdir/lexiconp.txt -fi - -cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk ' - BEGIN { - print " 0"; - } - { - if ($1 == "") { - print " is in the vocabulary!" | "cat 1>&2" - exit 1; - } - if ($1 == "") { - print " is in the vocabulary!" | "cat 1>&2" - exit 1; - } - printf("%s %d\n", $1, NR); - } - END { - printf("#0 %d\n", NR+1); - printf(" %d\n", NR+2); - printf(" %d\n", NR+3); - }' > $dir/words.txt || exit 1; - -# In case there are extra word-level disambiguation symbols they also -# need to be added to words.txt -if [ ! -z "$extra_word_disambig_syms" ]; then - # Since words.txt already exists, we need to extract the current word count. - word_count=`tail -n 1 $dir/words.txt | awk '{ print $2 }'` - - # We expect a file containing valid word-level disambiguation symbols. - # The list of symbols is attached to the current words.txt (including - # a numeric identifier for each symbol). - cat $extra_word_disambig_syms | \ - awk -v WC=$word_count '{ printf("%s %d\n", $1, ++WC); }' >> $dir/words.txt || exit 1; -fi - -# format of $dir/words.txt: -# 0 -#!EXCLAMATION-POINT 1 -#!SIL 2 -#"CLOSE-QUOTE 3 -#... - -silphone=`cat $srcdir/optional_silence.txt` || exit 1; -[ -z "$silphone" ] && \ - ( echo "You have no optional-silence phone; it is required in the current scripts" - echo "but you may use the option --sil-prob 0.0 to stop it being used." ) && \ - exit 1; - -# create $dir/phones/align_lexicon.{txt,int}. -# This is the method we use for lattice word alignment if we are not -# using word-position-dependent phones. - -# First remove pron-probs from the lexicon. -perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$tmpdir/lexiconp.txt >$tmpdir/align_lexicon.txt - -# Note: here, $silphone will have no suffix e.g. _S because it occurs as optional-silence, -# and is not part of a word. -[ ! -z "$silphone" ] && echo " $silphone" >> $tmpdir/align_lexicon.txt - -cat $tmpdir/align_lexicon.txt | \ - perl -ane '@A = split; print $A[0], " ", join(" ", @A), "\n";' | sort | uniq > $dir/phones/align_lexicon.txt - -# create phones/align_lexicon.int -cat $dir/phones/align_lexicon.txt | utils/sym2int.pl -f 3- $dir/phones.txt | \ - utils/sym2int.pl -f 1-2 $dir/words.txt > $dir/phones/align_lexicon.int - -# Create the basic L.fst without disambiguation symbols, for use -# in training. - -if $silprob; then - # Add silence probabilities (modlels the prob. of silence before and after each - # word). On some setups this helps a bit. See utils/dict_dir_add_pronprobs.sh - # and where it's called in the example scripts (run.sh). - utils/make_lexicon_fst_silprob.pl $tmpdir/lexiconp_silprob.txt $srcdir/silprob.txt $silphone "" | \ - fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; -else - utils/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp.txt $sil_prob $silphone | \ - fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; -fi - -# The file oov.txt contains a word that we will map any OOVs to during -# training. -echo "$oov_word" > $dir/oov.txt || exit 1; -cat $dir/oov.txt | utils/sym2int.pl $dir/words.txt >$dir/oov.int || exit 1; -# integer version of oov symbol, used in some scripts. - - -# the file wdisambig.txt contains a (line-by-line) list of the text-form of the -# disambiguation symbols that are used in the grammar and passed through by the -# lexicon. At this stage it's hardcoded as '#0', but we're laying the groundwork -# for more generality (which probably would be added by another script). -# wdisambig_words.int contains the corresponding list interpreted by the -# symbol table words.txt, and wdisambig_phones.int contains the corresponding -# list interpreted by the symbol table phones.txt. -echo '#0' >$dir/phones/wdisambig.txt - -# In case there are extra word-level disambiguation symbols they need -# to be added to the existing word-level disambiguation symbols file. -if [ ! -z "$extra_word_disambig_syms" ]; then - # We expect a file containing valid word-level disambiguation symbols. - # The regular expression for awk is just a paranoia filter (e.g. for empty lines). - cat $extra_word_disambig_syms | awk '{ print $1 }' >> $dir/phones/wdisambig.txt -fi - -utils/sym2int.pl $dir/phones.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_phones.int -utils/sym2int.pl $dir/words.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_words.int - -# Create these lists of phones in colon-separated integer list form too, -# for purposes of being given to programs as command-line options. -for f in silence nonsilence optional_silence disambig context_indep; do - utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt >$dir/phones/$f.int - utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt | \ - awk '{printf(":%d", $1);} END{printf "\n"}' | sed s/:// > $dir/phones/$f.csl || exit 1; -done - -for x in sets extra_questions; do - utils/sym2int.pl $dir/phones.txt <$dir/phones/$x.txt > $dir/phones/$x.int || exit 1; -done - -utils/sym2int.pl -f 3- $dir/phones.txt <$dir/phones/roots.txt \ - > $dir/phones/roots.int || exit 1; - -if [ -f $dir/phones/word_boundary.txt ]; then - utils/sym2int.pl -f 1 $dir/phones.txt <$dir/phones/word_boundary.txt \ - > $dir/phones/word_boundary.int || exit 1; -fi - -silphonelist=`cat $dir/phones/silence.csl` -nonsilphonelist=`cat $dir/phones/nonsilence.csl` - -# Note: it's OK, after generating the 'lang' directory, to overwrite the topo file -# with another one of your choice if the 'topo' file you want can't be generated by -# utils/gen_topo.pl. We do this in the 'chain' recipes. Of course, the 'topo' file -# should cover all the phones. Try running utils/validate_lang.pl to check that -# everything is OK after modifying the topo file. -utils/gen_topo.pl $num_nonsil_states $num_sil_states $nonsilphonelist $silphonelist >$dir/topo - - -# Create the lexicon FST with disambiguation symbols, and put it in lang_test. -# There is an extra step where we create a loop to "pass through" the -# disambiguation symbols from G.fst. - -if $silprob; then - utils/make_lexicon_fst_silprob.pl $tmpdir/lexiconp_silprob_disambig.txt $srcdir/silprob.txt $silphone '#'$ndisambig | \ - fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \ - fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1; -else - utils/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt $sil_prob $silphone '#'$ndisambig | \ - fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \ - fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1; -fi - - -if [ ! -z "$unk_fst" ]; then - utils/lang/internal/apply_unk_lm.sh $unk_fst $dir || exit 1 - - if ! $position_dependent_phones; then - echo "$0: warning: you are using the --unk-lm option and setting --position-dependent-phones false." - echo " ... this will make it impossible to properly work out the word boundaries after" - echo " ... decoding; quite a few scripts will not work as a result, and many scoring scripts" - echo " ... will die." - sleep 4 - fi -fi - -echo "$(basename $0): validating output directory" -! utils/validate_lang.pl $dir && echo "$(basename $0): error validating output" && exit 1; - -exit 0; diff --git a/kaldi/local/prepare_online_nnet_dist_build.sh b/kaldi/local/prepare_online_nnet_dist_build.sh deleted file mode 100755 index adc2cef..0000000 --- a/kaldi/local/prepare_online_nnet_dist_build.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/bin/bash - -# Copyright 2015 Johns Hopkins University (Author: Vijayaditya Peddinti) -# Guoguo Chen -# Apache 2.0 -# Script to prepare the distribution from the online-nnet build - -other_files= #other files to be included in the build -other_dirs= -conf_files="ivector_extractor.conf mfcc.conf online_cmvn.conf online_nnet2_decoding.conf splice.conf" -ivec_extractor_files="final.dubm final.ie final.mat global_cmvn.stats online_cmvn.conf splice_opts" - -echo "$0 $@" # Print the command line for logging -[ -f path.sh ] && . ./path.sh; -. parse_options.sh || exit 1; - -if [ $# -ne 3 ]; then - echo "Usage: $0 " - echo "e.g.: $0 data/lang exp/nnet2_online/nnet_ms_a_online tedlium.tgz" - exit 1; -fi - -lang=$1 -modeldir=$2 -tgzfile=$3 - -for f in $lang/phones.txt $other_files; do - [ ! -f $f ] && echo "$0: no such file $f" && exit 1; -done - -build_files= -for d in $modeldir/conf $modeldir/ivector_extractor; do - [ ! -d $d ] && echo "$0: no such directory $d" && exit 1; -done - -for f in $ivec_extractor_files; do - f=$modeldir/ivector_extractor/$f - [ ! -f $f ] && echo "$0: no such file $f" && exit 1; - build_files="$build_files $f" -done - -# Makes a copy of the original config files, as we will change the absolute path -# to relative. -rm -rf $modeldir/conf_abs_path -mkdir -p $modeldir/conf_abs_path -cp -r $modeldir/conf/* $modeldir/conf_abs_path - -for f in $conf_files; do - [ ! -f $modeldir/conf/$f ] && \ - echo "$0: no such file $modeldir/conf/$f" && exit 1; - # Changes absolute path to relative path. The path entries in the config file - # are generated by scripts and it is safe to assume that they have structure: - # variable=path - cat $modeldir/conf_abs_path/$f | perl -e ' - use File::Spec; - while() { - chomp; - @col = split("=", $_); - if (@col == 2 && (-f $col[1])) { - $col[1] = File::Spec->abs2rel($col[1]); - print "$col[0]=$col[1]\n"; - } else { - print "$_\n"; - } - } - ' > $modeldir/conf/$f - build_files="$build_files $modeldir/conf/$f" -done - -tar -hczvf $tgzfile $lang $build_files $other_files $other_dirs \ - $modeldir/final.mdl $modeldir/tree >/dev/null - -# Changes back to absolute path. -rm -rf $modeldir/conf -mv $modeldir/conf_abs_path $modeldir/conf diff --git a/kaldi/local/queue.pl b/kaldi/local/queue.pl deleted file mode 100755 index e14af5e..0000000 --- a/kaldi/local/queue.pl +++ /dev/null @@ -1,624 +0,0 @@ -#!/usr/bin/env perl -use strict; -use warnings; - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). -# 2014 Vimal Manohar (Johns Hopkins University) -# Apache 2.0. - -use File::Basename; -use Cwd; -use Getopt::Long; - -# queue.pl has the same functionality as run.pl, except that -# it runs the job in question on the queue (Sun GridEngine). -# This version of queue.pl uses the task array functionality -# of the grid engine. Note: it's different from the queue.pl -# in the s4 and earlier scripts. - -# The script now supports configuring the queue system using a config file -# (default in conf/queue.conf; but can be passed specified with --config option) -# and a set of command line options. -# The current script handles: -# 1) Normal configuration arguments -# For e.g. a command line option of "--gpu 1" could be converted into the option -# "-q g.q -l gpu=1" to qsub. How the CLI option is handled is determined by a -# line in the config file like -# gpu=* -q g.q -l gpu=$0 -# $0 here in the line is replaced with the argument read from the CLI and the -# resulting string is passed to qsub. -# 2) Special arguments to options such as -# gpu=0 -# If --gpu 0 is given in the command line, then no special "-q" is given. -# 3) Default argument -# default gpu=0 -# If --gpu option is not passed in the command line, then the script behaves as -# if --gpu 0 was passed since 0 is specified as the default argument for that -# option -# 4) Arbitrary options and arguments. -# Any command line option starting with '--' and its argument would be handled -# as long as its defined in the config file. -# 5) Default behavior -# If the config file that is passed using is not readable, then the script -# behaves as if the queue has the following config file: -# $ cat conf/queue.conf -# # Default configuration -# command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -# option mem=* -l mem_free=$0,ram_free=$0 -# option mem=0 # Do not add anything to qsub_opts -# option num_threads=* -pe smp $0 -# option num_threads=1 # Do not add anything to qsub_opts -# option max_jobs_run=* -tc $0 -# default gpu=0 -# option gpu=0 -q all.q -# option gpu=* -l gpu=$0 -q g.q - -my $qsub_opts = ""; -my $sync = 0; -my $num_threads = 1; -my $gpu = 0; - -my $config = "conf/queue.conf"; - -my %cli_options = (); - -my $jobname; -my $jobstart; -my $jobend; -my $array_job = 0; -my $sge_job_id; - -sub print_usage() { - print STDERR - "Usage: queue.pl [options] [JOB=1:n] log-file command-line arguments...\n" . - "e.g.: queue.pl foo.log echo baz\n" . - " (which will echo \"baz\", with stdout and stderr directed to foo.log)\n" . - "or: queue.pl -q all.q\@xyz foo.log echo bar \| sed s/bar/baz/ \n" . - " (which is an example of using a pipe; you can provide other escaped bash constructs)\n" . - "or: queue.pl -q all.q\@qyz JOB=1:10 foo.JOB.log echo JOB \n" . - " (which illustrates the mechanism to submit parallel jobs; note, you can use \n" . - " another string other than JOB)\n" . - "Note: if you pass the \"-sync y\" option to qsub, this script will take note\n" . - "and change its behavior. Otherwise it uses qstat to work out when the job finished\n" . - "Options:\n" . - " --config (default: $config)\n" . - " --mem (e.g. --mem 2G, --mem 500M, \n" . - " also support K and numbers mean bytes)\n" . - " --num-threads (default: $num_threads)\n" . - " --max-jobs-run \n" . - " --gpu <0|1> (default: $gpu)\n"; - exit 1; -} - -sub caught_signal { - if ( defined $sge_job_id ) { # Signal trapped after submitting jobs - my $signal = $!; - system ("qdel $sge_job_id"); - print STDERR "Caught a signal: $signal , deleting SGE task: $sge_job_id and exiting\n"; - exit(2); - } -} - -if (@ARGV < 2) { - print_usage(); -} - -for (my $x = 1; $x <= 2; $x++) { # This for-loop is to - # allow the JOB=1:n option to be interleaved with the - # options to qsub. - while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) { - my $switch = shift @ARGV; - - if ($switch eq "-V") { - $qsub_opts .= "-V "; - } else { - my $argument = shift @ARGV; - if ($argument =~ m/^--/) { - print STDERR "WARNING: suspicious argument '$argument' to $switch; starts with '-'\n"; - } - if ($switch eq "-sync" && $argument =~ m/^[yY]/) { - $sync = 1; - $qsub_opts .= "$switch $argument "; - } elsif ($switch eq "-pe") { # e.g. -pe smp 5 - my $argument2 = shift @ARGV; - $qsub_opts .= "$switch $argument $argument2 "; - $num_threads = $argument2; - } elsif ($switch =~ m/^--/) { # Config options - # Convert CLI option to variable name - # by removing '--' from the switch and replacing any - # '-' with a '_' - $switch =~ s/^--//; - $switch =~ s/-/_/g; - $cli_options{$switch} = $argument; - } else { # Other qsub options - passed as is - $qsub_opts .= "$switch $argument "; - } - } - } - if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20 - $array_job = 1; - $jobname = $1; - $jobstart = $2; - $jobend = $3; - shift; - if ($jobstart > $jobend) { - die "queue.pl: invalid job range $ARGV[0]"; - } - if ($jobstart <= 0) { - die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is a GridEngine limitation)."; - } - } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1. - $array_job = 1; - $jobname = $1; - $jobstart = $2; - $jobend = $2; - shift; - } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) { - print STDERR "queue.pl: Warning: suspicious first argument to queue.pl: $ARGV[0]\n"; - } -} - -if (@ARGV < 2) { - print_usage(); -} - -if (exists $cli_options{"config"}) { - $config = $cli_options{"config"}; -} - -my $default_config_file = <<'EOF'; -# Default configuration -command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -option mem=* -l mem_free=$0,ram_free=$0 -option mem=0 # Do not add anything to qsub_opts -option num_threads=* -pe smp $0 -option num_threads=1 # Do not add anything to qsub_opts -option max_jobs_run=* -tc $0 -default gpu=0 -option gpu=0 -option gpu=* -l gpu=$0 -q g.q -EOF - -# Here the configuration options specified by the user on the command line -# (e.g. --mem 2G) are converted to options to the qsub system as defined in -# the config file. (e.g. if the config file has the line -# "option mem=* -l ram_free=$0,mem_free=$0" -# and the user has specified '--mem 2G' on the command line, the options -# passed to queue system would be "-l ram_free=2G,mem_free=2G -# A more detailed description of the ways the options would be handled is at -# the top of this file. - -$SIG{INT} = \&caught_signal; -$SIG{TERM} = \&caught_signal; - -my $opened_config_file = 1; - -open CONFIG, "<$config" or $opened_config_file = 0; - -my %cli_config_options = (); -my %cli_default_options = (); - -if ($opened_config_file == 0 && exists($cli_options{"config"})) { - print STDERR "Could not open config file $config\n"; - exit(1); -} elsif ($opened_config_file == 0 && !exists($cli_options{"config"})) { - # Open the default config file instead - open (CONFIG, "echo '$default_config_file' |") or die "Unable to open pipe\n"; - $config = "Default config"; -} - -my $qsub_cmd = ""; -my $read_command = 0; - -while() { - chomp; - my $line = $_; - $_ =~ s/\s*#.*//g; - if ($_ eq "") { next; } - if ($_ =~ /^command (.+)/) { - $read_command = 1; - $qsub_cmd = $1 . " "; - } elsif ($_ =~ m/^option ([^=]+)=\* (.+)$/) { - # Config option that needs replacement with parameter value read from CLI - # e.g.: option mem=* -l mem_free=$0,ram_free=$0 - my $option = $1; # mem - my $arg= $2; # -l mem_free=$0,ram_free=$0 - if ($arg !~ m:\$0:) { - die "Unable to parse line '$line' in config file ($config)\n"; - } - if (exists $cli_options{$option}) { - # Replace $0 with the argument read from command line. - # e.g. "-l mem_free=$0,ram_free=$0" -> "-l mem_free=2G,ram_free=2G" - $arg =~ s/\$0/$cli_options{$option}/g; - $cli_config_options{$option} = $arg; - } - } elsif ($_ =~ m/^option ([^=]+)=(\S+)\s?(.*)$/) { - # Config option that does not need replacement - # e.g. option gpu=0 -q all.q - my $option = $1; # gpu - my $value = $2; # 0 - my $arg = $3; # -q all.q - if (exists $cli_options{$option}) { - $cli_default_options{($option,$value)} = $arg; - } - } elsif ($_ =~ m/^default (\S+)=(\S+)/) { - # Default options. Used for setting default values to options i.e. when - # the user does not specify the option on the command line - # e.g. default gpu=0 - my $option = $1; # gpu - my $value = $2; # 0 - if (!exists $cli_options{$option}) { - # If the user has specified this option on the command line, then we - # don't have to do anything - $cli_options{$option} = $value; - } - } else { - print STDERR "queue.pl: unable to parse line '$line' in config file ($config)\n"; - exit(1); - } -} - -close(CONFIG); - -if ($read_command != 1) { - print STDERR "queue.pl: config file ($config) does not contain the line \"command .*\"\n"; - exit(1); -} - -for my $option (keys %cli_options) { - if ($option eq "config") { next; } - if ($option eq "max_jobs_run" && $array_job != 1) { next; } - my $value = $cli_options{$option}; - - if (exists $cli_default_options{($option,$value)}) { - $qsub_opts .= "$cli_default_options{($option,$value)} "; - } elsif (exists $cli_config_options{$option}) { - $qsub_opts .= "$cli_config_options{$option} "; - } else { - if ($opened_config_file == 0) { $config = "default config file"; } - die "queue.pl: Command line option $option not described in $config (or value '$value' not allowed)\n"; - } -} - -my $cwd = getcwd(); -my $logfile = shift @ARGV; - -if ($array_job == 1 && $logfile !~ m/$jobname/ - && $jobend > $jobstart) { - print STDERR "queue.pl: you are trying to run a parallel job but " - . "you are putting the output into just one log file ($logfile)\n"; - exit(1); -} - -# -# Work out the command; quote escaping is done here. -# Note: the rules for escaping stuff are worked out pretty -# arbitrarily, based on what we want it to do. Some things that -# we pass as arguments to queue.pl, such as "|", we want to be -# interpreted by bash, so we don't escape them. Other things, -# such as archive specifiers like 'ark:gunzip -c foo.gz|', we want -# to be passed, in quotes, to the Kaldi program. Our heuristic -# is that stuff with spaces in should be quoted. This doesn't -# always work. -# -my $cmd = ""; - -foreach my $x (@ARGV) { - if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } # If string contains no spaces, take - # as-is. - elsif ($x =~ m:\":) { $cmd .= "'$x' "; } # else if no dbl-quotes, use single - else { $cmd .= "\"$x\" "; } # else use double. -} - -# -# Work out the location of the script file, and open it for writing. -# -my $dir = dirname($logfile); -my $base = basename($logfile); -my $qdir = "$dir/q"; -$qdir =~ s:/(log|LOG)/*q:/q:; # If qdir ends in .../log/q, make it just .../q. -my $queue_logfile = "$qdir/$base"; - -if (!-d $dir) { system "mkdir -p $dir 2>/dev/null"; } # another job may be doing this... -if (!-d $dir) { die "Cannot make the directory $dir\n"; } -# make a directory called "q", -# where we will put the log created by qsub... normally this doesn't contain -# anything interesting, evertyhing goes to $logfile. -# in $qdir/sync we'll put the done.* files... we try to keep this -# directory small because it's transmitted over NFS many times. -if (! -d "$qdir/sync") { - system "mkdir -p $qdir/sync 2>/dev/null"; - sleep(5); ## This is to fix an issue we encountered in denominator lattice creation, - ## where if e.g. the exp/tri2b_denlats/log/15/q directory had just been - ## created and the job immediately ran, it would die with an error because nfs - ## had not yet synced. I'm also decreasing the acdirmin and acdirmax in our - ## NFS settings to something like 5 seconds. -} - -my $queue_array_opt = ""; -if ($array_job == 1) { # It's an array job. - $queue_array_opt = "-t $jobstart:$jobend"; - $logfile =~ s/$jobname/\$SGE_TASK_ID/g; # This variable will get - # replaced by qsub, in each job, with the job-id. - $cmd =~ s/$jobname/\$\{SGE_TASK_ID\}/g; # same for the command... - $queue_logfile =~ s/\.?$jobname//; # the log file in the q/ subdirectory - # is for the queue to put its log, and this doesn't need the task array subscript - # so we remove it. -} - -# queue_scriptfile is as $queue_logfile [e.g. dir/q/foo.log] but -# with the suffix .sh. -my $queue_scriptfile = $queue_logfile; -($queue_scriptfile =~ s/\.[a-zA-Z]{1,5}$/.sh/) || ($queue_scriptfile .= ".sh"); -if ($queue_scriptfile !~ m:^/:) { - $queue_scriptfile = $cwd . "/" . $queue_scriptfile; # just in case. -} - -# We'll write to the standard input of "qsub" (the file-handle Q), -# the job that we want it to execute. -# Also keep our current PATH around, just in case there was something -# in it that we need (although we also source ./path.sh) - -my $syncfile = "$qdir/sync/done.$$"; - -unlink($queue_logfile, $syncfile); -# -# Write to the script file, and then close it. -# -open(Q, ">$queue_scriptfile") || die "Failed to write to $queue_scriptfile"; - -print Q "#!/bin/bash\n"; -print Q "cd $cwd\n"; -print Q ". ./path.sh\n"; -print Q "( echo '#' Running on \`hostname\`\n"; -print Q " echo '#' Started at \`date\`\n"; -print Q " echo -n '# '; cat <$logfile\n"; -print Q "time1=\`date +\"%s\"\`\n"; -print Q " ( $cmd ) 2>>$logfile >>$logfile\n"; -print Q "ret=\$?\n"; -print Q "time2=\`date +\"%s\"\`\n"; -print Q "echo '#' Accounting: time=\$((\$time2-\$time1)) threads=$num_threads >>$logfile\n"; -print Q "echo '#' Finished at \`date\` with status \$ret >>$logfile\n"; -print Q "[ \$ret -eq 137 ] && exit 100;\n"; # If process was killed (e.g. oom) it will exit with status 137; - # let the script return with status 100 which will put it to E state; more easily rerunnable. -if ($array_job == 0) { # not an array job - print Q "touch $syncfile\n"; # so we know it's done. -} else { - print Q "touch $syncfile.\$SGE_TASK_ID\n"; # touch a bunch of sync-files. -} -print Q "exit \$[\$ret ? 1 : 0]\n"; # avoid status 100 which grid-engine -print Q "## submitted with:\n"; # treats specially. -$qsub_cmd .= "-o $queue_logfile $qsub_opts $queue_array_opt $queue_scriptfile >>$queue_logfile 2>&1"; -print Q "# $qsub_cmd\n"; -if (!close(Q)) { # close was not successful... || die "Could not close script file $shfile"; - die "Failed to close the script file (full disk?)"; -} -chmod 0755, $queue_scriptfile; - -# This block submits the job to the queue. -for (my $try = 1; $try < 5; $try++) { - my $ret = system ($qsub_cmd); - if ($ret != 0) { - if ($sync && $ret == 256) { # this is the exit status when a job failed (bad exit status) - if (defined $jobname) { - $logfile =~ s/\$SGE_TASK_ID/*/g; - } - print STDERR "queue.pl: job writing to $logfile failed\n"; - exit(1); - } else { - print STDERR "queue.pl: Error submitting jobs to queue (return status was $ret)\n"; - print STDERR "queue log file is $queue_logfile, command was $qsub_cmd\n"; - my $err = `tail $queue_logfile`; - print STDERR "Output of qsub was: $err\n"; - if ($err =~ m/gdi request/ || $err =~ m/qmaster/) { - # When we get queue connectivity problems we usually see a message like: - # Unable to run job: failed receiving gdi request response for mid=1 (got - # syncron message receive timeout error).. - my $waitfor = 20; - print STDERR "queue.pl: It looks like the queue master may be inaccessible. " . - " Trying again after $waitfor seconts\n"; - sleep($waitfor); - # ... and continue throught the loop. - } else { - exit(1); - } - } - } else { - last; # break from the loop. - } -} - -if (! $sync) { # We're not submitting with -sync y, so we - # need to wait for the jobs to finish. We wait for the - # sync-files we "touched" in the script to exist. - my @syncfiles = (); - if (!defined $jobname) { # not an array job. - push @syncfiles, $syncfile; - } else { - for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) { - push @syncfiles, "$syncfile.$jobid"; - } - } - # We will need the sge_job_id, to check that job still exists - { # This block extracts the numeric SGE job-id from the log file in q/. - # It may be used later to query 'qstat' about the job. - open(L, "<$queue_logfile") || die "Error opening log file $queue_logfile"; - undef $sge_job_id; - while () { - if (m/Your job\S* (\d+)[. ].+ has been submitted/) { - if (defined $sge_job_id) { - die "Error: your job was submitted more than once (see $queue_logfile)"; - } else { - $sge_job_id = $1; - } - } - } - close(L); - if (!defined $sge_job_id) { - die "Error: log file $queue_logfile does not specify the SGE job-id."; - } - } - my $check_sge_job_ctr=1; - - my $wait = 0.1; - my $counter = 0; - foreach my $f (@syncfiles) { - # wait for the jobs to finish one by one. - while (! -f $f) { - sleep($wait); - $wait *= 1.2; - if ($wait > 3.0) { - $wait = 3.0; # never wait more than 3 seconds. - # the following (.kick) commands are basically workarounds for NFS bugs. - if (rand() < 0.25) { # don't do this every time... - if (rand() > 0.5) { - system("touch $qdir/sync/.kick"); - } else { - unlink("$qdir/sync/.kick"); - } - } - if ($counter++ % 10 == 0) { - # This seems to kick NFS in the teeth to cause it to refresh the - # directory. I've seen cases where it would indefinitely fail to get - # updated, even though the file exists on the server. - # Only do this every 10 waits (every 30 seconds) though, or if there - # are many jobs waiting they can overwhelm the file server. - system("ls $qdir/sync >/dev/null"); - } - } - - # The purpose of the next block is so that queue.pl can exit if the job - # was killed without terminating. It's a bit complicated because (a) we - # don't want to overload the qmaster by querying it too frequently), and - # (b) sometimes the qmaster is unreachable or temporarily down, and we - # don't want this to necessarily kill the job. - if (($check_sge_job_ctr < 100 && ($check_sge_job_ctr++ % 10) == 0) || - ($check_sge_job_ctr >= 100 && ($check_sge_job_ctr++ % 50) == 0)) { - # Don't run qstat too often, avoid stress on SGE; the if-condition above - # is designed to check every 10 waits at first, and eventually every 50 - # waits. - if ( -f $f ) { next; } #syncfile appeared: OK. - my $output = `qstat -j $sge_job_id 2>&1`; - my $ret = $?; - if ($ret >> 8 == 1 && $output !~ m/qmaster/ && - $output !~ m/gdi request/) { - # Don't consider immediately missing job as error, first wait some - # time to make sure it is not just delayed creation of the syncfile. - - sleep(3); - # Sometimes NFS gets confused and thinks it's transmitted the directory - # but it hasn't, due to timestamp issues. Changing something in the - # directory will usually fix that. - system("touch $qdir/sync/.kick"); - unlink("$qdir/sync/.kick"); - if ( -f $f ) { next; } #syncfile appeared, ok - sleep(7); - system("touch $qdir/sync/.kick"); - sleep(1); - unlink("qdir/sync/.kick"); - if ( -f $f ) { next; } #syncfile appeared, ok - sleep(60); - system("touch $qdir/sync/.kick"); - sleep(1); - unlink("$qdir/sync/.kick"); - if ( -f $f ) { next; } #syncfile appeared, ok - $f =~ m/\.(\d+)$/ || die "Bad sync-file name $f"; - my $job_id = $1; - if (defined $jobname) { - $logfile =~ s/\$SGE_TASK_ID/$job_id/g; - } - my $last_line = `tail -n 1 $logfile`; - if ($last_line =~ m/status 0$/ && (-M $logfile) < 0) { - # if the last line of $logfile ended with "status 0" and - # $logfile is newer than this program [(-M $logfile) gives the - # time elapsed between file modification and the start of this - # program], then we assume the program really finished OK, - # and maybe something is up with the file system. - print STDERR "**queue.pl: syncfile $f was not created but job seems\n" . - "**to have finished OK. Probably your file-system has problems.\n" . - "**This is just a warning.\n"; - last; - } else { - chop $last_line; - print STDERR "queue.pl: Error, unfinished job no " . - "longer exists, log is in $logfile, last line is '$last_line', " . - "syncfile is $f, return status of qstat was $ret\n" . - "Possible reasons: a) Exceeded time limit? -> Use more jobs!" . - " b) Shutdown/Frozen machine? -> Run again! Qmaster output " . - "was: $output\n"; - exit(1); - } - } elsif ($ret != 0) { - print STDERR "queue.pl: Warning: qstat command returned status $ret (qstat -j $sge_job_id,$!)\n"; - print STDERR "queue.pl: output was: $output"; - } - } - } - } - unlink(@syncfiles); -} - -# OK, at this point we are synced; we know the job is done. -# But we don't know about its exit status. We'll look at $logfile for this. -# First work out an array @logfiles of file-locations we need to -# read (just one, unless it's an array job). -my @logfiles = (); -if (!defined $jobname) { # not an array job. - push @logfiles, $logfile; -} else { - for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) { - my $l = $logfile; - $l =~ s/\$SGE_TASK_ID/$jobid/g; - push @logfiles, $l; - } -} - -my $num_failed = 0; -my $status = 1; -foreach my $l (@logfiles) { - my @wait_times = (0.1, 0.2, 0.2, 0.3, 0.5, 0.5, 1.0, 2.0, 5.0, 5.0, 5.0, 10.0, 25.0); - for (my $iter = 0; $iter <= @wait_times; $iter++) { - my $line = `tail -10 $l 2>/dev/null`; # Note: although this line should be the last - # line of the file, I've seen cases where it was not quite the last line because - # of delayed output by the process that was running, or processes it had called. - # so tail -10 gives it a little leeway. - if ($line =~ m/with status (\d+)/) { - $status = $1; - last; - } else { - if ($iter < @wait_times) { - sleep($wait_times[$iter]); - } else { - if (! -f $l) { - print STDERR "Log-file $l does not exist.\n"; - } else { - print STDERR "The last line of log-file $l does not seem to indicate the " - . "return status as expected\n"; - } - exit(1); # Something went wrong with the queue, or the - # machine it was running on, probably. - } - } - } - # OK, now we have $status, which is the return-status of - # the command in the job. - if ($status != 0) { $num_failed++; } -} -if ($num_failed == 0) { exit(0); } -else { # we failed. - if (@logfiles == 1) { - if (defined $jobname) { $logfile =~ s/\$SGE_TASK_ID/$jobstart/g; } - print STDERR "queue.pl: job failed with status $status, log is in $logfile\n"; - if ($logfile =~ m/JOB/) { - print STDERR "queue.pl: probably you forgot to put JOB=1:\$nj in your script.\n"; - } - } else { - if (defined $jobname) { $logfile =~ s/\$SGE_TASK_ID/*/g; } - my $numjobs = 1 + $jobend - $jobstart; - print STDERR "queue.pl: $num_failed / $numjobs failed, log is in $logfile\n"; - } - exit(1); -} diff --git a/kaldi/local/remove_data_links.sh b/kaldi/local/remove_data_links.sh deleted file mode 100755 index 8ec68f9..0000000 --- a/kaldi/local/remove_data_links.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash - -# This program searches within a directory for soft links that -# appear to be created by 'create_data_link.pl' to a 'storage/' subdirectory, -# and it removes both the soft links and the things they point to. -# for instance, if you have a soft link -# foo/egs/1.1.egs -> storage/2/1.1.egs -# it will remove both foo/egs/storage/2/1.1.egs, and foo/egs/1.1.egs. - -ret=0 - -dry_run=false - -if [ "$1" == "--dry-run" ]; then - dry_run=true - shift -fi - -if [ $# == 0 ]; then - echo "Usage: $0 [--dry-run] " - echo "e.g.: $0 exp/nnet4a/egs/" - echo " Removes from any subdirectories of the command-line arguments, soft links that " - echo " appear to have been created by utils/create_data_link.pl, as well as the things" - echo " that those soft links point to. Will typically be called on a directory prior" - echo " to 'rm -r' on that directory, to ensure that data that was distributed on other" - echo " volumes also gets deleted." - echo " With --dry-run, just prints what it would do." -fi - -for dir in $*; do - if [ ! -d $dir ]; then - echo "$0: not a directory: $dir" - ret=1 - else - for subdir in $(find $dir -type d); do - if [ -d $subdir/storage ]; then - for x in $(ls $subdir); do - f=$subdir/$x - if [ -L $f ] && [[ $(readlink $f) == storage/* ]]; then - target=$subdir/$(readlink $f) - if $dry_run; then - echo rm $f $target - else - rm $f $target - fi - fi - done - fi - done - fi -done - -exit $ret diff --git a/kaldi/local/remove_oovs.pl b/kaldi/local/remove_oovs.pl deleted file mode 100755 index 532d7f2..0000000 --- a/kaldi/local/remove_oovs.pl +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script removes lines that contain these OOVs on either the -# third or fourth fields of the line. It is intended to remove arcs -# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). - -if ( @ARGV < 1 && @ARGV > 2) { - die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; -} - -$unklist = shift @ARGV; -open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; -while(){ - @A = split(" ", $_); - @A == 1 || die "Bad line in unknown-symbol list: $_"; - $unk{$A[0]} = 1; -} - -$num_removed = 0; -while(<>){ - @A = split(" ", $_); - if(defined $unk{$A[2]} || defined $unk{$A[3]}) { - $num_removed++; - } else { - print; - } -} -print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; - diff --git a/kaldi/local/retry.pl b/kaldi/local/retry.pl deleted file mode 100755 index a039d6f..0000000 --- a/kaldi/local/retry.pl +++ /dev/null @@ -1,106 +0,0 @@ -#!/usr/bin/env perl -use strict; -use warnings; - -# Copyright 2018 Johns Hopkins University (Author: Daniel Povey). -# Apache 2.0. - -use File::Basename; -use Cwd; -use Getopt::Long; - - -# retry.pl is a wrapper for queue.pl. It can be used to retry jobs that failed, -# e.g. if your command line was "queue.pl [args]", you can replace that -# with "retry.pl queue.pl [args]" and it will retry jobs that failed. - - -my $num_tries = 2; - -sub print_usage() { - print STDERR - "Usage: retry.pl \n" . - " e.g.: retry.pl [options] queue.pl foo.log do_something\n" . - "This will retry jobs that failed (only once)\n" . - "Options:\n" . - " --num-tries # default: 2\n"; - exit 1; -} - -if ($ARGV[0] eq "--num-tries") { - shift; - $num_tries = $ARGV[0] + 0; - if ($num_tries < 1) { - die "$0: invalid option --num-tries $ARGV[0]"; - } - shift; -} - -if (@ARGV < 3) { - print_usage(); -} - - -sub get_log_file { - my $n; - # First just look for the first command-line arg that ends in ".log". If that - # exists, it's almost certainly the log file. - for ($n = 1; $n < @ARGV; $n++) { - if ($ARGV[$n] =~ m/\.log$/) { - return $ARGV[$n]; - } - } - for ($n = 1; $n < @ARGV; $n++) { - # If this arg isn't of the form "-some-option', and isn't of the form - # "JOB=1:10", and the previous arg wasn't of the form "-some-option", and this - # isn't just a number (note: the 'not-a-number' things is mostly to exclude - # things like the 5 in "-pe smp 5" which is an older but still-supported - # option to queue.pl)... then assume it's a log file. - if ($ARGV[$n] !~ m/^-=/ && $ARGV[$n] !~ m/=/ && $ARGV[$n] !~ m/^\d+$/ && - $ARGV[$n-1] !~ m/^-/) { - return $ARGV[$n]; - } - } - print STDERR "$0: failed to parse log-file name from args:" . join(" ", @ARGV); - exit(1); -} - - -my $log_file = get_log_file(); -my $return_status; - -for (my $n = 1; $n <= $num_tries; $n++) { - system(@ARGV); - $return_status = $?; - if ($return_status == 0) { - exit(0); # The command succeeded. We return success. - } elsif ($return_status != 256) { - # The command did not "die normally". When queue.pl and similar scripts - # detect a normal error, they exit(1), which becomes a status of 256 - # in perl's $? variable. - # See http://perldoc.perl.org/perlvar.html#%24CHILD_ERROR for more info. - # An example of an abnormal death that would cause us to want to exit - # immediately, is when the user does ctrl-c or KILLs the script, - # which gets caught by 'caught_signal' in queue.pl and causes that program - # to return with exit status 2. - exit(1); - } - - - if ($n < $num_tries) { - if (! -f $log_file) { - # $log_file doesn't exist as a file. Maybe it was an array job. - # This script doesn't yet support array jobs. We just give up. - # Later on we might want to figure out which array jobs failed - # and have to be rerun, but for now we just die. - print STDERR "$0: job failed and log file $log_file does not exist (array job?).\n"; - exit($return_status) - } else { - rename($log_file, $log_file . ".bak"); - print STDERR "$0: job failed; renaming log file to ${log_file}.bak and rerunning\n"; - } - } -} - -print STDERR "$0: job failed $num_tries times; log is in $log_file\n"; -exit(1); diff --git a/kaldi/local/reverse_arpa.py b/kaldi/local/reverse_arpa.py deleted file mode 100755 index 5437aec..0000000 --- a/kaldi/local/reverse_arpa.py +++ /dev/null @@ -1,188 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# Copyright 2012 Mirko Hannemann BUT, mirko.hannemann@gmail.com - -import sys -import codecs # for UTF-8/unicode - -if len(sys.argv) != 2: - print 'usage: reverse_arpa arpa.in' - sys.exit() -arpaname = sys.argv[1] - -#\data\ -#ngram 1=4 -#ngram 2=2 -#ngram 3=2 -# -#\1-grams: -#-5.234679 a -3.3 -#-3.456783 b -#0.0000000 -2.5 -#-4.333333 -# -#\2-grams: -#-1.45678 a b -3.23 -#-1.30490 a -4.2 -# -#\3-grams: -#-0.34958 a b -#-0.23940 a b -#\end\ - -# read language model in ARPA format -try: - file = codecs.open(arpaname, "r", "utf-8") -except IOError: - print 'file not found: ' + arpaname - sys.exit() - -text=file.readline() -while (text and text[:6] != "\\data\\"): text=file.readline() -if not text: - print "invalid ARPA file" - sys.exit() -#print text, -while (text and text[:5] != "ngram"): text=file.readline() - -# get ngram counts -cngrams=[] -n=0 -while (text and text[:5] == "ngram"): - ind = text.split("=") - counts = int(ind[1].strip()) - r = ind[0].split() - read_n = int(r[1].strip()) - if read_n != n+1: - print "invalid ARPA file:", text - sys.exit() - n = read_n - cngrams.append(counts) - #print text, - text=file.readline() - -# read all n-grams order by order -sentprob = 0.0 # sentence begin unigram -ngrams=[] -inf=float("inf") -for n in range(1,len(cngrams)+1): # unigrams, bigrams, trigrams - while (text and "-grams:" not in text): text=file.readline() - if n != int(text[1]): - print "invalid ARPA file:", text - sys.exit() - #print text,cngrams[n-1] - this_ngrams={} # stores all read ngrams - for ng in range(cngrams[n-1]): - while (text and len(text.split())<2): - text=file.readline() - if (not text) or ((len(text.split())==1) and (("-grams:" in text) or (text[:5] == "\\end\\"))): break - if (not text) or ((len(text.split())==1) and (("-grams:" in text) or (text[:5] == "\\end\\"))): - break # to deal with incorrect ARPA files - entry = text.split() - prob = float(entry[0]) - if len(entry)>n+1: - back = float(entry[-1]) - words = entry[1:n+1] - else: - back = 0.0 - words = entry[1:] - ngram = " ".join(words) - if (n==1) and words[0]=="": - sentprob = prob - prob = 0.0 - this_ngrams[ngram] = (prob,back) - #print prob,ngram.encode("utf-8"),back - - for x in range(n-1,0,-1): - # add all missing backoff ngrams for reversed lm - l_ngram = " ".join(words[:x]) # shortened ngram - r_ngram = " ".join(words[1:1+x]) # shortened ngram with offset one - if l_ngram not in ngrams[x-1]: # create missing ngram - ngrams[x-1][l_ngram] = (0.0,inf) - #print ngram, "create 0.0", l_ngram, "inf" - if r_ngram not in ngrams[x-1]: # create missing ngram - ngrams[x-1][r_ngram] = (0.0,inf) - #print ngram, "create 0.0", r_ngram, "inf",x,n,h_ngram - - # add all missing backoff ngrams for forward lm - h_ngram = " ".join(words[n-x:]) # shortened history - if h_ngram not in ngrams[x-1]: # create missing ngram - ngrams[x-1][h_ngram] = (0.0,inf) - #print "create inf", h_ngram, "0.0" - text=file.readline() - if (not text) or ((len(text.split())==1) and (("-grams:" in text) or (text[:5] == "\\end\\"))): break - ngrams.append(this_ngrams) - -while (text and text[:5] != "\\end\\"): text=file.readline() -if not text: - print "invalid ARPA file" - sys.exit() -file.close() -#print text, - -#fourgram "maxent" model (b(ABCD)=0): -#p(A)+b(A) A 0 -#p(AB)+b(AB)-b(A)-p(B) AB 0 -#p(ABC)+b(ABC)-b(AB)-p(BC) ABC 0 -#p(ABCD)+b(ABCD)-b(ABC)-p(BCD) ABCD 0 - -#fourgram reverse ARPA model (b(ABCD)=0): -#p(A)+b(A) A 0 -#p(AB)+b(AB)-p(B)+p(A) BA 0 -#p(ABC)+b(ABC)-p(BC)+p(AB)-p(B)+p(A) CBA 0 -#p(ABCD)+b(ABCD)-p(BCD)+p(ABC)-p(BC)+p(AB)-p(B)+p(A) DCBA 0 - -# compute new reversed ARPA model -print "\\data\\" -for n in range(1,len(cngrams)+1): # unigrams, bigrams, trigrams - print "ngram "+str(n)+"="+str(len(ngrams[n-1].keys())) -offset = 0.0 -for n in range(1,len(cngrams)+1): # unigrams, bigrams, trigrams - print "\\"+str(n)+"-grams:" - keys = ngrams[n-1].keys() - keys.sort() - for ngram in keys: - prob = ngrams[n-1][ngram] - # reverse word order - words = ngram.split() - rstr = " ".join(reversed(words)) - # swap and - rev_ngram = rstr.replace("","").replace("","").replace("","") - - revprob = prob[0] - if (prob[1] != inf): # only backoff weights from not newly created ngrams - revprob = revprob + prob[1] - #print prob[0],prob[1] - # sum all missing terms in decreasing ngram order - for x in range(n-1,0,-1): - l_ngram = " ".join(words[:x]) # shortened ngram - if l_ngram not in ngrams[x-1]: - sys.stderr.write(rev_ngram+": not found "+l_ngram+"\n") - p_l = ngrams[x-1][l_ngram][0] - #print p_l,l_ngram - revprob = revprob + p_l - - r_ngram = " ".join(words[1:1+x]) # shortened ngram with offset one - if r_ngram not in ngrams[x-1]: - sys.stderr.write(rev_ngram+": not found "+r_ngram+"\n") - p_r = ngrams[x-1][r_ngram][0] - #print -p_r,r_ngram - revprob = revprob - p_r - - if n != len(cngrams): #not highest order - back = 0.0 - if rev_ngram[:3] == "": # special handling since arpa2fst ignores weight - if n == 1: - offset = revprob # remember weight - revprob = sentprob # apply weight from forward model - back = offset - elif n == 2: - revprob = revprob + offset # add weight to bigrams starting with - if (prob[1] != inf): # only backoff weights from not newly created ngrams - print revprob,rev_ngram.encode("utf-8"),back - else: - print revprob,rev_ngram.encode("utf-8"),"-100000.0" - else: # highest order - no backoff weights - if (n==2) and (rev_ngram[:3] == ""): revprob = revprob + offset - print revprob,rev_ngram.encode("utf-8") -print "\\end\\" diff --git a/kaldi/local/rnnlm_compute_scores.sh b/kaldi/local/rnnlm_compute_scores.sh deleted file mode 100755 index 0d0eb60..0000000 --- a/kaldi/local/rnnlm_compute_scores.sh +++ /dev/null @@ -1,90 +0,0 @@ -#!/bin/bash - -# Compute scores from RNNLM. This script takes a directory -# $dir (e.g. dir=local/rnnlm/rnnlm.voc30.hl30 ), -# where it expects the files: -# rnnlm wordlist.rnn unk.probs, -# and also an input file location where it can get the sentences to score, and -# an output file location to put the scores (negated logprobs) for each -# sentence. This script uses the Kaldi-style "archive" format, so the input and -# output files will have a first field that corresponds to some kind of -# utterance-id or, in practice, utterance-id-1, utterance-id-2, etc., for the -# N-best list. -# -# Here, "wordlist.rnn" is the set of words, like a vocabulary, -# that the RNN was trained on (note, it won't include or ), -# plus which is a kind of class where we put low-frequency -# words; unk.probs gives the probs for words given this class, and it -# has, on each line, "word prob". - -rnnlm_ver=rnnlm-0.3e -ensure_normalized_probs=false # if true then we add the neccesary options to - # normalize the probabilities of RNNLM - # e.g. when using faster-rnnlm in the nce mode - -. ./path.sh || exit 1; -. utils/parse_options.sh - -rnnlm=$KALDI_ROOT/tools/$rnnlm_ver/rnnlm - -[ ! -f $rnnlm ] && echo No such program $rnnlm && exit 1; - -if [ $# != 4 ]; then - echo "Usage: rnnlm_compute_scores.sh " - exit 1; -fi - -dir=$1 -tempdir=$2 -text_in=$3 -scores_out=$4 - -for x in rnnlm wordlist.rnn unk.probs; do - if [ ! -f $dir/$x ]; then - echo "rnnlm_compute_scores.sh: expected file $dir/$x to exist." - exit 1; - fi -done - -mkdir -p $tempdir -cat $text_in | awk '{for (x=2;x<=NF;x++) {printf("%s ", $x)} printf("\n");}' >$tempdir/text -cat $text_in | awk '{print $1}' > $tempdir/ids # e.g. utterance ids. -cat $tempdir/text | awk -v voc=$dir/wordlist.rnn -v unk=$dir/unk.probs \ - -v logprobs=$tempdir/loglikes.oov \ - 'BEGIN{ while((getline0) { invoc[$1]=1; } while ((getline0){ unkprob[$1]=$2;} } - { logprob=0; - if (NF==0) { printf ""; logprob = log(1.0e-07); - print "Warning: empty sequence." | "cat 1>&2"; } - for (x=1;x<=NF;x++) { w=$x; - if (invoc[w]) { printf("%s ",w); } else { - printf(" "); - if (unkprob[w] != 0) { logprob += log(unkprob[w]); } - else { print "Warning: unknown word ", w | "cat 1>&2"; logprob += log(1.0e-07); }}} - printf("\n"); print logprob > logprobs } ' > $tempdir/text.nounk - -# OK, now we compute the scores on the text with OOVs replaced -# with - -if [ $rnnlm_ver == "faster-rnnlm" ]; then - extra_options= - if [ "$ensure_normalized_probs" = true ]; then - extra_options="--nce-accurate-test 1" - fi - $rnnlm $extra_options -independent -rnnlm $dir/rnnlm -test $tempdir/text.nounk -nbest -debug 0 | \ - awk '{print $1*log(10);}' > $tempdir/loglikes.rnn -else - # add the utterance_id as required by Mikolove's rnnlm - paste $tempdir/ids $tempdir/text.nounk > $tempdir/id_text.nounk - - $rnnlm -independent -rnnlm $dir/rnnlm -test $tempdir/id_text.nounk -nbest -debug 0 | \ - awk '{print $1*log(10);}' > $tempdir/loglikes.rnn -fi - -[ `cat $tempdir/loglikes.rnn | wc -l` -ne `cat $tempdir/loglikes.oov | wc -l` ] && \ - echo "rnnlm rescoring failed" && exit 1; - -paste $tempdir/loglikes.rnn $tempdir/loglikes.oov | awk '{print -($1+$2);}' >$tempdir/scores - -# scores out, with utterance-ids. -paste $tempdir/ids $tempdir/scores > $scores_out - diff --git a/kaldi/local/run.pl b/kaldi/local/run.pl deleted file mode 100755 index f23bb8d..0000000 --- a/kaldi/local/run.pl +++ /dev/null @@ -1,282 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter - -# In general, doing -# run.pl some.log a b c is like running the command a b c in -# the bash shell, and putting the standard error and output into some.log. -# To run parallel jobs (backgrounded on the host machine), you can do (e.g.) -# run.pl JOB=1:4 some.JOB.log a b c JOB is like running the command a b c JOB -# and putting it in some.JOB.log, for each one. [Note: JOB can be any identifier]. -# If any of the jobs fails, this script will fail. - -# A typical example is: -# run.pl some.log my-prog "--opt=foo bar" foo \| other-prog baz -# and run.pl will run something like: -# ( my-prog '--opt=foo bar' foo | other-prog baz ) >& some.log -# -# Basically it takes the command-line arguments, quotes them -# as necessary to preserve spaces, and evaluates them with bash. -# In addition it puts the command line at the top of the log, and -# the start and end times of the command at the beginning and end. -# The reason why this is useful is so that we can create a different -# version of this program that uses a queueing system instead. - -# use Data::Dumper; - -@ARGV < 2 && die "usage: run.pl log-file command-line arguments..."; - - -$max_jobs_run = -1; -$jobstart = 1; -$jobend = 1; -$ignored_opts = ""; # These will be ignored. - -# First parse an option like JOB=1:4, and any -# options that would normally be given to -# queue.pl, which we will just discard. - -for (my $x = 1; $x <= 2; $x++) { # This for-loop is to - # allow the JOB=1:n option to be interleaved with the - # options to qsub. - while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) { - # parse any options that would normally go to qsub, but which will be ignored here. - my $switch = shift @ARGV; - if ($switch eq "-V") { - $ignored_opts .= "-V "; - } elsif ($switch eq "--max-jobs-run" || $switch eq "-tc") { - # we do support the option --max-jobs-run n, and its GridEngine form -tc n. - $max_jobs_run = shift @ARGV; - if (! ($max_jobs_run > 0)) { - die "run.pl: invalid option --max-jobs-run $max_jobs_run"; - } - } else { - my $argument = shift @ARGV; - if ($argument =~ m/^--/) { - print STDERR "run.pl: WARNING: suspicious argument '$argument' to $switch; starts with '-'\n"; - } - if ($switch eq "-sync" && $argument =~ m/^[yY]/) { - $ignored_opts .= "-sync "; # Note: in the - # corresponding code in queue.pl it says instead, just "$sync = 1;". - } elsif ($switch eq "-pe") { # e.g. -pe smp 5 - my $argument2 = shift @ARGV; - $ignored_opts .= "$switch $argument $argument2 "; - } elsif ($switch eq "--gpu") { - $using_gpu = $argument; - } else { - # Ignore option. - $ignored_opts .= "$switch $argument "; - } - } - } - if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20 - $jobname = $1; - $jobstart = $2; - $jobend = $3; - shift; - if ($jobstart > $jobend) { - die "run.pl: invalid job range $ARGV[0]"; - } - if ($jobstart <= 0) { - die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility)."; - } - } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1. - $jobname = $1; - $jobstart = $2; - $jobend = $2; - shift; - } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) { - print STDERR "run.pl: Warning: suspicious first argument to run.pl: $ARGV[0]\n"; - } -} - -# Users found this message confusing so we are removing it. -# if ($ignored_opts ne "") { -# print STDERR "run.pl: Warning: ignoring options \"$ignored_opts\"\n"; -# } - -if ($max_jobs_run == -1) { # If --max-jobs-run option not set, - # then work out the number of processors if possible, - # and set it based on that. - $max_jobs_run = 0; - if ($using_gpu) { - if (open(P, "nvidia-smi -L |")) { - $max_jobs_run++ while (

); - close(P); - } - if ($max_jobs_run == 0) { - $max_jobs_run = 1; - print STDERR "run.pl: Warning: failed to detect number of GPUs from nvidia-smi, using ${max_jobs_run}\n"; - } - } elsif (open(P, ") { if (m/^processor/) { $max_jobs_run++; } } - if ($max_jobs_run == 0) { - print STDERR "run.pl: Warning: failed to detect any processors from /proc/cpuinfo\n"; - $max_jobs_run = 10; # reasonable default. - } - close(P); - } elsif (open(P, "sysctl -a |")) { # BSD/Darwin - while (

) { - if (m/hw\.ncpu\s*[:=]\s*(\d+)/) { # hw.ncpu = 4, or hw.ncpu: 4 - $max_jobs_run = $1; - last; - } - } - close(P); - if ($max_jobs_run == 0) { - print STDERR "run.pl: Warning: failed to detect any processors from sysctl -a\n"; - $max_jobs_run = 10; # reasonable default. - } - } else { - # allow at most 32 jobs at once, on non-UNIX systems; change this code - # if you need to change this default. - $max_jobs_run = 32; - } - # The just-computed value of $max_jobs_run is just the number of processors - # (or our best guess); and if it happens that the number of jobs we need to - # run is just slightly above $max_jobs_run, it will make sense to increase - # $max_jobs_run to equal the number of jobs, so we don't have a small number - # of leftover jobs. - $num_jobs = $jobend - $jobstart + 1; - if (!$using_gpu && - $num_jobs > $max_jobs_run && $num_jobs < 1.4 * $max_jobs_run) { - $max_jobs_run = $num_jobs; - } -} - -$logfile = shift @ARGV; - -if (defined $jobname && $logfile !~ m/$jobname/ && - $jobend > $jobstart) { - print STDERR "run.pl: you are trying to run a parallel job but " - . "you are putting the output into just one log file ($logfile)\n"; - exit(1); -} - -$cmd = ""; - -foreach $x (@ARGV) { - if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } - elsif ($x =~ m:\":) { $cmd .= "'$x' "; } - else { $cmd .= "\"$x\" "; } -} - -#$Data::Dumper::Indent=0; -$ret = 0; -$numfail = 0; -%active_pids=(); - -use POSIX ":sys_wait_h"; -for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) { - if (scalar(keys %active_pids) >= $max_jobs_run) { - - # Lets wait for a change in any child's status - # Then we have to work out which child finished - $r = waitpid(-1, 0); - $code = $?; - if ($r < 0 ) { die "run.pl: Error waiting for child process"; } # should never happen. - if ( defined $active_pids{$r} ) { - $jid=$active_pids{$r}; - $fail[$jid]=$code; - if ($code !=0) { $numfail++;} - delete $active_pids{$r}; - # print STDERR "Finished: $r/$jid " . Dumper(\%active_pids) . "\n"; - } else { - die "run.pl: Cannot find the PID of the chold process that just finished."; - } - - # In theory we could do a non-blocking waitpid over all jobs running just - # to find out if only one or more jobs finished during the previous waitpid() - # However, we just omit this and will reap the next one in the next pass - # through the for(;;) cycle - } - $childpid = fork(); - if (!defined $childpid) { die "run.pl: Error forking in run.pl (writing to $logfile)"; } - if ($childpid == 0) { # We're in the child... this branch - # executes the job and returns (possibly with an error status). - if (defined $jobname) { - $cmd =~ s/$jobname/$jobid/g; - $logfile =~ s/$jobname/$jobid/g; - } - system("mkdir -p `dirname $logfile` 2>/dev/null"); - open(F, ">$logfile") || die "run.pl: Error opening log file $logfile"; - print F "# " . $cmd . "\n"; - print F "# Started at " . `date`; - $starttime = `date +'%s'`; - print F "#\n"; - close(F); - - # Pipe into bash.. make sure we're not using any other shell. - open(B, "|bash") || die "run.pl: Error opening shell command"; - print B "( " . $cmd . ") 2>>$logfile >> $logfile"; - close(B); # If there was an error, exit status is in $? - $ret = $?; - - $lowbits = $ret & 127; - $highbits = $ret >> 8; - if ($lowbits != 0) { $return_str = "code $highbits; signal $lowbits" } - else { $return_str = "code $highbits"; } - - $endtime = `date +'%s'`; - open(F, ">>$logfile") || die "run.pl: Error opening log file $logfile (again)"; - $enddate = `date`; - chop $enddate; - print F "# Accounting: time=" . ($endtime - $starttime) . " threads=1\n"; - print F "# Ended ($return_str) at " . $enddate . ", elapsed time " . ($endtime-$starttime) . " seconds\n"; - close(F); - exit($ret == 0 ? 0 : 1); - } else { - $pid[$jobid] = $childpid; - $active_pids{$childpid} = $jobid; - # print STDERR "Queued: " . Dumper(\%active_pids) . "\n"; - } -} - -# Now we have submitted all the jobs, lets wait until all the jobs finish -foreach $child (keys %active_pids) { - $jobid=$active_pids{$child}; - $r = waitpid($pid[$jobid], 0); - $code = $?; - if ($r == -1) { die "run.pl: Error waiting for child process"; } # should never happen. - if ($r != 0) { $fail[$jobid]=$code; $numfail++ if $code!=0; } # Completed successfully -} - -# Some sanity checks: -# The $fail array should not contain undefined codes -# The number of non-zeros in that array should be equal to $numfail -# We cannot do foreach() here, as the JOB ids do not necessarily start by zero -$failed_jids=0; -for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) { - $job_return = $fail[$jobid]; - if (not defined $job_return ) { - # print Dumper(\@fail); - - die "run.pl: Sanity check failed: we have indication that some jobs are running " . - "even after we waited for all jobs to finish" ; - } - if ($job_return != 0 ){ $failed_jids++;} -} -if ($failed_jids != $numfail) { - die "run.pl: Sanity check failed: cannot find out how many jobs failed ($failed_jids x $numfail)." -} -if ($numfail > 0) { $ret = 1; } - -if ($ret != 0) { - $njobs = $jobend - $jobstart + 1; - if ($njobs == 1) { - if (defined $jobname) { - $logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with - # that job. - } - print STDERR "run.pl: job failed, log is in $logfile\n"; - if ($logfile =~ m/JOB/) { - print STDERR "run.pl: probably you forgot to put JOB=1:\$nj in your script."; - } - } - else { - $logfile =~ s/$jobname/*/g; - print STDERR "run.pl: $numfail / $njobs failed, log is in $logfile\n"; - } -} - - -exit ($ret); diff --git a/kaldi/local/s2eps.pl b/kaldi/local/s2eps.pl deleted file mode 100755 index ffeeb8e..0000000 --- a/kaldi/local/s2eps.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces and with (on both input and output sides), -# for the G.fst acceptor. - -while(<>){ - @A = split(" ", $_); - if ( @A >= 4 ) { - if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } - if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } - } - print join("\t", @A) . "\n"; -} diff --git a/kaldi/local/segmentation.pl b/kaldi/local/segmentation.pl deleted file mode 100755 index 41d90f4..0000000 --- a/kaldi/local/segmentation.pl +++ /dev/null @@ -1,402 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2013 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0. - -# This program is for segmentation of data, e.g. long telephone conversations, -# into short chunks. The input (stdin) should be a sequence of lines like -# sw0-20348-A 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 ... 2 2 0 0 0 -# where there is a number for each frame and the numbers mean 0 for silence, 1 -# for noise, laughter and other nonspeech events, and 2 for speech. This will -# typically be derived from some kind of fast recognition (see -# ../steps/resegment_data.sh), followed by ali-to-phones --per-frame=true and -# then mapping phones to these classes 0, 1 and 2. -# -# The algorithm is as follows: -# (1) Find contiguous sequences of classes 1 or 2 (i.e. speech and/or noise), with e.g. -# "1 1 1 2 2" counted as a single contiguous sequence. Each such sequence is an -# initial segment. -# (2) While the proportion of silence in the segments is less than $silence_proportion, -# add a single silence frame to the left and right of each segment, as long -# as this does not take us past the ends of the file or into another segment. -# At this point, do not merge segments. -# (3) Merging segments: -# Get a list of all boundaries between segments that ended up touching each other -# during phase 2. Sort them according to the number of silence frames at the boundary, -# with those with the least silence to be processed first. Go through the boundaries -# in order, merging each pair of segments, as long as doing so does not create -# a segment larger than $max_segment_length. -# (4) Splitting excessively long segments: -# For all segments that are longer than $hard_max_segment_length, split them equally -# into the smallest number of pieces such that the pieces will be no longer than -# $hard_max_segment_length. Print a warning. -# (5) Removing any segments that contain no speech. (remove segments that have only silence -# and noise. -# -# By default, the utterance-ids will be of the form --, -# where and are measured 0.01 seconds, using fixed-width -# integers with enough digits to print out all the segments (the number of digits being -# decided per line of the input). For instance, if the input recording-id was -# sw0-20348-A, an example line of the "segments-file" output would be: -# sw0-20348-A-00124-00298 sw0-20348-A 1.24 2.98 -# (interpreted as ) -# and the number of digits has to be that large because the same recording has -# a segment something like -# sw0-20348-A-13491-13606 sw0-20348-A 134.91 136.06 -# The "_" and "-" in the output are separately configurable by means of the -# --first-separator and --second-separator options. However, generally speaking, -# it is safer to use "-" than, say, "_", because "-" appears very early in the -# ASCII table, and using it as the separator will tend to ensure than when -# you sort the utterances and the recording-ids they will sort the same way. -# This matters because recording-ids will often equal speaker-ids, and Kaldi scripts -# require that the utterance-ids and speaker-ids sort in the "same order". - - -use Getopt::Long; - -$silence_proportion = 0.2; # The amount of silence at the sides of segments is - # tuned to give this proportion of silence. - -$frame_shift = 0.01; # Affects the interpretation of the options such as max_segment_length, - # and the seconds in the "segments" file. -$max_segment_length = 15.0; # Maximum segment length while we are merging segments... - # it will not allow merging segments to make segments longer than this. -$hard_max_segment_length = 30.0; # A hard maximum on the segment length; it will - # break segments to get below this, even if there is - # no silence, and print a warning. -$first_separator = "-"; # separator between recording-id and start-time, in utterance-id. -$second_separator = "-"; # separator between start-time and end-time, in utterance-id. -$remove_noise_only_segments = "true"; # boolean option; if true, - # remove segments that have no speech. - - -GetOptions('silence-proportion:f' => \$silence_proportion, - 'frame-shift:f' => \$frame_shift, - 'max-segment-length:f' => \$max_segment_length, - 'hard-max-segment-length:f' => \$hard_max_segment_length, - 'first-separator:s' => \$first_separator, - 'second-separator:s' => \$second_separator, - 'remove-noise-only-segments:s' => \$remove_noise_only_segments); - -if (@ARGV != 0) { - print STDERR "$0:\n" . - "Usage: segmentation.pl [options] < per-frame-archive > segments-file\n" . - "This program is called from steps/resegment_data.sh. Please see\n" . - "the extensive comment in the source. Options:\n" . - "--silence-proportion (default: $silence_proportion)\n" . - "--frame-shift (default: $frame_shift, in seconds)\n" . - "--max-segment-length (default: $max_segment_length, in seconds)\n" . - "--hard-max-segment-length (default: $hard_max_segment_length, in seconds)\n" . - "--first-separator (default: $first_separator), affects utterance-ids\n" . - "--second-separator (default: $second_separator), affects utterance-ids\n" . - "--remove-noise-only-segments (default: true)\n"; - exit 1; -} - -($silence_proportion > 0.01 && $silence_proportion < 0.99) || - die "Invalid silence-proportion value '$silence_proportion'"; -($frame_shift > 0.0001 && $frame_shift <= 1.0) || - die "Very strange frame-shift value '$frame_shift'"; -($max_segment_length > 1.0 && $max_segment_length < 100.0) || - die "Very strange max-segment-length value '$max_segment_length'"; -($hard_max_segment_length > 4.0 && $hard_max_segment_length < 500.0) || - die "Very strange hard-max-segment-length value '$hard_max_segment_length'"; -($hard_max_segment_length >= $max_segment_length) || - die "hard-max-segment-length may not be less than max-segment-length"; -($remove_noise_only_segments eq 'false' || - $remove_noise_only_segments eq 'true') || - die "Option --remove-noise-only-segments takes args true or false"; - - -sub get_initial_segments { - # This operates on the global arrays @A, @S and @N. It sets the elements of - # @S to 1 if start of segment, and @E to 1 if end of segment, end of segment - # being defined as one past the last frame in the segment. - - for (my $n = 0; $n < $N; $n++) { - if ($A[$n] == 0) { - if ($n > 0 && $A[$n-1] != 0) { - $E[$n] = 1; - } - } else { - if ($n == 0 || $A[$n-1] == 0) { - $S[$n] = 1; - } - } - } - if ($A[$N-1] != 0) { # Handle the special case - $E[$N] = 1; # where the last frame of the file is silence or noise. - } -} - - -sub set_silence_proportion { - $num_nonsil_frames = 0; - # Get number of frames that are inside segments. Initially, this will - # all be non-silence. - $in_segment = 0; - - my @active_frames = (); # active_frames are segment start/end frames. - for (my $n = 0; $n <= $N; $n++) { - if ($n < $N && $S[$n] == 1) { - $in_segment == 0 || die; - $in_segment = 1; - push @active_frames, $n; - } - if ($E[$n] == 1) { - $in_segment == 1 || die; - $in_segment = 0; - push @active_frames, $n; - } - if ($n < $N) { - ($in_segment == ($A[$n] != 0 ? 1 : 0)) || die; # Just a check. - if ($in_segment) { $num_nonsil_frames++; } - } - } - $in_segment == 0 || die; # should not be still in a segment after file-end. - if ($num_nonsil_frames == 0) { - print STDERR "$0: warning: no segments found for recording $recording_id\n"; - return; - } - #(target-segment-frames - num-nonsil-frames) / target-segment-frames = sil-proportion - # -> target-segment-frames = (num-nonsil-frames) / (1 - sil-proportion). - my $target_segment_frames = int($num_nonsil_frames / (1.0 - $silence_proportion)); - my $num_segment_frames = $num_nonsil_frames; - while ($num_segment_frames < $target_segment_frames) { - $changed = 0; - for (my $i = 0; $i < @active_frames; $i++) { - my $n = $active_frames[$i]; - if ($E[$n] == 1 && $n < $N && $S[$n] != 1) { - # shift the end of this segment one frame to the right. - $E[$n] = 0; - $E[$n+1] = 1; - $active_frames[$i] = $n + 1; - $num_segment_frames++; - $changed = 1; - } - if ($n < $N && $S[$n] == 1 && $n > 0 && $E[$n] != 1) { - # shift the start of this segment one frame to the left - $S[$n] = 0; - $S[$n-1] = 1; - $active_frames[$i] = $n - 1; - $num_segment_frames++; - $changed = 1; - } - if ($num_segment_frames == $target_segment_frames) { - last; - } - } - if ($changed == 0) { last; } # avoid an infinite loop. - } - if ($num_segment_frames < $target_segment_frames) { - my $proportion = - ($num_segment_frames - $num_nonsil_frames) / $num_segment_frames; - print STDERR "$0: warning: for recording $recording_id, only got a proportion " . - "$proportion of silence frames, versus target $silence_proportion\n"; - } -} - -sub merge_segments() { - my @boundaries = (); - my @num_silence_phones = (); # for any index into @S where there - # is a boundary between contiguous segments - # (i.e. an index which is both a segment-start - # and segment-end index), the number of silence - # frames at that boundary (i.e. at the end of the - # previous segment and the beginning of the next - # one. - for ($n = 0; $n < $N; $n++) { - if ($S[$n] == 1 && $E[$n] == 1) { - push @boundaries, $n; - my $num_sil = 0; - my $p; - # note: here we can count the silence phones without regard to the - # segment boundaries, since we'll hit nonsilence before we get to - # the end/beginning of these segments. - for ($p = $n; $p < $N; $p++) { - if ($A[$p] == 0) { $num_sil++; } - else { last; } - } - for ($p = $n - 1; $p >= 0; $p--) { - if ($A[$p] == 0) { $num_sil++; } - else { last; } - } - $num_silence_phones[$n] = $p; - } - } - - # Sort on increasing number of silence-phones, so we join the segments with - # the smallest amount of silence at the boundary first. - my @sorted_boundaries = - sort { $num_silence_phones[$a] <=> $num_silence_phones[$b] } @boundaries; - - foreach $n (@sorted_boundaries) { - # Join the segments only if the length of the resulting segment would - # be no more than $max_segment_length. - ($S[$n] == 1 && $E[$n] == 1) || die; - my $num_frames = 2; # total number of frames in the two segments we'll be merging.. - # start the count from 2 because the loops below do not - # count the 1st frame of the segment to the right and - # the last frame of the segment to the left. - my $p; - for ($p = $n + 1; $p <= @A && $E[$p] == 0; $p++) { - $num_frames++; - } - $E[$p] == 1 || die; - for ($p = $n - 1; $p >= 0 && $S[$p] == 0; $p--) { - $num_frames++; - } - $S[$p] == 1 || die; - if ($num_frames * $frame_shift <= $max_segment_length) { - # Join this pair of segments. - $S[$n] = 0; - $E[$n] = 0; - } - } -} - -sub split_long_segments { - for (my $n = 0; $n < @A; $n++) { - if ($S[$n] == 1) { # segment starts here... - my $p; - for ($p = $n + 1; $p <= @A; $p++) { - if ($E[$p] == 1) { last; } - } - ($E[$p] == 1) || die; - my $segment_length = $p - $n; - my $max_frames = int($hard_max_segment_length / $frame_shift); - if ($segment_length > $max_frames) { - # The segment is too long, we need to split it. First work out - # how many pieces to split it into. - # We divide and round up to nearest larger int. - my $num_pieces = int(($segment_length / $max_frames) + 0.99999); - my $segment_length_in_seconds = $segment_length * $frame_shift; - print STDERR "$0: warning: for recording $recording_id, splitting segment of " . - "length $segment_length_in_seconds seconds into $num_pieces pieces " . - "(--hard-max-segment-length $hard_max_segment_length)\n"; - my $frames_per_piece = int($segment_length / $num_pieces); - my $i; - for ($i = 1; $i < $num_pieces; $i++) { - my $q = $n + $i * $frames_per_piece; - # Insert a segment boundary at frame $q. - $S[$q] = 1; - $E[$q] = 1; - } - } - if ($p - 1 > $n) { - $n = $p - 1; # avoids some redundant work. - } - } - } -} - -sub remove_noise_only_segments { - for (my $n = 0; $n < $N; $n++) { - if ($S[$n] == 1) { # segment starts here... - my $p; - my $saw_speech = 0; - for ($p = $n; $p <= $N; $p++) { - if ($E[$p] == 1 && $p != $n) { last; } - if ($A[$p] == 2) { $saw_speech = 1; } - } - $E[$p] == 1 || die; - if (! $saw_speech) { # There was no speech in this segment, so remove it. - $S[$n] = 0; - $E[$p] = 0; - } - if ($p - 1 > $n) { - $n = $p - 1; # Avoid some redundant work. - } - } - } -} - -sub print_segments { - # We also do some sanity checking here. - my @segments = (); # each element will be a string start-time:end-time, in frames. - - $N == @S || die; # check array size. - ($N+1) == @E || die; # check array size. - - my $max_end_time = 0; - - for (my $n = 0; $n < $N; $n++) { - if ($E[$n] == 1 && $S[$n] != 1) { - die "Ending segment before starting it: n=$n.\n"; - } - if ($S[$n]) { - my $p; - for ($p = $n + 1; $p < $N && $E[$p] != 1; $p++) { - $S[$p] && die; # should not start a segment again, before ending it. - } - $E[$p] == 1 || die; - push @segments, "$n:$p"; # push the start/end times. - $max_end_time = $p; - if ($p < $N && $S[$p] == 1) { $n = $p - 1; } - else { $n = $p; } - # note: we increment $n again before the next loop instance. - } - } - - if (@segments == 0) { - print STDERR "$0: warning: no segments for recording $recording_id\n"; - } - - # we'll be printing the times out in hundredths of a second (regardless of the - # value of $frame_shift), and first need to know how many digits we need (we'll be - # printing with "%05d" or similar, for zero-padding. - $max_end_time_hundredths_second = int(100.0 * $frame_shift * $max_end_time); - $num_digits = 1; - my $i = 1; - while ($i < $max_end_time_hundredths_second) { - $i *= 10; - $num_digits++; - } - $format_str = "%0${num_digits}d"; # e.g. "%05d" - - foreach $s (@segments) { - my ($start,$end) = split(":", $s); - ($end > $start) || die; - my $start_seconds = sprintf("%.2f", $frame_shift * $start); - my $end_seconds = sprintf("%.2f", $frame_shift * $end); - my $start_str = sprintf($format_str, $start_seconds * 100); - my $end_str = sprintf($format_str, $end_seconds * 100); - my $utterance_id = "${recording_id}${first_separator}${start_str}${second_separator}${end_str}"; - print "$utterance_id $recording_id $start_seconds $end_seconds\n"; # <-- Here is where the output happens. - } -} - - - -while () { - @A = split; # split line on whitespace. - if (@A <= 1) { - print STDERR "$0: warning: invalid input line $_"; - next; - } - $recording_id = shift @A; # e.g. sw0-12430 - for ($n = 0; $n < @A; $n++) { - $a = $A[$n]; - if ($a != 0 && $a != 1 && $a != 2) { - die "Invalid value $a: expecting 0, 1 or 2. Line is: $_"; - } - $A[$n] = 0 + $a; # cast to integer, might be a bit faster. - } - # The array @S will contain 1 if a segment starts there and 0 - # otherwise. The array @E will contain 1 if a segment ends there - # and 0 otherwise. - $N = @A; # number of elements in @A. Used globally. - @S = (0) x $N; # 0 repeated $N times. - @E = (0) x ($N + 1); # 0 repeated $N+1 times (one more since if the last frame is - # in a segment, the end-marker goes one past that, at index $N.) - - get_initial_segments(); # stage (1) in the comment above. - set_silence_proportion(); # stage (2) - merge_segments(); # stage (3) - split_long_segments(); # stage (4) - if ($remove_noise_only_segments eq 'true') { - remove_noise_only_segments(); # stage (5) - } - print_segments(); -} - diff --git a/kaldi/local/show_lattice.sh b/kaldi/local/show_lattice.sh deleted file mode 100755 index d8f8d71..0000000 --- a/kaldi/local/show_lattice.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash - -format=pdf # pdf svg -mode=save # display save -lm_scale=0.0 -acoustic_scale=0.0 -#end of config - -. utils/parse_options.sh - -if [ $# != 3 ]; then - echo "usage: $0 [--mode display|save] [--format pdf|svg] " - echo "e.g.: $0 utt-0001 \"test/lat.*.gz\" tri1/graph/words.txt" - exit 1; -fi - -. ./path.sh - -uttid=$1 -lat=$2 -words=$3 - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); # trap "rm -r $tmpdir" EXIT # cleanup - -gunzip -c $lat | lattice-to-fst --lm-scale=$lm_scale --acoustic-scale=$acoustic_scale ark:- "scp,p:echo $uttid $tmpdir/$uttid.fst|" || exit 1; -! [ -s $tmpdir/$uttid.fst ] && \ - echo "Failed to extract lattice for utterance $uttid (not present?)" && exit 1; -fstdraw --portrait=true --osymbols=$words $tmpdir/$uttid.fst | dot -T${format} > $tmpdir/$uttid.${format} - -if [ "$(uname)" == "Darwin" ]; then - doc_open=open -elif [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then - doc_open=xdg-open -elif [ $mode == "display" ] ; then - echo "Can not automaticaly open file on your operating system" - mode=save -fi - -[ $mode == "display" ] && $doc_open $tmpdir/$uttid.${format} -[[ $mode == "display" && $? -ne 0 ]] && echo "Failed to open ${format} format." && mode=save -[ $mode == "save" ] && echo "Saving to $uttid.${format}" && cp $tmpdir/$uttid.${format} . - -exit 0 diff --git a/kaldi/local/shuffle_list.pl b/kaldi/local/shuffle_list.pl deleted file mode 100755 index 83bccff..0000000 --- a/kaldi/local/shuffle_list.pl +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env perl - -# Copyright 2013 Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -if ($ARGV[0] eq "--srand") { - $n = $ARGV[1]; - $n =~ m/\d+/ || die "Bad argument to --srand option: \"$n\""; - srand($ARGV[1]); - shift; - shift; -} else { - srand(0); # Gives inconsistent behavior if we don't seed. -} - -if (@ARGV > 1 || $ARGV[0] =~ m/^-.+/) { # >1 args, or an option we - # don't understand. - print "Usage: shuffle_list.pl [--srand N] [input file] > output\n"; - print "randomizes the order of lines of input.\n"; - exit(1); -} - -@lines; -while (<>) { - push @lines, [ (rand(), $_)] ; -} - -@lines = sort { $a->[0] cmp $b->[0] } @lines; -foreach $l (@lines) { - print $l->[1]; -} diff --git a/kaldi/local/slurm.pl b/kaldi/local/slurm.pl deleted file mode 100755 index cfa634a..0000000 --- a/kaldi/local/slurm.pl +++ /dev/null @@ -1,627 +0,0 @@ -#!/usr/bin/env perl -use strict; -use warnings; - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). -# 2014 Vimal Manohar (Johns Hopkins University) -# 2015 Johns Hopkins University (Yenda Trmal >) -# Apache 2.0. - -use File::Basename; -use Cwd; -use Getopt::Long; - -# slurm.pl was created from the queue.pl -# queue.pl has the same functionality as run.pl, except that -# it runs the job in question on the queue (Sun GridEngine). -# This version of queue.pl uses the task array functionality -# of the grid engine. Note: it's different from the queue.pl -# in the s4 and earlier scripts. - -# The script now supports configuring the queue system using a config file -# (default in conf/queue.conf; but can be passed specified with --config option) -# and a set of command line options. -# The current script handles: -# 1) Normal configuration arguments -# For e.g. a command line option of "--gpu 1" could be converted into the option -# "-q g.q -l gpu=1" to qsub. How the CLI option is handled is determined by a -# line in the config file like -# gpu=* -q g.q -l gpu=$0 -# $0 here in the line is replaced with the argument read from the CLI and the -# resulting string is passed to qsub. -# 2) Special arguments to options such as -# gpu=0 -# If --gpu 0 is given in the command line, then no special "-q" is given. -# 3) Default argument -# default gpu=0 -# If --gpu option is not passed in the command line, then the script behaves as -# if --gpu 0 was passed since 0 is specified as the default argument for that -# option -# 4) Arbitrary options and arguments. -# Any command line option starting with '--' and its argument would be handled -# as long as its defined in the config file. -# 5) Default behavior -# If the config file that is passed using is not readable, then the script -# behaves as if the queue has the following config file: -# $ cat conf/queue.conf -# # Default configuration -# command sbatch --export=PATH -S /bin/bash -j y -l arch=*64* -# option mem=* --mem-per-cpu $0 -# option mem=0 # Do not add anything to qsub_opts -# option num_threads=* --cpus-per-task $0 -# option num_threads=1 # Do not add anything to qsub_opts -# option max_jobs_run=* -tc $0 -# default gpu=0 -# option gpu=0 -p shared -# option gpu=* -p gpu #this has to be figured out - -#print STDERR "$0 " . join(" ", @ARGV) . "\n"; - -my $qsub_opts = ""; -my $sync = 0; -my $num_threads = 1; -my $max_jobs_run; -my $gpu = 0; - -my $config = "conf/slurm.conf"; - -my %cli_options = (); - -my $jobname; -my $jobstart; -my $jobend; - -my $array_job = 0; - -sub print_usage() { - print STDERR - "Usage: $0 [options] [JOB=1:n] log-file command-line arguments...\n" . - "e.g.: $0 foo.log echo baz\n" . - " (which will echo \"baz\", with stdout and stderr directed to foo.log)\n" . - "or: $0 -q all.q\@xyz foo.log echo bar \| sed s/bar/baz/ \n" . - " (which is an example of using a pipe; you can provide other escaped bash constructs)\n" . - "or: $0 -q all.q\@qyz JOB=1:10 foo.JOB.log echo JOB \n" . - " (which illustrates the mechanism to submit parallel jobs; note, you can use \n" . - " another string other than JOB)\n" . - "Note: if you pass the \"-sync y\" option to qsub, this script will take note\n" . - "and change its behavior. Otherwise it uses squeue to work out when the job finished\n" . - "Options:\n" . - " --config (default: $config)\n" . - " --mem (e.g. --mem 2G, --mem 500M, \n" . - " also support K and numbers mean bytes)\n" . - " --num-threads (default: $num_threads)\n" . - " --max-jobs-run \n" . - " --gpu <0|1> (default: $gpu)\n"; - exit 1; -} - -sub exec_command { - # Execute command and return a tuple of stdout and exit code - my $command = join ' ', @_; - # To get the actual exit value, shift right by eight bits. - ($_ = `$command 2>&1`, $? >> 8); -} - -if (@ARGV < 2) { - print_usage(); -} - -for (my $x = 1; $x <= 3; $x++) { # This for-loop is to - # allow the JOB=1:n option to be interleaved with the - # options to qsub. - while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) { - my $switch = shift @ARGV; - - if ($switch eq "-V") { - $qsub_opts .= "-V "; - } else { - my $argument = shift @ARGV; - if ($argument =~ m/^--/) { - print STDERR "WARNING: suspicious argument '$argument' to $switch; starts with '-'\n"; - } - if ($switch eq "-sync" && $argument =~ m/^[yY]/) { - $sync = 1; - $qsub_opts .= "$switch $argument "; - } elsif ($switch eq "-pe") { # e.g. -pe smp 5 - my $argument2 = shift @ARGV; - $qsub_opts .= "$switch $argument $argument2 "; - $num_threads = $argument2; - } elsif ($switch =~ m/^--/) { # Config options - # Convert CLI option to variable name - # by removing '--' from the switch and replacing any - # '-' with a '_' - $switch =~ s/^--//; - $switch =~ s/-/_/g; - $cli_options{$switch} = $argument; - } else { # Other qsub options - passed as is - $qsub_opts .= "$switch $argument "; - } - } - } - if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20 - $array_job = 1; - $jobname = $1; - $jobstart = $2; - $jobend = $3; - shift; - if ($jobstart > $jobend) { - die "$0: invalid job range $ARGV[0]"; - } - if ($jobstart <= 0) { - die "$0: invalid job range $ARGV[0], start must be strictly positive (this is a GridEngine limitation)."; - } - } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1. - $array_job = 1; - $jobname = $1; - $jobstart = $2; - $jobend = $2; - shift; - } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) { - print STDERR "Warning: suspicious first argument to $0: $ARGV[0]\n"; - } -} - -if (@ARGV < 2) { - print_usage(); -} - -if (exists $cli_options{"config"}) { - $config = $cli_options{"config"}; -} - -my $default_config_file = <<'EOF'; -# Default configuration -command sbatch --export=PATH --ntasks-per-node=1 -option time=* --time $0 -option mem=* --mem-per-cpu $0 -option mem=0 # Do not add anything to qsub_opts -option num_threads=* --cpus-per-task $0 --ntasks-per-node=1 -option num_threads=1 --cpus-per-task 1 --ntasks-per-node=1 # Do not add anything to qsub_opts -default gpu=0 -option gpu=0 -p shared -option gpu=* -p gpu --gres=gpu:$0 --time 4:0:0 # this has to be figured out -# note: the --max-jobs-run option is supported as a special case -# by slurm.pl and you don't have to handle it in the config file. -EOF - -# Here the configuration options specified by the user on the command line -# (e.g. --mem 2G) are converted to options to the qsub system as defined in -# the config file. (e.g. if the config file has the line -# "option mem=* -l ram_free=$0,mem_free=$0" -# and the user has specified '--mem 2G' on the command line, the options -# passed to queue system would be "-l ram_free=2G,mem_free=2G -# A more detailed description of the ways the options would be handled is at -# the top of this file. - -my $opened_config_file = 1; - -open CONFIG, "<$config" or $opened_config_file = 0; - -my %cli_config_options = (); -my %cli_default_options = (); - -if ($opened_config_file == 0 && exists($cli_options{"config"})) { - print STDERR "Could not open config file $config\n"; - exit(1); -} elsif ($opened_config_file == 0 && !exists($cli_options{"config"})) { - # Open the default config file instead - open (CONFIG, "echo '$default_config_file' |") or die "Unable to open pipe\n"; - $config = "Default config"; -} - -my $qsub_cmd = ""; -my $read_command = 0; - -while() { - chomp; - my $line = $_; - $_ =~ s/\s*#.*//g; - if ($_ eq "") { next; } - if ($_ =~ /^command (.+)/) { - $read_command = 1; - $qsub_cmd = $1 . " "; - } elsif ($_ =~ m/^option ([^=]+)=\* (.+)$/) { - # Config option that needs replacement with parameter value read from CLI - # e.g.: option mem=* -l mem_free=$0,ram_free=$0 - my $option = $1; # mem - my $arg= $2; # -l mem_free=$0,ram_free=$0 - if ($arg !~ m:\$0:) { - print STDERR "Warning: the line '$line' in config file ($config) does not substitution variable \$0\n"; - } - if (exists $cli_options{$option}) { - # Replace $0 with the argument read from command line. - # e.g. "-l mem_free=$0,ram_free=$0" -> "-l mem_free=2G,ram_free=2G" - $arg =~ s/\$0/$cli_options{$option}/g; - $cli_config_options{$option} = $arg; - } - } elsif ($_ =~ m/^option ([^=]+)=(\S+)\s?(.*)$/) { - # Config option that does not need replacement - # e.g. option gpu=0 -q all.q - my $option = $1; # gpu - my $value = $2; # 0 - my $arg = $3; # -q all.q - if (exists $cli_options{$option}) { - $cli_default_options{($option,$value)} = $arg; - } - } elsif ($_ =~ m/^default (\S+)=(\S+)/) { - # Default options. Used for setting default values to options i.e. when - # the user does not specify the option on the command line - # e.g. default gpu=0 - my $option = $1; # gpu - my $value = $2; # 0 - if (!exists $cli_options{$option}) { - # If the user has specified this option on the command line, then we - # don't have to do anything - $cli_options{$option} = $value; - } - } else { - print STDERR "$0: unable to parse line '$line' in config file ($config)\n"; - exit(1); - } -} - -close(CONFIG); - -if ($read_command != 1) { - print STDERR "$0: config file ($config) does not contain the line \"command .*\"\n"; - exit(1); -} - -for my $option (keys %cli_options) { - if ($option eq "config") { next; } - - my $value = $cli_options{$option}; - - if ($option eq "max_jobs_run") { - if ($array_job != 1) { - print STDERR "Ignoring $option since this is not an array task."; - } else { - $max_jobs_run = $value; - } - } elsif (exists $cli_default_options{($option,$value)}) { - $qsub_opts .= "$cli_default_options{($option,$value)} "; - } elsif (exists $cli_config_options{$option}) { - $qsub_opts .= "$cli_config_options{$option} "; - } elsif (exists $cli_default_options{($option,"*")}) { - $qsub_opts .= $cli_default_options{($option,"*")} . " "; - } else { - if ($opened_config_file == 0) { - $config = "default config file"; - } - die "$0: Command line option $option not described in $config (or value '$value' not allowed)\n"; - } -} - -my $cwd = getcwd(); -my $logfile = shift @ARGV; - -if ($array_job == 1 && $logfile !~ m/$jobname/ - && $jobend > $jobstart) { - print STDERR "$0: you are trying to run a parallel job but " - . "you are putting the output into just one log file ($logfile)\n"; - exit(1); -} - -# -# Work out the command; quote escaping is done here. -# Note: the rules for escaping stuff are worked out pretty -# arbitrarily, based on what we want it to do. Some things that -# we pass as arguments to $0, such as "|", we want to be -# interpreted by bash, so we don't escape them. Other things, -# such as archive specifiers like 'ark:gunzip -c foo.gz|', we want -# to be passed, in quotes, to the Kaldi program. Our heuristic -# is that stuff with spaces in should be quoted. This doesn't -# always work. -# -my $cmd = ""; - -foreach my $x (@ARGV) { - if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } # If string contains no spaces, take - # as-is. - elsif ($x =~ m:\":) { $cmd .= "'$x' "; } # else if no dbl-quotes, use single - else { $cmd .= "\"$x\" "; } # else use double. -} - -# -# Work out the location of the script file, and open it for writing. -# -my $dir = dirname($logfile); -my $base = basename($logfile); -my $qdir = "$dir/q"; -$qdir =~ s:/(log|LOG)/*q:/q:; # If qdir ends in .../log/q, make it just .../q. -my $queue_logfile = "$qdir/$base"; - -if (!-d $dir) { system "mkdir -p $dir 2>/dev/null"; } # another job may be doing this... -if (!-d $dir) { die "Cannot make the directory $dir\n"; } -# make a directory called "q", -# where we will put the log created by qsub... normally this doesn't contain -# anything interesting, evertyhing goes to $logfile. -if (! -d "$qdir") { - system "mkdir $qdir 2>/dev/null"; - sleep(5); ## This is to fix an issue we encountered in denominator lattice creation, - ## where if e.g. the exp/tri2b_denlats/log/15/q directory had just been - ## created and the job immediately ran, it would die with an error because nfs - ## had not yet synced. I'm also decreasing the acdirmin and acdirmax in our - ## NFS settings to something like 5 seconds. -} - -my $queue_array_opt = ""; -if ($array_job == 1) { # It's an array job. - if ($max_jobs_run) { - $queue_array_opt = "--array ${jobstart}-${jobend}%${max_jobs_run}"; - } else { - $queue_array_opt = "--array ${jobstart}-${jobend}"; - } - $logfile =~ s/$jobname/\$SLURM_ARRAY_TASK_ID/g; # This variable will get - # replaced by qsub, in each job, with the job-id. - $cmd =~ s/$jobname/\$\{SLURM_ARRAY_TASK_ID\}/g; # same for the command... - $queue_logfile =~ s/\.?$jobname//; # the log file in the q/ subdirectory - # is for the queue to put its log, and this doesn't need the task array subscript - # so we remove it. -} - -# queue_scriptfile is as $queue_logfile [e.g. dir/q/foo.log] but -# with the suffix .sh. -my $queue_scriptfile = $queue_logfile; -($queue_scriptfile =~ s/\.[a-zA-Z]{1,5}$/.sh/) || ($queue_scriptfile .= ".sh"); -if ($queue_scriptfile !~ m:^/:) { - $queue_scriptfile = $cwd . "/" . $queue_scriptfile; # just in case. -} - -# We'll write to the standard input of "qsub" (the file-handle Q), -# the job that we want it to execute. -# Also keep our current PATH around, just in case there was something -# in it that we need (although we also source ./path.sh) - -my $syncfile = "$qdir/done.$$"; - -system("rm $queue_logfile $syncfile 2>/dev/null"); -# -# Write to the script file, and then close it. -# -open(Q, ">$queue_scriptfile") || die "Failed to write to $queue_scriptfile"; - -print Q "#!/bin/bash\n"; -print Q "cd $cwd\n"; -print Q ". ./path.sh\n"; -print Q "( echo '#' Running on \`hostname\`\n"; -print Q " echo '#' Started at \`date\`\n"; -print Q " set | grep SLURM | while read line; do echo \"# \$line\"; done\n"; -print Q " echo -n '# '; cat <$logfile\n"; -print Q "if [ \"\$CUDA_VISIBLE_DEVICES\" == \"NoDevFiles\" ]; then\n"; -print Q " ( echo CUDA_VISIBLE_DEVICES set to NoDevFiles, unsetting it... \n"; -print Q " )>>$logfile\n"; -print Q " unset CUDA_VISIBLE_DEVICES\n"; -print Q "fi\n"; -print Q "time1=\`date +\"%s\"\`\n"; -print Q " ( $cmd ) &>>$logfile\n"; -print Q "ret=\$?\n"; -print Q "sync || true\n"; -print Q "time2=\`date +\"%s\"\`\n"; -print Q "echo '#' Accounting: begin_time=\$time1 >>$logfile\n"; -print Q "echo '#' Accounting: end_time=\$time2 >>$logfile\n"; -print Q "echo '#' Accounting: time=\$((\$time2-\$time1)) threads=$num_threads >>$logfile\n"; -print Q "echo '#' Finished at \`date\` with status \$ret >>$logfile\n"; -print Q "[ \$ret -eq 137 ] && exit 100;\n"; # If process was killed (e.g. oom) it will exit with status 137; - # let the script return with status 100 which will put it to E state; more easily rerunnable. -if ($array_job == 0) { # not an array job - print Q "touch $syncfile\n"; # so we know it's done. -} else { - print Q "touch $syncfile.\$SLURM_ARRAY_TASK_ID\n"; # touch a bunch of sync-files. -} -print Q "exit \$[\$ret ? 1 : 0]\n"; # avoid status 100 which grid-engine -print Q "## submitted with:\n"; # treats specially. -$qsub_cmd .= " $qsub_opts --open-mode=append -e ${queue_logfile} -o ${queue_logfile} $queue_array_opt $queue_scriptfile >>$queue_logfile 2>&1"; -print Q "# $qsub_cmd\n"; -if (!close(Q)) { # close was not successful... || die "Could not close script file $shfile"; - die "Failed to close the script file (full disk?)"; -} - -my $ret = system ($qsub_cmd); -if ($ret != 0) { - if ($sync && $ret == 256) { # this is the exit status when a job failed (bad exit status) - if (defined $jobname) { $logfile =~ s/\$SLURM_ARRAY_TASK_ID/*/g; } - print STDERR "$0: job writing to $logfile failed\n"; - } else { - print STDERR "$0: error submitting jobs to queue (return status was $ret)\n"; - print STDERR "queue log file is $queue_logfile, command was $qsub_cmd\n"; - print STDERR `tail $queue_logfile`; - } - exit(1); -} - -my $sge_job_id; -if (! $sync) { # We're not submitting with -sync y, so we - # need to wait for the jobs to finish. We wait for the - # sync-files we "touched" in the script to exist. - my @syncfiles = (); - if (!defined $jobname) { # not an array job. - push @syncfiles, $syncfile; - } else { - for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) { - push @syncfiles, "$syncfile.$jobid"; - } - } - # We will need the sge_job_id, to check that job still exists - { # Get the SLURM job-id from the log file in q/ - open(L, "<$queue_logfile") || die "Error opening log file $queue_logfile"; - undef $sge_job_id; - while () { - if (m/Submitted batch job (\d+)/) { - if (defined $sge_job_id) { - die "Error: your job was submitted more than once (see $queue_logfile)"; - } else { - $sge_job_id = $1; - } - } - } - close(L); - if (!defined $sge_job_id) { - die "Error: log file $queue_logfile does not specify the SLURM job-id."; - } - } - my $check_sge_job_ctr=1; - # - my $wait = 0.1; - my $counter = 0; - foreach my $f (@syncfiles) { - # wait for them to finish one by one. - while (! -f $f) { - sleep($wait); - $wait *= 1.2; - if ($wait > 3.0) { - $wait = 3.0; # never wait more than 3 seconds. - # the following (.kick) commands are basically workarounds for NFS bugs. - if (rand() < 0.25) { # don't do this every time... - if (rand() > 0.5) { - system("touch $qdir/.kick 2>/dev/null"); - } else { - system("rm $qdir/.kick 2>/dev/null"); - } - } - if ($counter++ % 10 == 0) { - # This seems to kick NFS in the teeth to cause it to refresh the - # directory. I've seen cases where it would indefinitely fail to get - # updated, even though the file exists on the server. - # Only do this every 10 waits (every 30 seconds) though, or if there - # are many jobs waiting they can overwhelm the file server. - system("ls $qdir >/dev/null"); - } - } - - # Check that the job exists in SLURM. Job can be killed if duration - # exceeds some hard limit, or in case of a machine shutdown. - if (($check_sge_job_ctr++ % 10) == 0) { # Don't run qstat too often, avoid stress on SGE. - if ( -f $f ) { next; }; #syncfile appeared: OK. - # system(...) : To get the actual exit value, shift $ret right by eight bits. - my ($squeue_output, $squeue_status) = exec_command("squeue -j $sge_job_id"); - if ($squeue_status == 1) { - # Don't consider immediately missing job as error, first wait some - sleep(4); - ($squeue_output, $squeue_status) = exec_command("squeue -j $sge_job_id"); - } - if ($squeue_status == 1) { - # time to make sure it is not just delayed creation of the syncfile. - - # Don't consider immediately missing job as error, first wait some - # time to make sure it is not just delayed creation of the syncfile. - sleep(4); - # Sometimes NFS gets confused and thinks it's transmitted the directory - # but it hasn't, due to timestamp issues. Changing something in the - # directory will usually fix that. - system("touch $qdir/.kick"); - system("rm $qdir/.kick 2>/dev/null"); - if ( -f $f ) { next; } #syncfile appeared, ok - sleep(7); - system("touch $qdir/.kick"); - sleep(1); - system("rm $qdir/.kick 2>/dev/null"); - if ( -f $f ) { next; } #syncfile appeared, ok - sleep(60); - system("touch $qdir/.kick"); - sleep(1); - system("rm $qdir/.kick 2>/dev/null"); - if ( -f $f ) { next; } #syncfile appeared, ok - $f =~ m/\.(\d+)$/ || die "Bad sync-file name $f"; - my $job_id = $1; - if (defined $jobname) { - $logfile =~ s/\$SLURM_ARRAY_TASK_ID/$job_id/g; - } - my $last_line = `tail -n 1 $logfile`; - if ($last_line =~ m/status 0$/ && (-M $logfile) < 0) { - # if the last line of $logfile ended with "status 0" and - # $logfile is newer than this program [(-M $logfile) gives the - # time elapsed between file modification and the start of this - # program], then we assume the program really finished OK, - # and maybe something is up with the file system. - print STDERR "**$0: syncfile $f was not created but job seems\n" . - "**to have finished OK. Probably your file-system has problems.\n" . - "**This is just a warning.\n"; - last; - } else { - chop $last_line; - print STDERR "$0: Error: Job $sge_job_id seems to no longer exists:\n" . - "'squeue -j $sge_job_id' returned error code $squeue_status and said:\n" . - " $squeue_output\n" . - "Syncfile $f does not exist, meaning that the job did not finish.\n" . - "Log is in $logfile. Last line '$last_line' does not end in 'status 0'.\n" . - "Possible reasons:\n" . - " a) Exceeded time limit? -> Use more jobs!\n" . - " b) Shutdown/Frozen machine? -> Run again! squeue:\n"; - system("squeue -j $sge_job_id"); - exit(1); - } - } elsif ($ret != 0) { - print STDERR "$0: Warning: squeue command returned status $ret (squeue -j $sge_job_id,$!)\n"; - } - } - } - } - my $all_syncfiles = join(" ", @syncfiles); - system("rm $all_syncfiles 2>/dev/null"); -} - -# OK, at this point we are synced; we know the job is done. -# But we don't know about its exit status. We'll look at $logfile for this. -# First work out an array @logfiles of file-locations we need to -# read (just one, unless it's an array job). -my @logfiles = (); -if (!defined $jobname) { # not an array job. - push @logfiles, $logfile; -} else { - for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) { - my $l = $logfile; - $l =~ s/\$SLURM_ARRAY_TASK_ID/$jobid/g; - push @logfiles, $l; - } -} - -my $num_failed = 0; -my $status = 1; -foreach my $l (@logfiles) { - my @wait_times = (0.1, 0.2, 0.2, 0.3, 0.5, 0.5, 1.0, 2.0, 5.0, 5.0, 5.0, 10.0, 25.0); - for (my $iter = 0; $iter <= @wait_times; $iter++) { - my $line = `tail -10 $l 2>/dev/null`; # Note: although this line should be the last - # line of the file, I've seen cases where it was not quite the last line because - # of delayed output by the process that was running, or processes it had called. - # so tail -10 gives it a little leeway. - if ($line =~ m/with status (\d+)/) { - $status = $1; - last; - } else { - if ($iter < @wait_times) { - sleep($wait_times[$iter]); - } else { - if (! -f $l) { - print STDERR "Log-file $l does not exist.\n"; - } else { - print STDERR "The last line of log-file $l does not seem to indicate the " - . "return status as expected\n"; - } - exit(1); # Something went wrong with the queue, or the - # machine it was running on, probably. - } - } - } - # OK, now we have $status, which is the return-status of - # the command in the job. - if ($status != 0) { $num_failed++; } -} -if ($num_failed == 0) { exit(0); } -else { # we failed. - if (@logfiles == 1) { - if (defined $jobname) { $logfile =~ s/\$SLURM_TASK_ARRAY_ID/$jobstart/g; } - print STDERR "$0: job failed with status $status, log is in $logfile\n"; - if ($logfile =~ m/JOB/) { - print STDERR "$0: probably you forgot to put JOB=1:\$nj in your script.\n"; - } - } else { - if (defined $jobname) { $logfile =~ s/\$SLURM_ARRAY_TASK_ID/*/g; } - my $numjobs = 1 + $jobend - $jobstart; - print STDERR "$0: $num_failed / $numjobs failed, log is in $logfile\n"; - } - exit(1); -} diff --git a/kaldi/local/spk2utt_to_utt2spk.pl b/kaldi/local/spk2utt_to_utt2spk.pl deleted file mode 100755 index 23992f2..0000000 --- a/kaldi/local/spk2utt_to_utt2spk.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -while(<>){ - @A = split(" ", $_); - @A > 1 || die "Invalid line in spk2utt file: $_"; - $s = shift @A; - foreach $u ( @A ) { - print "$u $s\n"; - } -} - - diff --git a/kaldi/local/split_data.sh b/kaldi/local/split_data.sh deleted file mode 100755 index bc5894e..0000000 --- a/kaldi/local/split_data.sh +++ /dev/null @@ -1,160 +0,0 @@ -#!/bin/bash -# Copyright 2010-2013 Microsoft Corporation -# Johns Hopkins University (Author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -split_per_spk=true -if [ "$1" == "--per-utt" ]; then - split_per_spk=false - shift -fi - -if [ $# != 2 ]; then - echo "Usage: $0 [--per-utt] " - echo "E.g.: $0 data/train 50" - echo "It creates its output in e.g. data/train/split50/{1,2,3,...50}, or if the " - echo "--per-utt option was given, in e.g. data/train/split50utt/{1,2,3,...50}." - echo "" - echo "This script will not split the data-dir if it detects that the output is newer than the input." - echo "By default it splits per speaker (so each speaker is in only one split dir)," - echo "but with the --per-utt option it will ignore the speaker information while splitting." - exit 1 -fi - -data=$1 -numsplit=$2 - -if ! [ "$numsplit" -gt 0 ]; then - echo "Invalid num-split argument $numsplit"; - exit 1; -fi - -if $split_per_spk; then - warning_opt= -else - # suppress warnings from filter_scps.pl about 'some input lines were output - # to multiple files'. - warning_opt="--no-warn" -fi - -n=0; -feats="" -wavs="" -utt2spks="" -texts="" - -nu=`cat $data/utt2spk | wc -l` -nf=`cat $data/feats.scp 2>/dev/null | wc -l` -nt=`cat $data/text 2>/dev/null | wc -l` # take it as zero if no such file -if [ -f $data/feats.scp ] && [ $nu -ne $nf ]; then - echo "** split_data.sh: warning, #lines is (utt2spk,feats.scp) is ($nu,$nf); you can " - echo "** use utils/fix_data_dir.sh $data to fix this." -fi -if [ -f $data/text ] && [ $nu -ne $nt ]; then - echo "** split_data.sh: warning, #lines is (utt2spk,text) is ($nu,$nt); you can " - echo "** use utils/fix_data_dir.sh to fix this." -fi - - -if $split_per_spk; then - utt2spk_opt="--utt2spk=$data/utt2spk" - utt="" -else - utt2spk_opt= - utt="utt" -fi - -s1=$data/split${numsplit}${utt}/1 -if [ ! -d $s1 ]; then - need_to_split=true -else - need_to_split=false - for f in utt2spk spk2utt spk2warp feats.scp text wav.scp cmvn.scp spk2gender \ - vad.scp segments reco2file_and_channel utt2lang; do - if [[ -f $data/$f && ( ! -f $s1/$f || $s1/$f -ot $data/$f ) ]]; then - need_to_split=true - fi - done -fi - -if ! $need_to_split; then - exit 0; -fi - -utt2spks=$(for n in `seq $numsplit`; do echo $data/split${numsplit}${utt}/$n/utt2spk; done) - -directories=$(for n in `seq $numsplit`; do echo $data/split${numsplit}${utt}/$n; done) - -# if this mkdir fails due to argument-list being too long, iterate. -if ! mkdir -p $directories >&/dev/null; then - for n in `seq $numsplit`; do - mkdir -p $data/split${numsplit}${utt}/$n - done -fi - -# If lockfile is not installed, just don't lock it. It's not a big deal. -which lockfile >&/dev/null && lockfile -l 60 $data/.split_lock -trap 'rm -f $data/.split_lock' EXIT HUP INT PIPE TERM - -utils/split_scp.pl $utt2spk_opt $data/utt2spk $utt2spks || exit 1 - -for n in `seq $numsplit`; do - dsn=$data/split${numsplit}${utt}/$n - utils/utt2spk_to_spk2utt.pl $dsn/utt2spk > $dsn/spk2utt || exit 1; -done - -maybe_wav_scp= -if [ ! -f $data/segments ]; then - maybe_wav_scp=wav.scp # If there is no segments file, then wav file is - # indexed per utt. -fi - -# split some things that are indexed by utterance. -for f in feats.scp text vad.scp utt2lang $maybe_wav_scp utt2dur utt2num_frames; do - if [ -f $data/$f ]; then - utils/filter_scps.pl JOB=1:$numsplit \ - $data/split${numsplit}${utt}/JOB/utt2spk $data/$f $data/split${numsplit}${utt}/JOB/$f || exit 1; - fi -done - -# split some things that are indexed by speaker -for f in spk2gender spk2warp cmvn.scp; do - if [ -f $data/$f ]; then - utils/filter_scps.pl $warning_opt JOB=1:$numsplit \ - $data/split${numsplit}${utt}/JOB/spk2utt $data/$f $data/split${numsplit}${utt}/JOB/$f || exit 1; - fi -done - -if [ -f $data/segments ]; then - utils/filter_scps.pl JOB=1:$numsplit \ - $data/split${numsplit}${utt}/JOB/utt2spk $data/segments $data/split${numsplit}${utt}/JOB/segments || exit 1 - for n in `seq $numsplit`; do - dsn=$data/split${numsplit}${utt}/$n - awk '{print $2;}' $dsn/segments | sort | uniq > $dsn/tmp.reco # recording-ids. - done - if [ -f $data/reco2file_and_channel ]; then - utils/filter_scps.pl $warning_opt JOB=1:$numsplit \ - $data/split${numsplit}${utt}/JOB/tmp.reco $data/reco2file_and_channel \ - $data/split${numsplit}${utt}/JOB/reco2file_and_channel || exit 1 - fi - if [ -f $data/wav.scp ]; then - utils/filter_scps.pl $warning_opt JOB=1:$numsplit \ - $data/split${numsplit}${utt}/JOB/tmp.reco $data/wav.scp \ - $data/split${numsplit}${utt}/JOB/wav.scp || exit 1 - fi - for f in $data/split${numsplit}${utt}/*/tmp.reco; do rm $f; done -fi - -exit 0 diff --git a/kaldi/local/split_scp.pl b/kaldi/local/split_scp.pl deleted file mode 100755 index 994c62e..0000000 --- a/kaldi/local/split_scp.pl +++ /dev/null @@ -1,225 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - - -# This program splits up any kind of .scp or archive-type file. -# If there is no utt2spk option it will work on any text file and -# will split it up with an approximately equal number of lines in -# each but. -# With the --utt2spk option it will work on anything that has the -# utterance-id as the first entry on each line; the utt2spk file is -# of the form "utterance speaker" (on each line). -# It splits it into equal size chunks as far as it can. If you use the utt2spk -# option it will make sure these chunks coincide with speaker boundaries. In -# this case, if there are more chunks than speakers (and in some other -# circumstances), some of the resulting chunks will be empty and it will print -# an error message and exit with nonzero status. -# You will normally call this like: -# split_scp.pl scp scp.1 scp.2 scp.3 ... -# or -# split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ... -# Note that you can use this script to split the utt2spk file itself, -# e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ... - -# You can also call the scripts like: -# split_scp.pl -j 3 0 scp scp.0 -# [note: with this option, it assumes zero-based indexing of the split parts, -# i.e. the second number must be 0 <= n < num-jobs.] - -$num_jobs = 0; -$job_id = 0; -$utt2spk_file = ""; - -for ($x = 1; $x <= 2 && @ARGV > 0; $x++) { - if ($ARGV[0] eq "-j") { - shift @ARGV; - $num_jobs = shift @ARGV; - $job_id = shift @ARGV; - if ($num_jobs <= 0 || $job_id < 0 || $job_id >= $num_jobs) { - die "Invalid num-jobs and job-id: $num_jobs and $job_id"; - } - } - if ($ARGV[0] =~ "--utt2spk=(.+)") { - $utt2spk_file=$1; - shift; - } -} - -if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) { - die "Usage: split_scp.pl [--utt2spk=] in.scp out1.scp out2.scp ... \n" . - " or: split_scp.pl -j num-jobs job-id [--utt2spk=] in.scp [out.scp]\n" . - " ... where 0 <= job-id < num-jobs."; -} - -$error = 0; -$inscp = shift @ARGV; -if ($num_jobs == 0) { # without -j option - @OUTPUTS = @ARGV; -} else { - for ($j = 0; $j < $num_jobs; $j++) { - if ($j == $job_id) { - if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; } - else { push @OUTPUTS, "-"; } - } else { - push @OUTPUTS, "/dev/null"; - } - } -} - -if ($utt2spk_file ne "") { # We have the --utt2spk option... - open(U, "<$utt2spk_file") || die "Failed to open utt2spk file $utt2spk_file"; - while() { - @A = split; - @A == 2 || die "Bad line $_ in utt2spk file $utt2spk_file"; - ($u,$s) = @A; - $utt2spk{$u} = $s; - } - open(I, "<$inscp") || die "Opening input scp file $inscp"; - @spkrs = (); - while() { - @A = split; - if(@A == 0) { die "Empty or space-only line in scp file $inscp"; } - $u = $A[0]; - $s = $utt2spk{$u}; - if(!defined $s) { die "No such utterance $u in utt2spk file $utt2spk_file"; } - if(!defined $spk_count{$s}) { - push @spkrs, $s; - $spk_count{$s} = 0; - $spk_data{$s} = []; # ref to new empty array. - } - $spk_count{$s}++; - push @{$spk_data{$s}}, $_; - } - # Now split as equally as possible .. - # First allocate spks to files by allocating an approximately - # equal number of speakers. - $numspks = @spkrs; # number of speakers. - $numscps = @OUTPUTS; # number of output files. - if ($numspks < $numscps) { - die "Refusing to split data because number of speakers $numspks is less " . - "than the number of output .scp files $numscps"; - } - for($scpidx = 0; $scpidx < $numscps; $scpidx++) { - $scparray[$scpidx] = []; # [] is array reference. - } - for ($spkidx = 0; $spkidx < $numspks; $spkidx++) { - $scpidx = int(($spkidx*$numscps) / $numspks); - $spk = $spkrs[$spkidx]; - push @{$scparray[$scpidx]}, $spk; - $scpcount[$scpidx] += $spk_count{$spk}; - } - - # Now will try to reassign beginning + ending speakers - # to different scp's and see if it gets more balanced. - # Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2. - # We can show that if considering changing just 2 scp's, we minimize - # this by minimizing the squared difference in sizes. This is - # equivalent to minimizing the absolute difference in sizes. This - # shows this method is bound to converge. - - $changed = 1; - while($changed) { - $changed = 0; - for($scpidx = 0; $scpidx < $numscps; $scpidx++) { - # First try to reassign ending spk of this scp. - if($scpidx < $numscps-1) { - $sz = @{$scparray[$scpidx]}; - if($sz > 0) { - $spk = $scparray[$scpidx]->[$sz-1]; - $count = $spk_count{$spk}; - $nutt1 = $scpcount[$scpidx]; - $nutt2 = $scpcount[$scpidx+1]; - if( abs( ($nutt2+$count) - ($nutt1-$count)) - < abs($nutt2 - $nutt1)) { # Would decrease - # size-diff by reassigning spk... - $scpcount[$scpidx+1] += $count; - $scpcount[$scpidx] -= $count; - pop @{$scparray[$scpidx]}; - unshift @{$scparray[$scpidx+1]}, $spk; - $changed = 1; - } - } - } - if($scpidx > 0 && @{$scparray[$scpidx]} > 0) { - $spk = $scparray[$scpidx]->[0]; - $count = $spk_count{$spk}; - $nutt1 = $scpcount[$scpidx-1]; - $nutt2 = $scpcount[$scpidx]; - if( abs( ($nutt2-$count) - ($nutt1+$count)) - < abs($nutt2 - $nutt1)) { # Would decrease - # size-diff by reassigning spk... - $scpcount[$scpidx-1] += $count; - $scpcount[$scpidx] -= $count; - shift @{$scparray[$scpidx]}; - push @{$scparray[$scpidx-1]}, $spk; - $changed = 1; - } - } - } - } - # Now print out the files... - for($scpidx = 0; $scpidx < $numscps; $scpidx++) { - $scpfn = $OUTPUTS[$scpidx]; - open(F, ">$scpfn") || die "Could not open scp file $scpfn for writing."; - $count = 0; - if(@{$scparray[$scpidx]} == 0) { - print STDERR "Error: split_scp.pl producing empty .scp file $scpfn (too many splits and too few speakers?)\n"; - $error = 1; - } else { - foreach $spk ( @{$scparray[$scpidx]} ) { - print F @{$spk_data{$spk}}; - $count += $spk_count{$spk}; - } - if($count != $scpcount[$scpidx]) { die "Count mismatch [code error]"; } - } - close(F); - } -} else { - # This block is the "normal" case where there is no --utt2spk - # option and we just break into equal size chunks. - - open(I, "<$inscp") || die "Opening input scp file $inscp"; - - $numscps = @OUTPUTS; # size of array. - @F = (); - while() { - push @F, $_; - } - $numlines = @F; - if($numlines == 0) { - print STDERR "split_scp.pl: error: empty input scp file $inscp , "; - $error = 1; - } - $linesperscp = int( $numlines / $numscps); # the "whole part".. - $linesperscp >= 1 || die "You are splitting into too many pieces! [reduce \$nj]"; - $remainder = $numlines - ($linesperscp * $numscps); - ($remainder >= 0 && $remainder < $numlines) || die "bad remainder $remainder"; - # [just doing int() rounds down]. - $n = 0; - for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) { - $scpfile = $OUTPUTS[$scpidx]; - open(O, ">$scpfile") || die "Opening output scp file $scpfile"; - for($k = 0; $k < $linesperscp + ($scpidx < $remainder ? 1 : 0); $k++) { - print O $F[$n++]; - } - close(O) || die "Closing scp file $scpfile"; - } - $n == $numlines || die "split_scp.pl: code error., $n != $numlines"; -} - -exit ($error ? 1 : 0); diff --git a/kaldi/local/ssh.pl b/kaldi/local/ssh.pl deleted file mode 100755 index 5d3e3e4..0000000 --- a/kaldi/local/ssh.pl +++ /dev/null @@ -1,219 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter - -use Cwd; -use File::Basename; - -# This program is like run.pl except rather than just running on a local -# machine, it can be configured to run on remote machines via ssh. -# It requires that you have set up passwordless access to those machines, -# and that Kaldi is running from a location that is accessible via the -# same path on those machines (presumably via an NFS mount). -# -# It looks for a file .queue/machines that should have, on each line, the name -# of a machine that you can ssh to (which may include this machine). It doesn't -# have to be a fully qualified name. -# -# Later we may extend this so that on each line of .queue/machines you -# can specify various resources that each machine has, such as how -# many slots and how much memory, and make it wait if machines are -# busy. But for now it simply ssh's to a machine from those in the list. - -# The command-line interface of this program is the same as run.pl; -# see run.pl for more information about the usage. - - -@ARGV < 2 && die "usage: ssh.pl log-file command-line arguments..."; - -$jobstart = 1; -$jobend = 1; -$qsub_opts=""; # These will be ignored. - -# First parse an option like JOB=1:4, and any -# options that would normally be given to -# ssh.pl, which we will just discard. - -if (@ARGV > 0) { - while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) { # parse any options - # that would normally go to qsub, but which will be ignored here. - $switch = shift @ARGV; - if ($switch eq "-V") { - $qsub_opts .= "-V "; - } else { - $option = shift @ARGV; - if ($switch eq "-sync" && $option =~ m/^[yY]/) { - $qsub_opts .= "-sync "; # Note: in the - # corresponding code in queue.pl it says instead, just "$sync = 1;". - } - $qsub_opts .= "$switch $option "; - if ($switch eq "-pe") { # e.g. -pe smp 5 - $option2 = shift @ARGV; - $qsub_opts .= "$option2 "; - } - } - } - if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:10 - $jobname = $1; - $jobstart = $2; - $jobend = $3; - shift; - if ($jobstart > $jobend) { - die "run.pl: invalid job range $ARGV[0]"; - } - if ($jobstart <= 0) { - die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility)"; - } - } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1. - $jobname = $1; - $jobstart = $2; - $jobend = $2; - shift; - } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) { - print STDERR "Warning: suspicious first argument to run.pl: $ARGV[0]\n"; - } -} - -if ($qsub_opts ne "") { - print STDERR "Warning: ssh.pl ignoring options \"$qsub_opts\"\n"; -} - -{ # Read .queue/machines - if (!open(Q, "<.queue/machines")) { - print STDERR "ssh.pl: expected the file .queue/machines to exist.\n"; - exit(1); - } - @machines = (); - while () { - chop; - if ($_ ne "") { - @A = split; - if (@A != 1) { - die "ssh.pl: bad line '$_' in .queue/machines."; - } - if ($A[0] !~ m/^[a-z0-9\.\-]+/) { - die "ssh.pl: invalid machine name '$A[0]'"; - } - push @machines, $A[0]; - } - } - if (@machines == 0) { die "ssh.pl: no machines listed in .queue/machines"; } -} - -$logfile = shift @ARGV; - -if (defined $jobname && $logfile !~ m/$jobname/ && - $jobend > $jobstart) { - print STDERR "ssh.pl: you are trying to run a parallel job but " - . "you are putting the output into just one log file ($logfile)\n"; - exit(1); -} - -{ - $offset = 0; # $offset will be an offset added to any index from the job-id - # specified if the user does JOB=1:10. The main point of this is - # that there are instances where a script will manually submit a - # number of jobs to the queue, e.g. with log files foo.1.log, - # foo.2.log and so on, and we don't want all of these to go - # to the first machine. - @A = split(".", basename($logfile)); - # if $logfile looks like foo.9.log, add 9 to $offset. - foreach $a (@A) { if ($a =~ m/^\d+$/) { $offset += $a; } } -} - -$cmd = ""; - -foreach $x (@ARGV) { - if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } - elsif ($x =~ m:\":) { $cmd .= "'$x' "; } - else { $cmd .= "\"$x\" "; } -} - - -for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) { - $childpid = fork(); - if (!defined $childpid) { die "Error forking in ssh.pl (writing to $logfile)"; } - if ($childpid == 0) { - # We're in the child... this branch executes the job and returns (possibly - # with an error status). - if (defined $jobname) { - $cmd =~ s/$jobname/$jobid/g; - $logfile =~ s/$jobname/$jobid/g; - } - { # work out the machine to ssh to. - $local_offset = $offset + $jobid - 1; # subtract 1 since jobs never start - # from 0; we'd like the first job - # to normally run on the first - # machine. - $num_machines = scalar @machines; - # in the next line, the "+ $num_machines" is in case $local_offset is - # negative, to ensure the modulus is calculated in the mathematical way, not - # in the C way where (negative number % positive number) is negative. - $machines_index = ($local_offset + $num_machines) % $num_machines; - $machine = $machines[$machines_index]; - } - if (!open(S, "|ssh $machine bash")) { - print STDERR "ssh.pl failed to ssh to $machine"; - exit(1); # exits from the forked process within ssh.pl. - } - $cwd = getcwd(); - $logdir = dirname($logfile); - # Below, we're printing into ssh which has opened a bash session; these are - # bash commands. - print S "set -e\n"; # if any of the later commands fails, we want it to exit. - print S "cd $cwd\n"; - print S ". ./path.sh\n"; - print S "mkdir -p $logdir\n"; - print S "time1=\`date +\"%s\"\`\n"; - print S "( echo '#' Running on \`hostname\`\n"; - print S " echo '#' Started at \`date\`\n"; - print S " echo -n '# '; cat <$logfile\n"; - print S "set +e\n"; # we don't want bash to exit if the next line fails. - # in the next line, || true means allow this one to fail and not have bash exit immediately. - print S " ( $cmd ) 2>>$logfile >>$logfile\n"; - print S "ret=\$?\n"; - print S "set -e\n"; # back into mode where it will exit on error. - print S "time2=\`date +\"%s\"\`\n"; - print S "echo '#' Accounting: time=\$((\$time2-\$time1)) threads=1 >>$logfile\n"; - print S "echo '#' Finished at \`date\` with status \$ret >>$logfile\n"; - print S "exit \$ret"; # return with the status the command exited with. - $ret = close(S); - $ssh_return_status = $?; - # see http://perldoc.perl.org/functions/close.html for explanation of return - # status of close() and the variables it sets. - if (! $ret && $! != 0) { die "ssh.pl: unexpected problem ssh'ing to machine $machine"; } - if ($ssh_return_status != 0) { exit(1); } # exit with error status from this forked process. - else { exit(0); } # else exit with non-error status. - } -} - -$ret = 0; -$numfail = 0; -for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) { - $r = wait(); - if ($r == -1) { die "Error waiting for child process"; } # should never happen. - if ($? != 0) { $numfail++; $ret = 1; } # The child process failed. -} - -if ($ret != 0) { - $njobs = $jobend - $jobstart + 1; - if ($njobs == 1) { - if (defined $jobname) { - $logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with - # that job. - } - print STDERR "ssh.pl: job failed, log is in $logfile\n"; - if ($logfile =~ m/JOB/) { - print STDERR "run.pl: probably you forgot to put JOB=1:\$nj in your script."; - } - } - else { - $logfile =~ s/$jobname/*/g; - print STDERR "ssh.pl: $numfail / $njobs failed, log is in $logfile\n"; - } -} - - -exit ($ret); diff --git a/kaldi/local/subset_data_dir.sh b/kaldi/local/subset_data_dir.sh deleted file mode 100755 index ba52d14..0000000 --- a/kaldi/local/subset_data_dir.sh +++ /dev/null @@ -1,194 +0,0 @@ -#!/bin/bash -# Copyright 2010-2011 Microsoft Corporation -# 2012-2013 Johns Hopkins University (Author: Daniel Povey) -# Apache 2.0 - - -# This script operates on a data directory, such as in data/train/. -# See http://kaldi-asr.org/doc/data_prep.html#data_prep_data -# for what these directories contain. - -# This script creates a subset of that data, consisting of some specified -# number of utterances. (The selected utterances are distributed evenly -# throughout the file, by the program ./subset_scp.pl). - -# There are six options, none compatible with any other. - -# If you give the --per-spk option, it will attempt to select the supplied -# number of utterances for each speaker (typically you would supply a much -# smaller number in this case). - -# If you give the --speakers option, it selects a subset of n randomly -# selected speakers. - -# If you give the --shortest option, it will give you the n shortest utterances. - -# If you give the --first option, it will just give you the n first utterances. - -# If you give the --last option, it will just give you the n last utterances. - -# If you give the --spk-list or --utt-list option, it reads the -# speakers/utterances to keep from /" (note, -# in this case there is no positional parameter; see usage message.) - - -shortest=false -perspk=false -first_opt="" -speakers=false -spk_list_specified=false -utt_list_specified=false - -if [ "$1" == "--per-spk" ]; then - perspk=true; - shift; -elif [ "$1" == "--shortest" ]; then - shortest=true; - shift; -elif [ "$1" == "--first" ]; then - first_opt="--first"; - shift; -elif [ "$1" == "--speakers" ]; then - speakers=true - shift; -elif [ "$1" == "--last" ]; then - first_opt="--last"; - shift; -elif [ "$1" == "--spk-list" ]; then - spk_list_specified=true - shift; -elif [ "$1" == "--utt-list" ]; then - utt_list_specified=true - shift; -fi - - - - -if [ $# != 3 ]; then - echo "Usage: " - echo " subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] " - echo " subset_data_dir.sh [--spk-list ] " - echo " subset_data_dir.sh [--utt-list ] " - echo "By default, randomly selects utterances from the data directory." - echo "With --speakers, randomly selects enough speakers that we have utterances" - echo "With --per-spk, selects utterances per speaker, if available." - echo "With --first, selects the first utterances" - echo "With --last, selects the last utterances" - echo "With --shortest, selects the shortest utterances." - echo "With --spk-list, reads the speakers to keep from " - exit 1; -fi - -if $spk_list_specified; then - spk_list=$1 - srcdir=$2 - destdir=$3 -elif $utt_list_specified; then - utt_list=$1 - srcdir=$2 - destdir=$3 -else - srcdir=$1 - numutt=$2 - destdir=$3 -fi - - -export LC_ALL=C - -if [ ! -f $srcdir/utt2spk ]; then - echo "subset_data_dir.sh: no such file $srcdir/utt2spk" - exit 1; -fi - -function do_filtering { - # assumes the utt2spk and spk2utt files already exist. - [ -f $srcdir/feats.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp - [ -f $srcdir/vad.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/vad.scp >$destdir/vad.scp - [ -f $srcdir/utt2lang ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2lang >$destdir/utt2lang - [ -f $srcdir/utt2dur ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2dur >$destdir/utt2dur - [ -f $srcdir/utt2num_frames ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2num_frames >$destdir/utt2num_frames - [ -f $srcdir/utt2uniq ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2uniq >$destdir/utt2uniq - [ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp - [ -f $srcdir/spk2warp ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2warp >$destdir/spk2warp - [ -f $srcdir/utt2warp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/utt2warp >$destdir/utt2warp - [ -f $srcdir/text ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text - [ -f $srcdir/spk2gender ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender - [ -f $srcdir/cmvn.scp ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp - if [ -f $srcdir/segments ]; then - utils/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments - awk '{print $2;}' $destdir/segments | sort | uniq > $destdir/reco # recordings. - # The next line would override the command above for wav.scp, which would be incorrect. - [ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp - [ -f $srcdir/reco2file_and_channel ] && \ - utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel - - # Filter the STM file for proper sclite scoring (this will also remove the comments lines) - [ -f $srcdir/stm ] && utils/filter_scp.pl $destdir/reco < $srcdir/stm > $destdir/stm - - rm $destdir/reco - else - awk '{print $1;}' $destdir/wav.scp | sort | uniq > $destdir/reco - [ -f $srcdir/reco2file_and_channel ] && \ - utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel - - rm $destdir/reco - fi - srcutts=`cat $srcdir/utt2spk | wc -l` - destutts=`cat $destdir/utt2spk | wc -l` - echo "$0: reducing #utt from $srcutts to $destutts" -} - - -if $spk_list_specified; then - mkdir -p $destdir - utils/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1; - utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1; - do_filtering; # bash function. - exit 0; -elif $utt_list_specified; then - mkdir -p $destdir - utils/filter_scp.pl "$utt_list" $srcdir/utt2spk > $destdir/utt2spk || exit 1; - utils/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt || exit 1; - do_filtering; # bash function. - exit 0; -elif $speakers; then - mkdir -p $destdir - utils/shuffle_list.pl < $srcdir/spk2utt | awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | \ - sort > $destdir/spk2utt - utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk - do_filtering; # bash function. - exit 0; -elif $perspk; then - mkdir -p $destdir - awk '{ n='$numutt'; printf("%s ",$1); skip=1; while(n*(skip+1) <= NF-1) { skip++; } - for(x=2; x<=NF && x <= n*skip; x += skip) { printf("%s ", $x); } - printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt - utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk - do_filtering; # bash function. - exit 0; -else - if [ $numutt -gt `cat $srcdir/utt2spk | wc -l` ]; then - echo "subset_data_dir.sh: cannot subset to more utterances than you originally had." - exit 1; - fi - mkdir -p $destdir || exit 1; - - ## scripting note: $shortest evaluates to true or false - ## so this becomes the command true or false. - if $shortest; then - # select the n shortest utterances. - . ./path.sh - [ ! -f $srcdir/feats.scp ] && echo "$0: you selected --shortest but no feats.scp exist." && exit 1; - feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1; - sort -n -k2 $destdir/tmp.len | awk '{print $1}' | head -$numutt >$destdir/tmp.uttlist - utils/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk - rm $destdir/tmp.uttlist $destdir/tmp.len - else - utils/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1; - fi - utils/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt - do_filtering; - exit 0; -fi diff --git a/kaldi/local/subset_data_dir_tr_cv.sh b/kaldi/local/subset_data_dir_tr_cv.sh deleted file mode 100755 index d8694bd..0000000 --- a/kaldi/local/subset_data_dir_tr_cv.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash -# -# Copyright 2017 Brno University of Technology (Author: Karel Vesely); -# Apache 2.0 - -# This scripts splits 'data' directory into two parts: -# - training set with 90% of speakers -# - held-out set with 10% of speakers (cv) -# (to be used in frame cross-entropy training of 'nnet1' models), - -# The script also accepts a list of held-out set speakers by '--cv-spk-list' -# (with perturbed data, we pass the list of speakers externally). -# The remaining set of speakers is the the training set. - -cv_spk_percent=10 -cv_spk_list= # To be used with perturbed data, -seed=777 -cv_utt_percent= # ignored (compatibility), -. utils/parse_options.sh - -if [ $# != 3 ]; then - echo "Usage: $0 [opts] " - echo " --cv-spk-percent N (default 10)" - echo " --cv-spk-list (a pre-defined list with cv speakers)" - exit 1; -fi - -set -euo pipefail - -src_data=$1 -trn_data=$2 -cv_data=$3 - -[ ! -r $src_data/spk2utt ] && echo "Missing '$src_data/spk2utt'. Error!" && exit 1 - -tmp=$(mktemp -d /tmp/${USER}_XXXXX) - -if [ -z "$cv_spk_list" ]; then - # Select 'cv_spk_percent' speakers randomly, - cat $src_data/spk2utt | awk '{ print $1; }' | utils/shuffle_list.pl --srand $seed >$tmp/speakers - n_spk=$(wc -l <$tmp/speakers) - n_spk_cv=$(perl -e "print int($cv_spk_percent * $n_spk / 100); ") - # - head -n $n_spk_cv $tmp/speakers >$tmp/speakers_cv - tail -n+$((n_spk_cv+1)) $tmp/speakers >$tmp/speakers_trn -else - # Use pre-defined list of speakers, - cp $cv_spk_list $tmp/speakers_cv - join -v2 <(sort $cv_spk_list) <(awk '{ print $1; }' <$src_data/spk2utt | sort) >$tmp/speakers_trn -fi - -# Sanity checks, -n_spk=$(wc -l <$src_data/spk2utt) -echo "Speakers, src=$n_spk, trn=$(wc -l <$tmp/speakers_trn), cv=$(wc -l $tmp/speakers_cv)" -overlap=$(join <(sort $tmp/speakers_trn) <(sort $tmp/speakers_cv) | wc -l) -[ $overlap != 0 ] && \ - echo "WARNING, speaker overlap detected!" && \ - join <(sort $tmp/speakers_trn) <(sort $tmp/speakers_cv) | head && \ - echo '...' - -# Create new data dirs, -utils/data/subset_data_dir.sh --spk-list $tmp/speakers_trn $src_data $trn_data -utils/data/subset_data_dir.sh --spk-list $tmp/speakers_cv $src_data $cv_data - diff --git a/kaldi/local/subset_scp.pl b/kaldi/local/subset_scp.pl deleted file mode 100755 index 11fddc0..0000000 --- a/kaldi/local/subset_scp.pl +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This program selects a subset of N elements in the scp. - -# By default, it selects them evenly from throughout the scp, in order to avoid -# selecting too many from the same speaker. It prints them on the standard -# output. -# With the option --first, it just selects the N first utterances. -# With the option --last, it just selects the N last utterances. - -# Last modified by JHU & HKUST @2013 - - -$quiet = 0; -$first = 0; -$last = 0; - -if (@ARGV > 0 && $ARGV[0] eq "--quiet") { - shift; - $quiet = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--first") { - shift; - $first = 1; -} -if (@ARGV > 0 && $ARGV[0] eq "--last") { - shift; - $last = 1; -} - -if(@ARGV < 2 ) { - die "Usage: subset_scp.pl [--quiet][--first|--last] N in.scp\n" . - " --quiet causes it to not die if N < num lines in scp.\n" . - " --first and --last make it equivalent to head or tail.\n" . - "See also: filter_scp.pl\n"; -} - -$N = shift @ARGV; -if($N == 0) { - die "First command-line parameter to subset_scp.pl must be an integer, got \"$N\""; -} -$inscp = shift @ARGV; -open(I, "<$inscp") || die "Opening input scp file $inscp"; - -@F = (); -while() { - push @F, $_; -} -$numlines = @F; -if($N > $numlines) { - if ($quiet) { - $N = $numlines; - } else { - die "You requested from subset_scp.pl more elements than available: $N > $numlines"; - } -} - -sub select_n { - my ($start,$end,$num_needed) = @_; - my $diff = $end - $start; - if ($num_needed > $diff) { - die "select_n: code error"; - } - if ($diff == 1 ) { - if ($num_needed > 0) { - print $F[$start]; - } - } else { - my $halfdiff = int($diff/2); - my $halfneeded = int($num_needed/2); - select_n($start, $start+$halfdiff, $halfneeded); - select_n($start+$halfdiff, $end, $num_needed - $halfneeded); - } -} - -if ( ! $first && ! $last) { - if ($N > 0) { - select_n(0, $numlines, $N); - } -} else { - if ($first) { # --first option: same as head. - for ($n = 0; $n < $N; $n++) { - print $F[$n]; - } - } else { # --last option: same as tail. - for ($n = @F - $N; $n < @F; $n++) { - print $F[$n]; - } - } -} diff --git a/kaldi/local/summarize_logs.pl b/kaldi/local/summarize_logs.pl deleted file mode 100755 index 9b8a145..0000000 --- a/kaldi/local/summarize_logs.pl +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/env perl - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. - -#scalar(@ARGV) >= 1 && print STDERR "Usage: summarize_warnings.pl \n" && exit 1; - -sub split_hundreds { # split list of filenames into groups of 100. - my $names = shift @_; - my @A = split(" ", $names); - my @ans = (); - while (@A > 0) { - my $group = ""; - for ($x = 0; $x < 100 && @A>0; $x++) { - $fname = pop @A; - $group .= "$fname "; - } - push @ans, $group; - } - return @ans; -} - -sub parse_accounting_entry { - $entry= shift @_; - - @elems = split " ", $entry; - - $time=undef; - $threads=undef; - foreach $elem (@elems) { - if ( $elem=~ m/time=(\d+)/ ) { - $elem =~ s/time=(\d+)/$1/; - $time = $elem; - } elsif ( $elem=~ m/threads=(\d+)/ ) { - $elem =~ s/threads=(\d+)/$1/g; - $threads = $elem; - } else { - die "Unknown entry \"$elem\" when parsing \"$entry\" \n"; - } - } - - if (defined($time) and defined($threads) ) { - return ($time, $threads); - } else { - die "The accounting entry \"$entry\" did not contain all necessary attributes"; - } -} - -foreach $dir (@ARGV) { - - #$dir = $ARGV[0]; - print $dir - - ! -d $dir && print STDERR "summarize_warnings.pl: no such directory $dir\n" ; - - $dir =~ s:/$::; # Remove trailing slash. - - - # Group the files into categories where all have the same base-name. - foreach $f (glob ("$dir/*.log")) { - $f_category = $f; - # do next expression twice; s///g doesn't work as they overlap. - $f_category =~ s:\.\d+\.(?!\d+):.*.:; - #$f_category =~ s:\.\d+\.:.*.:; - $fmap{$f_category} .= " $f"; - } -} - -foreach $c (sort (keys %fmap) ) { - $n = 0; - foreach $fgroup (split_hundreds($fmap{$c})) { - $n += `grep -w WARNING $fgroup | wc -l`; - } - if ($n != 0) { - print "$n warnings in $c\n" - } -} -foreach $c (sort (keys %fmap)) { - $n = 0; - foreach $fgroup (split_hundreds($fmap{$c})) { - $n += `grep -w ERROR $fgroup | wc -l`; - } - if ($n != 0) { - print "$n errors in $c\n" - } -} - -$supertotal_cpu_time=0.0; -$supertotal_clock_time=0.0; -$supertotal_threads=0.0; - -foreach $c (sort (keys %fmap)) { - $n = 0; - - $total_cpu_time=0.0; - $total_clock_time=0.0; - $total_threads=0.0; - foreach $fgroup (split_hundreds($fmap{$c})) { - $lines=`grep -a "# Accounting: " $fgroup |sed 's/.* Accounting: *//g'`; - - #print $lines ."\n"; - - @entries = split "\n", $lines; - - foreach $line (@entries) { - $time, $threads = parse_accounting_entry($line); - - $total_cpu_time += $time * $threads; - $total_threads += $threads; - if ( $time > $total_clock_time ) { - $total_clock_time = $time; - } - } - } - print "total_cpu_time=$total_cpu_time clock_time=$total_clock_time total_threads=$total_threads group=$c\n"; - - $supertotal_cpu_time += $total_cpu_time; - $supertotal_clock_time += $total_clock_time; - $supertotal_threads += $total_threads; -} -print "total_cpu_time=$supertotal_cpu_time clock_time=$supertotal_clock_time total_threads=$supertotal_threads group=all\n"; - diff --git a/kaldi/local/summarize_warnings.pl b/kaldi/local/summarize_warnings.pl deleted file mode 100755 index c094a1d..0000000 --- a/kaldi/local/summarize_warnings.pl +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env perl - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. - - @ARGV != 1 && print STDERR "Usage: summarize_warnings.pl \n" && exit 1; - -$dir = $ARGV[0]; - -! -d $dir && print STDERR "summarize_warnings.pl: no such directory $dir\n" && exit 1; - -$dir =~ s:/$::; # Remove trailing slash. - - -# Group the files into categories where all have the same base-name. -foreach $f (glob ("$dir/*.log")) { - $f_category = $f; - # do next expression twice; s///g doesn't work as they overlap. - $f_category =~ s:\.\d+\.:.*.:; - $f_category =~ s:\.\d+\.:.*.:; - $fmap{$f_category} .= " $f"; -} - -sub split_hundreds { # split list of filenames into groups of 100. - my $names = shift @_; - my @A = split(" ", $names); - my @ans = (); - while (@A > 0) { - my $group = ""; - for ($x = 0; $x < 100 && @A>0; $x++) { - $fname = pop @A; - $group .= "$fname "; - } - push @ans, $group; - } - return @ans; -} - -foreach $c (keys %fmap) { - $n = 0; - foreach $fgroup (split_hundreds($fmap{$c})) { - $n += `grep -w WARNING $fgroup | wc -l`; - } - if ($n != 0) { - print "$n warnings in $c\n" - } -} diff --git a/kaldi/local/sym2int.pl b/kaldi/local/sym2int.pl deleted file mode 100755 index 592145c..0000000 --- a/kaldi/local/sym2int.pl +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -$ignore_oov = 0; - -for($x = 0; $x < 2; $x++) { - if ($ARGV[0] eq "--map-oov") { - shift @ARGV; - $map_oov = shift @ARGV; - if ($map_oov eq "-f" || $map_oov =~ m/words\.txt$/ || $map_oov eq "") { - # disallow '-f', the empty string and anything ending in words.txt as the - # OOV symbol because these are likely command-line errors. - die "the --map-oov option requires an argument"; - } - } - if ($ARGV[0] eq "-f") { - shift @ARGV; - $field_spec = shift @ARGV; - if ($field_spec =~ m/^\d+$/) { - $field_begin = $field_spec - 1; $field_end = $field_spec - 1; - } - if ($field_spec =~ m/^(\d*)[-:](\d*)/) { # accept e.g. 1:10 as a courtesty (properly, 1-10) - if ($1 ne "") { - $field_begin = $1 - 1; # Change to zero-based indexing. - } - if ($2 ne "") { - $field_end = $2 - 1; # Change to zero-based indexing. - } - } - if (!defined $field_begin && !defined $field_end) { - die "Bad argument to -f option: $field_spec"; - } - } -} - -$symtab = shift @ARGV; -if (!defined $symtab) { - print STDERR "Usage: sym2int.pl [options] symtab [input transcriptions] > output transcriptions\n" . - "options: [--map-oov ] [-f ]\n" . - "note: can look like 4-5, or 4-, or 5-, or 1.\n"; -} -open(F, "<$symtab") || die "Error opening symbol table file $symtab"; -while() { - @A = split(" ", $_); - @A == 2 || die "bad line in symbol table file: $_"; - $sym2int{$A[0]} = $A[1] + 0; -} - -if (defined $map_oov && $map_oov !~ m/^\d+$/) { # not numeric-> look it up - if (!defined $sym2int{$map_oov}) { die "OOV symbol $map_oov not defined."; } - $map_oov = $sym2int{$map_oov}; -} - -$num_warning = 0; -$max_warning = 20; - -while (<>) { - @A = split(" ", $_); - @B = (); - for ($n = 0; $n < @A; $n++) { - $a = $A[$n]; - if ( (!defined $field_begin || $n >= $field_begin) - && (!defined $field_end || $n <= $field_end)) { - $i = $sym2int{$a}; - if (!defined ($i)) { - if (defined $map_oov) { - if ($num_warning++ < $max_warning) { - print STDERR "sym2int.pl: replacing $a with $map_oov\n"; - if ($num_warning == $max_warning) { - print STDERR "sym2int.pl: not warning for OOVs any more times\n"; - } - } - $i = $map_oov; - } else { - $pos = $n+1; - die "sym2int.pl: undefined symbol $a (in position $pos)\n"; - } - } - $a = $i; - } - push @B, $a; - } - print join(" ", @B); - print "\n"; -} -if ($num_warning > 0) { - print STDERR "** Replaced $num_warning instances of OOVs with $map_oov\n"; -} - -exit(0); diff --git a/kaldi/local/utt2spk_to_spk2utt.pl b/kaldi/local/utt2spk_to_spk2utt.pl deleted file mode 100755 index 6e0e438..0000000 --- a/kaldi/local/utt2spk_to_spk2utt.pl +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# converts an utt2spk file to a spk2utt file. -# Takes input from the stdin or from a file argument; -# output goes to the standard out. - -if ( @ARGV > 1 ) { - die "Usage: utt2spk_to_spk2utt.pl [ utt2spk ] > spk2utt"; -} - -while(<>){ - @A = split(" ", $_); - @A == 2 || die "Invalid line in utt2spk file: $_"; - ($u,$s) = @A; - if(!$seen_spk{$s}) { - $seen_spk{$s} = 1; - push @spklist, $s; - } - push (@{$spk_hash{$s}}, "$u"); -} -foreach $s (@spklist) { - $l = join(' ',@{$spk_hash{$s}}); - print "$s $l\n"; -} diff --git a/kaldi/local/validate_data_dir.sh b/kaldi/local/validate_data_dir.sh deleted file mode 100755 index 453ad69..0000000 --- a/kaldi/local/validate_data_dir.sh +++ /dev/null @@ -1,363 +0,0 @@ -#!/bin/bash - - -no_feats=false -no_wav=false -no_text=false -no_spk_sort=false - -for x in `seq 4`; do - if [ "$1" == "--no-feats" ]; then - no_feats=true - shift; - fi - if [ "$1" == "--no-text" ]; then - no_text=true - shift; - fi - if [ "$1" == "--no-wav" ]; then - no_wav=true - shift; - fi - if [ "$1" == "--no-spk-sort" ]; then - no_spk_sort=true - shift; - fi -done - -if [ $# -ne 1 ]; then - echo "Usage: $0 [--no-feats] [--no-text] [--no-wav] [--no-spk-sort] " - echo "The --no-xxx options mean that the script does not require " - echo "xxx.scp to be present, but it will check it if it is present." - echo "--no-spk-sort means that the script does not require the utt2spk to be " - echo "sorted by the speaker-id in addition to being sorted by utterance-id." - echo "By default, utt2spk is expected to be sorted by both, which can be " - echo "achieved by making the speaker-id prefixes of the utterance-ids" - echo "e.g.: $0 data/train" - exit 1; -fi - -data=$1 - -if [ ! -d $data ]; then - echo "$0: no such directory $data" - exit 1; -fi - -for f in spk2utt utt2spk; do - if [ ! -f $data/$f ]; then - echo "$0: no such file $f" - exit 1; - fi - if [ ! -s $data/$f ]; then - echo "$0: empty file $f" - exit 1; - fi -done - -! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \ - echo "$0: $data/utt2spk has wrong format." && exit; - -ns=$(wc -l < $data/spk2utt) -if [ "$ns" == 1 ]; then - echo "$0: WARNING: you have only one speaker. This probably a bad idea." - echo " Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html" - echo " for more information." -fi - - -tmpdir=$(mktemp -d /tmp/kaldi.XXXX); -trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM - -export LC_ALL=C - -function check_sorted_and_uniq { - ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \ - echo "$0: file $1 is not in sorted order or has duplicates" && exit 1; -} - -function partial_diff { - diff $1 $2 | head -n 6 - echo "..." - diff $1 $2 | tail -n 6 - n1=`cat $1 | wc -l` - n2=`cat $2 | wc -l` - echo "[Lengths are $1=$n1 versus $2=$n2]" -} - -check_sorted_and_uniq $data/utt2spk - -if ! $no_spk_sort; then - ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \ - echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \ - echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; -fi - -check_sorted_and_uniq $data/spk2utt - -! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \ - <(utils/spk2utt_to_utt2spk.pl $data/spk2utt) && \ - echo "$0: spk2utt and utt2spk do not seem to match" && exit 1; - -cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts - -if [ ! -f $data/text ] && ! $no_text; then - echo "$0: no such file $data/text (if this is by design, specify --no-text)" - exit 1; -fi - -num_utts=`cat $tmpdir/utts | wc -l` -if [ -f $data/text ]; then - utils/validate_text.pl $data/text || exit 1; - check_sorted_and_uniq $data/text - text_len=`cat $data/text | wc -l` - illegal_sym_list=" #0" - for x in $illegal_sym_list; do - if grep -w "$x" $data/text > /dev/null; then - echo "$0: Error: in $data, text contains illegal symbol $x" - exit 1; - fi - done - awk '{print $1}' < $data/text > $tmpdir/utts.txt - if ! cmp -s $tmpdir/utts{,.txt}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and text" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.txt} - exit 1; - fi -fi - -if [ -f $data/segments ] && [ ! -f $data/wav.scp ]; then - echo "$0: in directory $data, segments file exists but no wav.scp" - exit 1; -fi - - -if [ ! -f $data/wav.scp ] && ! $no_wav; then - echo "$0: no such file $data/wav.scp (if this is by design, specify --no-wav)" - exit 1; -fi - -if [ -f $data/wav.scp ]; then - check_sorted_and_uniq $data/wav.scp - - if grep -E -q '^\S+\s+~' $data/wav.scp; then - # note: it's not a good idea to have any kind of tilde in wav.scp, even if - # part of a command, as it would cause compatibility problems if run by - # other users, but this used to be not checked for so we let it slide unless - # it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which - # would definitely cause problems as the fopen system call does not do - # tilde expansion. - echo "$0: Please do not use tilde (~) in your wav.scp." - exit 1; - fi - - if [ -f $data/segments ]; then - - check_sorted_and_uniq $data/segments - # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids. - ! cat $data/segments | \ - awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \ - echo "$0: badly formatted segments file" && exit 1; - - segments_len=`cat $data/segments | wc -l` - if [ -f $data/text ]; then - ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \ - echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \ - echo "$0: Lengths are $segments_len vs $num_utts" && \ - exit 1 - fi - - cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings - awk '{print $1}' $data/wav.scp > $tmpdir/recordings.wav - if ! cmp -s $tmpdir/recordings{,.wav}; then - echo "$0: Error: in $data, recording-ids extracted from segments and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.wav} - exit 1; - fi - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc - if ! cmp -s $tmpdir/recordings{,.r2fc}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.r2fc} - exit 1; - fi - fi - else - # No segments file -> assume wav.scp indexed by utterance. - cat $data/wav.scp | awk '{print $1}' > $tmpdir/utts.wav - if ! cmp -s $tmpdir/utts{,.wav}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and wav.scp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.wav} - exit 1; - fi - - if [ -f $data/reco2file_and_channel ]; then - # this file is needed only for ctm scoring; it's indexed by recording-id. - check_sorted_and_uniq $data/reco2file_and_channel - ! cat $data/reco2file_and_channel | \ - awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { - if ( NF == 3 && $3 == "1" ) { - warning_issued = 1; - } else { - print "Bad line ", $0; exit 1; - } - } - } - END { - if (warning_issued == 1) { - print "The channel should be marked as A or B, not 1! You should change it ASAP! " - } - }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; - cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc - if ! cmp -s $tmpdir/utts{,.r2fc}; then - echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.r2fc} - exit 1; - fi - fi - fi -fi - -if [ ! -f $data/feats.scp ] && ! $no_feats; then - echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)" - exit 1; -fi - -if [ -f $data/feats.scp ]; then - check_sorted_and_uniq $data/feats.scp - cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats - if ! cmp -s $tmpdir/utts{,.feats}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.feats} - exit 1; - fi -fi - - -if [ -f $data/cmvn.scp ]; then - check_sorted_and_uniq $data/cmvn.scp - cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.cmvn}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.cmvn} - exit 1; - fi -fi - -if [ -f $data/spk2gender ]; then - check_sorted_and_uniq $data/spk2gender - ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \ - echo "$0: Mal-formed spk2gender file" && exit 1; - cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2gender}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2gender} - exit 1; - fi -fi - -if [ -f $data/spk2warp ]; then - check_sorted_and_uniq $data/spk2warp - ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed spk2warp file" && exit 1; - cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp - cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - if ! cmp -s $tmpdir/speakers{,.spk2warp}; then - echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/speakers{,.spk2warp} - exit 1; - fi -fi - -if [ -f $data/utt2warp ]; then - check_sorted_and_uniq $data/utt2warp - ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ - echo "$0: Mal-formed utt2warp file" && exit 1; - cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp - cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts - if ! cmp -s $tmpdir/utts{,.utt2warp}; then - echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2warp} - exit 1; - fi -fi - -# check some optionally-required things -for f in vad.scp utt2lang utt2uniq; do - if [ -f $data/$f ]; then - check_sorted_and_uniq $data/$f - if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \ - <( awk '{print $1}' $data/$f ); then - echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list" - exit 1; - fi - fi -done - - -if [ -f $data/utt2dur ]; then - check_sorted_and_uniq $data/utt2dur - cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur - if ! cmp -s $tmpdir/utts{,.utt2dur}; then - echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/utts{,.utt2dur} - exit 1; - fi - cat $data/utt2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1 -fi - - -if [ -f $data/reco2dur ]; then - check_sorted_and_uniq $data/reco2dur - cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur - if [ -f $tmpdir/recordings ]; then - if ! cmp -s $tmpdir/recordings{,.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/recordings{,.reco2dur} - exit 1; - fi - else - if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then - echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file" - echo "$0: differ, partial diff is:" - partial_diff $tmpdir/{utts,recordings.reco2dur} - exit 1; - fi - fi - cat $data/reco2dur | \ - awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1 -fi - - -echo "$0: Successfully validated data-directory $data" diff --git a/kaldi/local/validate_dict_dir.pl b/kaldi/local/validate_dict_dir.pl deleted file mode 100755 index 981dc00..0000000 --- a/kaldi/local/validate_dict_dir.pl +++ /dev/null @@ -1,508 +0,0 @@ -#!/usr/bin/env perl - -# Apache 2.0. -# Copyright 2012 Guoguo Chen -# 2015 Daniel Povey -# 2017 Johns Hopkins University (Jan "Yenda" Trmal ) -# -# Validation script for data/local/dict - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The current line (nr. $i) contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n"; - if ($has_invalid_whitespaces) { - print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n"; - return 0; - } else { - print "--> text contains only allowed whitespaces\n"; - } - } else { - print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n"; - } - return 1; -} - - -if(@ARGV != 1) { - die "Usage: validate_dict_dir.pl \n" . - "e.g.: validate_dict_dir.pl data/local/dict\n"; -} - -$dict = shift @ARGV; -$dict =~ s:/$::; - -$exit = 0; -$success = 1; # this is re-set each time we read a file. - -sub set_to_fail { $exit = 1; $success = 0; } - -# Checking silence_phones.txt ------------------------------- -print "Checking $dict/silence_phones.txt ...\n"; -if(-z "$dict/silence_phones.txt") {print "--> ERROR: $dict/silence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(S, "<$dict/silence_phones.txt")) {print "--> ERROR: fail to open $dict/silence_phones.txt\n"; exit 1;} -$idx = 1; -%silence = (); -$crlf = 1; - -print "--> reading $dict/silence_phones.txt\n"; -check_allowed_whitespace(\*S) || set_to_fail(); -while() { - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/silence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/silence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($silence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/silence_phones.txt (line $idx)\n"; - } else { - $silence{$p} = 1; - } - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(S); -$success == 0 || print "--> $dict/silence_phones.txt is OK\n"; -print "\n"; - -# Checking optional_silence.txt ------------------------------- -print "Checking $dict/optional_silence.txt ...\n"; -if(-z "$dict/optional_silence.txt") {print "--> ERROR: $dict/optional_silence.txt is empty or not exists\n"; exit 1;} -if(!open(OS, "<$dict/optional_silence.txt")) {print "--> ERROR: fail to open $dict/optional_silence.txt\n"; exit 1;} -$idx = 1; -$success = 1; -$crlf = 1; -print "--> reading $dict/optional_silence.txt\n"; -check_allowed_whitespace(\*OS) or exit 1; -while() { - chomp; - my @col = split(" ", $_); - if ($idx > 1 or @col > 1) { - set_to_fail(); print "--> ERROR: only 1 phone expected in $dict/optional_silence.txt\n"; - } elsif (!$silence{$col[0]}) { - set_to_fail(); print "--> ERROR: phone $col[0] not found in $dict/silence_phones.txt\n"; - } - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/optional_silence.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - $idx ++; -} -close(OS); -$success == 0 || print "--> $dict/optional_silence.txt is OK\n"; -print "\n"; - -# Checking nonsilence_phones.txt ------------------------------- -print "Checking $dict/nonsilence_phones.txt ...\n"; -if(-z "$dict/nonsilence_phones.txt") {print "--> ERROR: $dict/nonsilence_phones.txt is empty or not exists\n"; exit 1;} -if(!open(NS, "<$dict/nonsilence_phones.txt")) {print "--> ERROR: fail to open $dict/nonsilence_phones.txt\n"; exit 1;} -$idx = 1; -%nonsilence = (); -$success = 1; -$crlf = 1; -print "--> reading $dict/nonsilence_phones.txt\n"; -check_allowed_whitespace(\*NS) or set_to_fail(); -while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/nonsilence_phones.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/nonsilence_phones.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); - print "--> ERROR: empty line in $dict/nonsilence_phones.txt (line $idx)\n"; - } - foreach(0 .. @col-1) { - my $p = $col[$_]; - if($nonsilence{$p}) { - set_to_fail(); print "--> ERROR: phone \"$p\" duplicates in $dict/nonsilence_phones.txt (line $idx)\n"; - } else { - $nonsilence{$p} = 1; - } - # phones that start with the pound sign/hash may be mistaken for - # disambiguation symbols; phones ending in _B, _E, _S or _I will cause - # problems with word-position-dependent systems, and is obviously - # confusable with epsilon. - if ($p =~ m/^#/ || $p =~ m/_[BESI]$/ || $p eq ""){ - set_to_fail(); - print "--> ERROR: phone \"$p\" has disallowed written form\n"; - } - } - $idx ++; -} -close(NS); -$success == 0 || print "--> $dict/nonsilence_phones.txt is OK\n"; -print "\n"; - -# Checking disjoint ------------------------------- -sub intersect { - my ($a, $b) = @_; - @itset = (); - %itset = (); - foreach(keys %$a) { - if(exists $b->{$_} and !$itset{$_}) { - push(@itset, $_); - $itset{$_} = 1; - } - } - return @itset; -} - -print "Checking disjoint: silence_phones.txt, nonsilence_phones.txt\n"; -@itset = intersect(\%silence, \%nonsilence); -if(@itset == 0) {print "--> disjoint property is OK.\n";} -else {set_to_fail(); print "--> ERROR: silence_phones.txt and nonsilence_phones.txt has overlap: "; foreach(@itset) {print "$_ ";} print "\n";} -print "\n"; - - -sub check_lexicon { - my ($lex, $num_prob_cols, $num_skipped_cols) = @_; - print "Checking $lex\n"; - !open(L, "<$lex") && print "--> ERROR: fail to open $lex\n" && set_to_fail(); - my %seen_line = {}; - $idx = 1; $success = 1; $crlf = 1; - print "--> reading $lex\n"; - check_allowed_whitespace(\*L) or set_to_fail(); - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $lex contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (defined $seen_line{$_}) { - print "--> ERROR: line '$_' of $lex is repeated\n"; - set_to_fail(); - } - $seen_line{$_} = 1; - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $lex does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - $word = shift @col; - if (!defined $word) { - print "--> ERROR: empty lexicon line in $lex\n"; set_to_fail(); - } - if ($word eq "" || $word eq "" || $word eq "" || $word eq "#0") { - print "--> ERROR: lexicon.txt contains forbidden word $word\n"; - set_to_fail(); - } - for ($n = 0; $n < $num_prob_cols; $n++) { - $prob = shift @col; - if (!($prob > 0.0 && $prob <= 1.0)) { - print "--> ERROR: bad pron-prob in lexicon-line '$_', in $lex\n"; - set_to_fail(); - } - } - for ($n = 0; $n < $num_skipped_cols; $n++) { shift @col; } - if (@col == 0) { - print "--> ERROR: lexicon.txt contains word $word with empty "; - print "pronunciation.\n"; - set_to_fail(); - } - foreach (0 .. @col-1) { - if (!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt "; - print "(line $idx)\n"; - set_to_fail(); - } - } - $idx ++; - } - close(L); - $success == 0 || print "--> $lex is OK\n"; - print "\n"; -} - -if (-f "$dict/lexicon.txt") { check_lexicon("$dict/lexicon.txt", 0, 0); } -if (-f "$dict/lexiconp.txt") { check_lexicon("$dict/lexiconp.txt", 1, 0); } -if (-f "$dict/lexiconp_silprob.txt") { - # If $dict/lexiconp_silprob.txt exists, we expect $dict/silprob.txt to also - # exist. - check_lexicon("$dict/lexiconp_silprob.txt", 2, 2); - if (-f "$dict/silprob.txt") { - !open(SP, "<$dict/silprob.txt") && - print "--> ERROR: fail to open $dict/silprob.txt\n" && set_to_fail(); - $crlf = 1; - while () { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/silprob.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - chomp; my @col = split; - @col != 2 && die "--> ERROR: bad line \"$_\"\n" && set_to_fail(); - if ($col[0] eq "" || $col[0] eq "overall") { - if (!($col[1] > 0.0 && $col[1] <= 1.0)) { - set_to_fail(); - print "--> ERROR: bad probability in $dir/silprob.txt \"$_\"\n"; - } - } elsif ($col[0] eq "_s" || $col[0] eq "_n") { - if ($col[1] <= 0.0) { - set_to_fail(); - print "--> ERROR: bad correction term in $dir/silprob.txt \"$_\"\n"; - } - } else { - print "--> ERROR: unexpected line in $dir/silprob.txt \"$_\"\n"; - set_to_fail(); - } - } - close(SP); - } else { - set_to_fail(); - print "--> ERROR: expecting $dict/silprob.txt to exist\n"; - } -} - -if (!(-f "$dict/lexicon.txt" || -f "$dict/lexiconp.txt")) { - print "--> ERROR: neither lexicon.txt or lexiconp.txt exist in directory $dir\n"; - set_to_fail(); -} - -sub check_lexicon_pair { - my ($lex1, $num_prob_cols1, $num_skipped_cols1, - $lex2, $num_prob_cols2, $num_skipped_cols2) = @_; - # We have checked individual lexicons already. - open(L1, "<$lex1"); open(L2, "<$lex2"); - print "Checking lexicon pair $lex1 and $lex2\n"; - my $line_num = 0; - while() { - $line_num++; - @A = split; - $line_B = ; - if (!defined $line_B) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); last; - } - @B = split(" ", $line_B); - # Check if the word matches. - if ($A[0] ne $B[0]) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - shift @A; shift @B; - for ($n = 0; $n < $num_prob_cols1 + $num_skipped_cols1; $n ++) { shift @A; } - for ($n = 0; $n < $num_prob_cols2 + $num_skipped_cols2; $n ++) { shift @B; } - # Check if the pronunciation matches - if (join(" ", @A) ne join(" ", @B)) { - print "--> ERROR: $lex1 and $lex2 mismatch at line $line_num. sorting?\n"; - set_to_fail(); last; - } - } - $line_B = ; - if (defined $line_B && $exit == 0) { - print "--> ERROR: $lex1 and $lex2 have different number of lines.\n"; - set_to_fail(); - } - $success == 0 || print "--> lexicon pair $lex1 and $lex2 match\n\n"; -} - -# If more than one lexicon exist, we have to check if they correspond to each -# other. It could be that the user overwrote one and we need to regenerate the -# other, but we do not know which is which. -if ( -f "$dict/lexicon.txt" && -f "$dict/lexiconp.txt") { - check_lexicon_pair("$dict/lexicon.txt", 0, 0, "$dict/lexiconp.txt", 1, 0); -} -if ( -f "$dict/lexiconp.txt" && -f "$dict/lexiconp_silprob.txt") { - check_lexicon_pair("$dict/lexiconp.txt", 1, 0, - "$dict/lexiconp_silprob.txt", 2, 2); -} - -# Checking extra_questions.txt ------------------------------- -%distinguished = (); # Keep track of all phone-pairs including nonsilence that - # are distinguished (split apart) by extra_questions.txt, - # as $distinguished{$p1,$p2} = 1. This will be used to - # make sure that we don't have pairs of phones on the same - # line in nonsilence_phones.txt that can never be - # distinguished from each other by questions. (If any two - # phones appear on the same line in nonsilence_phones.txt, - # they share a tree root, and since the automatic - # question-building treats all phones that appear on the - # same line of nonsilence_phones.txt as being in the same - # group, we can never distinguish them without resorting to - # questions in extra_questions.txt. -print "Checking $dict/extra_questions.txt ...\n"; -if (-s "$dict/extra_questions.txt") { - if (!open(EX, "<$dict/extra_questions.txt")) { - set_to_fail(); print "--> ERROR: fail to open $dict/extra_questions.txt\n"; - } - $idx = 1; - $success = 1; - $crlf = 1; - print "--> reading $dict/extra_questions.txt\n"; - check_allowed_whitespace(\*EX) or set_to_fail(); - while() { - if ($crlf == 1 && m/\r/) { - print "--> ERROR: $dict/extra_questions.txt contains Carriage Return (^M) characters.\n"; - set_to_fail(); - $crlf = 0; - } - if (! s/\n$//) { - print "--> ERROR: last line '$_' of $dict/extra_questions.txt does not end in newline.\n"; - set_to_fail(); - } - my @col = split(" ", $_); - if (@col == 0) { - set_to_fail(); print "--> ERROR: empty line in $dict/extra_questions.txt\n"; - } - foreach (0 .. @col-1) { - if(!$silence{@col[$_]} and !$nonsilence{@col[$_]}) { - set_to_fail(); print "--> ERROR: phone \"@col[$_]\" is not in {, non}silence.txt (line $idx, block ", $_+1, ")\n"; - } - $idx ++; - } - %col_hash = (); - foreach $p (@col) { $col_hash{$p} = 1; } - foreach $p1 (@col) { - # Update %distinguished hash. - foreach $p2 (keys %nonsilence) { - if (!defined $col_hash{$p2}) { # for each p1 in this question and p2 not - # in this question (and in nonsilence - # phones)... mark p1,p2 as being split apart - $distinguished{$p1,$p2} = 1; - $distinguished{$p2,$p1} = 1; - } - } - } - } - close(EX); - $success == 0 || print "--> $dict/extra_questions.txt is OK\n"; -} else { print "--> $dict/extra_questions.txt is empty (this is OK)\n";} - - -# check nonsilence_phones.txt again for phone-pairs that are never -# distnguishable. (note: this situation is normal and expected for silence -# phones, so we don't check it.) -if(!open(NS, "<$dict/nonsilence_phones.txt")) { - print "--> ERROR: fail to open $dict/nonsilence_phones.txt the second time\n"; exit 1; -} - -$num_warn_nosplit = 0; -$num_warn_nosplit_limit = 10; -while() { - my @col = split(" ", $_); - foreach $p1 (@col) { - foreach $p2 (@col) { - if ($p1 ne $p2 && ! $distinguished{$p1,$p2}) { - set_to_fail(); - if ($num_warn_nosplit <= $num_warn_nosplit_limit) { - print "--> ERROR: phones $p1 and $p2 share a tree root but can never be distinguished by extra_questions.txt.\n"; - } - if ($num_warn_nosplit == $num_warn_nosplit_limit) { - print "... Not warning any more times about this issue.\n"; - } - if ($num_warn_nosplit == 0) { - print " (note: we started checking for this only recently. You can still build a system but\n"; - print " phones $p1 and $p2 will be acoustically indistinguishable).\n"; - } - $num_warn_nosplit++; - } - } - } -} - - -if ($exit == 1) { - print "--> ERROR validating dictionary directory $dict (see detailed error "; - print "messages above)\n\n"; - exit 1; -} else { - print "--> SUCCESS [validating dictionary directory $dict]\n\n"; -} - -exit 0; diff --git a/kaldi/local/validate_lang.pl b/kaldi/local/validate_lang.pl deleted file mode 100755 index 2501d25..0000000 --- a/kaldi/local/validate_lang.pl +++ /dev/null @@ -1,997 +0,0 @@ -#!/usr/bin/env perl - -# Apache 2.0. -# Copyright 2012 Guoguo Chen -# 2014 Neil Nelson -# 2017 Johns Hopkins University (Jan "Yenda" Trmal ) -# -# Validation script for data/lang - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The current line (nr. $i) contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - print "--> text seems to be UTF-8 or ASCII, checking whitespaces\n"; - if ($has_invalid_whitespaces) { - print "--> ERROR: the text containes disallowed UTF-8 whitespace character(s)\n"; - return 0; - } else { - print "--> text contains only allowed whitespaces\n"; - } - } else { - print "--> text doesn't seem to be UTF-8 or ASCII, won't check whitespaces\n"; - } - return 1; -} - -$skip_det_check = 0; -$skip_disambig_check = 0; - -if (@ARGV > 0 && $ARGV[0] eq "--skip-determinization-check") { - $skip_det_check = 1; - shift @ARGV; -} - -if (@ARGV > 0 && $ARGV[0] eq "--skip-disambig-check") { - $skip_disambig_check = 1; - shift @ARGV; -} - -if (@ARGV != 1) { - print "Usage: $0 [options] \n"; - print "e.g.: $0 data/lang\n"; - print "Options:\n"; - print " --skip-determinization-check (this flag causes it to skip a time consuming check).\n"; - print " --skip-disambig-check (this flag causes it to skip a disambig check in phone bigram models).\n"; - exit(1); -} - -print "$0 " . join(" ", @ARGV) . "\n"; - -$lang = shift @ARGV; -$exit = 0; -$warning = 0; -# Checking phones.txt ------------------------------- -print "Checking $lang/phones.txt ...\n"; -if (-z "$lang/phones.txt") { - print "--> ERROR: $lang/phones.txt is empty or does not exist\n"; exit 1; -} -if (!open(P, "<$lang/phones.txt")) { - print "--> ERROR: fail to open $lang/phones.txt\n"; exit 1; -} -$idx = 1; -%psymtab = (); -check_allowed_whitespace(\*P) or exit 1; -while (

) { - chomp; - my @col = split(" ", $_); - if (@col != 2) { - print "--> ERROR: expect 2 columns in $lang/phones.txt (break at line $idx)\n"; exit 1; - } - my $phone = shift @col; - my $id = shift @col; - $psymtab{$phone} = $id; - $idx ++; -} -close(P); -%pint2sym = (); -foreach (keys %psymtab) { - if ($pint2sym{$psymtab{$_}}) { - print "--> ERROR: ID \"$psymtab{$_}\" duplicates\n"; exit 1; - } else { - $pint2sym{$psymtab{$_}} = $_; - } -} -print "--> $lang/phones.txt is OK\n"; -print "\n"; - -# Check word.txt ------------------------------- -print "Checking words.txt: #0 ...\n"; -if (-z "$lang/words.txt") { - print "--> ERROR: $lang/words.txt is empty or does not exist\n"; exit 1; -} -if (!open(W, "<$lang/words.txt")) { - print "--> ERROR: fail to open $lang/words.txt\n"; exit 1; -} -$idx = 1; -%wsymtab = (); -check_allowed_whitespace(\*W) or exit 1; -while () { - chomp; - my @col = split(" ", $_); - if (@col != 2) { - print "--> ERROR: expect 2 columns in $lang/words.txt (line $idx)\n"; exit 1; - } - $word = shift @col; - $id = shift @col; - $wsymtab{$word} = $id; - $idx ++; -} -close(W); -%wint2sym = (); -foreach (keys %wsymtab) { - if ($wint2sym{$wsymtab{$_}}) { - print "--> ERROR: ID \"$wsymtab{$_}\" duplicates\n"; exit 1; - } else { - $wint2sym{$wsymtab{$_}} = $_; - } -} -print "--> $lang/words.txt is OK\n"; -print "\n"; - -# Checking phones/* ------------------------------- -sub check_txt_int_csl { - my ($cat, $symtab) = @_; - print "Checking $cat.\{txt, int, csl\} ...\n"; - if (!open(TXT, "<$cat.txt")) { - $exit = 1; return print "--> ERROR: fail to open $cat.txt\n"; - } - if (!open(INT, "<$cat.int")) { - $exit = 1; return print "--> ERROR: fail to open $cat.int\n"; - } - if (!open(CSL, "<$cat.csl")) { - $exit = 1; return print "--> ERROR: fail to open $cat.csl\n"; - } - if (-z "$cat.txt") { - $warning = 1; print "--> WARNING: $cat.txt is empty\n"; - } - if (-z "$cat.int") { - $warning = 1; print "--> WARNING: $cat.int is empty\n"; - } - if (-z "$cat.csl") { - $warning = 1; print "--> WARNING: $cat.csl is empty\n"; - } - - $idx1 = 1; - check_allowed_whitespace(\*TXT) or $exit = 1; - while () { - chomp; - my @col = split(" ", $_); - if (@col != 1) { - $exit = 1; return print "--> ERROR: expect 1 column in $cat.txt (break at line $idx1)\n"; - } - $entry[$idx1] = shift @col; - $idx1 ++; - } - close(TXT); $idx1 --; - print "--> $idx1 entry/entries in $cat.txt\n"; - - $idx2 = 1; - while () { - chomp; - my @col = split(" ", $_); - if (@col != 1) { - $exit = 1; return print "--> ERROR: expect 1 column in $cat.int (break at line $idx2)\n"; - } - if ($symtab->{$entry[$idx2]} ne shift @col) { - $exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2)\n"; - } - $idx2 ++; - } - close(INT); $idx2 --; - if ($idx1 != $idx2) { - $exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line ", $idx2+1, ")\n"; - } - print "--> $cat.int corresponds to $cat.txt\n"; - - $num_lines = 0; - while () { - chomp; - my @col = split(":", $_); - $num_lines++; - if (@col != $idx1) { - $exit = 1; return print "--> ERROR: expect $idx1 block/blocks in $cat.csl (break at line $idx3)\n"; - } - foreach (1 .. $idx1) { - if ($symtab->{$entry[$_]} ne @col[$_-1]) { - $exit = 1; return print "--> ERROR: $cat.csl doesn't correspond to $cat.txt (break at line $idx3, block $_)\n"; - } - } - } - close(CSL); - if ($idx1 != 0) { # nonempty .txt,.int files - if ($num_lines != 1) { - $exit = 1; - return print "--> ERROR: expect 1 line in $cat.csl\n"; - } - } else { - if ($num_lines != 1 && $num_lines != 0) { - $exit = 1; - return print "--> ERROR: expect 0 or 1 line in $cat.csl, since empty .txt,int\n"; - } - } - print "--> $cat.csl corresponds to $cat.txt\n"; - - return print "--> $cat.\{txt, int, csl\} are OK\n"; -} - -sub check_txt_int { - my ($cat, $symtab, $sym_check) = @_; - print "Checking $cat.\{txt, int\} ...\n"; - if (-z "$cat.txt") { - $exit = 1; return print "--> ERROR: $cat.txt is empty or does not exist\n"; - } - if (-z "$cat.int") { - $exit = 1; return print "--> ERROR: $cat.int is empty or does not exist\n"; - } - if (!open(TXT, "<$cat.txt")) { - $exit = 1; return print "--> ERROR: fail to open $cat.txt\n"; - } - if (!open(INT, "<$cat.int")) { - $exit = 1; return print "--> ERROR: fail to open $cat.int\n"; - } - - $idx1 = 1; - check_allowed_whitespace(\*TXT) or $exit = 1; - while () { - chomp; - s/^(shared|not-shared) (split|not-split) //g; - s/ nonword$//g; - s/ begin$//g; - s/ end$//g; - s/ internal$//g; - s/ singleton$//g; - $entry[$idx1] = $_; - $idx1 ++; - } - close(TXT); $idx1 --; - print "--> $idx1 entry/entries in $cat.txt\n"; - - my %used_syms = (); - $idx2 = 1; - while () { - chomp; - s/^(shared|not-shared) (split|not-split) //g; - s/ nonword$//g; - s/ begin$//g; - s/ end$//g; - s/ internal$//g; - s/ singleton$//g; - my @col = split(" ", $_); - @set = split(" ", $entry[$idx2]); - if (@set != @col) { - $exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2)\n"; - } - foreach (0 .. @set-1) { - if ($symtab->{@set[$_]} ne @col[$_]) { - $exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line $idx2, block " ,$_+1, ")\n"; - } - if ($sym_check && defined $used_syms{@set[$_]}) { - $exit = 1; return print "--> ERROR: $cat.txt and $cat.int contain duplicate symbols (break at line $idx2, block " ,$_+1, ")\n"; - } - $used_syms{@set[$_]} = 1; - } - $idx2 ++; - } - close(INT); $idx2 --; - if ($idx1 != $idx2) { - $exit = 1; return print "--> ERROR: $cat.int doesn't correspond to $cat.txt (break at line ", $idx2+1, ")\n"; - } - print "--> $cat.int corresponds to $cat.txt\n"; - - if ($sym_check) { - while ( my ($key, $value) = each(%silence) ) { - if (!defined $used_syms{$key}) { - $exit = 1; return print "--> ERROR: $cat.txt and $cat.int do not contain all silence phones\n"; - } - } - while ( my ($key, $value) = each(%nonsilence) ) { - if (!defined $used_syms{$key}) { - $exit = 1; return print "--> ERROR: $cat.txt and $cat.int do not contain all non-silence phones\n"; - } - } - } - - return print "--> $cat.\{txt, int\} are OK\n"; -} - -# Check disjoint and summation ------------------------------- -sub intersect { - my ($a, $b) = @_; - @itset = (); - %itset = (); - foreach (keys %$a) { - if (exists $b->{$_} and !$itset{$_}) { - push(@itset, $_); - $itset{$_} = 1; - } - } - return @itset; -} - -sub check_disjoint { - print "Checking disjoint: silence.txt, nonsilence.txt, disambig.txt ...\n"; - if (!open(S, "<$lang/phones/silence.txt")) { - $exit = 1; return print "--> ERROR: fail to open $lang/phones/silence.txt\n"; - } - if (!open(N, "<$lang/phones/nonsilence.txt")) { - $exit = 1; return print "--> ERROR: fail to open $lang/phones/nonsilence.txt\n"; - } - if (!$skip_disambig_check && !open(D, "<$lang/phones/disambig.txt")) { - $exit = 1; return print "--> ERROR: fail to open $lang/phones/disambig.txt\n"; - } - - $idx = 1; - while () { - chomp; - my @col = split(" ", $_); - $phone = shift @col; - if ($silence{$phone}) { - $exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/silence.txt (line $idx)\n"; - } - $silence{$phone} = 1; - push(@silence, $phone); - $idx ++; - } - close(S); - - $idx = 1; - while () { - chomp; - my @col = split(" ", $_); - $phone = shift @col; - if ($nonsilence{$phone}) { - $exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/nonsilence.txt (line $idx)\n"; - } - $nonsilence{$phone} = 1; - push(@nonsilence, $phone); - $idx ++; - } - close(N); - - $idx = 1; - while () { - chomp; - my @col = split(" ", $_); - $phone = shift @col; - if ($disambig{$phone}) { - $exit = 1; print "--> ERROR: phone \"$phone\" duplicates in $lang/phones/disambig.txt (line $idx)\n"; - } - $disambig{$phone} = 1; - $idx ++; - } - close(D); - - my @itsect1 = intersect(\%silence, \%nonsilence); - my @itsect2 = intersect(\%silence, \%disambig); - my @itsect3 = intersect(\%disambig, \%nonsilence); - - $success = 1; - if (@itsect1 != 0) { - $success = 0; - $exit = 1; print "--> ERROR: silence.txt and nonsilence.txt have intersection -- "; - foreach (@itsect1) { - print $_, " "; - } - print "\n"; - } else { - print "--> silence.txt and nonsilence.txt are disjoint\n"; - } - - if (@itsect2 != 0) { - $success = 0; - $exit = 1; print "--> ERROR: silence.txt and disambig.txt have intersection -- "; - foreach (@itsect2) { - print $_, " "; - } - print "\n"; - } else { - print "--> silence.txt and disambig.txt are disjoint\n"; - } - - if (@itsect3 != 0) { - $success = 0; - $exit = 1; print "--> ERROR: disambig.txt and nonsilence.txt have intersection -- "; - foreach (@itsect1) { - print $_, " "; - } - print "\n"; - } else { - print "--> disambig.txt and nonsilence.txt are disjoint\n"; - } - - $success == 0 || print "--> disjoint property is OK\n"; - return; -} - -sub check_summation { - print "Checking sumation: silence.txt, nonsilence.txt, disambig.txt ...\n"; - if (scalar(keys %silence) == 0) { - $exit = 1; return print "--> ERROR: $lang/phones/silence.txt is empty or does not exist\n"; - } - if (scalar(keys %nonsilence) == 0) { - $exit = 1; return print "--> ERROR: $lang/phones/nonsilence.txt is empty or does not exist\n"; - } - if (!$skip_disambig_check && scalar(keys %disambig) == 0) { - $warning = 1; print "--> WARNING: $lang/phones/disambig.txt is empty or does not exist\n"; - } - - %sum = (%silence, %nonsilence, %disambig); - $sum{""} = 1; - - my @itset = intersect(\%sum, \%psymtab); - my @key1 = keys %sum; - my @key2 = keys %psymtab; - my %itset = (); foreach(@itset) {$itset{$_} = 1;} - if (@itset < @key1) { - $exit = 1; print "--> ERROR: phones in silence.txt, nonsilence.txt, disambig.txt but not in phones.txt -- "; - foreach (@key1) { - if (!$itset{$_}) { - print "$_ "; - } - } - print "\n"; - } - - if (@itset < @key2) { - $exit = 1; print "--> ERROR: phones in phones.txt but not in silence.txt, nonsilence.txt, disambig.txt -- "; - foreach (@key2) { - if (!$itset{$_}) { - print "$_ "; - } - } - print "\n"; - } - - if (@itset == @key1 and @itset == @key2) { - print "--> summation property is OK\n"; - } - return; -} - -%silence = (); -@silence = (); -%nonsilence = (); -@nonsilence = (); -%disambig = (); -check_disjoint; print "\n"; -check_summation; print "\n"; - -@list1 = ("context_indep", "nonsilence", "silence", "optional_silence"); -@list2 = ("roots", "sets"); -if (!$skip_disambig_check) { - push(@list1, "disambig"); -} -foreach (@list1) { - check_txt_int_csl("$lang/phones/$_", \%psymtab); print "\n"; -} -foreach (@list2) { - check_txt_int("$lang/phones/$_", \%psymtab, 1); print "\n"; -} -if ((-s "$lang/phones/extra_questions.txt") || (-s "$lang/phones/extra_questions.int")) { - check_txt_int("$lang/phones/extra_questions", \%psymtab, 0); print "\n"; -} else { - print "Checking $lang/phones/extra_questions.\{txt, int\} ...\n"; - if (!((-f "$lang/phones/extra_questions.txt") && (-f "$lang/phones/extra_questions.int"))) { - print "--> ERROR: $lang/phones/extra_questions.\{txt, int\} do not exist (they may be empty, but should be present)\n\n"; - $exit = 1; - } -} -if (-e "$lang/phones/word_boundary.txt") { - check_txt_int("$lang/phones/word_boundary", \%psymtab, 0); print "\n"; -} - -# Checking optional_silence.txt ------------------------------- -print "Checking optional_silence.txt ...\n"; -$idx = 1; -$success = 1; -if (-z "$lang/phones/optional_silence.txt") { - $exit = 1; $success = 0; print "--> ERROR: $lang/phones/optional_silence.txt is empty or does not exist\n"; -} -if (!open(OS, "<$lang/phones/optional_silence.txt")) { - $exit = 1; $success = 0; print "--> ERROR: fail to open $lang/phones/optional_silence.txt\n"; -} -print "--> reading $lang/phones/optional_silence.txt\n"; -while () { - chomp; - my @col = split(" ", $_); - if ($idx > 1 or @col > 1) { - $exit = 1; print "--> ERROR: only 1 phone expected in $lang/phones/optional_silence.txt\n"; $success = 0; - } elsif (!$silence{$col[0]}) { - $exit = 1; print "--> ERROR: phone $col[0] not found in $lang/phones/silence_phones.txt\n"; $success = 0; - } - $idx ++; -} -close(OS); -$success == 0 || print "--> $lang/phones/optional_silence.txt is OK\n"; -print "\n"; - -if (!$skip_disambig_check) { - # Check disambiguation symbols ------------------------------- - print "Checking disambiguation symbols: #0 and #1\n"; - if (scalar(keys %disambig) == 0) { - $warning = 1; print "--> WARNING: $lang/phones/disambig.txt is empty or does not exist\n"; - } - if (exists $disambig{"#0"} and exists $disambig{"#1"}) { - print "--> $lang/phones/disambig.txt has \"#0\" and \"#1\"\n"; - print "--> $lang/phones/disambig.txt is OK\n\n"; - } else { - print "--> WARNING: $lang/phones/disambig.txt doesn't have \"#0\" or \"#1\";\n"; - print "--> this would not be OK with a conventional ARPA-type language\n"; - print "--> model or a conventional lexicon (L.fst)\n"; - $warning = 1; - } -} - - -# Check topo ------------------------------- -print "Checking topo ...\n"; -if (-z "$lang/topo") { - $exit = 1; print "--> ERROR: $lang/topo is empty or does not exist\n"; -} -if (!open(T, "<$lang/topo")) { - $exit = 1; print "--> ERROR: fail to open $lang/topo\n"; -} else { - $topo_ok = 1; - $idx = 1; - %phones_in_topo_int_hash = ( ); - %phones_in_topo_hash = ( ); - while () { - chomp; - next if (m/^<.*>[ ]*$/); - foreach $i (split(" ", $_)) { - if (defined $phones_in_topo_int_hash{$i}) { - $topo_ok = 0; - $exit = 1; print "--> ERROR: $lang/topo has phone $i twice\n"; - } - if (!defined $pint2sym{$i}) { - $topo_ok = 0; - $exit = 1; print "--> ERROR: $lang/topo has phone $i which is not in phones.txt\n"; - } - $phones_in_topo_int_hash{$i} = 1; - $phones_in_topo_hash{$pint2sym{$i}} = 1; - } - } - close(T); - $phones_that_should_be_in_topo_hash = {}; - foreach $p (@silence, @nonsilence) { $phones_that_should_be_in_topo_hash{$p} = 1; } - foreach $p (keys %phones_that_should_be_in_topo_hash) { - if ( ! defined $phones_in_topo_hash{$p}) { - $topo_ok = 0; - $i = $pint2sym{$p}; - $exit = 1; print "--> ERROR: $lang/topo does not cover phone $p (label = $i)\n"; - } - } - foreach $i (keys %phones_in_topo_int_hash) { - $p = $pint2sym{$i}; - if ( ! defined $phones_that_should_be_in_topo_hash{$p}) { - $topo_ok = 0; - $exit = 1; print "--> ERROR: $lang/topo covers phone $p (label = $i) which is not a real phone\n"; - } - } - if ($topo_ok) { - "--> $lang/topo is OK\n"; - } - print "\n"; -} - -# Check word_boundary ------------------------------- -$nonword = ""; -$begin = ""; -$end = ""; -$internal = ""; -$singleton = ""; -if (-s "$lang/phones/word_boundary.txt") { - print "Checking word_boundary.txt: silence.txt, nonsilence.txt, disambig.txt ...\n"; - if (!open (W, "<$lang/phones/word_boundary.txt")) { - $exit = 1; print "--> ERROR: fail to open $lang/phones/word_boundary.txt\n"; - } - $idx = 1; - %wb = (); - while () { - chomp; - my @col; - if (m/^.*nonword$/ ) { - s/ nonword//g; @col = split(" ", $_); if (@col == 1) {$nonword .= "$col[0] ";} - } - if (m/^.*begin$/ ) { - s/ begin$//g; @col = split(" ", $_); if (@col == 1) {$begin .= "$col[0] ";} - } - if (m/^.*end$/ ) { - s/ end$//g; @col = split(" ", $_); if (@col == 1) {$end .= "$col[0] ";} - } - if (m/^.*internal$/ ) { - s/ internal$//g; @col = split(" ", $_); if (@col == 1) {$internal .= "$col[0] ";} - } - if (m/^.*singleton$/) { - s/ singleton$//g; @col = split(" ", $_); if (@col == 1) {$singleton .= "$col[0] ";} - } - if (@col != 1) { - $exit = 1; print "--> ERROR: expect 1 column in $lang/phones/word_boundary.txt (line $idx)\n"; - } - $wb{shift @col} = 1; - $idx ++; - } - close(W); - - @itset = intersect(\%disambig, \%wb); - $success1 = 1; - if (@itset != 0) { - $success1 = 0; - $exit = 1; print "--> ERROR: $lang/phones/word_boundary.txt has disambiguation symbols -- "; - foreach (@itset) { - print "$_ "; - } - print "\n"; - } - $success1 == 0 || print "--> $lang/phones/word_boundary.txt doesn't include disambiguation symbols\n"; - - %sum = (%silence, %nonsilence); - @itset = intersect(\%sum, \%wb); - %itset = (); foreach(@itset) {$itset{$_} = 1;} - $success2 = 1; - if (@itset < scalar(keys %sum)) { - $success2 = 0; - $exit = 1; print "--> ERROR: phones in nonsilence.txt and silence.txt but not in word_boundary.txt -- "; - foreach (keys %sum) { - if (!$itset{$_}) { - print "$_ "; - } - } - print "\n"; - } - if (@itset < scalar(keys %wb)) { - $success2 = 0; - $exit = 1; print "--> ERROR: phones in word_boundary.txt but not in nonsilence.txt or silence.txt -- "; - foreach (keys %wb) { - if (!$itset{$_}) { - print "$_ "; - } - } - print "\n"; - } - $success2 == 0 || print "--> $lang/phones/word_boundary.txt is the union of nonsilence.txt and silence.txt\n"; - $success1 != 1 or $success2 != 1 || print "--> $lang/phones/word_boundary.txt is OK\n"; - print "\n"; -} - - - -{ - print "Checking word-level disambiguation symbols...\n"; - # This block checks that one of the two following conditions hold: - # (1) for lang diretories prepared by older versions of prepare_lang.sh: - # The symbol '#0' should appear in words.txt and phones.txt, and should - # or (2): the files wdisambig.txt, wdisambig_phones.int and wdisambig_words.int - # exist, and have the expected properties (see below for details). - - # note, %wdisambig_words_hash hashes from the integer word-id of word-level - # disambiguation symbols, to 1 if the word is a disambig symbol. - - if (! -e "$lang/phones/wdisambig.txt") { - print "--> no $lang/phones/wdisambig.txt (older prepare_lang.sh)\n"; - if (exists $wsymtab{"#0"}) { - print "--> $lang/words.txt has \"#0\"\n"; - $wdisambig_words_hash{$wsymtab{"#0"}} = 1; - } else { - print "--> WARNING: $lang/words.txt doesn't have \"#0\"\n"; - print "--> (if you are using ARPA-type language models, you will normally\n"; - print "--> need the disambiguation symbol \"#0\" to ensure determinizability)\n"; - } - } else { - print "--> $lang/phones/wdisambig.txt exists (newer prepare_lang.sh)\n"; - if (!open(T, "<$lang/phones/wdisambig.txt")) { - print "--> ERROR: fail to open $lang/phones/wdisambig.txt\n"; $exit = 1; return; - } - chomp(my @wdisambig = ); - close(T); - if (!open(W, "<$lang/phones/wdisambig_words.int")) { - print "--> ERROR: fail to open $lang/phones/wdisambig_words.int\n"; $exit = 1; return; - } - chomp(my @wdisambig_words = ); - close(W); - if (!open(P, "<$lang/phones/wdisambig_phones.int")) { - print "--> ERROR: fail to open $lang/phones/wdisambig_phones.int\n"; $exit = 1; return; - } - chomp(my @wdisambig_phones =

); - close(P); - my $len = @wdisambig, $len2; - if (($len2 = @wdisambig_words) != $len) { - print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_words.int have different lengths"; - $exit = 1; return; - } - if (($len2 = @wdisambig_phones) != $len) { - print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_phones.int have different lengths"; - $exit = 1; return; - } - for (my $i = 0; $i < $len; $i++) { - if ($wsymtab{$wdisambig[$i]} ne $wdisambig_words[$i]) { - my $ii = $i + 1; - print "--> ERROR: line $ii of files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_words.int mismatch\n"; - $exit = 1; return; - } - } - for (my $i = 0; $i < $len; $i++) { - if ($psymtab{$wdisambig[$i]} ne $wdisambig_phones[$i]) { - my $ii = $i + 1; - print "--> ERROR: line $ii of files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_phones.int mismatch\n"; - $exit = 1; return; - } - } - foreach my $i ( @wdisambig_words ) { - $wdisambig_words_hash{$i} = 1; - } - } -} - - -if (-s "$lang/phones/word_boundary.int") { - print "Checking word_boundary.int and disambig.int\n"; - if (!open (W, "<$lang/phones/word_boundary.int")) { - $exit = 1; print "--> ERROR: fail to open $lang/phones/word_boundary.int\n"; - } - while () { - @A = split; - if (@A != 2) { - $exit = 1; print "--> ERROR: bad line $_ in $lang/phones/word_boundary.int\n"; - } - $wbtype{$A[0]} = $A[1]; - } - close(W); - if (!open (D, "<$lang/phones/disambig.int")) { - $exit = 1; print "--> ERROR: fail to open $lang/phones/disambig.int\n"; - } - while () { - @A = split; - if (@A != 1) { - $exit = 1; print "--> ERROR: bad line $_ in $lang/phones/disambig.int\n"; - } - $is_disambig{$A[0]} = 1; - } - - $text = `. ./path.sh`; - if ($text ne "") { - print "*** This script cannot continue because your path.sh or bash profile prints something: $text" . - "*** Please fix that and try again.\n"; - exit(1); - } - - foreach $fst ("L.fst", "L_disambig.fst") { - $wlen = int(rand(100)) + 1; - print "--> generating a $wlen word sequence\n"; - $wordseq = ""; - $sid = 0; - $wordseq_syms = ""; - foreach (1 .. $wlen) { - $id = int(rand(scalar(keys %wint2sym))); - # exclude disambiguation symbols, BOS and EOS and epsilon from the word - # sequence. - while (defined $wdisambig_words_hash{$id} or - $wint2sym{$id} eq "" or $wint2sym{$id} eq "" or $id == 0) { - $id = int(rand(scalar(keys %wint2sym))); - } - $wordseq_syms = $wordseq_syms . $wint2sym{$id} . " "; - $wordseq = $wordseq . "$sid ". ($sid + 1) . " $id $id 0\n"; - $sid ++; - } - $wordseq = $wordseq . "$sid 0"; - $phoneseq = `. ./path.sh; echo \"$wordseq" | fstcompile | fstcompose $lang/$fst - | fstproject | fstrandgen | fstrmepsilon | fsttopsort | fstprint | awk '{if (NF > 2) {print \$3}}';`; - $transition = { }; # empty assoc. array of allowed transitions between phone types. 1 means we count a word, - # 0 means transition is allowed. bos and eos are added as extra symbols here. - foreach $x ("bos", "nonword", "end", "singleton") { - $transition{$x, "nonword"} = 0; - $transition{$x, "begin"} = 1; - $transition{$x, "singleton"} = 1; - $transition{$x, "eos"} = 0; - } - $transition{"begin", "end"} = 0; - $transition{"begin", "internal"} = 0; - $transition{"internal", "internal"} = 0; - $transition{"internal", "end"} = 0; - - $cur_state = "bos"; - $num_words = 0; - foreach $phone (split (" ", "$phoneseq <>")) { - # Note: now that we support unk-LMs (see the --unk-fst option to - # prepare_lang.sh), the regular L.fst may contain some disambiguation - # symbols. - if (! defined $is_disambig{$phone}) { - if ($phone eq "<>") { - $state = "eos"; - } elsif ($phone == 0) { - $exit = 1; print "--> ERROR: unexpected phone sequence=$phoneseq, wordseq=$wordseq\n"; last; - } else { - $state = $wbtype{$phone}; - } - if (!defined $state) { - $exit = 1; print "--> ERROR: phone $phone is not specified in $lang/phones/word_boundary.int\n"; - last; - } elsif (!defined $transition{$cur_state, $state}) { - $exit = 1; print "--> ERROR: transition from state $cur_state to $state indicates error in word_boundary.int or L.fst\n"; - last; - } else { - $num_words += $transition{$cur_state, $state}; - $cur_state = $state; - } - } - } - if (!$exit) { - if ($num_words != $wlen) { - $phoneseq_syms = ""; - foreach my $id (split(" ", $phoneseq)) { $phoneseq_syms = $phoneseq_syms . " " . $pint2sym{$id}; } - $exit = 1; print "--> ERROR: number of reconstructed words $num_words does not match real number of words $wlen; indicates problem in $fst or word_boundary.int. phoneseq = $phoneseq_syms, wordseq = $wordseq_syms\n"; - } else { - print "--> resulting phone sequence from $fst corresponds to the word sequence\n"; - print "--> $fst is OK\n"; - } - } - } - print "\n"; -} - -# Check oov ------------------------------- -check_txt_int("$lang/oov", \%wsymtab, 0); print "\n"; - -# Check if L.fst is olabel sorted. -if (-e "$lang/L.fst") { - $cmd = "fstinfo $lang/L.fst | grep -E 'output label sorted.*y' > /dev/null"; - $res = system(". ./path.sh; $cmd"); - if ($res == 0) { - print "--> $lang/L.fst is olabel sorted\n"; - } else { - print "--> ERROR: $lang/L.fst is not olabel sorted\n"; - $exit = 1; - } -} - -# Check if L_disambig.fst is olabel sorted. -if (-e "$lang/L_disambig.fst") { - $cmd = "fstinfo $lang/L_disambig.fst | grep -E 'output label sorted.*y' > /dev/null"; - $res = system(". ./path.sh; $cmd"); - if ($res == 0) { - print "--> $lang/L_disambig.fst is olabel sorted\n"; - } else { - print "--> ERROR: $lang/L_disambig.fst is not olabel sorted\n"; - $exit = 1; - } -} - -if (-e "$lang/G.fst") { - # Check that G.fst is ilabel sorted and nonempty. - $text = `. ./path.sh; fstinfo $lang/G.fst`; - if ($? != 0) { - print "--> ERROR: fstinfo failed on $lang/G.fst\n"; - $exit = 1; - } - if ($text =~ m/input label sorted\s+y/) { - print "--> $lang/G.fst is ilabel sorted\n"; - } else { - print "--> ERROR: $lang/G.fst is not ilabel sorted\n"; - $exit = 1; - } - if ($text =~ m/# of states\s+(\d+)/) { - $num_states = $1; - if ($num_states == 0) { - print "--> ERROR: $lang/G.fst is empty\n"; - $exit = 1; - } else { - print "--> $lang/G.fst has $num_states states\n"; - } - } - - # Check that G.fst is determinizable. - if (!$skip_det_check) { - # Check determinizability of G.fst - # fstdeterminizestar is much faster, and a more relevant test as it's what - # we do in the actual graph creation recipe. - if (-e "$lang/G.fst") { - $cmd = "fstdeterminizestar $lang/G.fst /dev/null"; - $res = system(". ./path.sh; $cmd"); - if ($res == 0) { - print "--> $lang/G.fst is determinizable\n"; - } else { - print "--> ERROR: fail to determinize $lang/G.fst\n"; - $exit = 1; - } - } - } - - # Check that G.fst does not have cycles with only disambiguation symbols or - # epsilons on the input, or the forbidden symbols and (and a few - # related checks - - if (-e "$lang/G.fst") { - system("utils/lang/check_g_properties.pl $lang"); - if ($? != 0) { - print "--> ERROR: failure running check_g_properties.pl\n"; - $exit = 1; - } else { - print("--> utils/lang/check_g_properties.pl succeeded.\n"); - } - } -} - - -if (!$skip_det_check) { - if (-e "$lang/G.fst" && -e "$lang/L_disambig.fst") { - print "--> Testing determinizability of L_disambig . G\n"; - $output = `. ./path.sh; fsttablecompose $lang/L_disambig.fst $lang/G.fst | fstdeterminizestar | fstinfo 2>&1 `; - if ($output =~ m/# of states\s*[1-9]/) { - print "--> L_disambig . G is determinizable\n"; - } else { - print "--> ERROR: fail to determinize L_disambig . G. Output is:\n"; - print "$output\n"; - $exit = 1; - } - } -} - -if ($exit == 1) { - print "--> ERROR (see error messages above)\n"; exit 1; -} else { - if ($warning == 1) { - print "--> WARNING (check output above for warnings)\n"; exit 0; - } else { - print "--> SUCCESS [validating lang directory $lang]\n"; exit 0; - } -} diff --git a/kaldi/local/validate_text.pl b/kaldi/local/validate_text.pl deleted file mode 100755 index 172396c..0000000 --- a/kaldi/local/validate_text.pl +++ /dev/null @@ -1,132 +0,0 @@ -#!/usr/bin/env perl -# -#=============================================================================== -# Copyright 2017 Johns Hopkins University (author: Yenda Trmal ) -# Johns Hopkins University (author: Daniel Povey) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. -#=============================================================================== - -# validation script for data//text -# to be called (preferably) from utils/validate_data_dir.sh -use strict; -use warnings; -use utf8; -use Fcntl qw< SEEK_SET >; - -# this function reads the opened file (supplied as a first -# parameter) into an array of lines. For each -# line, it tests whether it's a valid utf-8 compatible -# line. If all lines are valid utf-8, it returns the lines -# decoded as utf-8, otherwise it assumes the file's encoding -# is one of those 1-byte encodings, such as ISO-8859-x -# or Windows CP-X. -# Please recall we do not really care about -# the actually encoding, we just need to -# make sure the length of the (decoded) string -# is correct (to make the output formatting looking right). -sub get_utf8_or_bytestream { - use Encode qw(decode encode); - my $is_utf_compatible = 1; - my @unicode_lines; - my @raw_lines; - my $raw_text; - my $lineno = 0; - my $file = shift; - - while (<$file>) { - $raw_text = $_; - last unless $raw_text; - if ($is_utf_compatible) { - my $decoded_text = eval { decode("UTF-8", $raw_text, Encode::FB_CROAK) } ; - $is_utf_compatible = $is_utf_compatible && defined($decoded_text); - push @unicode_lines, $decoded_text; - } else { - #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; - ; - } - push @raw_lines, $raw_text; - $lineno += 1; - } - - if (!$is_utf_compatible) { - return (0, @raw_lines); - } else { - return (1, @unicode_lines); - } -} - -# check if the given unicode string contain unicode whitespaces -# other than the usual four: TAB, LF, CR and SPACE -sub validate_utf8_whitespaces { - my $unicode_lines = shift; - use feature 'unicode_strings'; - for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { - my $current_line = $unicode_lines->[$i]; - my @A = split(" ", $current_line); - my $utt_id = $A[0]; - # we replace TAB, LF, CR, and SPACE - # this is to simplify the test - if ($current_line =~ /\x{000d}/) { - print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; - return 1; - } - $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; - if ($current_line =~/\s/) { - print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; - return 1; - } - } - return 0; -} - -# checks if the text in the file (supplied as the argument) is utf-8 compatible -# if yes, checks if it contains only allowed whitespaces. If no, then does not -# do anything. The function seeks to the original position in the file after -# reading the text. -sub check_allowed_whitespace { - my $file = shift; - my $filename = shift; - my $pos = tell($file); - (my $is_utf, my @lines) = get_utf8_or_bytestream($file); - seek($file, $pos, SEEK_SET); - if ($is_utf) { - my $has_invalid_whitespaces = validate_utf8_whitespaces(\@lines); - if ($has_invalid_whitespaces) { - print STDERR "$0: ERROR: text file '$filename' contains disallowed UTF-8 whitespace character(s)\n"; - return 0; - } - } - return 1; -} - -if(@ARGV != 1) { - die "Usage: validate_text.pl \n" . - "e.g.: validate_text.pl data/train/text\n"; -} - -my $text = shift @ARGV; - -if (-z "$text") { - print STDERR "$0: ERROR: file '$text' is empty or does not exist\n"; - exit 1; -} - -if(!open(FILE, "<$text")) { - print STDERR "$0: ERROR: failed to open $text\n"; - exit 1; -} - -check_allowed_whitespace(\*FILE, $text) or exit 1; -close(FILE); diff --git a/kaldi/local/visualize_spk_emb.py b/kaldi/local/visualize_spk_emb.py deleted file mode 100755 index 5722d0c..0000000 --- a/kaldi/local/visualize_spk_emb.py +++ /dev/null @@ -1,66 +0,0 @@ -from sklearn.manifold import TSNE -from kaldi_io import read_vec_flt_scp -import sys -import numpy as np -import pandas as pd -import matplotlib -matplotlib.use('Agg') -import matplotlib.pyplot as plt -from mpl_toolkits.mplot3d import Axes3D -import seaborn as sns - -# example usage -# python scripts/visualize_spk_emb.py spk_embs_2/vctk_spk_resnet_mfcc_3-8_200_32_mean_lde_sqr_asoftmax_m2.scp 108 output.png -# reference: https://towardsdatascience.com/visualising-high-dimensional-datasets-using-pca-and-t-sne-in-python-8ef87e7915b - -tsne = TSNE(n_components=2, verbose=1) -X, y = [], [] -index = 0 -for key,vec in read_vec_flt_scp(sys.argv[1]): - X.append(vec) - y.append(key) - #y.append(index) - index += 1 -X, y = np.array(X), np.array(y) - -X_emb = tsne.fit_transform(X) # tsne transformed - -# For reproducability of the results -np.random.seed(42) -N = int(sys.argv[2]) -rndperm = np.random.permutation(X_emb.shape[0]) -X_emb, y = X_emb[rndperm[:N]], y[rndperm[:N]] - -feat_cols = [ 'pixel'+str(i) for i in range(X_emb.shape[1]) ] -df = pd.DataFrame(X_emb,columns=feat_cols) -df['y'] = y -df['label'] = df['y'].apply(lambda i: str(i)) -df['tsne-1'] = X_emb[:,0] -df['tsne-2'] = X_emb[:,1] -#df['tsne-3'] = X_emb[:,2] - -## 2D plot -plt.figure(figsize=(16,10)) -sns_plt = sns.scatterplot( - x="tsne-1", y="tsne-2", - hue="y", - palette=sns.color_palette("hls", N), - data=df, - legend=False, # “brief”, “full” - alpha=0.5 -) -sns_plt.figure.savefig(sys.argv[3]) - -## 3D plot -#ax = plt.figure(figsize=(16,10)).gca(projection='3d') -#ax.scatter( -# xs=df["tsne-1"], -# ys=df["tsne-2"], -# zs=df["tsne-3"], -# c=df["y"], -# cmap='tab10' -#) -#ax.set_xlabel('tsne-one') -#ax.set_ylabel('tsne-two') -#ax.set_zlabel('tsne-three') -#ax.figure.savefig(sys.argv[3]) diff --git a/kaldi/local/visualize_trait_emb.py b/kaldi/local/visualize_trait_emb.py deleted file mode 100755 index bcee79e..0000000 --- a/kaldi/local/visualize_trait_emb.py +++ /dev/null @@ -1,99 +0,0 @@ -from sklearn.manifold import TSNE -from kaldi_io import read_vec_flt_scp -import sys -import numpy as np -import pandas as pd -import matplotlib -matplotlib.use('Agg') -import matplotlib.pyplot as plt -from mpl_toolkits.mplot3d import Axes3D -import seaborn as sns - -# example usage -# python local/visualize_trait_emb.py age/accent/gender exp/vctk_lde/resnet_mfcc_3-8_200_32_mean_lde_sqr_asoftmax_m2/lde.scp 43873 output.png -# reference: https://towardsdatascience.com/visualising-high-dimensional-datasets-using-pca-and-t-sne-in-python-8ef87e7915b -# speaker-info.txt -# ID AGE GENDER ACCENTS REGION -# 225 23 F English Southern England -# 226 22 M English Surrey -# 227 38 M English Cumbria -# 228 22 F English Southern England -# 229 23 F English Southern England -# 230 22 F English Stockton-on-tees -# 231 23 F English Southern England -# 232 23 M English Southern England - -#speaker_info = '/export/c01/jlai/nii/spk_enc/Erica_VCTK_processed/vctk-speaker-info.txt' -speaker_info = '/data/sls/scratch/clai24/data/Erica_VCTK_processed/vctk-speaker-info.txt' - -with open(speaker_info, 'r') as f: - context = f.readlines() -context = [x.strip() for x in context][1:] -spk2trait = {} -for i in context: - spk = i.split()[0] - if spk != 's5': # add prefix 'p' - spk = 'p' + spk - if sys.argv[1] == 'age': - trait = int(i.split()[1]) - elif sys.argv[1] == 'gender': - trait = i.split()[2] - elif sys.argv[1] == 'accent': - trait = i.split()[3] - spk2trait[spk] = trait -print('speaker to trait is %s' % spk2trait) - -tsne = TSNE(n_components=2, verbose=1) -X, y = [], [] -index = 0 -for key,vec in read_vec_flt_scp(sys.argv[2]): - X.append(vec) - spk = key.split('-')[0] - y.append(spk2trait[spk]) - #print(vec.shape) - #y.append(index) - index += 1 -X, y = np.array(X), np.array(y) -print(len(y)) -print(np.unique(y)) -X_emb = tsne.fit_transform(X) # tsne transformed - -# For reproducability of the results -np.random.seed(42) -N = int(sys.argv[3]) -rndperm = np.random.permutation(X_emb.shape[0]) -X_emb, y = X_emb[rndperm[:N]], y[rndperm[:N]] - -feat_cols = [ 'pixel'+str(i) for i in range(X_emb.shape[1]) ] -df = pd.DataFrame(X_emb,columns=feat_cols) -df['y'] = y -df['label'] = df['y'].apply(lambda i: str(i)) -df['tsne-1'] = X_emb[:,0] -df['tsne-2'] = X_emb[:,1] -#df['tsne-3'] = X_emb[:,2] - -## 2D plot -plt.figure(figsize=(16,10)) -sns_plt = sns.scatterplot( - x="tsne-1", y="tsne-2", - hue="y", - palette=sns.color_palette("hls", len(np.unique(y))), - data=df, - legend='brief', # “brief”, “full” - alpha=0.5 -) -sns_plt.figure.savefig(sys.argv[4]) - -## 3D plot -#ax = plt.figure(figsize=(16,10)).gca(projection='3d') -#ax.scatter( -# xs=df["tsne-1"], -# ys=df["tsne-2"], -# zs=df["tsne-3"], -# c=df["y"], -# cmap='tab10' -#) -#ax.set_xlabel('tsne-one') -#ax.set_ylabel('tsne-two') -#ax.set_zlabel('tsne-three') -#ax.figure.savefig(sys.argv[3]) diff --git a/kaldi/local/visualize_utt_emb.py b/kaldi/local/visualize_utt_emb.py deleted file mode 100755 index 869d489..0000000 --- a/kaldi/local/visualize_utt_emb.py +++ /dev/null @@ -1,69 +0,0 @@ -from sklearn.manifold import TSNE -from kaldi_io import read_vec_flt_scp -import sys -import numpy as np -import pandas as pd -import matplotlib -matplotlib.use('Agg') -import matplotlib.pyplot as plt -from mpl_toolkits.mplot3d import Axes3D -import seaborn as sns - -# example usage -# python local/visualize_utt_emb.py exp/vctk_lde/resnet_mfcc_3-8_200_32_mean_lde_sqr_asoftmax_m2/lde.scp 43873 output.png -# reference: https://towardsdatascience.com/visualising-high-dimensional-datasets-using-pca-and-t-sne-in-python-8ef87e7915b - -tsne = TSNE(n_components=2, verbose=1) -X, y = [], [] -index = 0 -for key,vec in read_vec_flt_scp(sys.argv[1]): - X.append(vec) - y.append(key.split('-')[0]) - #print(vec.shape) - #y.append(index) - index += 1 -X, y = np.array(X), np.array(y) -print(y) -print(np.unique(y)) - -X_emb = tsne.fit_transform(X) # tsne transformed - -# For reproducability of the results -np.random.seed(42) -N = int(sys.argv[2]) -rndperm = np.random.permutation(X_emb.shape[0]) -X_emb, y = X_emb[rndperm[:N]], y[rndperm[:N]] - -feat_cols = [ 'pixel'+str(i) for i in range(X_emb.shape[1]) ] -df = pd.DataFrame(X_emb,columns=feat_cols) -df['y'] = y -df['label'] = df['y'].apply(lambda i: str(i)) -df['tsne-1'] = X_emb[:,0] -df['tsne-2'] = X_emb[:,1] -#df['tsne-3'] = X_emb[:,2] - -## 2D plot -plt.figure(figsize=(16,10)) -sns_plt = sns.scatterplot( - x="tsne-1", y="tsne-2", - hue="y", - palette=sns.color_palette("hls", len(np.unique(y))), - data=df, - legend='brief', # “brief”, “full” - alpha=0.5 -) -sns_plt.figure.savefig(sys.argv[3]) - -## 3D plot -#ax = plt.figure(figsize=(16,10)).gca(projection='3d') -#ax.scatter( -# xs=df["tsne-1"], -# ys=df["tsne-2"], -# zs=df["tsne-3"], -# c=df["y"], -# cmap='tab10' -#) -#ax.set_xlabel('tsne-one') -#ax.set_ylabel('tsne-two') -#ax.set_zlabel('tsne-three') -#ax.figure.savefig(sys.argv[3]) diff --git a/kaldi/local/write_kwslist.pl b/kaldi/local/write_kwslist.pl deleted file mode 100755 index fa54b9f..0000000 --- a/kaldi/local/write_kwslist.pl +++ /dev/null @@ -1,345 +0,0 @@ -#!/usr/bin/env perl - -# Copyright 2012 Johns Hopkins University (Author: Guoguo Chen) -# Apache 2.0. -# -use strict; -use warnings; -use Getopt::Long; - -my $Usage = < - e.g.: utils/write_kwslist.pl --flen=0.01 --duration=1000 --segments=data/eval/segments - --normalize=true --map-utter=data/kws/utter_map raw_results kwslist.xml - -Allowed options: - --beta : Beta value when computing ATWV (float, default = 999.9) - --digits : How many digits should the score use (int, default = "infinite") - --duptime : Tolerance for duplicates (float, default = 0.5) - --duration : Duration of all audio, you must set this (float, default = 999.9) - --ecf-filename : ECF file name (string, default = "") - --flen : Frame length (float, default = 0.01) - --index-size : Size of index (float, default = 0) - --kwlist-filename : Kwlist.xml file name (string, default = "") - --language : Language type (string, default = "cantonese") - --map-utter : Map utterance for evaluation (string, default = "") - --normalize : Normalize scores or not (boolean, default = false) - --Ntrue-scale : Keyword independent scale factor for Ntrue (float, default = 1.0) - --remove-dup : Remove duplicates (boolean, default = false) - --remove-NO : Remove the "NO" decision instances (boolean, default = false) - --segments : Segments file from Kaldi (string, default = "") - --system-id : System ID (string, default = "") - --verbose : Verbose level (higher --> more kws section) (integer, default = 0) - --YES-cutoff : Only keep "\$YES-cutoff" yeses for each kw (int, default = -1) - --nbest | Output upto nbest hits into the kwlist (int, default = -1) - -EOU - -my $segment = ""; -my $flen = 0.01; -my $beta = 999.9; -my $duration = 999.9; -my $language = "cantonese"; -my $ecf_filename = ""; -my $index_size = 0; -my $system_id = ""; -my $normalize = "false"; -my $map_utter = ""; -my $Ntrue_scale = 1.0; -my $digits = 0; -my $kwlist_filename = ""; -my $verbose = 0; -my $duptime = 0.5; -my $remove_dup = "false"; -my $remove_NO = "false"; -my $YES_cutoff = -1; -my $nbest_max = -1; -GetOptions('segments=s' => \$segment, - 'flen=f' => \$flen, - 'beta=f' => \$beta, - 'duration=f' => \$duration, - 'language=s' => \$language, - 'ecf-filename=s' => \$ecf_filename, - 'index-size=f' => \$index_size, - 'system-id=s' => \$system_id, - 'normalize=s' => \$normalize, - 'map-utter=s' => \$map_utter, - 'Ntrue-scale=f' => \$Ntrue_scale, - 'digits=i' => \$digits, - 'kwlist-filename=s' => \$kwlist_filename, - 'verbose=i' => \$verbose, - 'duptime=f' => \$duptime, - 'remove-dup=s' => \$remove_dup, - 'YES-cutoff=i' => \$YES_cutoff, - 'remove-NO=s' => \$remove_NO, - 'nbest=i' => \$nbest_max) or die "Cannot continue\n"; - -($normalize eq "true" || $normalize eq "false") || die "$0: Bad value for option --normalize\n"; -($remove_dup eq "true" || $remove_dup eq "false") || die "$0: Bad value for option --remove-dup\n"; -($remove_NO eq "true" || $remove_NO eq "false") || die "$0: Bad value for option --remove-NO\n"; - -if ($segment) { - open(SEG, "<$segment") || die "$0: Fail to open segment file $segment\n"; -} - -if ($map_utter) { - open(UTT, "<$map_utter") || die "$0: Fail to open utterance table $map_utter\n"; -} - -if (@ARGV != 2) { - die $Usage; -} - -# Get parameters -my $filein = shift @ARGV; -my $fileout = shift @ARGV; - -# Get input source -my $source = ""; -if ($filein eq "-") { - $source = "STDIN"; -} else { - open(I, "<$filein") || die "$0: Fail to open input file $filein\n"; - $source = "I"; -} - -# Get symbol table and start time -my %tbeg; -if ($segment) { - while () { - chomp; - my @col = split(" ", $_); - @col == 4 || die "$0: Bad number of columns in $segment \"$_\"\n"; - $tbeg{$col[0]} = $col[2]; - } -} - -# Get utterance mapper -my %utter_mapper; -if ($map_utter) { - while () { - chomp; - my @col = split(" ", $_); - @col == 2 || die "$0: Bad number of columns in $map_utter \"$_\"\n"; - $utter_mapper{$col[0]} = $col[1]; - } -} - -# Function for printing Kwslist.xml -sub PrintKwslist { - my ($info, $KWS) = @_; - - my $kwslist = ""; - - # Start printing - $kwslist .= "[0]\" language=\"$info->[1]\" system_id=\"$info->[2]\">\n"; - my $prev_kw = ""; - my $nbest = $nbest_max; - foreach my $kwentry (@{$KWS}) { - if (($prev_kw eq $kwentry->[0]) && ($nbest le 0) && ($nbest_max gt 0)) { - next; - } - if ($prev_kw ne $kwentry->[0]) { - if ($prev_kw ne "") {$kwslist .= " \n";} - $kwslist .= " [0]\" search_time=\"1\" oov_count=\"0\">\n"; - $prev_kw = $kwentry->[0]; - $nbest = $nbest_max; - } - $nbest -= 1 if $nbest_max gt 0; - my $score = sprintf("%g", $kwentry->[5]); - $kwslist .= " [1]\" channel=\"$kwentry->[2]\" tbeg=\"$kwentry->[3]\" dur=\"$kwentry->[4]\" score=\"$score\" decision=\"$kwentry->[6]\""; - if (defined($kwentry->[7])) {$kwslist .= " threshold=\"$kwentry->[7]\"";} - if (defined($kwentry->[8])) {$kwslist .= " raw_score=\"$kwentry->[8]\"";} - $kwslist .= "/>\n"; - } - if ($prev_kw ne "") {$kwslist .= " \n";} - $kwslist .= "\n"; - - return $kwslist; -} - -# Function for sorting -sub KwslistOutputSort { - if ($a->[0] ne $b->[0]) { - if ($a->[0] =~ m/[0-9]+$/ && $b->[0] =~ m/[0-9]+$/) { - ($a->[0] =~ /([0-9]*)$/)[0] <=> ($b->[0] =~ /([0-9]*)$/)[0] - } else { - $a->[0] cmp $b->[0]; - } - } elsif ($a->[5] ne $b->[5]) { - $b->[5] <=> $a->[5]; - } else { - $a->[1] cmp $b->[1]; - } -} -sub KwslistDupSort { - my ($a, $b, $duptime) = @_; - if ($a->[0] ne $b->[0]) { - $a->[0] cmp $b->[0]; - } elsif ($a->[1] ne $b->[1]) { - $a->[1] cmp $b->[1]; - } elsif ($a->[2] ne $b->[2]) { - $a->[2] cmp $b->[2]; - } elsif (abs($a->[3]-$b->[3]) >= $duptime){ - $a->[3] <=> $b->[3]; - } elsif ($a->[5] ne $b->[5]) { - $b->[5] <=> $a->[5]; - } else { - $b->[4] <=> $a->[4]; - } -} - -# Processing -my @KWS; -while (<$source>) { - chomp; - my @col = split(" ", $_); - @col == 5 || die "$0: Bad number of columns in raw results \"$_\"\n"; - my $kwid = shift @col; - my $utter = $col[0]; - my $start = sprintf("%.2f", $col[1]*$flen); - my $dur = sprintf("%.2f", $col[2]*$flen-$start); - my $score = exp(-$col[3]); - - if ($segment) { - $start = sprintf("%.2f", $start+$tbeg{$utter}); - } - if ($map_utter) { - my $utter_x = $utter_mapper{$utter}; - die "Unmapped utterance $utter\n" unless $utter_x; - $utter = $utter_x; - } - - push(@KWS, [$kwid, $utter, 1, $start, $dur, $score, ""]); -} - -my %Ntrue = (); -foreach my $kwentry (@KWS) { - if (!defined($Ntrue{$kwentry->[0]})) { - $Ntrue{$kwentry->[0]} = 0.0; - } - $Ntrue{$kwentry->[0]} += $kwentry->[5]; -} - -# Scale the Ntrue -my %threshold; -foreach my $key (keys %Ntrue) { - $Ntrue{$key} *= $Ntrue_scale; - $threshold{$key} = $Ntrue{$key}/($duration/$beta+($beta-1)/$beta*$Ntrue{$key}); -} - -# Removing duplicates -if ($remove_dup eq "true") { - my @tmp = sort {KwslistDupSort($a, $b, $duptime)} @KWS; - @KWS = (); - if (@tmp >= 1) {push(@KWS, $tmp[0])}; - for (my $i = 1; $i < scalar(@tmp); $i ++) { - my $prev = $KWS[-1]; - my $curr = $tmp[$i]; - if ((abs($prev->[3]-$curr->[3]) < $duptime ) && - ($prev->[2] eq $curr->[2]) && - ($prev->[1] eq $curr->[1]) && - ($prev->[0] eq $curr->[0])) { - next; - } else { - push(@KWS, $curr); - } - } -} - -my $format_string = "%g"; -if ($digits gt 0 ) { - $format_string = "%." . $digits ."f"; -} - -my @info = ($kwlist_filename, $language, $system_id); -my %YES_count; -foreach my $kwentry (@KWS) { - my $threshold = $threshold{$kwentry->[0]}; - if ($kwentry->[5] > $threshold) { - $kwentry->[6] = "YES"; - if (defined($YES_count{$kwentry->[0]})) { - $YES_count{$kwentry->[0]} ++; - } else { - $YES_count{$kwentry->[0]} = 1; - } - } else { - $kwentry->[6] = "NO"; - if (!defined($YES_count{$kwentry->[0]})) { - $YES_count{$kwentry->[0]} = 0; - } - } - if ($verbose > 0) { - push(@{$kwentry}, sprintf("%g", $threshold)); - } - if ($normalize eq "true") { - if ($verbose > 0) { - push(@{$kwentry}, $kwentry->[5]); - } - my $numerator = (1-$threshold)*$kwentry->[5]; - my $denominator = (1-$threshold)*$kwentry->[5]+(1-$kwentry->[5])*$threshold; - if ($denominator != 0) { - $kwentry->[5] = sprintf($format_string, $numerator/$denominator); - } else { - $kwentry->[5] = sprintf($format_string, $kwentry->[5]); - } - } else { - $kwentry->[5] = sprintf($format_string, $kwentry->[5]); - } -} - -# Output sorting -my @tmp = sort KwslistOutputSort @KWS; - -# Process the YES-cutoff. Note that you don't need this for the normal cases where -# hits and false alarms are balanced -if ($YES_cutoff != -1) { - my $count = 1; - for (my $i = 1; $i < scalar(@tmp); $i ++) { - if ($tmp[$i]->[0] ne $tmp[$i-1]->[0]) { - $count = 1; - next; - } - if ($YES_count{$tmp[$i]->[0]} > $YES_cutoff*2) { - $tmp[$i]->[6] = "NO"; - $tmp[$i]->[5] = 0; - next; - } - if (($count == $YES_cutoff) && ($tmp[$i]->[6] eq "YES")) { - $tmp[$i]->[6] = "NO"; - $tmp[$i]->[5] = 0; - next; - } - if ($tmp[$i]->[6] eq "YES") { - $count ++; - } - } -} - -# Process the remove-NO decision -if ($remove_NO eq "true") { - my @KWS = @tmp; - @tmp = (); - for (my $i = 0; $i < scalar(@KWS); $i ++) { - if ($KWS[$i]->[6] eq "YES") { - push(@tmp, $KWS[$i]); - } - } -} - -# Printing -my $kwslist = PrintKwslist(\@info, \@tmp); - -if ($segment) {close(SEG);} -if ($map_utter) {close(UTT);} -if ($filein ne "-") {close(I);} -if ($fileout eq "-") { - print $kwslist; -} else { - open(O, ">$fileout") || die "$0: Fail to open output file $fileout\n"; - print O $kwslist; - close(O); -} diff --git a/kaldi/path.sh b/kaldi/path.sh deleted file mode 100755 index 648348b..0000000 --- a/kaldi/path.sh +++ /dev/null @@ -1,8 +0,0 @@ -export KALDI_ROOT=/work2/home/ing2/theo/kaldi - -export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH - -[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 -. $KALDI_ROOT/tools/config/common_path.sh - -export LC_ALL=C diff --git a/kaldi/run.sh b/kaldi/run.sh deleted file mode 100755 index 366cc75..0000000 --- a/kaldi/run.sh +++ /dev/null @@ -1,102 +0,0 @@ -#!/bin/bash - -. ./cmd.sh -. ./path.sh -set -e - -voxceleb1_trials=data/voxceleb1_test/trials # created by make_voxceleb1.pl -voxceleb1_root=/diskssd1/ing2/datasets/VoxCeleb1 -voxceleb2_root=/diskssd1/ing2/datasets/VoxCeleb2 - -model_config_path=$1 -expname=test_training -stage=3 - -# Stage 0: Prepare train and test data directories -# Train+CV = VoxCeleb2 dev set -# Test = VoxCeleb1 test set -if [ $stage -le 0 ]; then - echo "=== Stage 0: Prepare train and test data directories ===" - - log=exp/make_voxceleb - - $train_cmd $log/make_voxceleb2_dev.log local/make_voxceleb2.pl $voxceleb2_root dev data/train - $train_cmd $log/make_voxceleb1.log local/make_voxceleb1.pl $voxceleb1_root data - - echo -e "\n" -fi - -# Stage 1: Generate features from data -if [ $stage -le 1 ]; then - echo "=== Stage 1: Generate features from data ===" - - log=exp/encode - - #$train_cmd $log/encode.log - python ../cache_features.py data/train - #$train_cmd $log/encode.log - python ../cache_features.py data/voxceleb1_test - - echo -e "\n" -fi - -# Stage 2: Train feature extractor neural network -if [ $stage -le 2 ]; then - echo "=== Stage 2: Train feature extractor neural network ===" - - #expdir=exp/training/$expname/ - #mkdir -p $expdir - - export CUDA_VISIBLE_DEVICES=0,1 - - #$train_cmd $expdir/train.log - python ../train.py $model_config_path - - echo -e "\n" -fi - -backend_log=exp/backend/$expname/ -mkdir -p $backend_log - -# Stage 3: Extract speaker embeddings -if [ $stage -le 3 ]; then - echo "=== Stage 3: Extract speaker embeddings ===" - - #expdir=exp/decode/$expname/ - #mkdir -p $expdir - - export CUDA_VISIBLE_DEVICES=0 - - #$train_cmd $expdir/decode.log - python ../extract_embeddings.py \ - data/voxceleb1_test/feats.scp \ - $backend_log/test.ark \ - $model_config_path - - echo -e "\n" -fi - -# Stage 4: Score model on test set -if [ $stage -le 4 ]; then - echo "=== Stage 4: Score model on test set ===" - - python ../kaldi_evaluate.py \ - $voxceleb1_trials \ - $backend_log/test.scp \ - $backend_log/scores_voxceleb1_test - - echo -e "\n" -fi - -# Stage 5: Show evaluation metrics (EER, minDCF) -if [ $stage -le 5 ]; then - echo "=== Stage 5: Show evaluation metrics ===" - - eer=`compute-eer <(python local/prepare_for_eer.py $voxceleb1_trials $backend_log/scores_voxceleb1_test) 2> /dev/null` - mindcf1=`python local/compute_min_dcf.py --p-target 0.01 $backend_log/scores_voxceleb1_test $voxceleb1_trials 2> /dev/null` - mindcf2=`python local/compute_min_dcf.py --p-target 0.001 $backend_log/scores_voxceleb1_test $voxceleb1_trials 2> /dev/null` - - echo "EER: $eer%" - echo "minDCF(p-target=0.01): $mindcf1" - echo "minDCF(p-target=0.001): $mindcf2" -fi diff --git a/kaldi/steps b/kaldi/steps deleted file mode 120000 index 3b8efed..0000000 --- a/kaldi/steps +++ /dev/null @@ -1 +0,0 @@ -../../kaldi/egs/wsj/s5/steps \ No newline at end of file diff --git a/kaldi/utils b/kaldi/utils deleted file mode 120000 index 8aebc12..0000000 --- a/kaldi/utils +++ /dev/null @@ -1 +0,0 @@ -../../kaldi/egs/wsj/s5/utils \ No newline at end of file diff --git a/kaldi_evaluate.py b/kaldi_evaluate.py deleted file mode 100644 index a0cee21..0000000 --- a/kaldi_evaluate.py +++ /dev/null @@ -1,30 +0,0 @@ -import argparse -import numpy as np -import kaldiio -from tqdm import tqdm - -from scipy.spatial.distance import cosine - -def extract_embeddings(trials_path, embeddings_path, scores_path): - out = open(scores_path, "w") - d = kaldiio.load_scp(embeddings_path) - for line in tqdm(open(trials_path)): - info = line.rstrip().split(' ') - a = d[info[0]] - b = d[info[1]] - target = (info[2] == 'target') - - # Compute cosine distance - dist = 1 - cosine(a, b) - - # Write to output file - out.write(info[0] + ' ' + info[1] + ' ' + str(dist) + '\n') - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument('trials_path', help='Path to test set trials list.') - parser.add_argument('embeddings_path', help='Path to scp file containing embeddings for speakers in the test set.') - parser.add_argument('scores_path', help='Path to output file containing scores.') - args = parser.parse_args() - - extract_embeddings(args.trials_path, args.embeddings_path, args.scores_path) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index fcf0a6d..3013bcd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,8 @@ numpy -h5py tensorflow tensorflow-addons sklearn soundfile -librosa -audiomentations tqdm -kaldiio torch torchaudio \ No newline at end of file diff --git a/run.sh b/run.sh new file mode 100644 index 0000000..4244e5e --- /dev/null +++ b/run.sh @@ -0,0 +1,6 @@ +model_config_path=$1 + +export CUDA_VISIBLE_DEVICES=0,1 +python train.py $model_config_path + +python evaluate.py $model_config_path \ No newline at end of file diff --git a/setup.py b/setup.py index b3718a2..aef7762 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,6 @@ 'sslforslr.models.encoders', 'sslforslr.models.cpc', 'sslforslr.models.lim', - 'sslforslr.models.multitask', - 'sslforslr.models.wav2vec2', - 'sslforslr.models.vqwav2vec'], + 'sslforslr.models.simclr', + 'sslforslr.models.moco'], ) \ No newline at end of file diff --git a/sslforslr/dataset/AudioAugmentationGenerator.py b/sslforslr/dataset/AudioAugmentationGenerator.py deleted file mode 100644 index ff80fe7..0000000 --- a/sslforslr/dataset/AudioAugmentationGenerator.py +++ /dev/null @@ -1,57 +0,0 @@ -from tensorflow.keras.utils import Sequence -from audiomentations import Compose -from audiomentations import AddImpulseResponse -from audiomentations import AddBackgroundNoise -from audiomentations import FrequencyMask -from audiomentations import TimeMask -from audiomentations import ClippingDistortion - -class AudioAugmentationGenerator(Sequence): - ''' - Keras generator which adds audio augmentation transformations - to an existing generator. - ''' - - def __init__(self, gen, config, sample_frequency): - self.gen = gen - self.config = config - self.sample_frequency = sample_frequency - self.transforms = self.init_transforms() - - def init_transforms(self): - transforms = [] - for transform in self.config: - transform_type = transform['type'] - probability = transform.get('p', 0.0) - path = transform.get('path', "") - - if transform_type == 'add_ir': - transforms.append(AddImpulseResponse(p=probability, - ir_path=path, - leave_length_unchanged=True)) - elif transform_type == 'add_noise': - transforms.append(AddBackgroundNoise(p=probability, - sounds_path=path)) - elif transform_type == 'frequency_mask': - transforms.append(FrequencyMask(p=probability)) - elif transform_type == 'time_mask': - transforms.append(TimeMask(p=probability)) - elif transform_type == 'clipping': - transforms.append(ClippingDistortion(p=probability)) - - return Compose(transforms) - - def __len__(self): - return len(self.gen) - - def __getitem__(self, i): - X, y = self.gen[i] - batch_size = X.shape[0] - frame_length = X.shape[1] - - for i in range(batch_size): - data = self.transforms(X[i].flatten(), - sample_rate=self.sample_frequency) - X[i] = data.reshape((frame_length, 1)) - - return X, y \ No newline at end of file diff --git a/sslforslr/dataset/AudioDatasetGenerator.py b/sslforslr/dataset/AudioDatasetGenerator.py deleted file mode 100644 index a1b5f1c..0000000 --- a/sslforslr/dataset/AudioDatasetGenerator.py +++ /dev/null @@ -1,68 +0,0 @@ -import numpy as np -import math -import h5py -from tensorflow.keras.utils import Sequence - -class AudioDatasetGenerator(Sequence): - ''' - Keras generator to use with an existing cache file - created with AudioDatasetLoader. - ''' - - def __init__(self, - cache_path, - batch_size, - frame_length, - subset='train', - indices=None, - pick_random=False): - cache = h5py.File(cache_path, 'r') - self.X = cache[subset + '_x'] - self.y = cache[subset + '_y'] - - self.indices = np.arange(len(self.y)) if indices is None else indices - - self.batch_size = batch_size - self.frame_length = frame_length - self.pick_random = pick_random - - def __len__(self): - return math.ceil(len(self.indices) / self.batch_size) - - def get_random_frame(self, curr_batch_size, signals): - X_batch = np.empty((curr_batch_size, self.frame_length, 1)) - - for j in range(curr_batch_size): - signal = signals[j] - idx = np.random.randint(len(signal) - self.frame_length + 1) - X_batch[j, :, 0] = signal[idx:idx+self.frame_length] - - return X_batch - - def __getitem__(self, i): - curr_batch_size = self.batch_size - - # Last batch may have fewer samples - is_last_batch = i == self.__len__() - 1 - remaining_samples = len(self.indices) % self.batch_size - if is_last_batch and remaining_samples != 0: - curr_batch_size = remaining_samples - - # Shuffling a h5py dataset directly is not possible and indices - # must be in increasing order. - idx = self.indices[i*self.batch_size:i*self.batch_size+curr_batch_size] - idx = np.sort(idx) - - X_batch = self.X[idx] - y_batch = self.y[idx] - - if self.pick_random: - return self.get_random_frame(curr_batch_size, X_batch), y_batch - - # Expected output shape is (batch_size, frame_length, 1) - X_batch = np.expand_dims(X_batch, axis=-1) - return X_batch, y_batch - - def on_epoch_end(self): - # Randomize samples manually after each epoch - np.random.shuffle(self.indices) \ No newline at end of file diff --git a/sslforslr/dataset/AudioDatasetLoader.py b/sslforslr/dataset/AudioDatasetLoader.py deleted file mode 100644 index 91e7375..0000000 --- a/sslforslr/dataset/AudioDatasetLoader.py +++ /dev/null @@ -1,271 +0,0 @@ -import numpy as np -import math -from sklearn.model_selection import train_test_split -import soundfile as sf -import glob -import h5py -from tqdm import tqdm - -from .AudioDatasetGenerator import AudioDatasetGenerator - -def get_frames(filename, frames_config): - length = frames_config['length'] - with sf.SoundFile(filename, 'r') as f: - signal_length = f.frames - - if signal_length < length: - return [] - - if frames_config['pick'] == 'random': - return [] - - elif frames_config['pick'] == 'sequence': - stride = frames_config['stride'] - count = frames_config['count'] - - # Determine frames indexes - num_frames = 1 + math.floor((signal_length - length) / stride) - num_frames = min(num_frames, count) - indexes = np.arange(0, num_frames * stride, stride) - - return indexes - - raise Exception('AudioDatasetLoader: frames picking method not handled') - -def scan_librispeech(paths, limits_config, frames_config): - nb_speakers = 0 - filenames = [] - speakers = [] - - limit_speakers = limits_config.get('speakers', -1) - limit_utterances = limits_config.get('utterances_per_speaker', -1) - - # Scan datasets - for dataset_id in range(len(paths)): - speaker_dirs = glob.glob(paths[dataset_id]) - - if (len(speaker_dirs) == 0): - raise Exception('AudioDatasetLoader: no data found in %s' % paths[dataset_id]) - - for speaker_id in range(len(speaker_dirs)): - if nb_speakers == limit_speakers: - break - - nb_speaker_utterances = 0 - chapter_dirs = glob.glob(speaker_dirs[speaker_id] + '/*') - - for chapter_id in range(len(chapter_dirs)): - files = glob.glob(chapter_dirs[chapter_id] + '/*.flac') - - for file in files: - if nb_speaker_utterances == limit_utterances: - break - - frames = get_frames(file, frames_config) - for frame in frames: - filenames.append([file, frame]) - speakers.append(nb_speakers) - - if len(frames) != 0: - nb_speaker_utterances += 1 - - nb_speakers += 1 - - return filenames, speakers - -def scan_voxlingua107(paths, limits_config, frames_config): - nb_languages = 0 - filenames = [] - languages = [] - - limit_languages = limits_config.get('languages', -1) - limit_utterances = limits_config.get('utterances_per_language', -1) - - for dataset_id in range(len(paths)): - language_dirs = glob.glob(paths[dataset_id]) - - if (len(language_dirs) == 0): - raise Exception('AudioDatasetLoader: no data found in %s' % paths[dataset_id]) - - for language_id in range(len(language_dirs)): - if nb_languages == limit_languages: - break - - nb_language_utterances = 0 - files = glob.glob(language_dirs[language_id] + '/*.wav') - - for file in files: - if nb_language_utterances == limit_utterances: - break - - frames = get_frames(file, frames_config) - for frame in frames: - filenames.append([file, frame]) - languages.append(nb_languages) - - if len(frames) != 0: - nb_language_utterances += 1 - - nb_languages += 1 - - return filenames, languages - -class AudioDatasetLoader: - ''' - Class to create a cache file from LibriSpeech and VoxLingua107 datasets. - Features: - - sample audio frames randomly during training - - sample audio frames sequentially - - split train set to create val and test sets - - create val and test sets from different directories - - load only a specific amount of utterances, speakers, ... - ''' - - def __init__(self, seed, config): - np.random.seed(seed) - - self.type = config.get('type', 'LibriSpeech') - self.train_paths = config.get('train_paths', []) - self.val_paths = config.get('val_paths', []) - self.test_paths = config.get('test_paths', []) - self.val_ratio = config.get('val_ratio', None) - self.test_ratio = config.get('test_ratio', None) - self.frames = config['frames'] - self.limits = config.get('limits', {}) - - def create_cache(self, name, cache, paths): - if self.type == 'LibriSpeech': - filenames, labels = scan_librispeech(paths, self.limits, self.frames) - elif self.type == 'VoxLingua107': - filenames, labels = scan_voxlingua107(paths, self.limits, self.frames) - - nb_samples = len(filenames) - frame_length = self.frames['length'] - - # Create h5py dataset - if self.frames['pick'] == 'random': - # Picking frames randomly online implies storing - # frames of different length - dt = h5py.vlen_dtype(np.float64) - X = cache.create_dataset(name + '_x', (nb_samples,), dtype=dt) - else: - X = cache.create_dataset(name + '_x', (nb_samples, frame_length)) - y = cache.create_dataset(name + '_y', (nb_samples)) - - if nb_samples == 0: - return 0 - - for i in tqdm(range(nb_samples)): - filename, frame = filenames[i] - label = labels[i] - data, fs = sf.read(filename) - - if self.frames['pick'] == 'sequence': - data = data[frame:frame+frame_length] - - # Normalize input signal - max_value = np.max(np.abs(data)) - data = data / max_value if max_value != 0 else np.zeros_like(data) - - X[i] = data - y[i] = label - - return len(np.unique(labels)) - - def create_gens(self, cache_path, batch_size): - random = (self.frames['pick'] == 'random') - frame_length = self.frames['length'] - - train = AudioDatasetGenerator(cache_path, - batch_size, - frame_length, - subset='train', - pick_random=random) - - val = AudioDatasetGenerator(cache_path, - batch_size, - frame_length, - subset='val', - pick_random=random) - - test = AudioDatasetGenerator(cache_path, - batch_size, - frame_length, - subset='test', - pick_random=random) - - return [train, val, test] - - def create_gens_with_ratio(self, cache_path, nb_train_samples, batch_size): - indices = np.arange(nb_train_samples) - indices_train = [] - indices_val = [] - indices_test = [] - - if self.val_ratio > 0.0 and self.test_ratio > 0.0: - ratio = self.val_ratio + self.test_ratio - indices_train, indices_test = train_test_split(indices, test_size=ratio) - ratio = self.test_ratio / (self.test_ratio + self.val_ratio) - indices_val, indices_test = train_test_split(indices_test, test_size=ratio) - elif self.val_ratio > 0.0: - indices_train, indices_val = train_test_split(indices, test_size=self.val_ratio) - else: # self.test_ratio > 0.0 - indices_train, indices_test = train_test_split(indices, test_size=self.test_ratio) - - # Handle train ratio - train_ratio = self.limits.get('train_ratio', 1.0) - idx = np.random.choice(len(indices_train), - int(train_ratio * len(indices_train)), - replace=False) - indices_train = indices_train[idx] - - random = (self.frames['pick'] == 'random') - frame_length = self.frames['length'] - - train = AudioDatasetGenerator(cache_path, - batch_size, - frame_length, - indices=indices_train, - pick_random=random) - - val = AudioDatasetGenerator(cache_path, - batch_size, - frame_length, - indices=indices_val, - pick_random=random) - - test = AudioDatasetGenerator(cache_path, - batch_size, - frame_length, - indices=indices_test, - pick_random=random) - - return [train, val, test] - - def load(self, batch_size, checkpoint_dir): - # Create cache during first use - cache_path = checkpoint_dir + '/AudioDatasetLoader_cache.h5' - cache = h5py.File(cache_path, 'a') - if len(cache) == 0: - print('==== AudioDatasetLoader') - print('Creating dataset cache...') - nb_categories = self.create_cache('train', cache, self.train_paths) - nb_categories += self.create_cache('val', cache, self.val_paths) - nb_categories += self.create_cache('test', cache, self.test_paths) - print('====') - - cache.attrs.create('nb_categories', nb_categories) - - nb_categories = cache.attrs['nb_categories'] - nb_train_samples = len(cache['train_y']) - cache.close() - - # Create Keras generators - if self.val_ratio is not None and self.test_ratio is not None: - return self.create_gens_with_ratio(cache_path, - nb_train_samples, - batch_size), nb_categories - elif self.val_ratio is not None or self.test_ratio is not None: - raise Exception('AudioDatasetLoader: you must specify both val_ratio and test_ratio') - - return self.create_gens(cache_path,batch_size), nb_categories \ No newline at end of file diff --git a/sslforslr/dataset/KaldiDatasetLoader.py b/sslforslr/dataset/KaldiDatasetLoader.py index 4de6e63..12169c2 100644 --- a/sslforslr/dataset/KaldiDatasetLoader.py +++ b/sslforslr/dataset/KaldiDatasetLoader.py @@ -1,26 +1,16 @@ import numpy as np from tensorflow.keras.utils import Sequence -import kaldiio import soundfile as sf from sklearn.model_selection import train_test_split import torch import torchaudio -def extract_mfcc(audio): - mfcc = torchaudio.compliance.kaldi.mfcc(torch.from_numpy(audio.T), - num_ceps=30, - num_mel_bins=30) - mfcc = torchaudio.transforms.SlidingWindowCmn(norm_vars=False)(mfcc) - return mfcc.numpy() - class KaldiDatasetGenerator(Sequence): - def __init__(self, batch_size, frames_config, rxfiles, labels, indices): + def __init__(self, batch_size, frame_length, files, indices): self.batch_size = batch_size - self.frame_length = frames_config['length'] - self.extract_mfcc = frames_config.get('extract_mfcc', False) - self.rxfiles = rxfiles - self.labels = labels + self.frame_length = frame_length + self.files = files self.indices = indices def __len__(self): @@ -33,19 +23,15 @@ def __getitem__(self, i): for j in range(self.batch_size): index = self.indices[i * self.batch_size + j] - sample, sr = sf.read(self.rxfiles[index]) + sample, sr = sf.read(self.files[index]) data = sample.reshape((len(sample), 1)) - label = self.labels[index] - - if self.extract_mfcc: - data = extract_mfcc(sample) assert len(data) >= self.frame_length offset = np.random.randint(0, len(data) - self.frame_length + 1) data = data[offset:offset+self.frame_length] X.append(data) - y.append(label) + y.append(0) return np.array(X), np.array(y) @@ -53,56 +39,28 @@ class KaldiDatasetLoader: def __init__(self, seed, config): self.config = config - self.create_utt2spkid() - - # Create a list for rxfiles and labels of each utterance - self.rxfiles = [] - self.labels = [] - for line in open(self.config['scp']): - # Parse scp line - line_parts = line.rstrip().split() - utt = line_parts[0] - rxfile = ' '.join(line_parts[1:]) - - self.rxfiles.append(rxfile) - self.labels.append(self.utt2spkid[utt]) - - def create_utt2spkid(self): - # Associate each utterance to a unique speaker id (starting from 0) - self.utt2spkid = {} - speaker_ids = {} - current_speaker_id = 0 - for line in open(self.config['utt2spk']): - utt, label = line.rstrip().split() - if label in speaker_ids: - label = speaker_ids[label] - else: - speaker_ids[label] = current_speaker_id - label = current_speaker_id - current_speaker_id += 1 - self.utt2spkid[utt] = label - self.nb_categories = current_speaker_id + self.files = [] + for line in open(self.config['train']): + _, file = line.rstrip().split() + self.files.append(file) - def load(self, batch_size, checkpoint_dir): - frames_config = self.config['frames'] + def load(self, batch_size): val_ratio = self.config.get('val_ratio', 0.1) # Create list of indices to shuffle easily during training max_samples = self.config.get('max_samples', None) - nb_samples = max_samples if max_samples else len(self.labels) + nb_samples = max_samples if max_samples else len(self.files) indices = np.arange(nb_samples) indices_train, indices_val = train_test_split(indices, test_size=val_ratio) train = KaldiDatasetGenerator(batch_size, - frames_config, - self.rxfiles, - self.labels, + self.config['frame_length'], + self.files, indices_train) val = KaldiDatasetGenerator(batch_size, - frames_config, - self.rxfiles, - self.labels, + self.config['frame_length'], + self.files, indices_val) - return [train, val], self.nb_categories + return (train, val) diff --git a/sslforslr/models/multitask/MultiTask.py b/sslforslr/models/multitask/MultiTask.py deleted file mode 100644 index cea5967..0000000 --- a/sslforslr/models/multitask/MultiTask.py +++ /dev/null @@ -1,413 +0,0 @@ -import librosa -import numpy as np -import tensorflow as tf -from tensorflow.keras import Model -from tensorflow.keras.layers import Layer -from tensorflow.keras.layers import PReLU -from tensorflow.keras.layers import Conv1D -from tensorflow.keras.layers import Conv1DTranspose -from tensorflow.keras.utils import Sequence -from tensorflow.keras.losses import MeanAbsoluteError, MeanSquaredError -from tensorflow.keras import regularizers - -from sslforslr.models.cpc import CPCModel, cpc_loss -from sslforslr.models.lim import LIMModel, lim_loss - -class MultiTaskModel(Model): - ''' - Keras model combining different self-supervised workers - similarly to PASE and PASE+. - - "Multi-task self-supervised learning for Robust Speech Recognition" - Mirco Ravanelli et al. - https://arxiv.org/pdf/2001.09239.pdf - ''' - - def __init__(self, encoder, in_shape, modules): - super(MultiTaskModel, self).__init__() - - self.encoder = encoder - self.in_shape = in_shape - self.modules = self.create_modules(modules) - - def create_modules(self, modules_config): - modules = {} - - for module in modules_config: - module_type = module['type'] - loss_scaler = module.get('loss_scaler', 1.0) - weight_regularizer = module.get('weight_regularizer', 0.0) - - encoder_output_shape = self.encoder.compute_output_shape(self.in_shape) - nb_timesteps = encoder_output_shape[0] - encoded_dim = encoder_output_shape[1] - - if module_type == 'CPC': - nb_timesteps_to_predict = module['nb_timesteps_to_predict'] - bidirectional = module.get('bidirectional', False) - module_model = CPCModel(self.encoder, - encoded_dim, - nb_timesteps, - nb_timesteps_to_predict, - bidirectional, - weight_regularizer) - modules[module_type] = CPCWorker(module_model, loss_scaler) - elif module_type == 'LIM': - loss_fn = module['loss_fn'] - context_length = module.get('context_length', 1) - module_model = LIMModel(self.encoder, - nb_timesteps, - loss_fn, - context_length, - weight_regularizer) - modules[module_type] = LIMWorker(module_model, loss_scaler) - elif module_type == 'Waveform': - modules[module_type] = WaveformWorker(weight_regularizer, loss_scaler) - elif module_type == 'MFCC': - modules[module_type] = MFCCWorker(weight_regularizer, loss_scaler) - elif module_type == 'LPS': - modules[module_type] = LPSWorker(weight_regularizer, loss_scaler) - - return modules - - def add_targets_to_gen(self, gen): - return WorkerTargetsGenerator(gen, self.modules) - - def compile(self, optimizer, **kwargs): - super(MultiTaskModel, self).compile(**kwargs) - self.optimizer = optimizer - - def call(self, X): - return self.encoder(X) - - def train_step(self, data): - X, Y = data - total_loss = 0 - losses = {name:0 for name in self.modules.keys()} - trainable_params = [] - - with tf.GradientTape() as tape: - X_encoded = self.encoder(X, training=True) - - for module_type, model in self.modules.items(): - # Handle module CPC bidirectional - if module_type == 'CPC' and model.cpc.bidirectional: - X_r = tf.reverse(X, axis=[1]) - X_encoded_r = self.encoder(X_r, training=True) - Y_pred = model((X_encoded, X_encoded_r), training=True) - else: - Y_pred = model(X_encoded, training=True) - - Y_target = Y.get(module_type, None) - - loss = model.compute_loss(Y_target, Y_pred) - - total_loss += loss - losses[module_type] += loss - trainable_params += model.trainable_weights - - trainable_params += self.encoder.trainable_weights - grads = tape.gradient(total_loss, trainable_params) - self.optimizer.apply_gradients(zip(grads, trainable_params)) - - losses['loss'] = total_loss - return losses - - def test_step(self, data): - X, Y = data - total_loss = 0 - losses = {name:0 for name in self.modules.keys()} - - X_encoded = self.encoder(X, training=False) - - for module_type, model in self.modules.items(): - # Handle module CPC bidirectional - if module_type == 'CPC' and model.cpc.bidirectional: - X_r = tf.reverse(X, axis=[1]) - X_encoded_r = self.encoder(X_r, training=False) - Y_pred = model((X_encoded, X_encoded_r), training=False) - else: - Y_pred = model(X_encoded, training=False) - - Y_target = Y.get(module_type, None) - - loss = model.compute_loss(Y_target, Y_pred) - - total_loss += loss - losses[module_type] += loss - - losses['loss'] = total_loss - return losses - - -class WorkerTargetsGenerator(Sequence): - - def __init__(self, gen, modules): - self.gen = gen - self.modules = modules - - def __len__(self): - return len(self.gen) - - def __getitem__(self, batch_id): - X, _ = self.gen[batch_id] - Y = {} - - for module_type, model in self.modules.items(): - if module_type in ['Waveform', 'MFCC', 'LPS']: - Y[module_type] = model.get_target(X) - - return X, Y - - -class LPSWorker(Model): - - def __init__(self, - weight_regularizer, - loss_scaler, - fft_length=2048, - hop_length=160): - super(LPSWorker, self).__init__() - - self.reg = regularizers.l2(weight_regularizer) - self.loss_scaler = loss_scaler - - self.fft_length = fft_length - self.hop_length = hop_length - self.nb_outputs = fft_length // 2 + 1 - - self.conv1 = Conv1D(filters=256, - kernel_size=1, - padding='same', - kernel_regularizer=self.reg, - bias_regularizer=self.reg) - - # PReLU shared_axes option implies that one parameter - # per channel will be learned. - self.activation1 = PReLU(shared_axes=[1]) - - self.last_conv = Conv1D(filters=self.nb_outputs, - kernel_size=1, - padding='same', - kernel_regularizer=self.reg, - bias_regularizer=self.reg) - - def call(self, X): - X = self.conv1(X) - X = self.activation1(X) - X = self.last_conv(X) - return X - - def get_target(self, X): - frame_length = X.shape[1] - - Y = tf.signal.stft(np.squeeze(X, axis=-1), - frame_length=self.hop_length, - frame_step=self.hop_length, - fft_length=self.fft_length) - Y = tf.math.abs(Y) - Y = 10 * tf.experimental.numpy.log10(Y ** 2) - return Y - - def compute_loss(self, Y, Y_pred): - return MeanSquaredError()(Y, Y_pred) * self.loss_scaler - - -class MFCCWorker(Model): - - def __init__(self, - weight_regularizer, - loss_scaler, - sample_frequency=16000, - nb_coefficients=20, - hop_length=160): - super(MFCCWorker, self).__init__() - - self.reg = regularizers.l2(weight_regularizer) - self.loss_scaler = loss_scaler - - self.sample_frequency = sample_frequency - self.nb_coefficients = nb_coefficients - self.hop_length = hop_length - - self.conv1 = Conv1D(filters=256, - kernel_size=1, - padding='same', - kernel_regularizer=self.reg, - bias_regularizer=self.reg) - - # PReLU shared_axes option implies that one parameter - # per channel will be learned. - self.activation1 = PReLU(shared_axes=[1]) - - self.last_conv = Conv1D(filters=nb_coefficients, - kernel_size=1, - padding='same', - kernel_regularizer=self.reg, - bias_regularizer=self.reg) - - def call(self, X): - X = self.conv1(X) - X = self.activation1(X) - X = self.last_conv(X) - return X - - def get_target(self, X): - max_frames = X.shape[1] // self.hop_length - res = np.empty((X.shape[0], max_frames, self.nb_coefficients)) - - for i in range(X.shape[0]): - mfcc = librosa.feature.mfcc(X[i].flatten(), - sr=self.sample_frequency, - hop_length=self.hop_length).T - res[i] = mfcc[:max_frames, :] - - return res - - def compute_loss(self, Y, Y_pred): - return MeanSquaredError()(Y, Y_pred) * self.loss_scaler - - -class WaveformWorkerBlock(Layer): - - def __init__(self, filters, kernel_size, stride, reg, **kwargs): - super(WaveformWorkerBlock, self).__init__(**kwargs) - - self.conv = Conv1DTranspose(filters=filters, - kernel_size=kernel_size, - strides=stride, - kernel_regularizer=reg, - bias_regularizer=reg) - # self.normalization = BatchNormalization(center=False, scale=False) - # self.activation = PReLU(shared_axes=[1]) - - def call(self, X): - X = self.conv(X) - # X = self.normalization(X) - # X = self.activation(X) - return X - - -class WaveformWorker(Model): - - def __init__(self, weight_regularizer, loss_scaler): - super(WaveformWorker, self).__init__() - - self.reg = regularizers.l2(weight_regularizer) - self.loss_scaler = loss_scaler - - self.nb_filters = [256, 256, 128, 128, 128, 64] - self.kernel_sizes = [2, 2, 2, 2, 2, 5] - self.strides = [2, 2, 2, 2, 2, 5] - - self.blocks = [] - for i, (f, w, s) in enumerate(zip(self.nb_filters, - self.kernel_sizes, - self.strides)): - self.blocks.append(WaveformWorkerBlock(f, w, s, self.reg)) - - self.conv1 = Conv1D(filters=64, - kernel_size=1, - padding='same', - kernel_regularizer=self.reg, - bias_regularizer=self.reg) - self.activation1 = PReLU(shared_axes=[1]) - - self.last_conv = Conv1D(filters=1, - kernel_size=1, - kernel_regularizer=self.reg, - bias_regularizer=self.reg) - - def call(self, X): - for block in self.blocks: - X = block(X) - X = self.conv1(X) - X = self.activation1(X) - X = self.last_conv(X) - return X - - def get_target(self, X): - return X - - def compute_loss(self, Y, Y_pred): - return MeanAbsoluteError()(Y, Y_pred) * self.loss_scaler - - -class CPCWorker(Model): - - def __init__(self, - cpc, - loss_scaler): - super(CPCWorker, self).__init__() - - self.cpc = cpc - self.loss_scaler = loss_scaler - - def call(self, X_encoded): - if self.cpc.bidirectional: - X_encoded, X_encoded_r = X_encoded[0], X_encoded[1] - - # X_encoded = audio sequence in correct order - X_past_encoded = X_encoded[:, 0:self.cpc.nb_timesteps_for_context, ...] - X_future_encoded = X_encoded[:, self.cpc.nb_timesteps_for_context:, ...] - X_past_context = self.cpc.ar1(X_past_encoded, training=True) - predictions = self.cpc.predictor1(X_past_context, training=True) - - if not self.cpc.bidirectional: - return predictions, X_future_encoded - - # X_encoded_r = audio sequence in reversed order - X_past_encoded_r = X_encoded_r[:, 0:self.cpc.nb_timesteps_for_context, ...] - X_future_encoded_r = X_encoded_r[:, self.cpc.nb_timesteps_for_context:, ...] - X_past_context_r = self.cpc.ar2(X_past_encoded_r, training=True) - predictions_r = self.cpc.predictor2(X_past_context_r, training=True) - - return predictions, X_future_encoded, predictions_r, X_future_encoded_r - - def compute_loss(self, Y, Y_pred): - # Y is empty and Y_pred contains tensors computed during last call - if self.cpc.bidirectional: - predictions, X_future_encoded, predictions_r, X_future_encoded_r = Y_pred - else: - predictions, X_future_encoded = Y_pred - - loss, _ = cpc_loss(self.cpc.nb_timesteps_to_predict, - predictions, - X_future_encoded) - - if self.cpc.bidirectional: - loss2, _ = cpc_loss(self.cpc.nb_timesteps_to_predict, - predictions_r, - X_future_encoded_r) - loss = (loss + loss2) / 2.0 - - return loss * self.loss_scaler - - -class LIMWorker(Model): - - def __init__(self, - lim, - loss_scaler): - super(LIMWorker, self).__init__() - - self.lim = lim - self.loss_scaler = loss_scaler - - def call(self, X_encoded): - C1, C2, CR = self.lim.extract_chunks(X_encoded) - - C1_and_C2 = tf.concat([C1, C2], axis=1) - C1_and_CR = tf.concat([C1, CR], axis=1) - - pos = self.lim.discriminator(C1_and_C2, training=True) - neg = self.lim.discriminator(C1_and_CR, training=True) - - return pos, neg - - def compute_loss(self, Y, Y_pred): - # Y is empty and Y_pred contains tensors computed during last call - pos, neg = Y_pred - - loss, _ = lim_loss(self.lim.loss_fn, pos, neg) - return loss * self.loss_scaler \ No newline at end of file diff --git a/sslforslr/models/multitask/__init__.py b/sslforslr/models/multitask/__init__.py deleted file mode 100644 index 8c379ce..0000000 --- a/sslforslr/models/multitask/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .MultiTask import MultiTaskModel \ No newline at end of file diff --git a/sslforslr/models/vqwav2vec/VQWav2Vec.py b/sslforslr/models/vqwav2vec/VQWav2Vec.py deleted file mode 100644 index e8710c2..0000000 --- a/sslforslr/models/vqwav2vec/VQWav2Vec.py +++ /dev/null @@ -1,266 +0,0 @@ -import numpy as np -import tensorflow as tf -from tensorflow.keras import Model -from tensorflow.keras.layers import Dense, Conv1D, LayerNormalization, Dropout -from tensorflow_addons.layers import GELU -from tensorflow.keras import regularizers -from tensorflow.keras import losses - -from .VQWav2VecConfig import VQWav2VecConfig -from sslforslr.modules import TransformerEncoder, VectorQuantizer - -class VQWav2VecModel(Model): - ''' - vq-wav2vec implemented as a Keras model. - - It combines the principle of CPC and a quantization module. - - "vq-wav2vec: Self-Supervised Learning of Discrete Speech Representations" - Alexei Baevski, Steffen Schneider, Michael Auli - https://arxiv.org/pdf/1910.05453.pdf - ''' - - def __init__(self, config: VQWav2VecConfig): - super().__init__() - - self.config = config - - self.encoder = VQWav2VecEncoder(config.encoder_conv_layers) - self.quantizer = VectorQuantizer(input_dim=config.encoder_dim, - dim=config.quantizer_dim, - nb_groups=config.quantizer_nb_groups, - nb_vars=config.quantizer_nb_vars, - temp=config.quantizer_temp) - self.transformer = TransformerEncoder(config) - self.predictor = Predictor(config.quantizer_dim, - config.nb_timesteps_to_predict) - self.layer_norm = LayerNormalization() - self.dropout = Dropout(config.dropout) - self.proj_before_transformer = Dense(config.transformer_dim) - - self.mask_weights = self.add_weight( - name='mask_weights', - shape=(self.config.transformer_dim,), - initializer=tf.keras.initializers.RandomUniform(minval=0, maxval=1), - trainable=True - ) - - def compile(self, optimizer, **kwargs): - super().compile(**kwargs) - self.optimizer = optimizer - - def call(self, X): - Z = self.encoder(X, training=False) - Z = self.layer_norm(Z) - Z = self.dropout(Z) - - Q, _ = self.quantizer(Z, training=False) - Q.set_shape((Q.shape[0], - Q.shape[1], - self.config.quantizer_dim)) - - Q = self.proj_before_transformer(Q) - C = self.transformer(Q, training=False) - C = C[:, -1, :] # Keep only last timestep - - return C - - def get_mask_indices(self, Z): - B, T, F = Z.numpy().shape - - num_mask = int( - (self.config.mask_prob * T) - / float(self.config.mask_length) - + np.random.rand() - ) - - indices = [] - for i in range(B): - mask_idx = np.random.choice(T - self.config.mask_length, - num_mask, - replace=False) - mask_idx = np.asarray( - [ - mask_idx[j] + offset - for j in range(num_mask) - for offset in range(self.config.mask_length) - ] - ) - mask_idx = np.unique(mask_idx[mask_idx < T]) - - # FIXME: better vectorization - for j in mask_idx: - indices.append([i, j]) - - return tf.convert_to_tensor(indices) - - @tf.function - def apply_mask(self, Z): - mask_indices = tf.py_function(func=self.get_mask_indices, - inp=[Z], - Tout=tf.int32) - nb_masked_timesteps = tf.shape(mask_indices)[0] - mask_updates = tf.repeat(self.mask_weights, [nb_masked_timesteps]) - mask_updates = tf.reshape(mask_updates, (nb_masked_timesteps, -1)) - Z = tf.tensor_scatter_nd_update(Z, mask_indices, mask_updates) - return Z - - @tf.function - def compute_loss(self, predictions, Q_future, diversity_loss, features_loss): - # preds shape: (B, nb_timesteps_to_predict, quantizer_dim) - # Q_future shape: (B, nb_timesteps_to_predict, quantizer_dim) - - batch_size = tf.shape(predictions)[0] - - losses = tf.zeros((batch_size)) - - for t in range(self.config.nb_timesteps_to_predict): - dot = tf.linalg.matmul(Q_future[:, t, :], - predictions[:, t, :], - transpose_b=True) - - # Determine loss - log_softmax_dot = tf.nn.log_softmax(dot, axis=0) - diag = tf.linalg.tensor_diag_part(log_softmax_dot) - losses += diag - - losses /= tf.cast(self.config.nb_timesteps_to_predict, dtype=tf.float32) - - # Determine accuracy - softmax_dot = tf.nn.softmax(dot, axis=0) - pred_indices = tf.math.argmax(softmax_dot, axis=0, output_type=tf.int32) - preds_acc = tf.math.equal(pred_indices, tf.range(0, batch_size)) - accuracies = tf.math.count_nonzero(preds_acc, dtype=tf.int32) / batch_size - - # Compute the average loss and accuracy across all batches - loss = tf.math.reduce_mean(losses) - accuracy = tf.math.reduce_mean(accuracies) - - # Add additional losses: features penalty, codebook penalty - d_loss = self.config.diversity_loss_weight * diversity_loss - f_loss = self.config.features_loss_weight * features_loss - - return -loss + d_loss + f_loss, accuracy - - def train_step(self, data): - X, _ = data # Discard Y provided by the dataset generator - - with tf.GradientTape() as tape: - # X shape: (B, T, 1) - - Z = self.encoder(X, training=True) - # Z shape: (B, T, encoded_dim) - - features_loss = tf.math.reduce_mean(tf.math.pow(Z, 2)) - - Z = self.layer_norm(Z) - Z = self.dropout(Z) - - Q, diversity_loss = self.quantizer(Z, training=True) - Q.set_shape((Q.shape[0], - Q.shape[1], - self.config.quantizer_dim)) - # Q shape: (B, T, quantizer_dim) - - # Split past and future timesteps - Q_past = Q[:, 0:self.config.nb_timesteps_for_context, ...] - Q_future = Q[:, self.config.nb_timesteps_for_context:, ...] - - # Apply mask on Q_past and determine context C from past timesteps - Q_past = self.proj_before_transformer(Q_past) - Q_past = self.apply_mask(Q_past) - C = self.transformer(Q_past, training=True) - C = C[:, -1, :] # Keep only last timestep - # C shape: (B, transformer_dim) - - # Compute predictions with C - preds = self.predictor(C, training=True) - # preds shape: (B, nb_timesteps_to_predict, quantizer_dim) - - # Contrastive loss between predictions and Q_future - loss, accuracy = self.compute_loss(preds, - Q_future, - diversity_loss, - features_loss) - - trainable_params = self.trainable_weights - trainable_params += self.encoder.trainable_weights - trainable_params += self.transformer.trainable_weights - trainable_params += self.predictor.trainable_weights - trainable_params += self.quantizer.trainable_weights - grads = tape.gradient(loss, trainable_params) - self.optimizer.apply_gradients(zip(grads, trainable_params)) - - return { 'loss': loss, 'accuracy': accuracy } - - def test_step(self, data): - X, _ = data # Discard Y provided by the dataset generator - - Z = self.encoder(X, training=True) - - features_loss = tf.math.reduce_mean(tf.math.pow(Z, 2)) - - Z = self.layer_norm(Z) - Z = self.dropout(Z) - - Q, diversity_loss = self.quantizer(Z, training=False) - Q.set_shape((Q.shape[0], - Q.shape[1], - self.config.quantizer_dim)) - - # Split past and future timesteps - Q_past = Q[:, 0:self.config.nb_timesteps_for_context, ...] - Q_future = Q[:, self.config.nb_timesteps_for_context:, ...] - - # Apply mask on Q_past and determine context C from past timesteps - Q_past = self.proj_before_transformer(Q_past) - Q_past = self.apply_mask(Q_past) - C = self.transformer(Q_past, training=False) - C = C[:, -1, :] # Keep only last timestep - - # Compute predictions with C - preds = self.predictor(C, training=False) - - # Contrastive loss between predictions and Q_future - loss, accuracy = self.compute_loss(preds, - Q_future, - diversity_loss, - features_loss) - - return { 'loss': loss, 'accuracy': accuracy } - - -class VQWav2VecEncoder(Model): - - def __init__(self, config): - super().__init__() - - conv_layers = eval(config) - - self.layers_ = [] - for dim, size, stride in conv_layers: - self.layers_.append(Conv1D(dim, size, strides=stride, padding='same')) - self.layers_.append(LayerNormalization()) - self.layers_.append(GELU()) - - def call(self, X): - for layer in self.layers_: - X = layer(X) - return X - - -class Predictor(Model): - - def __init__(self, encoded_dim, nb_timesteps_to_predict): - super(Predictor, self).__init__() - - self.layers_ = [] - for i in range(nb_timesteps_to_predict): - self.layers_.append(Dense(units=encoded_dim)) - - def call(self, context): - predictions = [] - for layer in self.layers_: - predictions.append(layer(context)) - - return tf.stack(predictions, axis=1) \ No newline at end of file diff --git a/sslforslr/models/vqwav2vec/VQWav2VecConfig.py b/sslforslr/models/vqwav2vec/VQWav2VecConfig.py deleted file mode 100644 index e741c9b..0000000 --- a/sslforslr/models/vqwav2vec/VQWav2VecConfig.py +++ /dev/null @@ -1,98 +0,0 @@ -from dataclasses import dataclass, field -from typing import List, Tuple - -@dataclass -class VQWav2VecConfig: - nb_timesteps_to_predict: int = field( - default=12, - metadata={ - "help": "Number of timesteps to predict in contrastive loss" - } - ) - - nb_timesteps_for_context: int = field( - default=128-12, - metadata={ - "help": "Number of timesteps used for context in contrastive loss" - } - ) - - encoder_conv_layers: str = field( - default="[(512, 10, 5)] + [(512, 8, 4)] + [(512, 4, 2)] * 3", - metadata={ - "help": "Hyperparameters of encoder conv layers. " - "Format: [(dim, kernel_size, stride), ...]"} - ) - - encoder_dim: int = field( - default=512, - metadata={ - "help": "Dimension of encoder embeddings"} - ) - - transformer_dim: int = field( - default=768, - metadata={"help": "Input dimension for transformer"} - ) - - transformer_layers: int = field( - default=12, - metadata={"help": "Number of layers for transformer"} - ) - - transformer_attention_heads: int = field( - default=12, - metadata={"help": "Number of attention heads for transformer"} - ) - - transformer_ffn_dim: int = field( - default=3072, metadata={"help": "Embedding dimension for FFN in transformer"} - ) - - quantizer_dim: int = field( - default=256, - metadata={"help": "Dimension of codebooks vectors"} - ) - - quantizer_nb_groups: int = field( - default=2, - metadata={"help": "Number of codebooks groups"} - ) - - quantizer_nb_vars: int = field( - default=320, - metadata={"help": "Number of codebooks vars (per group)"} - ) - - quantizer_temp: Tuple[float, float, float] = field( - default=(2, 0.5, 0.999995), - metadata={ - "help": "Temperature for Gumble softmax during vector quantization. " - "Format: (start, end, decay)" - }, - ) - - dropout: float = field( - default=0.1, - metadata={"help": "Dropout rate applied to features after encoder"} - ) - - mask_length: int = field( - default=10, - metadata={"help": "Number of timesteps to mask"} - ) - - mask_prob: float = field( - default=0.65, - metadata={"help": "Probability of masking a timestep"} - ) - - diversity_loss_weight: float = field( - default=0.1, - metadata={"help": "Weight of diversity loss used on quantizer"} - ) - - features_loss_weight: float = field( - default=10, - metadata={"help": "Weight of features penalty loss"} - ) \ No newline at end of file diff --git a/sslforslr/models/vqwav2vec/__init__.py b/sslforslr/models/vqwav2vec/__init__.py deleted file mode 100644 index 65e3e03..0000000 --- a/sslforslr/models/vqwav2vec/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .VQWav2Vec import VQWav2VecModel -from .VQWav2VecConfig import VQWav2VecConfig \ No newline at end of file diff --git a/sslforslr/models/wav2vec2/Wav2Vec2.py b/sslforslr/models/wav2vec2/Wav2Vec2.py deleted file mode 100644 index c13cf6a..0000000 --- a/sslforslr/models/wav2vec2/Wav2Vec2.py +++ /dev/null @@ -1,254 +0,0 @@ -import numpy as np -import tensorflow as tf -from tensorflow.keras import Model -from tensorflow.keras.layers import Dense, Conv1D, LayerNormalization, Dropout -from tensorflow_addons.layers import GELU -from tensorflow.keras import regularizers -from tensorflow.keras import losses - -from .Wav2Vec2Config import Wav2Vec2Config -from sslforslr.modules import TransformerEncoder, VectorQuantizer - -class Wav2Vec2Model(Model): - ''' - wav2vec 2.0 implemented as a Keras model. - - "wav2vec 2.0: A Framework for Self-Supervised Learning - of Speech Representations" - Alexei Baevski et al. - https://arxiv.org/pdf/2006.11477.pdf - ''' - - def __init__(self, config: Wav2Vec2Config): - super().__init__() - - self.config = config - - self.encoder = Wav2Vec2Encoder(config.encoder_conv_layers) - self.quantizer = VectorQuantizer(input_dim=config.encoder_dim, - dim=config.quantizer_dim, - nb_groups=config.quantizer_nb_groups, - nb_vars=config.quantizer_nb_vars, - temp=config.quantizer_temp) - self.transformer = TransformerEncoder(config) - self.layer_norm = LayerNormalization() - self.dropout = Dropout(config.dropout) - self.proj_Z = Dense(config.transformer_dim) - self.proj_Q = Dense(config.quantizer_dim) - self.proj_C = Dense(config.quantizer_dim) - - self.mask_weights = self.add_weight( - name='mask_weights', - shape=(self.config.transformer_dim,), - initializer=tf.keras.initializers.RandomUniform(minval=0, maxval=1), - trainable=True - ) - - def compile(self, optimizer, **kwargs): - super().compile(**kwargs) - self.optimizer = optimizer - - def call(self, X): - Z = self.encoder(X, training=False) - Z = self.layer_norm(Z) - Z = self.proj_Z(Z) - Z = self.dropout(Z) - C = self.transformer(Z, training=False) - # C shape: (B, T, transformer_dim) - return C - - @tf.function - def sample_negatives(self, Q): - # Q shape: (B, T, self.config.quantizer_dim) - - B = tf.shape(Q)[0] - T = tf.shape(Q)[1] - F = tf.shape(Q)[2] - - nb_negatives = self.config.nb_negatives - - shift_utterances = tf.range(B) - shift_utterances = tf.roll(shift_utterances, shift=-1, axis=0) - shift_utterances = tf.repeat(shift_utterances, T * nb_negatives) * T - shift_utterances = tf.reshape(shift_utterances, (B, -1)) - - idxs = tf.random.uniform(shape=[B, nb_negatives * T], - minval=0, - maxval=T, - dtype=tf.int32) - - idxs = idxs + shift_utterances - idxs = tf.reshape(idxs, [-1]) - - Q = tf.reshape(Q, (B * T, F)) - - Q_negs = tf.gather(Q, idxs) - Q_negs = tf.reshape(Q_negs, (B, T, nb_negatives, F)) - Q_negs = tf.transpose(Q_negs, perm=[2, 0, 1, 3]) - return Q_negs - - def get_mask_indices(self, Z): - B, T, F = Z.numpy().shape - - num_mask = int( - (self.config.mask_prob * T) - / float(self.config.mask_length) - + np.random.rand() - ) - - indices = [] - for i in range(B): - mask_idx = np.random.choice(T - self.config.mask_length, - num_mask, - replace=False) - mask_idx = np.asarray( - [ - mask_idx[j] + offset - for j in range(num_mask) - for offset in range(self.config.mask_length) - ] - ) - mask_idx = np.unique(mask_idx[mask_idx < T]) - - # FIXME: better vectorization - for j in mask_idx: - indices.append([i, j]) - - return tf.convert_to_tensor(indices) - - @tf.function - def apply_mask(self, Z): - mask_indices = tf.py_function(func=self.get_mask_indices, - inp=[Z], - Tout=tf.int32) - nb_masked_timesteps = tf.shape(mask_indices)[0] - mask_updates = tf.repeat(self.mask_weights, [nb_masked_timesteps]) - mask_updates = tf.reshape(mask_updates, (nb_masked_timesteps, -1)) - Z = tf.tensor_scatter_nd_update(Z, mask_indices, mask_updates) - return Z - - @tf.function - def compute_loss(self, C, Q, Q_negs, diversity_loss, features_loss): - # Q shape: (B, T, F) - # Q_negs shape: (nb_negatives, B, T, F) - - B = tf.shape(Q)[0] - T = tf.shape(Q)[1] - - Q = tf.expand_dims(Q, axis=0) - targets = tf.concat([Q, Q_negs], axis=0) - - dist = losses.CosineSimilarity(axis=-1, - reduction=losses.Reduction.NONE)(C, targets) - dist = dist / self.config.cos_dist_temp - # dist shape: (nb_negatives + 1, B, T) - dist = tf.reshape(dist, (B * T, -1)) - - loss = tf.nn.log_softmax(dist, axis=-1) - loss = loss[:, 0] # Keep first column as it represents Q positive - loss = tf.math.reduce_mean(loss) - - # Add additional losses: features penalty, codebook penalty - d_loss = self.config.diversity_loss_weight * diversity_loss - f_loss = self.config.features_loss_weight * features_loss - - return -loss + d_loss + f_loss - - def train_step(self, data): - X, _ = data # Discard Y provided by the dataset generator - - with tf.GradientTape() as tape: - # X shape: (B, T, 1) - - Z = self.encoder(X, training=True) - # Z shape: (B, T, encoded_dim) - - features_loss = tf.math.reduce_mean(tf.math.pow(Z, 2)) - - Z = self.layer_norm(Z) - Z_unmasked = tf.identity(Z) - - Z = self.proj_Z(Z) - Z = self.dropout(Z) - - Z_unmasked = self.dropout(Z_unmasked) - - Z = self.apply_mask(Z) - - C = self.transformer(Z, training=True) - # C shape: (B, T, transformer_dim) - - Q, diversity_loss = self.quantizer(Z_unmasked, training=True) - # Q shape: (B, T, quantizer_dim) - - # When creating next Dense layer a static shape is required (quantizer_dim) - Q.set_shape((Q.shape[0], Q.shape[1], self.config.quantizer_dim)) - Q = self.proj_Q(Q) - # Q shape: (B, T, quantizer_dim) - - Q_negs = self.sample_negatives(Q) - - C = self.proj_C(C) - - loss = self.compute_loss(C, Q, Q_negs, diversity_loss, features_loss) - - trainable_params = self.trainable_weights - trainable_params += self.encoder.trainable_weights - trainable_params += self.transformer.trainable_weights - trainable_params += self.quantizer.trainable_weights - grads = tape.gradient(loss, trainable_params) - self.optimizer.apply_gradients(zip(grads, trainable_params)) - - return { 'loss': loss } - - def test_step(self, data): - X, _ = data # Discard Y provided by the dataset generator - - Z = self.encoder(X, training=False) - - features_loss = tf.math.reduce_mean(tf.math.pow(Z, 2)) - - Z = self.layer_norm(Z) - Z_unmasked = tf.identity(Z) - - Z = self.proj_Z(Z) - Z = self.dropout(Z) - - Z_unmasked = self.dropout(Z_unmasked) - - Z = self.apply_mask(Z) - - C = self.transformer(Z, training=False) - - Q, diversity_loss = self.quantizer(Z_unmasked, training=False) - - # When creating next Dense layer a static shape is required (quantizer_dim) - Q.set_shape((Q.shape[0], Q.shape[1], self.config.quantizer_dim)) - Q = self.proj_Q(Q) - - Q_negs = self.sample_negatives(Q) - - C = self.proj_C(C) - - loss = self.compute_loss(C, Q, Q_negs, diversity_loss, features_loss) - - return { 'loss': loss } - - -class Wav2Vec2Encoder(Model): - - def __init__(self, config): - super().__init__() - - conv_layers = eval(config) - - self.layers_ = [] - for dim, size, stride in conv_layers: - self.layers_.append(Conv1D(dim, size, strides=stride, padding='same')) - self.layers_.append(LayerNormalization()) - self.layers_.append(GELU()) - - def call(self, X): - for layer in self.layers_: - X = layer(X) - return X \ No newline at end of file diff --git a/sslforslr/models/wav2vec2/Wav2Vec2Config.py b/sslforslr/models/wav2vec2/Wav2Vec2Config.py deleted file mode 100644 index 7770463..0000000 --- a/sslforslr/models/wav2vec2/Wav2Vec2Config.py +++ /dev/null @@ -1,94 +0,0 @@ -from dataclasses import dataclass, field -from typing import List, Tuple - -@dataclass -class Wav2Vec2Config: - encoder_conv_layers: str = field( - default="[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512, 2, 2)] + [(512, 2, 2)]", - metadata={ - "help": "Hyperparameters of encoder conv layers. " - "Format: [(dim, kernel_size, stride), ...]"} - ) - - encoder_dim: int = field( - default=512, - metadata={ - "help": "Dimension of encoder embeddings"} - ) - - transformer_dim: int = field( - default=768, - metadata={"help": "Input dimension for transformer"} - ) - - transformer_layers: int = field( - default=12, - metadata={"help": "Number of layers for transformer"} - ) - - transformer_attention_heads: int = field( - default=12, - metadata={"help": "Number of attention heads for transformer"} - ) - - transformer_ffn_dim: int = field( - default=3072, metadata={"help": "Embedding dimension for FFN in transformer"} - ) - - quantizer_dim: int = field( - default=256, - metadata={"help": "Dimension of codebooks vectors"} - ) - - quantizer_nb_groups: int = field( - default=2, - metadata={"help": "Number of codebooks groups"} - ) - - quantizer_nb_vars: int = field( - default=320, - metadata={"help": "Number of codebooks vars (per group)"} - ) - - quantizer_temp: Tuple[float, float, float] = field( - default=(2, 0.5, 0.999995), - metadata={ - "help": "Temperature for Gumble softmax during vector quantization. " - "Format: (start, end, decay)" - }, - ) - - dropout: float = field( - default=0.1, - metadata={"help": "Dropout rate applied to features after encoder"} - ) - - nb_negatives: int = field( - default=100, - metadata={"help": "Number of negatives for contrastive loss"} - ) - - mask_length: int = field( - default=10, - metadata={"help": "Number of timesteps to mask"} - ) - - mask_prob: float = field( - default=0.65, - metadata={"help": "Probability of masking a timestep"} - ) - - cos_dist_temp: float = field( - default=0.1, - metadata={"help": "Temperature to divide cosine distance by"} - ) - - diversity_loss_weight: float = field( - default=0.1, - metadata={"help": "Weight of diversity loss used on quantizer"} - ) - - features_loss_weight: float = field( - default=10, - metadata={"help": "Weight of features penalty loss"} - ) \ No newline at end of file diff --git a/sslforslr/models/wav2vec2/__init__.py b/sslforslr/models/wav2vec2/__init__.py deleted file mode 100644 index ae9f9fe..0000000 --- a/sslforslr/models/wav2vec2/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .Wav2Vec2 import Wav2Vec2Model -from .Wav2Vec2Config import Wav2Vec2Config \ No newline at end of file diff --git a/sslforslr/modules/TransformerEncoder.py b/sslforslr/modules/TransformerEncoder.py deleted file mode 100644 index 2e4e10b..0000000 --- a/sslforslr/modules/TransformerEncoder.py +++ /dev/null @@ -1,94 +0,0 @@ -import tensorflow as tf -from tensorflow.keras import Model -from tensorflow.keras.layers import Conv1D, Dropout, Dense, LayerNormalization, MultiHeadAttention -from tensorflow_addons.layers import GELU - -class TransformerEncoderLayer(Model): - ''' - Self-attention layer composing TransformerEncoder model. - ''' - - def __init__(self, config): - super().__init__() - - self.activation_fn = GELU() - - self.self_attn = MultiHeadAttention( - config.transformer_attention_heads, - config.transformer_dim, - dropout=config.dropout - ) - - self.dropout1 = Dropout(config.dropout) - self.dropout2 = Dropout(0.0) - self.dropout3 = Dropout(config.dropout) - - self.self_attn_layer_norm = LayerNormalization() - self.fc1 = Dense(config.transformer_ffn_dim) - self.fc2 = Dense(config.transformer_dim) - - self.final_layer_norm = LayerNormalization() - - def call(self, X): - residual = X - - X, attn = self.self_attn(query=X, - key=X, - value=X, - return_attention_scores=True) - - X = self.dropout1(X) - X = residual + X - - X = self.self_attn_layer_norm(X) - - residual = X - X = self.activation_fn(self.fc1(X)) - X = self.dropout2(X) - X = self.fc2(X) - X = self.dropout3(X) - X = residual + X - X = self.final_layer_norm(X) - - return X - -class TransformerEncoder(Model): - ''' - Self-attention Transformer implemented as a Keras model. - - This implementation is based on fairseq MultiheadAttention module. - - "Attention Is All You Need" - Ashish Vaswani et al. - https://arxiv.org/pdf/1706.03762.pdf - ''' - - def __init__(self, config): - super().__init__() - - self.config = config - - self.pos_conv = Conv1D( - self.config.transformer_dim, - kernel_size=128, - padding='same', - groups=1 # FIXME: should be 16 but backprop does not work - ) - self.pos_conv_activation = GELU() - - self.layer_norm = LayerNormalization() - self.dropout = Dropout(self.config.dropout) - - self.transformer_layers = [ - TransformerEncoderLayer(self.config) - for _ in range(self.config.transformer_layers) - ] - - def call(self, X): - X_conv = self.pos_conv_activation(self.pos_conv(X)) - X = X + X_conv - X = self.layer_norm(X) - X = self.dropout(X) - for layer in self.transformer_layers: - X = layer(X) - return X \ No newline at end of file diff --git a/sslforslr/utils/callbacks.py b/sslforslr/utils/callbacks.py new file mode 100644 index 0000000..21f1f4d --- /dev/null +++ b/sslforslr/utils/callbacks.py @@ -0,0 +1,13 @@ +from tf.keras.callbacks import Callback + +from sslforslr.utils.evaluate import speaker_verification_evaluate + +class SVMetricsCallback(Callback): + def __init__(self, config): + super().__init__() + + self.config = config + + def on_epoch_end(self, epoch, logs): + eer = speaker_verification_evaluate(self.model, self.config) + logs['EER'] = eer \ No newline at end of file diff --git a/sslforslr/utils/callbacks/TimeHistoryCallback.py b/sslforslr/utils/callbacks/TimeHistoryCallback.py deleted file mode 100644 index f19b254..0000000 --- a/sslforslr/utils/callbacks/TimeHistoryCallback.py +++ /dev/null @@ -1,16 +0,0 @@ -import tensorflow as tf -import time - -class TimeHistoryCallback(tf.keras.callbacks.Callback): - def update_history(self, history): - history.history['duration'] = self.times - return history - - def on_train_begin(self, logs={}): - self.times = [] - - def on_epoch_begin(self, batch, logs={}): - self.epoch_time_start = time.time() - - def on_epoch_end(self, batch, logs={}): - self.times.append(time.time() - self.epoch_time_start) \ No newline at end of file diff --git a/sslforslr/utils/callbacks/__init__.py b/sslforslr/utils/callbacks/__init__.py deleted file mode 100644 index 9fc1f6f..0000000 --- a/sslforslr/utils/callbacks/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .TimeHistoryCallback import TimeHistoryCallback \ No newline at end of file diff --git a/sslforslr/utils/evaluate.py b/sslforslr/utils/evaluate.py new file mode 100644 index 0000000..88f61dd --- /dev/null +++ b/sslforslr/utils/evaluate.py @@ -0,0 +1,52 @@ +from tqdm import tqdm +import numpy as np +import soundfile as sf +from scipy.spatial.distance import cosine +from sklearn.metrics import roc_curve + +def extract_embeddings(model, wav_list_path, frame_length): + embeddings = {} + for line in tqdm(open(wav_list_path)): + utterance_id, file = line.rstrip().split() + + sample, sr = sf.read(file) + data = sample.reshape((len(sample), 1)) + + assert len(data) >= frame_length + offset = np.random.randint(0, len(data) - frame_length + 1) + data = data[offset:offset+frame_length] + + feats = model(np.expand_dims(data, axis=0)) + embeddings[utterance_id] = feats + + return embeddings + +def score_trials(trials_path, embeddings): + scores, labels = [], [] + for line in tqdm(open(trials_path)): + a, b, target = line.rstrip().split(' ') + + score = 1 - cosine(embeddings[a], embeddings[b]) + label = 1 if (target == 'target') else 0 + + scores.append(score) + labels.append(label) + + return scores, trials + +def compute_eer(scores, labels): + fpr, tpr, thresholds = roc_curve(labels, scores, pos_label=1) + fnr = 1 - tpr + idxE = np.nanargmin(np.abs(fnr - fpr)) + eer = max(fpr[idxE], fnr[idxE]) * 100 + return eer + +def speaker_verification_evaluate(model, config): + test_list_path = config['dataset']['test'] + trials_path = config['dataset']['trials'] + frame_length = config['dataset']['frame_length'] + + embeddings = extract_embeddings(model, test_list_path, frame_length) + scores, labels = score_trials(trials_path, embeddings) + eer = compute_eer(scores, labels) + return eer \ No newline at end of file diff --git a/sslforslr/utils/helpers.py b/sslforslr/utils/helpers.py index 3c52fe1..cf761a5 100644 --- a/sslforslr/utils/helpers.py +++ b/sslforslr/utils/helpers.py @@ -16,13 +16,8 @@ from sslforslr.models.lim import LIMModel from sslforslr.models.simclr import SimCLRModel from sslforslr.models.moco import MoCoModel -from sslforslr.models.wav2vec2 import Wav2Vec2Model, Wav2Vec2Config -from sslforslr.models.vqwav2vec import VQWav2VecModel, VQWav2VecConfig -from sslforslr.models.multitask import MultiTaskModel from sslforslr.models.encoders import CPCEncoder, SincEncoder, Wav2SpkEncoder, XVectorEncoder, ThinResNet34Encoder -from sslforslr.dataset.AudioDatasetLoader import AudioDatasetLoader from sslforslr.dataset.KaldiDatasetLoader import KaldiDatasetLoader -from sslforslr.dataset.AudioAugmentationGenerator import AudioAugmentationGenerator def summary_for_shape(model, input_shape): x = Input(shape=input_shape) @@ -32,7 +27,7 @@ def summary_for_shape(model, input_shape): model_ = Model(inputs=x, outputs=model_copy.call(x)) return model_.summary() -def load_config(config_path, evaluate=False): +def load_config(config_path): # Load config file with open(config_path) as config_file: config = json.load(config_file) @@ -48,52 +43,20 @@ def load_config(config_path, evaluate=False): checkpoint_dir = './checkpoints/' + config['name'] Path(checkpoint_dir).mkdir(parents=True, exist_ok=True) - eval_checkpoint_dir = None - if evaluate: - eval_checkpoint_dir = checkpoint_dir + '___eval' - Path(eval_checkpoint_dir).mkdir(parents=True, exist_ok=True) + return config, checkpoint_dir - return config, checkpoint_dir, eval_checkpoint_dir - -def load_dataset(config, checkpoint_dir, key='training'): - dataset_config = config[key]['dataset'] - batch_size = config[key]['batch_size'] +def load_dataset(config): + dataset_config = config['dataset'] + batch_size = config['training']['batch_size'] seed = config['seed'] - if dataset_config['type'] == 'Kaldi': - dataset = KaldiDatasetLoader(seed, dataset_config) - else: - dataset = AudioDatasetLoader(seed, dataset_config) - gens, nb_categories = dataset.load(batch_size, checkpoint_dir) - - # Add data augmentation generator on top of generators - if 'data_augment' in config[key]: - data_augment_config = config[key]['data_augment'] - sample_frequency = config[key]['dataset']['sample_frequency'] - for i in range(len(gens)): - gens[i] = AudioAugmentationGenerator(gens[i], - data_augment_config, - sample_frequency) - - print("Number of training batches:", len(gens[0])) - print("Number of val batches:", len(gens[1])) - if len(gens) >= 3: - print("Number of test batches:", len(gens[2])) - - # Determine input shape - # Usually 20480 (1.28s at 16kHz on LibriSpeech) => nb_timesteps = 128 - frame_length = config[key]['dataset']['frames']['length'] - input_shape = (frame_length, 1) + dataset = KaldiDatasetLoader(seed, dataset_config) + gens = dataset.load(batch_size) - return gens, input_shape, nb_categories + return gens def create_encoder(config): encoder_type = config['encoder']['type'] - - if encoder_type in ['wav2vec2', 'vq-wav2vec']: - encoder = None - return encoder - encoded_dim = config['encoder']['encoded_dim'] encoder_weight_regularizer = config['encoder'].get('weight_regularizer', 0.0) @@ -117,7 +80,7 @@ def create_encoder(config): elif encoder_type == 'ThinResNet34': encoder = ThinResNet34Encoder(encoded_dim, encoder_weight_regularizer) else: - raise Exception('Config: encoder {} is not supported.'.format(encoder_type)) + raise Exception('Encoder {} is not supported.'.format(encoder_type)) return encoder @@ -126,16 +89,6 @@ def create_model(config, input_shape): model_type = model_config['type'] weight_regularizer = model_config.get('weight_regularizer', 0.0) - if model_type == 'multitask': - modules = model_config['modules'] - return MultiTaskModel(encoder, input_shape, modules) - elif model_type == 'wav2vec2': - config = Wav2Vec2Config() - return Wav2Vec2Model(config) - elif model_type == 'vq-wav2vec': - config = VQWav2VecConfig() - return VQWav2VecModel(config) - encoder = create_encoder(config) encoder_output_shape = encoder.compute_output_shape(input_shape) @@ -168,13 +121,16 @@ def create_model(config, input_shape): encoder_k = create_encoder(config) model = MoCoModel(encoder, encoder_k, model_config, weight_regularizer) else: - raise Exception('Config: model {} is not supported.'.format(model_type)) + raise Exception('Model {} is not supported.'.format(model_type)) return model -def load_model(config, input_shape): - mirrored_strategy = tf.distribute.MirroredStrategy() +def load_model(config): + # Determine input shape + # Usually 20480 (1.28s at 16kHz on LibriSpeech) => nb_timesteps = 128 + input_shape = (dataset_config['frame_length'], 1) + mirrored_strategy = tf.distribute.MirroredStrategy() with mirrored_strategy.scope(): model = create_model(config, input_shape) @@ -197,7 +153,7 @@ def load_model(config, input_shape): momentum = optimizer_config.get('momentum', 0.0) optimizer = SGD(learning_rate, momentum) else: - raise Exception('Config: optimizer {} is not supported.'.format(opt_type)) + raise Exception('Optimizer {} is not supported.'.format(opt_type)) # Compile and print model run_eagerly = config['training'].get('run_eagerly', False) diff --git a/train.py b/train.py index beb6747..8e4b9f8 100644 --- a/train.py +++ b/train.py @@ -1,3 +1,6 @@ +import os +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + import argparse import numpy as np @@ -7,31 +10,26 @@ from tensorflow.keras.callbacks import TensorBoard from sslforslr.utils.helpers import load_config, load_dataset, load_model -from sslforslr.utils.callbacks import TimeHistoryCallback +from sslforslr.utils.callbacks import SVMetricsCallback from sslforslr.models.moco import MoCoUpdateCallback def train(config_path): - config, checkpoint_dir, _ = load_config(config_path) + # Load config, model and dataset + config, checkpoint_dir = load_config(config_path) + model = load_model(config) + + gens = load_dataset(config) + train_gen, val_gen = gens + print("Number of training batches:", len(train_gen)) + print("Number of val batches:", len(val_gen)) # Prevent re-training model if tf.train.latest_checkpoint(checkpoint_dir): - raise Exception('%s already contains checkpoints.' % checkpoint_dir) - - gens, input_shape, _ = load_dataset(config, - checkpoint_dir, - key='training') - train_gen = gens[0] - val_gen = gens[1] - - model = load_model(config, input_shape) - - # For multitask model: add targets to data generator - if config['model']['type'] == 'multitask': - for i in range(len(gens)): - gens[i] = model.add_targets_to_gen(gens[i]) + raise Exception('%s has already been trained.' % config['name']) # Setup callbacks + sv_metrics = SVMetricsCallback(config) save_callback = ModelCheckpoint(filepath=checkpoint_dir + '/training', monitor='val_loss', save_best_only=True, @@ -41,13 +39,10 @@ def train(config_path): patience=5) tensorboard = TensorBoard(log_dir=checkpoint_dir + '/logs/', histogram_freq=1) - time_history = TimeHistoryCallback() # Start training nb_epochs = config['training']['epochs'] - callbacks = [save_callback, early_stopping, time_history] - if config['training'].get('tensorboard', False): - callbacks.append(tensorboard) + callbacks = [sv_metrics, save_callback, early_stopping, tensorboard] if config['model']['type'] == 'MoCo': callbacks.append(MoCoUpdateCallback(train_gen)) history = model.fit(train_gen, @@ -57,10 +52,7 @@ def train(config_path): use_multiprocessing=True, workers=8) - # Save training history - hist_path = checkpoint_dir + '/history.npy' - history = time_history.update_history(history) - np.save(hist_path, history.history) + np.save(checkpoint_dir + '/history.npy', history.history) if __name__ == "__main__": parser = argparse.ArgumentParser() diff --git a/train_evaluate.py b/train_evaluate.py deleted file mode 100644 index 246ed4d..0000000 --- a/train_evaluate.py +++ /dev/null @@ -1,107 +0,0 @@ -import argparse -import numpy as np - -import tensorflow as tf -from tensorflow.keras import Model -from tensorflow.keras.layers import Input -from tensorflow.keras.layers import Dense -from tensorflow.keras.layers import Flatten -from tensorflow.keras.optimizers import Adam -from tensorflow.keras.callbacks import ModelCheckpoint -from tensorflow.keras.callbacks import EarlyStopping -from tensorflow.keras.callbacks import TensorBoard - -from sslforslr.utils.callbacks import TimeHistoryCallback -from sslforslr.utils.helpers import load_config, load_dataset, load_model - -class Classifier(Model): - - def __init__(self, nb_categories): - super(Classifier, self).__init__() - - self.nb_categories = nb_categories - - self.flatten = Flatten() - self.dense1 = Dense(units=256) - self.dense2 = Dense(units=nb_categories, activation='softmax') - - def call(self, X): - X = self.flatten(X) - X = self.dense1(X) - X = self.dense2(X) - return X - -def create_classifier(config, input_shape, nb_categories, model): - learning_rate = config['evaluate']['learning_rate'] - - inputs = Input(input_shape) - inputs_encoded = model(inputs) - outputs = Classifier(nb_categories)(inputs_encoded) - - classifier = Model(inputs, outputs) - classifier.compile(optimizer=Adam(learning_rate=learning_rate), - loss='sparse_categorical_crossentropy', - metrics=['accuracy']) - classifier.summary() - - return classifier - -def train_evaluate(config_path): - config, checkpoint_dir, eval_checkpoint_dir = load_config(config_path, evaluate=True) - - # Prevent re-training model - if tf.train.latest_checkpoint(eval_checkpoint_dir): - raise Exception('%s already contains checkpoints.' % eval_checkpoint_dir) - - gens, input_shape, nb_categories = load_dataset(config, - eval_checkpoint_dir, - key='evaluate') - - model = load_model(config, input_shape) - - # Load pre-trained weights - last_checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir) - if last_checkpoint_path: - mirrored_strategy = tf.distribute.MirroredStrategy() - with mirrored_strategy.scope(): - model.load_weights(last_checkpoint_path) - print('Loading pretrained model: ', last_checkpoint_path is not None) - - model.trainable = config['evaluate'].get('train_encoder', True) - - # Create classifier - classifier = create_classifier(config, input_shape, nb_categories, model) - - # Setup callbacks - save_callback = ModelCheckpoint(filepath=eval_checkpoint_dir + '/training', - monitor='val_loss', - save_best_only=True, - save_weights_only=True, - verbose=1) - early_stopping = EarlyStopping(monitor='val_loss', - patience=5) - tensorboard = TensorBoard(log_dir=eval_checkpoint_dir + '/logs/', - histogram_freq=1) - time_history = TimeHistoryCallback() - - # Start training - nb_epochs = config['evaluate']['epochs'] - callbacks = [save_callback, early_stopping, time_history] - if config['evaluate'].get('tensorboard', False): - callbacks.append(tensorboard) - history = classifier.fit(gens[0], - validation_data=gens[1], - epochs=nb_epochs, - callbacks=callbacks) - - # Save training history - hist_path = eval_checkpoint_dir + '/history.npy' - history = time_history.update_history(history) - np.save(hist_path, history.history) - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument('config', help='Path to model config file.') - args = parser.parse_args() - - train_evaluate(args.config)