openspeech-team · sooftware · Jun 7, 2021 · Jun 7, 2021
diff --git a/README.md b/README.md
@@ -179,6 +179,32 @@ $ python ./openspeech_cli/hydra_train.py \
     criterion=ctc
 ```
 
+### Evaluation examples
+
+- Example1: Evaluation the `listen_attend_spell` model:
+
+```
+$ python ./openspeech_cli/hydra_eval.py \
+    audio=melspectrogram \
+    eval.model_name=listen_attend_spell \
+    eval.dataset_path=$DATASET_PATH \
+    eval.checkpoint_path=$CHECKPOINT_PATH \
+    eval.manifest_file_path=$MANIFEST_FILE_PATH  
+```
+
+- Example2: Evaluation the `listen_attend_spell`, `conformer_lstm` models with ensemble:
+
+```
+$ python ./openspeech_cli/hydra_eval.py \
+    audio=melspectrogram \
+    eval.model_names=(listen_attend_spell, conformer_lstm) \
+    eval.dataset_path=$DATASET_PATH \
+    eval.checkpoint_paths=($CHECKPOINT_PATH1, $CHECKPOINT_PATH2) \
+    eval.ensemble_weights=(0.3, 0.7) \
+    eval.ensemble_method=weighted \
+    eval.manifest_file_path=$MANIFEST_FILE_PATH  
+```
+
 ## Installation
 
 This project recommends Python 3.7 or higher.  

diff --git a/openspeech/configs/eval.yaml b/openspeech/configs/eval.yaml
@@ -0,0 +1,5 @@
+# @package _group_
+
+defaults:
+  - audio: null
+  - eval: default
diff --git a/openspeech/configs/configs.yaml → openspeech/configs/train.yaml b/openspeech/configs/configs.yaml → openspeech/configs/train.yaml
diff --git a/openspeech/data/audio/filter_bank/configuration.py b/openspeech/data/audio/filter_bank/configuration.py
@@ -35,7 +35,7 @@ class FilterBankConfigs(OpenspeechDataclass):
 
     Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
 
-    Configurations:
+    Args:
         name (str): name of feature transform. (default: fbank)
         sample_rate (int): sampling rate of audio (default: 16000)
         frame_length (float): frame length for spectrogram (default: 20.0)

diff --git a/openspeech/data/audio/melspectrogram/configuration.py b/openspeech/data/audio/melspectrogram/configuration.py
@@ -35,7 +35,7 @@ class MelSpectrogramConfigs(OpenspeechDataclass):
 
     Configuration objects inherit from :class: `~openspeech.dataclass.OpenspeechDataclass`.
 
-    Configurations:
+    Args:
         name (str): name of feature transform. (default: melspectrogram)
         sample_rate (int): sampling rate of audio (default: 16000)
         frame_length (float): frame length for spectrogram (default: 20.0)

diff --git a/openspeech/data/audio/mfcc/configuration.py b/openspeech/data/audio/mfcc/configuration.py
@@ -35,7 +35,7 @@ class MFCCConfigs(OpenspeechDataclass):
 
     Configuration objects inherit from :class: `~openspeech.dataclass.OpenspeechDataclass`.
 
-    Configurations:
+    Args:
         name (str): name of feature transform. (default: mfcc)
         sample_rate (int): sampling rate of audio (default: 16000)
         frame_length (float): frame length for spectrogram (default: 20.0)

diff --git a/openspeech/data/audio/spectrogram/configuration.py b/openspeech/data/audio/spectrogram/configuration.py
@@ -35,7 +35,7 @@ class SpectrogramConfigs(OpenspeechDataclass):
 
     Configuration objects inherit from :class: `~openspeech.dataclass.OpenspeechDataclass`.
 
-    Configurations:
+    Args:
         name (str): name of feature transform. (default: spectrogram)
         sample_rate (int): sampling rate of audio (default: 16000)
         frame_length (float): frame length for spectrogram (default: 20.0)

diff --git a/openspeech/data/data_loader.py b/openspeech/data/data_loader.py
@@ -19,6 +19,7 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
+from typing import Tuple
 
 import torch
 import numpy as np
@@ -132,3 +133,27 @@ def __len__(self):
 
     def shuffle(self, epoch):
         np.random.shuffle(self.bins)
+
+
+def load_dataset(manifest_file_path: str) -> Tuple[list, list]:
+    """
+    Provides dictionary of filename and labels.
+
+    Args:
+        manifest_file_path (str): evaluation manifest file path.
+
+    Returns: target_dict
+        * target_dict (dict): dictionary of filename and labels
+    """
+    audio_paths = list()
+    transcripts = list()
+
+    with open(manifest_file_path) as f:
+        for idx, line in enumerate(f.readlines()):
+            audio_path, korean_transcript, transcript = line.split('\t')
+            transcript = transcript.replace('\n', '')
+
+            audio_paths.append(audio_path)
+            transcripts.append(transcript)
+
+    return audio_paths, transcripts
diff --git a/openspeech/data/dataset.py b/openspeech/data/dataset.py
@@ -90,7 +90,7 @@ def __init__(
         self.apply_noise_augment = apply_noise_augment
         self.apply_time_stretch_augment = apply_time_stretch_augment
         self.apply_joining_augment = apply_joining_augment
-        self.transforms = AUDIO_FEATURE_TRANSFORM_DATACLASS_REGISTRY[configs.name](configs)
+        self.transforms = AUDIO_FEATURE_TRANSFORM_DATACLASS_REGISTRY[configs.audio.name](configs)
         self._load_audio = load_audio
 
         if self.apply_spec_augment:

diff --git a/openspeech/dataclass/__init__.py b/openspeech/dataclass/__init__.py
@@ -31,6 +31,8 @@
     Fp16GPUTrainerConfigs,
     Fp16TPUTrainerConfigs,
     Fp64CPUTrainerConfigs,
+    EvaluationConfigs,
+    EnsembleEvaluationConfigs,
 )
 
 OPENSPEECH_CONFIGS = [
@@ -62,3 +64,7 @@
 AUGMENT_DATACLASS_REGISTRY = {
     "default": AugmentConfigs,
 }
+EVAL_DATACLASS_REGISTRY = {
+    "default": EvaluationConfigs,
+    "ensemble": EnsembleEvaluationConfigs,
+}
diff --git a/openspeech/dataclass/configurations.py b/openspeech/dataclass/configurations.py
@@ -311,6 +311,62 @@ class VocabularyConfigs(OpenspeechDataclass):
     )
 
 
+@dataclass
+class EvaluationConfigs(OpenspeechDataclass):
+    model_name: str = field(
+        default=MISSING, metadata={"help": "Model name."}
+    )
+    dataset_path: str = field(
+        default=MISSING, metadata={"help": "Path of dataset."}
+    )
+    checkpoint_path: str = field(
+        default=MISSING, metadata={"help": "Path of model checkpoint."}
+    )
+    manifest_file_path: str = field(
+        default=MISSING, metadata={"help": "Path of evaluation manifest file."}
+    )
+    num_workers: int = field(
+        default=4, metadata={"help": "Number of worker."}
+    )
+    batch_size: int = field(
+        default=32, metadata={"help": "Batch size."}
+    )
+    beam_size: int = field(
+        default=1, metadata={"help": "Beam size of beam search."}
+    )
+
+
+@dataclass
+class EnsembleEvaluationConfigs(OpenspeechDataclass):
+    model_names: str = field(
+        default=MISSING, metadata={"help": "List of model name."}
+    )
+    dataset_paths: str = field(
+        default=MISSING, metadata={"help": "Path of dataset."}
+    )
+    checkpoint_paths: str = field(
+        default=MISSING, metadata={"help": "List of model checkpoint path."}
+    )
+    manifest_file_path: str = field(
+        default=MISSING, metadata={"help": "Path of evaluation manifest file."}
+    )
+    ensemble_method: str = field(
+        default="vanilla", metadata={"help": "Method of ensemble (vanilla, weighted)"}
+    )
+    ensemble_weights: str = field(
+        default="(1.0, 1.0, 1.0 ..)", metadata={"help": "Weights of ensemble models."}
+    )
+    num_workers: int = field(
+        default=4, metadata={"help": "Number of worker."}
+    )
+    batch_size: int = field(
+        default=32, metadata={"help": "Batch size."}
+    )
+    beam_size: int = field(
+        default=1, metadata={"help": "Beam size of beam search."}
+    )
+
+
 def generate_openspeech_configs_with_help():
     from openspeech.dataclass import OPENSPEECH_CONFIGS, TRAINER_DATACLASS_REGISTRY
     from openspeech.models import MODEL_DATACLASS_REGISTRY

diff --git a/openspeech/dataclass/initialize.py b/openspeech/dataclass/initialize.py
@@ -51,3 +51,21 @@ def hydra_init() -> None:
 
         for k, v in dataclass_registry.items():
             cs.store(group=group, name=k, node=v, provider="openspeech")
+
+
+def hydra_eval_init() -> None:
+    from openspeech.data import AUDIO_FEATURE_TRANSFORM_DATACLASS_REGISTRY
+    from openspeech.dataclass import EVAL_DATACLASS_REGISTRY
+
+    registries = {
+        "audio": AUDIO_FEATURE_TRANSFORM_DATACLASS_REGISTRY,
+        "eval": EVAL_DATACLASS_REGISTRY,
+    }
+
+    cs = ConfigStore.instance()
+
+    for group in registries.keys():
+        dataclass_registry = registries[group]
+
+        for k, v in dataclass_registry.items():
+            cs.store(group=group, name=k, node=v, provider="openspeech")
diff --git a/openspeech/decoders/transformer_decoder.py b/openspeech/decoders/transformer_decoder.py
@@ -255,7 +255,7 @@ def forward(
             input_var = input_var.fill_(self.pad_id)
             input_var[:, 0] = self.sos_id
 
-            for di in range(1, self.max_length):
+            for di in range(self.max_length):
                 input_lengths = torch.IntTensor(batch_size).fill_(di)
 
                 outputs = self.forward_step(

diff --git a/openspeech/models/conformer/configurations.py b/openspeech/models/conformer/configurations.py
@@ -35,7 +35,7 @@ class ConformerConfigs(OpenspeechDataclass):
 
     Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
 
-    Configurations:
+    Args:
         model_name (str): Model name (default: conformer)
         encoder_dim (int): Dimension of encoder. (default: 512)
         num_encoder_layers (int): The number of encoder layers. (default: 17)

diff --git a/openspeech/models/conformer_lstm/configurations.py b/openspeech/models/conformer_lstm/configurations.py
@@ -35,7 +35,7 @@ class ConformerLSTMConfigs(OpenspeechDataclass):
 
     Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
 
-    Configurations:
+    Args:
         model_name (str): Model name (default: conformer_lstm)
         encoder_dim (int): Dimension of encoder. (default: 512)
         num_encoder_layers (int): The number of encoder layers. (default: 17)

diff --git a/openspeech/models/conformer_lstm/model.py b/openspeech/models/conformer_lstm/model.py
@@ -85,13 +85,12 @@ def build_model(self):
             rnn_type=self.configs.model.rnn_type,
         )
 
-    def set_beam_decoder(self, batch_size: int, beam_size: int = 3):
+    def set_beam_decoder(self, beam_size: int = 3):
         """ Setting beam search decoder """
         from openspeech.search import BeamSearchLSTM
         self.decoder = BeamSearchLSTM(
             decoder=self.decoder,
             beam_size=beam_size,
-            batch_size=batch_size,
         )
 
     def forward(self, inputs: Tensor, input_lengths: Tensor) -> Dict[str, Tensor]:

diff --git a/openspeech/models/conformer_transducer/configurations.py b/openspeech/models/conformer_transducer/configurations.py
@@ -35,7 +35,7 @@ class ConformerTransducerConfigs(OpenspeechDataclass):
 
     Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
 
-    Configurations:
+    Args:
         model_name (str): Model name (default: conformer_transducer)
         encoder_dim (int): Dimension of encoder. (default: 512)
         num_encoder_layers (int): The number of encoder layers. (default: 17)

diff --git a/openspeech/models/deep_cnn_with_joint_ctc_listen_attend_spell/configurations.py b/openspeech/models/deep_cnn_with_joint_ctc_listen_attend_spell/configurations.py
@@ -35,7 +35,7 @@ class DeepCNNWithJointCTCListenAttendSpellConfigs(OpenspeechDataclass):
 
     Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
 
-    Configurations:
+    Args:
         model_name (str): Model name (default: deep_cnn_with_joint_ctc_listen_attend_spell)
         num_encoder_layers (int): The number of encoder layers. (default: 3)
         num_decoder_layers (int): The number of decoder layers. (default: 2)

diff --git a/openspeech/models/deep_cnn_with_joint_ctc_listen_attend_spell/model.py b/openspeech/models/deep_cnn_with_joint_ctc_listen_attend_spell/model.py
@@ -83,13 +83,12 @@ def build_model(self):
             rnn_type=self.configs.model.rnn_type,
         )
 
-    def set_beam_decoder(self, batch_size: int, beam_size: int = 3):
+    def set_beam_decoder(self, beam_size: int = 3):
         """ Setting beam search decoder """
         from openspeech.search import BeamSearchLSTM
         self.decoder = BeamSearchLSTM(
             decoder=self.decoder,
             beam_size=beam_size,
-            batch_size=batch_size,
         )
 
     def forward(self, inputs: Tensor, input_lengths: Tensor) -> Dict[str, Tensor]:

diff --git a/openspeech/models/deepspeech2/configurations.py b/openspeech/models/deepspeech2/configurations.py
@@ -35,7 +35,7 @@ class DeepSpeech2Configs(OpenspeechDataclass):
 
     Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
 
-    Configurations:
+    Args:
         model_name (str): Model name (default: deepspeech2)
         num_rnn_layers (int): The number of rnn layers. (default: 5)
         rnn_hidden_dim (int): The hidden state dimension of rnn. (default: 1024)

diff --git a/openspeech/models/jasper10x5/configurations.py b/openspeech/models/jasper10x5/configurations.py
@@ -35,7 +35,7 @@ class Jasper10x5Config(OpenspeechDataclass):
 
     Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
 
-    Configurations:
+    Args:
         model_name (str): Model name (default: jasper10x5)
         num_blocks (int): Number of jasper blocks (default: 10)
         num_sub_blocks (int): Number of jasper sub blocks (default: 5)

diff --git a/openspeech/models/jasper5x3/configurations.py b/openspeech/models/jasper5x3/configurations.py
@@ -35,7 +35,7 @@ class Jasper5x3Config(OpenspeechDataclass):
 
     Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
 
-    Configurations:
+    Args:
         model_name (str): Model name (default: jasper5x3)
         num_blocks (int): Number of jasper blocks (default: 5)
         num_sub_blocks (int): Number of jasper sub blocks (default: 3)

diff --git a/openspeech/models/joint_ctc_conformer_lstm/configurations.py b/openspeech/models/joint_ctc_conformer_lstm/configurations.py
@@ -35,7 +35,7 @@ class JointCTCConformerLSTMConfigs(OpenspeechDataclass):
 
     Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
 
-    Configurations:
+    Args:
         model_name (str): Model name (default: joint_ctc_conformer_lstm)
         encoder_dim (int): Dimension of encoder. (default: 512)
         num_encoder_layers (int): The number of encoder layers. (default: 17)

diff --git a/openspeech/models/joint_ctc_conformer_lstm/model.py b/openspeech/models/joint_ctc_conformer_lstm/model.py
@@ -85,13 +85,12 @@ def build_model(self):
             rnn_type=self.configs.model.rnn_type,
         )
 
-    def set_beam_decoder(self, batch_size: int, beam_size: int = 3):
+    def set_beam_decoder(self, beam_size: int = 3):
         """ Setting beam search decoder """
         from openspeech.search import BeamSearchLSTM
         self.decoder = BeamSearchLSTM(
             decoder=self.decoder,
             beam_size=beam_size,
-            batch_size=batch_size,
         )
 
     def forward(self, inputs: Tensor, input_lengths: Tensor) -> Dict[str, Tensor]:

diff --git a/openspeech/models/joint_ctc_listen_attend_spell/configurations.py b/openspeech/models/joint_ctc_listen_attend_spell/configurations.py
@@ -35,21 +35,21 @@ class JointCTCListenAttendSpellConfigs(OpenspeechDataclass):
 
     Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
 
-    Configurations:
-    model_name (str): Model name (default: joint_ctc_listen_attend_spell)
-    num_encoder_layers (int): The number of encoder layers. (default: 3)
-    num_decoder_layers (int): The number of decoder layers. (default: 2)
-    hidden_state_dim (int): The hidden state dimension of encoder. (default: 768)
-    encoder_dropout_p (float): The dropout probability of encoder. (default: 0.3)
-    encoder_bidirectional (bool): If True, becomes a bidirectional encoders (default: True)
-    rnn_type (str): Type of rnn cell (rnn, lstm, gru) (default: lstm)
-    joint_ctc_attention (bool): Flag indication joint ctc attention or not (default: True)
-    max_length (int): Max decoding length. (default: 128)
-    num_attention_heads (int): The number of attention heads. (default: 1)
-    decoder_dropout_p (float): The dropout probability of decoder. (default: 0.2)
-    decoder_attn_mechanism (str): The attention mechanism for decoder. (default: loc)
-    teacher_forcing_ratio (float): The ratio of teacher forcing. (default: 1.0)
-    optimizer (str): Optimizer for training. (default: adam)
+    Args:
+        model_name (str): Model name (default: joint_ctc_listen_attend_spell)
+        num_encoder_layers (int): The number of encoder layers. (default: 3)
+        num_decoder_layers (int): The number of decoder layers. (default: 2)
+        hidden_state_dim (int): The hidden state dimension of encoder. (default: 768)
+        encoder_dropout_p (float): The dropout probability of encoder. (default: 0.3)
+        encoder_bidirectional (bool): If True, becomes a bidirectional encoders (default: True)
+        rnn_type (str): Type of rnn cell (rnn, lstm, gru) (default: lstm)
+        joint_ctc_attention (bool): Flag indication joint ctc attention or not (default: True)
+        max_length (int): Max decoding length. (default: 128)
+        num_attention_heads (int): The number of attention heads. (default: 1)
+        decoder_dropout_p (float): The dropout probability of decoder. (default: 0.2)
+        decoder_attn_mechanism (str): The attention mechanism for decoder. (default: loc)
+        teacher_forcing_ratio (float): The ratio of teacher forcing. (default: 1.0)
+        optimizer (str): Optimizer for training. (default: adam)
     """
     model_name: str = field(
         default="joint_ctc_listen_attend_spell", metadata={"help": "Model name"}