diff --git a/examples/tts/conf/fastpitch_align_ipa_adapter.yaml b/examples/tts/conf/fastpitch_align_ipa_adapter.yaml
new file mode 100644
index 000000000000..3e7b7ec9b2ae
--- /dev/null
+++ b/examples/tts/conf/fastpitch_align_ipa_adapter.yaml
@@ -0,0 +1,324 @@
+# This config contains the default values for training FastPitch speaker adaptation
+# If you want to train model on other dataset, you can change config values according to your dataset.
+# Most dataset-specific arguments are in the head of the config file, see below.
+
+name: FastPitch
+
+train_dataset: ???
+validation_datasets: ???
+sup_data_path: ???
+sup_data_types: [ "align_prior_matrix", "pitch", "speaker_id", "reference_audio"]
+
+
+# Default values from librosa.pyin
+pitch_fmin: 65.40639132514966
+pitch_fmax: 2093.004522404789
+
+# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
+# by running `scripts/dataset_processing/tts/extract_sup_data.py`
+pitch_mean: ???  # e.g. 212.35873413085938 for LJSpeech
+pitch_std:  ???  # e.g.  68.52806091308594 for LJSpeech
+
+# Default values for dataset with sample_rate=44100
+sample_rate: 44100
+n_mel_channels: 80
+n_window_size: 2048
+n_window_stride: 512
+n_fft: 2048
+lowfreq: 0
+highfreq: 8000
+window: hann
+
+phoneme_dict_path: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt"
+heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
+
+model:
+  unfreeze_aligner: false
+  unfreeze_duration_predictor: false
+  unfreeze_pitch_predictor: false
+  unfreeze_energy_predictor: false
+  learn_alignment: true
+  bin_loss_warmup_epochs: 100
+  
+  max_token_duration: 75
+  symbols_embedding_dim: 384
+  pitch_embedding_kernel_size: 3
+  energy_embedding_kernel_size: 3
+    
+  pitch_fmin: ${pitch_fmin}
+  pitch_fmax: ${pitch_fmax}
+
+  pitch_mean: ${pitch_mean}
+  pitch_std: ${pitch_std}
+
+  sample_rate: ${sample_rate}
+  n_mel_channels: ${n_mel_channels}
+  n_window_size: ${n_window_size}
+  n_window_stride: ${n_window_stride}
+  n_fft: ${n_fft}
+  lowfreq: ${lowfreq}
+  highfreq: ${highfreq}
+  window: ${window}
+
+  text_normalizer:
+    _target_: nemo_text_processing.text_normalization.normalize.Normalizer
+    lang: en
+    input_case: cased
+    
+  text_normalizer_call_kwargs:
+    verbose: false
+    punct_pre_process: true
+    punct_post_process: true
+
+  text_tokenizer:
+    _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer
+    punct: true
+    apostrophe: true
+    pad_with_space: true
+    g2p:
+      _target_: nemo.collections.tts.g2p.modules.IPAG2P
+      phoneme_dict: ${phoneme_dict_path}
+      heteronyms: ${heteronyms_path}
+      phoneme_probability: 0.8
+      # Relies on the heteronyms list for anything that needs to be disambiguated
+      ignore_ambiguous_words: false
+      use_chars: true
+      use_stresses: true
+      
+  adapter:
+    # Config of the adapter training/eval script.
+    adapter_name: "adapter"  # Name of the adapter, used by the script
+    adapter_module_name: "encoder+decoder+duration_predictor+pitch_predictor+aligner"  # Name of the adapter module. Combine multiple modules with '+' between module names.
+    adapter_state_dict_name: "adapters.pt"  # If the individual adapters must be saved, a file name can be provided here. null disables this.
+    
+    # Config of the adapter module itself
+    _target_: nemo.collections.common.parts.adapter_modules.LinearAdapter
+    in_features: ${model.symbols_embedding_dim}  # User must provide the output dimension of the layers of the model, which is the input dimension of this adapter.
+    dim: 256  # The hidden dimension of the adapter, as chosen by user, but small values are preferred to reduce param count.
+    activation: swish
+    norm_position: 'pre'  # Can be `pre` or `post`
+    dropout: 0.0  # float, dropout for the adapter
+
+    # Adapter strategy config
+    adapter_strategy:
+      _target_: nemo.core.classes.mixins.adapter_mixin_strategies.ResidualAddAdapterStrategy
+      stochastic_depth: 0.0  # float, setting to > 0 will enable stochastic depth for each adapter block.
+      l2_lambda: 0.0  # float, setting to > 0 will enable l2 norm auxiliary loss for each adapter's output.
+
+    # Optional global config available to all adapters at a global level.
+    # A global config is shared across every layer of the adapters, defining global properties rather
+    # than properties local to the adapter (as defined above).
+    # This can be useful in order to select *which type of adapter* is added, *what adapters to enable*,
+    # and further global operations that can decide dynamically how to support the requested adapter.
+    global_cfg:
+      check_encoder_adapter: True  # determines whether to check if encoder adapter modules is supported
+      check_decoder_adapter: True  # determines whether to check if decoder adapter modules is supported
+      check_duration_predictor_adapter: True  # determines whether to check if duration_predictor adapter modules is supported
+      check_pitch_predictor_adapter: True  # determines whether to check if pitch_predictor adapter modules is supported
+      check_aligner_adapter: True  # determines whether to check if aligner adapter modules is supported
+      
+  train_ds:
+    dataset:
+      _target_: nemo.collections.tts.data.dataset.TTSDataset
+      manifest_filepath: ${train_dataset}
+      sample_rate: ${model.sample_rate}
+      sup_data_path: ${sup_data_path}
+      sup_data_types: ${sup_data_types}
+      n_fft: ${model.n_fft}
+      win_length: ${model.n_window_size}
+      hop_length: ${model.n_window_stride}
+      window: ${model.window}
+      n_mels: ${model.n_mel_channels}
+      lowfreq: ${model.lowfreq}
+      highfreq: ${model.highfreq}
+      max_duration: null
+      min_duration: 0.1
+      ignore_file: null
+      trim: false
+      pitch_fmin: ${model.pitch_fmin}
+      pitch_fmax: ${model.pitch_fmax}
+      pitch_norm: true
+      pitch_mean: ${model.pitch_mean}
+      pitch_std: ${model.pitch_std}
+      use_beta_binomial_interpolator: true
+      
+    dataloader_params:
+      drop_last: false
+      shuffle: true
+      batch_size: 32
+      num_workers: 12
+      pin_memory: true
+
+  validation_ds:
+    dataset:
+      _target_: nemo.collections.tts.data.dataset.TTSDataset
+      manifest_filepath: ${validation_datasets}
+      sample_rate: ${model.sample_rate}
+      sup_data_path: ${sup_data_path}
+      sup_data_types: ${sup_data_types}
+      n_fft: ${model.n_fft}
+      win_length: ${model.n_window_size}
+      hop_length: ${model.n_window_stride}
+      window: ${model.window}
+      n_mels: ${model.n_mel_channels}
+      lowfreq: ${model.lowfreq}
+      highfreq: ${model.highfreq}
+      max_duration: null
+      min_duration: 0.1
+      ignore_file: null
+      trim: false
+      pitch_fmin: ${model.pitch_fmin}
+      pitch_fmax: ${model.pitch_fmax}
+      pitch_norm: true
+      pitch_mean: ${model.pitch_mean}
+      pitch_std: ${model.pitch_std}
+      use_beta_binomial_interpolator: true
+
+    dataloader_params:
+      drop_last: false
+      shuffle: false
+      batch_size: 32
+      num_workers: 8
+      pin_memory: true
+
+  preprocessor:
+    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
+    features: ${model.n_mel_channels}
+    lowfreq: ${model.lowfreq}
+    highfreq: ${model.highfreq}
+    n_fft: ${model.n_fft}
+    n_window_size: ${model.n_window_size}
+    window_size: false
+    n_window_stride: ${model.n_window_stride}
+    window_stride: false
+    pad_to: 1
+    pad_value: 0
+    sample_rate: ${model.sample_rate}
+    window: ${model.window}
+    normalize: null
+    preemph: null
+    dither: 0.0
+    frame_splicing: 1
+    log: true
+    log_zero_guard_type: add
+    log_zero_guard_value: 1e-05
+    mag_power: 1.0
+
+  input_fft: #n_embed and padding_idx are added by the model
+    _target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder
+    n_layer: 6
+    n_head: 1
+    d_model: ${model.symbols_embedding_dim}
+    d_head: 64
+    d_inner: 1536
+    kernel_size: 3
+    dropout: 0.1
+    dropatt: 0.1
+    dropemb: 0.0
+    d_embed: ${model.symbols_embedding_dim}
+    condition_types: [ "add", "layernorm" ] # options: [ "add", "concat", "layernorm" ]
+
+  output_fft:
+    _target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder
+    n_layer: 6
+    n_head: 1
+    d_model: ${model.symbols_embedding_dim}
+    d_head: 64
+    d_inner: 1536
+    kernel_size: 3
+    dropout: 0.1
+    dropatt: 0.1
+    dropemb: 0.0
+    condition_types: [ "add", "layernorm" ] # options: [ "add", "concat", "layernorm" ]
+
+  alignment_module:
+    _target_: nemo.collections.tts.modules.aligner.AlignmentEncoder
+    n_text_channels: ${model.symbols_embedding_dim}
+    condition_types: [ "add" ] # options: [ "add", "concat" ]
+    
+  duration_predictor:
+    _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
+    input_size: ${model.symbols_embedding_dim}
+    kernel_size: 3
+    filter_size: 256
+    dropout: 0.1
+    n_layers: 2
+    condition_types: [ "add", "layernorm" ] # options: [ "add", "concat", "layernorm" ]
+
+  pitch_predictor:
+    _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
+    input_size: ${model.symbols_embedding_dim}
+    kernel_size: 3
+    filter_size: 256
+    dropout: 0.1
+    n_layers: 2
+    condition_types: [ "add", "layernorm" ] # options: [ "add", "concat", "layernorm" ]
+
+  energy_predictor:
+    _target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
+    input_size: ${model.symbols_embedding_dim}
+    kernel_size: 3
+    filter_size: 256
+    dropout: 0.1
+    n_layers: 2
+    condition_types: [ "add", "layernorm" ] # options: [ "add", "concat", "layernorm" ]
+
+  speaker_encoder:
+    _target_: nemo.collections.tts.modules.submodules.SpeakerEncoder
+    precomputed_embedding_dim: null
+    lookup_module:
+      _target_: nemo.collections.tts.modules.submodules.SpeakerLookupTable
+      n_speakers: ???
+      embedding_dim: ${model.symbols_embedding_dim}
+    gst_module:
+      _target_: nemo.collections.tts.modules.submodules.GlobalStyleToken
+      gst_size: ${model.symbols_embedding_dim}
+      n_style_token: 10
+      n_style_attn_head: 4
+      reference_encoder:
+        _target_: nemo.collections.tts.modules.submodules.ReferenceEncoder
+        n_mels: ${model.n_mel_channels}
+        cnn_filters: [32, 32, 64, 64, 128, 128] 
+        dropout: 0.2
+        gru_hidden: ${model.symbols_embedding_dim}
+        kernel_size: 3
+        stride: 2
+        padding: 1
+        bias: true
+
+  optim:
+    name: adamw
+    lr: 1e-3
+    betas: [0.9, 0.999]
+    weight_decay: 1e-6
+
+    sched:
+      name: NoamAnnealing
+      warmup_steps: 1000
+      last_epoch: -1
+      d_model: 1  # Disable scaling based on model dim
+
+trainer:
+  num_nodes: 1
+  devices: 1
+  accelerator: gpu
+  strategy: ddp
+  precision: 16
+  max_epochs: 1000
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1000.0
+  enable_checkpointing: false # Provided by exp_manager
+  logger: false # Provided by exp_manager
+  log_every_n_steps: 100
+  check_val_every_n_epoch: 1
+  benchmark: false
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+  create_tensorboard_logger: true
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    monitor: val_loss
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: false
diff --git a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb
index 8d5c0bd638a8..0499c12c90ec 100644
--- a/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb
+++ b/tutorials/tts/FastPitch_Adapter_Finetuning.ipynb
@@ -430,7 +430,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "phoneme_dict_path = os.path.abspath(os.path.join(code_dir, \"scripts\", \"tts_dataset_files\", \"cmudict-0.7b_nv22.10\"))\n",
+    "phoneme_dict_path = os.path.abspath(os.path.join(code_dir, \"scripts\", \"tts_dataset_files\", \"ipa_cmudict-0.7b_nv23.01.txt\"))\n",
     "heteronyms_path = os.path.abspath(os.path.join(code_dir, \"scripts\", \"tts_dataset_files\", \"heteronyms-052722\"))\n",
     "\n",
     "# Copy and Paste the PITCH_MEAN and PITCH_STD from previous steps (train_manifest) to override pitch_mean and pitch_std configs below.\n",
@@ -461,16 +461,18 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "7ae8383a",
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
-    "# Normally 100 epochs\n",
+    "# Normally 200 epochs\n",
     "!cd {code_dir} && python examples/tts/fastpitch_finetune_adapters.py \\\n",
-    "--config-name=fastpitch_align_44100_adapter.yaml \\\n",
+    "--config-name=fastpitch_align_ipa_adapter.yaml \\\n",
     "+init_from_nemo_model={pretrained_fastpitch_checkpoint} \\\n",
     "train_dataset={train_manifest} \\\n",
     "validation_datasets={valid_manifest} \\\n",
-    "sup_data_types=\"['align_prior_matrix', 'pitch']\" \\\n",
+    "sup_data_types=\"['align_prior_matrix', 'pitch', 'energy']\" \\\n",
     "sup_data_path={supp_dir} \\\n",
     "pitch_mean={PITCH_MEAN} \\\n",
     "pitch_std={PITCH_STD} \\\n",
@@ -489,7 +491,7 @@
     "+exp_manager.wandb_logger_kwargs.name=\"tutorial-FastPitch-finetune-adaptation\" \\\n",
     "+exp_manager.wandb_logger_kwargs.project=\"NeMo\" \\\n",
     "+exp_manager.checkpoint_callback_params.save_top_k=-1 \\\n",
-    "trainer.max_epochs=10 \\\n",
+    "trainer.max_epochs=200 \\\n",
     "trainer.check_val_every_n_epoch=10 \\\n",
     "trainer.log_every_n_steps=1 \\\n",
     "trainer.devices=1 \\\n",
@@ -534,7 +536,9 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "1aecaa68",
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
     "!cd {code_dir} \\\n",
@@ -579,7 +583,9 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "e5d5f281",
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
     "# Normally 500 epochs\n",
@@ -592,7 +598,7 @@
     "model.optim.lr=0.0001 \\\n",
     "model/train_ds=train_ds_finetune \\\n",
     "model/validation_ds=val_ds_finetune \\\n",
-    "+trainer.max_epochs=5 \\\n",
+    "+trainer.max_epochs=500 \\\n",
     "trainer.check_val_every_n_epoch=5 \\\n",
     "trainer.devices=-1 \\\n",
     "trainer.strategy='ddp' \\\n",
@@ -751,14 +757,6 @@
     "print(f\"Finetuned FastPitch: {finetuned_fastpitch_checkpoint}\")\n",
     "print(f\"Finetuned HiFi-Gan: {finetuned_hifigan_on_adaptation_checkpoint}\")"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "66e8ab7d",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
@@ -777,7 +775,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.13"
+   "version": "3.8.0"
   }
  },
  "nbformat": 4,