diff --git a/hierarchicalforecast/_modidx.py b/hierarchicalforecast/_modidx.py
index 9c31fd68..3b2958f5 100644
--- a/hierarchicalforecast/_modidx.py
+++ b/hierarchicalforecast/_modidx.py
@@ -20,7 +20,19 @@
                                                  'hierarchicalforecast.evaluation.HierarchicalEvaluation.__init__': ( 'evaluation.html#hierarchicalevaluation.__init__',
                                                                                                                       'hierarchicalforecast/evaluation.py'),
                                                  'hierarchicalforecast.evaluation.HierarchicalEvaluation.evaluate': ( 'evaluation.html#hierarchicalevaluation.evaluate',
-                                                                                                                      'hierarchicalforecast/evaluation.py')},
+                                                                                                                      'hierarchicalforecast/evaluation.py'),
+                                                 'hierarchicalforecast.evaluation._metric_protections': ( 'evaluation.html#_metric_protections',
+                                                                                                          'hierarchicalforecast/evaluation.py'),
+                                                 'hierarchicalforecast.evaluation.energy_score': ( 'evaluation.html#energy_score',
+                                                                                                   'hierarchicalforecast/evaluation.py'),
+                                                 'hierarchicalforecast.evaluation.mqloss': ( 'evaluation.html#mqloss',
+                                                                                             'hierarchicalforecast/evaluation.py'),
+                                                 'hierarchicalforecast.evaluation.mse': ( 'evaluation.html#mse',
+                                                                                          'hierarchicalforecast/evaluation.py'),
+                                                 'hierarchicalforecast.evaluation.msse': ( 'evaluation.html#msse',
+                                                                                           'hierarchicalforecast/evaluation.py'),
+                                                 'hierarchicalforecast.evaluation.scaled_crps': ( 'evaluation.html#scaled_crps',
+                                                                                                  'hierarchicalforecast/evaluation.py')},
             'hierarchicalforecast.methods': { 'hierarchicalforecast.methods.BottomUp': ( 'methods.html#bottomup',
                                                                                          'hierarchicalforecast/methods.py'),
                                               'hierarchicalforecast.methods.BottomUp._get_PW_matrices': ( 'methods.html#bottomup._get_pw_matrices',
diff --git a/hierarchicalforecast/core.py b/hierarchicalforecast/core.py
index bf58938a..fa804235 100644
--- a/hierarchicalforecast/core.py
+++ b/hierarchicalforecast/core.py
@@ -38,7 +38,7 @@ def _build_fn_name(fn) -> str:
 def _reverse_engineer_sigmah(Y_hat_df, y_hat, model_name, uids):
     """
     This function assumes that the model creates prediction intervals
-    under a normality assumption with the following the Equation:
+    under a normality with the following the Equation:
     $\hat{y}_{t+h} + c \hat{sigma}_{h}$
 
     In the future, we might deprecate this function in favor of a 
@@ -93,8 +93,8 @@ def reconcile(self,
                   intervals_method: str = 'normality'):
         """Hierarchical Reconciliation Method.
 
-        The `reconcile` method is analogous to SKLearn `fit` method, it applies different 
-        reconciliation methods instantiated in the `reconcilers` list. 
+        The `reconcile` method is analogous to SKLearn `fit_predict` method, it 
+        applies different reconciliation methods instantiated in the `reconcilers` list. 
 
         Most reconciliation methods can be described by the following convenient 
         linear algebra notation:
@@ -207,9 +207,6 @@ def reconcile(self,
                 end = time.time()
                 self.execution_times[f'{model_name}/{reconcile_fn_name}'] = (end - start)
 
-                end = time.time()
-                self.execution_times[f'{model_name}/{reconcile_fn_name}'] = (end - start)
-
                 if self.insample and has_fitted:
                     del reconciler_args['y_hat_insample']
                 gc.collect()
diff --git a/hierarchicalforecast/evaluation.py b/hierarchicalforecast/evaluation.py
index 048d995e..296eabe4 100644
--- a/hierarchicalforecast/evaluation.py
+++ b/hierarchicalforecast/evaluation.py
@@ -1,17 +1,235 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/evaluation.ipynb.
 
 # %% auto 0
-__all__ = ['HierarchicalEvaluation']
+__all__ = ['msse', 'scaled_crps', 'energy_score', 'HierarchicalEvaluation']
 
-# %% ../nbs/evaluation.ipynb 2
+# %% ../nbs/evaluation.ipynb 3
 from inspect import signature
-from typing import Callable, Dict, List, Optional
+from typing import Callable, Dict, List, Optional, Union
 
 import numpy as np
 import pandas as pd
 
-# %% ../nbs/evaluation.ipynb 5
-class HierarchicalEvaluation:    
+# %% ../nbs/evaluation.ipynb 6
+def _metric_protections(y: np.ndarray, y_hat: np.ndarray, 
+                        weights: Optional[np.ndarray]) -> None:
+    if not ((weights is None) or (np.sum(weights) > 0)):
+        raise Exception('Sum of `weights` cannot be 0')
+    if not ((weights is None) or (weights.shape == y.shape)):
+        raise Exception(
+        f'Wrong weight dimension weights.shape {weights.shape}, y.shape {y.shape}')
+
+def mse(y: np.ndarray, y_hat: np.ndarray, 
+        weights: Optional[np.ndarray] = None,
+        axis: Optional[int] = None) -> Union[float, np.ndarray]:
+    """Mean Squared Error
+
+    Calculates Mean Squared Error between
+    `y` and `y_hat`. MSE measures the relative prediction
+    accuracy of a forecasting method by calculating the 
+    squared deviation of the prediction and the true
+    value at a given time, and averages these devations
+    over the length of the series.
+
+    $$ \mathrm{MSE}(\\mathbf{y}_{\\tau}, \\mathbf{\hat{y}}_{\\tau}) = \\frac{1}{H} \\sum^{t+H}_{\\tau=t+1} (y_{\\tau} - \hat{y}_{\\tau})^{2} $$
+
+    **Parameters:**<br>
+    `y`: numpy array, Actual values.<br>
+    `y_hat`: numpy array, Predicted values.<br>
+    `mask`: numpy array, Specifies date stamps per serie to consider in loss.<br>
+
+    **Returns:**<br>
+    `mse`: numpy array, (single value).
+    """
+    _metric_protections(y, y_hat, weights)
+
+    delta_y = np.square(y - y_hat)
+    if weights is not None:
+        mse = np.average(delta_y[~np.isnan(delta_y)],
+                         weights=weights[~np.isnan(delta_y)],
+                         axis=axis)
+    else:
+        mse = np.nanmean(delta_y, axis=axis)
+    return mse
+
+def mqloss(y: np.ndarray, y_hat: np.ndarray, 
+           quantiles: np.ndarray, 
+           weights: Optional[np.ndarray] = None,
+           axis: Optional[int] = None) -> Union[float, np.ndarray]:
+    """Multi-Quantile Loss
+
+    Calculates the Multi-Quantile loss (MQL) between `y` and `y_hat`.
+    MQL calculates the average multi-quantile Loss for
+    a given set of quantiles, based on the absolute 
+    difference between predicted quantiles and observed values.
+
+    $$ \mathrm{MQL}(\\mathbf{y}_{\\tau},[\\mathbf{\hat{y}}^{(q_{1})}_{\\tau}, ... ,\hat{y}^{(q_{n})}_{\\tau}]) = \\frac{1}{n} \\sum_{q_{i}} \mathrm{QL}(\\mathbf{y}_{\\tau}, \\mathbf{\hat{y}}^{(q_{i})}_{\\tau}) $$
+
+    The limit behavior of MQL allows to measure the accuracy 
+    of a full predictive distribution $\mathbf{\hat{F}}_{\\tau}$ with 
+    the continuous ranked probability score (CRPS). This can be achieved 
+    through a numerical integration technique, that discretizes the quantiles 
+    and treats the CRPS integral with a left Riemann approximation, averaging over 
+    uniformly distanced quantiles.    
+
+    $$ \mathrm{CRPS}(y_{\\tau}, \mathbf{\hat{F}}_{\\tau}) = \int^{1}_{0} \mathrm{QL}(y_{\\tau}, \hat{y}^{(q)}_{\\tau}) dq $$
+
+    **Parameters:**<br>
+    `y`: numpy array, Actual values.<br>
+    `y_hat`: numpy array, Predicted values.<br>
+    `quantiles`: numpy array. Quantiles between 0 and 1, to perform evaluation upon size (n_quantiles).<br>
+    `mask`: numpy array, Specifies date stamps per serie to consider in loss.<br>
+ 
+    **Returns:**<br>
+    `mqloss`: numpy array, (single value).
+
+    **References:**<br>
+    [Roger Koenker and Gilbert Bassett, Jr., "Regression Quantiles".](https://www.jstor.org/stable/1913643)<br>
+    [James E. Matheson and Robert L. Winkler, "Scoring Rules for Continuous Probability Distributions".](https://www.jstor.org/stable/2629907)
+    """
+    if weights is None: weights = np.ones(y.shape)
+    if (np.sum(quantiles>1)>0 or np.sum(quantiles<0)>0):
+        raise Exception('`quantiles` need to be between 0 and 1')
+        
+    _metric_protections(y, y_hat, weights)
+    n_q = len(quantiles)
+    
+    y_rep  = np.expand_dims(y, axis=-1)
+    error  = y_hat - y_rep
+    sq     = np.maximum(-error, np.zeros_like(error))
+    s1_q   = np.maximum(error, np.zeros_like(error))
+    mqloss = (quantiles * sq + (1 - quantiles) * s1_q)
+    
+    # Match y/weights dimensions and compute weighted average
+    weights = np.repeat(np.expand_dims(weights, axis=-1), repeats=n_q, axis=-1)
+    mqloss  = np.average(mqloss, weights=weights, axis=axis)
+
+    return mqloss
+
+# %% ../nbs/evaluation.ipynb 8
+def msse(y, y_hat, y_train, mask=None):
+    """Mean Squared Scaled Error
+
+    Computes Mean squared scaled error (MSSE), as proposed by Hyndman & Koehler (2006)
+    as an alternative to percentage errors, to avoid measure unstability.
+
+    $$ \mathrm{MSSE}(\\mathbf{y}, \\mathbf{\hat{y}}, \\mathbf{\hat{y}}^{naive1}) =
+    \\frac{\mathrm{MSE}(\\mathbf{y}, \\mathbf{\hat{y}})}{\mathrm{MSE}(\\mathbf{y}, \\mathbf{\hat{y}}^{naive1})} $$
+
+    **Parameters:**<br>
+    `y`: numpy array, Actual values of size (`n_series`, `horizon`).<br>
+    `y_hat`: numpy array, Predicted values (`n_series`, `horizon`).<br>
+    `mask`: numpy array, Specifies date stamps per serie to consider in loss.<br>
+
+    **Returns:**<br>
+    `loss`: float.    
+
+    **References:**<br>
+    - [Hyndman, R. J and Koehler, A. B. (2006).
+       "Another look at measures of forecast accuracy",
+       International Journal of Forecasting, Volume 22, Issue 4.](https://www.sciencedirect.com/science/article/pii/S0169207006000239)
+    - [Kin G. Olivares, O. Nganba Meetei, Ruijun Ma, Rohan Reddy, Mengfei Cao, Lee Dicker. 
+       "Probabilistic Hierarchical Forecasting with Deep Poisson Mixtures. 
+       Submitted to the International Journal Forecasting, Working paper available at arxiv.](https://arxiv.org/pdf/2110.13179.pdf)
+    """
+    if mask is None: 
+       mask = np.ones_like(y)
+    n_series, horizon = y.shape
+
+    eps = np.finfo(float).eps
+    y_naive = np.repeat(y_train[:,[-1]], horizon, axis=1)
+    norm = mse(y=y, y_hat=y_naive)
+    loss = mse(y=y, y_hat=y_hat, weights=mask)
+    loss = loss / (norm + eps)
+    return loss
+
+# %% ../nbs/evaluation.ipynb 11
+def scaled_crps(y, y_hat, quantiles):
+    """Scaled Continues Ranked Probability Score
+
+    Calculates a scaled variation of the CRPS, as proposed by Rangapuram (2021),
+    to measure the accuracy of predicted quantiles `y_hat` compared to the observation `y`.
+
+    This metric averages percentual weighted absolute deviations as 
+    defined by the quantile losses.
+
+    $$ \mathrm{sCRPS}(\hat{F}_{\\tau}, \mathbf{y}_{\\tau}) = 2 \sum_{i}
+    \int^{1}_{0}
+    \\frac{\mathrm{QL}(\hat{F}_{i,\\tau}, y_{i,\\tau})_{q}}{\sum_{i} | y_{i,\\tau} |} dq $$
+
+    where $\hat{F}_{\\tau}$ is the an estimated multivariate distribution, and $y_{i,\\tau}$
+    are its realizations.
+
+    **Parameters:**<br>
+    `y`: numpy array, Actual values of size (`n_series`, `horizon`).<br>
+    `y_hat`: numpy array, Predicted quantiles of size (`n_series`, `horizon`, `n_quantiles`).<br>
+    `quantiles`: numpy array,(`n_quantiles`). Quantiles to estimate from the distribution of y.<br>
+
+    **Returns:**<br>
+    `loss`: float.
+
+    **References:**<br>
+    - [Gneiting, Tilmann. (2011). \"Quantiles as optimal point forecasts\". 
+    International Journal of Forecasting.](https://www.sciencedirect.com/science/article/pii/S0169207010000063)
+    - [Spyros Makridakis, Evangelos Spiliotis, Vassilios Assimakopoulos, Zhi Chen, Anil Gaba, Ilia Tsetlin, Robert L. Winkler. (2022). 
+    \"The M5 uncertainty competition: Results, findings and conclusions\". 
+    International Journal of Forecasting.](https://www.sciencedirect.com/science/article/pii/S0169207021001722)
+    - [Syama Sundar Rangapuram, Lucien D Werner, Konstantinos Benidis, Pedro Mercado, Jan Gasthaus, Tim Januschowski. (2021). 
+    \"End-to-End Learning of Coherent Probabilistic Forecasts for Hierarchical Time Series\". 
+    Proceedings of the 38th International Conference on Machine Learning (ICML).](https://proceedings.mlr.press/v139/rangapuram21a.html)
+    """
+    eps = np.finfo(float).eps
+    norm  = np.sum(np.abs(y))
+    loss  = mqloss(y=y, y_hat=y_hat, quantiles=quantiles)
+    loss  = 2 * loss * np.sum(np.ones(y.shape)) / (norm + eps)
+    return loss
+
+# %% ../nbs/evaluation.ipynb 14
+def energy_score(y, y_sample1, y_sample2, beta=2):
+    """Energy Score
+
+    Calculates Gneiting's Energy Score sample approximation for
+    `y` and independent multivariate samples `y_sample1` and `y_sample2`.
+    The Energy Score generalizes the CRPS (`beta`=1) in the multivariate setting.
+
+    $$ \mathrm{ES}(\\mathbf{y}_{\\tau}, \\mathbf{\hat{y}}_{\\tau}, \\mathbf{\hat{y}}_{\\tau}') 
+    = \\frac{1}{2} \mathbb{E}_{\hat{P}} \\left[ ||\\mathbf{\hat{y}}_{\\tau} - \\mathbf{\hat{y}}_{\\tau}'||^{\\beta} \\right]
+    -  \mathbb{E}_{\hat{P}} \\left[ ||\\mathbf{y}_{\\tau} - \\mathbf{\hat{y}}_{\\tau}||^{\\beta} \\right] 
+    \quad \\beta \in (0,2]$$
+
+    where $\\mathbf{\hat{y}}_{\\tau}, \\mathbf{\hat{y}}_{\\tau}'$ are independent samples drawn from $\hat{P}$.
+
+    **Parameters:**<br>
+    `y`: numpy array, Actual values of size (`n_series`, `horizon`).<br>
+    `y_sample1`: numpy array, predictive distribution sample of size (`n_series`, `horizon`, `n_samples`).<br>
+    `y_sample2`: numpy array, predictive distribution sample of size (`n_series`, `horizon`, `n_samples`).<br>
+    `beta`: float in (0,2], defines the energy score's power for the euclidean metric.<br>
+
+    **Returns:**<br>
+    `loss`: float.
+
+    **References:**<br>
+    - [Gneiting, Tilmann, and Adrian E. Raftery. (2007). 
+    \"Strictly proper scoring rules, prediction and estimation\". 
+    Journal of the American Statistical Association.](https://sites.stat.washington.edu/raftery/Research/PDF/Gneiting2007jasa.pdf)
+    - [Anastasios Panagiotelis, Puwasala Gamakumara, George Athanasopoulos, Rob J. Hyndman. (2022). 
+    \"Probabilistic forecast reconciliation: Properties, evaluation and score optimisation\". 
+    European Journal of Operational Research.](https://www.sciencedirect.com/science/article/pii/S0377221722006087)    
+    """
+    if beta>2 or beta<0:
+        raise Exception("beta needs to be between 0 and 2.")
+
+    dif1 = (y_sample1 - y_sample2)
+    dif2 = (y[:,:,None] - y_sample1)
+
+    term1 = np.linalg.norm(dif1, axis=0) ** beta
+    term2 = np.linalg.norm(dif2, axis=0) ** beta
+
+    loss = np.mean(term2 - 0.5 * term1)
+    return loss
+
+# %% ../nbs/evaluation.ipynb 17
+class HierarchicalEvaluation:
     """Hierarchical Evaluation Class.
     
     You can use your own metrics to evaluate the performance of each level in the structure.
@@ -23,6 +241,8 @@ class HierarchicalEvaluation:
 
     **Parameters:**<br>
     `evaluators`: functions with arguments `y`, `y_hat` (numpy arrays).<br>
+
+    **References:**<br>
     """
     def __init__(self, 
                  evaluators: List[Callable]):
diff --git a/hierarchicalforecast/methods.py b/hierarchicalforecast/methods.py
index 4f4f1538..1a327b2c 100644
--- a/hierarchicalforecast/methods.py
+++ b/hierarchicalforecast/methods.py
@@ -108,9 +108,7 @@ def sample(self,
         if self.sampler is None:
             raise Exception("This model instance does not have sampler. Call fit with `intervals_method`.")
 
-        # [samples, N, H] -> [N, H, samples]
         samples = self.sampler.get_samples(num_samples=num_samples)
-        samples = samples.transpose((1, 2, 0))
         return samples
 
 # %% ../nbs/methods.ipynb 10
diff --git a/hierarchicalforecast/probabilistic_methods.py b/hierarchicalforecast/probabilistic_methods.py
index 5b089dd1..2c4f59ed 100644
--- a/hierarchicalforecast/probabilistic_methods.py
+++ b/hierarchicalforecast/probabilistic_methods.py
@@ -82,6 +82,9 @@ def get_samples(self, num_samples: int=None):
             partial_samples = state.multivariate_normal(mean=self.SP @ self.y_hat[:,t],
                                                   cov=self.cov_rec[t], size=num_samples)
             samples[:,:,t] = partial_samples
+
+        # [samples, N, H] -> [N, H, samples]
+        samples = samples.transpose((1, 2, 0))
         return samples
 
     def get_prediction_levels(self, res, level):
@@ -168,7 +171,11 @@ def get_samples(self, num_samples: int=None):
         SP = self.S @ self.P
         samples = np.apply_along_axis(lambda path: np.matmul(SP, path),
                                       axis=1, arr=samples)
-        return np.stack(samples)
+        samples = np.stack(samples)
+
+        # [samples, N, H] -> [N, H, samples]
+        samples = samples.transpose((1, 2, 0))
+        return samples
 
     def get_prediction_levels(self, res, level):
         """ Adds reconciled forecast levels to results dictionary """
@@ -176,8 +183,8 @@ def get_prediction_levels(self, res, level):
         for lv in level:
             min_q = (100 - lv) / 200
             max_q = min_q + lv / 100
-            res[f'lo-{lv}'] = np.quantile(samples, min_q, axis=0)
-            res[f'hi-{lv}'] = np.quantile(samples, max_q, axis=0)
+            res[f'lo-{lv}'] = np.quantile(samples, min_q, axis=2)
+            res[f'hi-{lv}'] = np.quantile(samples, max_q, axis=2)
         return res
 
 # %% ../nbs/probabilistic_methods.ipynb 14
@@ -382,8 +389,7 @@ def get_samples(self, num_samples: int=None):
             )
 
             rec_samples[parent_idxs,:,:] = parent_samples
-
-        return np.transpose(rec_samples, (2, 0, 1))
+        return rec_samples
 
     def get_prediction_levels(self, res, level):
         """ Adds reconciled forecast levels to results dictionary """
@@ -391,6 +397,6 @@ def get_prediction_levels(self, res, level):
         for lv in level:
             min_q = (100 - lv) / 200
             max_q = min_q + lv / 100
-            res[f'lo-{lv}'] = np.quantile(samples, min_q, axis=0)
-            res[f'hi-{lv}'] = np.quantile(samples, max_q, axis=0)
+            res[f'lo-{lv}'] = np.quantile(samples, min_q, axis=2)
+            res[f'hi-{lv}'] = np.quantile(samples, max_q, axis=2)
         return res
diff --git a/nbs/core.ipynb b/nbs/core.ipynb
index 215de23e..6ae0dbd8 100644
--- a/nbs/core.ipynb
+++ b/nbs/core.ipynb
@@ -131,7 +131,7 @@
     "def _reverse_engineer_sigmah(Y_hat_df, y_hat, model_name, uids):\n",
     "    \"\"\"\n",
     "    This function assumes that the model creates prediction intervals\n",
-    "    under a normality assumption with the following the Equation:\n",
+    "    under a normality with the following the Equation:\n",
     "    $\\hat{y}_{t+h} + c \\hat{sigma}_{h}$\n",
     "\n",
     "    In the future, we might deprecate this function in favor of a \n",
@@ -193,8 +193,8 @@
     "                  intervals_method: str = 'normality'):\n",
     "        \"\"\"Hierarchical Reconciliation Method.\n",
     "\n",
-    "        The `reconcile` method is analogous to SKLearn `fit` method, it applies different \n",
-    "        reconciliation methods instantiated in the `reconcilers` list. \n",
+    "        The `reconcile` method is analogous to SKLearn `fit_predict` method, it \n",
+    "        applies different reconciliation methods instantiated in the `reconcilers` list. \n",
     "\n",
     "        Most reconciliation methods can be described by the following convenient \n",
     "        linear algebra notation:\n",
@@ -307,9 +307,6 @@
     "                end = time.time()\n",
     "                self.execution_times[f'{model_name}/{reconcile_fn_name}'] = (end - start)\n",
     "\n",
-    "                end = time.time()\n",
-    "                self.execution_times[f'{model_name}/{reconcile_fn_name}'] = (end - start)\n",
-    "\n",
     "                if self.insample and has_fitted:\n",
     "                    del reconciler_args['y_hat_insample']\n",
     "                gc.collect()\n",
diff --git a/nbs/evaluation.ipynb b/nbs/evaluation.ipynb
index 5a4b4c36..17c13f44 100644
--- a/nbs/evaluation.ipynb
+++ b/nbs/evaluation.ipynb
@@ -6,18 +6,25 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#| default_exp evaluation\n",
-    "# - [Anastasios Panagiotelis et. al. \"Probabilistic Forecast Reconciliation: Properties, Evaluation and Score Optimisation\"](https://www.monash.edu/business/ebs/research/publications/ebs/wp26-2020.pdf)"
+    "#| default_exp evaluation"
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Hierarchical Evaluation \n",
+    "# Hierarchical Evaluation"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To assist the evaluation of hierarchical forecasting systems, we make available accuracy metrics along with the `HierarchicalEvaluation` module that facilitates the measurement of prediction's accuracy through the hierarchy levels. \n",
     "\n",
-    "> To assist on the evaluation of hierarchical systems we make available the `HierarchicalEvaluation` module\n",
-    "that facilitates the measurement of prediction's accuracy through the levels of a hierarchy.<br><br>"
+    "The available metrics include point and probabilistic multivariate scoring rules that were used in previous hierarchical forecasting studies."
    ]
   },
   {
@@ -28,7 +35,7 @@
    "source": [
     "#| export\n",
     "from inspect import signature\n",
-    "from typing import Callable, Dict, List, Optional\n",
+    "from typing import Callable, Dict, List, Optional, Union\n",
     "\n",
     "import numpy as np\n",
     "import pandas as pd"
@@ -45,6 +52,309 @@
     "from nbdev.showdoc import add_docs, show_doc"
    ]
   },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# <span style=\"color:DarkBlue\"> Accuracy Measurements </span>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| exporti\n",
+    "def _metric_protections(y: np.ndarray, y_hat: np.ndarray, \n",
+    "                        weights: Optional[np.ndarray]) -> None:\n",
+    "    if not ((weights is None) or (np.sum(weights) > 0)):\n",
+    "        raise Exception('Sum of `weights` cannot be 0')\n",
+    "    if not ((weights is None) or (weights.shape == y.shape)):\n",
+    "        raise Exception(\n",
+    "        f'Wrong weight dimension weights.shape {weights.shape}, y.shape {y.shape}')\n",
+    "\n",
+    "def mse(y: np.ndarray, y_hat: np.ndarray, \n",
+    "        weights: Optional[np.ndarray] = None,\n",
+    "        axis: Optional[int] = None) -> Union[float, np.ndarray]:\n",
+    "    \"\"\"Mean Squared Error\n",
+    "\n",
+    "    Calculates Mean Squared Error between\n",
+    "    `y` and `y_hat`. MSE measures the relative prediction\n",
+    "    accuracy of a forecasting method by calculating the \n",
+    "    squared deviation of the prediction and the true\n",
+    "    value at a given time, and averages these devations\n",
+    "    over the length of the series.\n",
+    "\n",
+    "    $$ \\mathrm{MSE}(\\\\mathbf{y}_{\\\\tau}, \\\\mathbf{\\hat{y}}_{\\\\tau}) = \\\\frac{1}{H} \\\\sum^{t+H}_{\\\\tau=t+1} (y_{\\\\tau} - \\hat{y}_{\\\\tau})^{2} $$\n",
+    "\n",
+    "    **Parameters:**<br>\n",
+    "    `y`: numpy array, Actual values.<br>\n",
+    "    `y_hat`: numpy array, Predicted values.<br>\n",
+    "    `mask`: numpy array, Specifies date stamps per serie to consider in loss.<br>\n",
+    "\n",
+    "    **Returns:**<br>\n",
+    "    `mse`: numpy array, (single value).\n",
+    "    \"\"\"\n",
+    "    _metric_protections(y, y_hat, weights)\n",
+    "\n",
+    "    delta_y = np.square(y - y_hat)\n",
+    "    if weights is not None:\n",
+    "        mse = np.average(delta_y[~np.isnan(delta_y)],\n",
+    "                         weights=weights[~np.isnan(delta_y)],\n",
+    "                         axis=axis)\n",
+    "    else:\n",
+    "        mse = np.nanmean(delta_y, axis=axis)\n",
+    "    return mse\n",
+    "\n",
+    "def mqloss(y: np.ndarray, y_hat: np.ndarray, \n",
+    "           quantiles: np.ndarray, \n",
+    "           weights: Optional[np.ndarray] = None,\n",
+    "           axis: Optional[int] = None) -> Union[float, np.ndarray]:\n",
+    "    \"\"\"Multi-Quantile Loss\n",
+    "\n",
+    "    Calculates the Multi-Quantile loss (MQL) between `y` and `y_hat`.\n",
+    "    MQL calculates the average multi-quantile Loss for\n",
+    "    a given set of quantiles, based on the absolute \n",
+    "    difference between predicted quantiles and observed values.\n",
+    "\n",
+    "    $$ \\mathrm{MQL}(\\\\mathbf{y}_{\\\\tau},[\\\\mathbf{\\hat{y}}^{(q_{1})}_{\\\\tau}, ... ,\\hat{y}^{(q_{n})}_{\\\\tau}]) = \\\\frac{1}{n} \\\\sum_{q_{i}} \\mathrm{QL}(\\\\mathbf{y}_{\\\\tau}, \\\\mathbf{\\hat{y}}^{(q_{i})}_{\\\\tau}) $$\n",
+    "\n",
+    "    The limit behavior of MQL allows to measure the accuracy \n",
+    "    of a full predictive distribution $\\mathbf{\\hat{F}}_{\\\\tau}$ with \n",
+    "    the continuous ranked probability score (CRPS). This can be achieved \n",
+    "    through a numerical integration technique, that discretizes the quantiles \n",
+    "    and treats the CRPS integral with a left Riemann approximation, averaging over \n",
+    "    uniformly distanced quantiles.    \n",
+    "\n",
+    "    $$ \\mathrm{CRPS}(y_{\\\\tau}, \\mathbf{\\hat{F}}_{\\\\tau}) = \\int^{1}_{0} \\mathrm{QL}(y_{\\\\tau}, \\hat{y}^{(q)}_{\\\\tau}) dq $$\n",
+    "\n",
+    "    **Parameters:**<br>\n",
+    "    `y`: numpy array, Actual values.<br>\n",
+    "    `y_hat`: numpy array, Predicted values.<br>\n",
+    "    `quantiles`: numpy array. Quantiles between 0 and 1, to perform evaluation upon size (n_quantiles).<br>\n",
+    "    `mask`: numpy array, Specifies date stamps per serie to consider in loss.<br>\n",
+    " \n",
+    "    **Returns:**<br>\n",
+    "    `mqloss`: numpy array, (single value).\n",
+    "\n",
+    "    **References:**<br>\n",
+    "    [Roger Koenker and Gilbert Bassett, Jr., \"Regression Quantiles\".](https://www.jstor.org/stable/1913643)<br>\n",
+    "    [James E. Matheson and Robert L. Winkler, \"Scoring Rules for Continuous Probability Distributions\".](https://www.jstor.org/stable/2629907)\n",
+    "    \"\"\"\n",
+    "    if weights is None: weights = np.ones(y.shape)\n",
+    "    if (np.sum(quantiles>1)>0 or np.sum(quantiles<0)>0):\n",
+    "        raise Exception('`quantiles` need to be between 0 and 1')\n",
+    "        \n",
+    "    _metric_protections(y, y_hat, weights)\n",
+    "    n_q = len(quantiles)\n",
+    "    \n",
+    "    y_rep  = np.expand_dims(y, axis=-1)\n",
+    "    error  = y_hat - y_rep\n",
+    "    sq     = np.maximum(-error, np.zeros_like(error))\n",
+    "    s1_q   = np.maximum(error, np.zeros_like(error))\n",
+    "    mqloss = (quantiles * sq + (1 - quantiles) * s1_q)\n",
+    "    \n",
+    "    # Match y/weights dimensions and compute weighted average\n",
+    "    weights = np.repeat(np.expand_dims(weights, axis=-1), repeats=n_q, axis=-1)\n",
+    "    mqloss  = np.average(mqloss, weights=weights, axis=axis)\n",
+    "\n",
+    "    return mqloss"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Mean Squared Scaled Error"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "def msse(y, y_hat, y_train, mask=None):\n",
+    "    \"\"\"Mean Squared Scaled Error\n",
+    "\n",
+    "    Computes Mean squared scaled error (MSSE), as proposed by Hyndman & Koehler (2006)\n",
+    "    as an alternative to percentage errors, to avoid measure unstability.\n",
+    "\n",
+    "    $$ \\mathrm{MSSE}(\\\\mathbf{y}, \\\\mathbf{\\hat{y}}, \\\\mathbf{\\hat{y}}^{naive1}) =\n",
+    "    \\\\frac{\\mathrm{MSE}(\\\\mathbf{y}, \\\\mathbf{\\hat{y}})}{\\mathrm{MSE}(\\\\mathbf{y}, \\\\mathbf{\\hat{y}}^{naive1})} $$\n",
+    "\n",
+    "    **Parameters:**<br>\n",
+    "    `y`: numpy array, Actual values of size (`n_series`, `horizon`).<br>\n",
+    "    `y_hat`: numpy array, Predicted values (`n_series`, `horizon`).<br>\n",
+    "    `mask`: numpy array, Specifies date stamps per serie to consider in loss.<br>\n",
+    "\n",
+    "    **Returns:**<br>\n",
+    "    `loss`: float.    \n",
+    "\n",
+    "    **References:**<br>\n",
+    "    - [Hyndman, R. J and Koehler, A. B. (2006).\n",
+    "       \"Another look at measures of forecast accuracy\",\n",
+    "       International Journal of Forecasting, Volume 22, Issue 4.](https://www.sciencedirect.com/science/article/pii/S0169207006000239)\n",
+    "    - [Kin G. Olivares, O. Nganba Meetei, Ruijun Ma, Rohan Reddy, Mengfei Cao, Lee Dicker. \n",
+    "       \"Probabilistic Hierarchical Forecasting with Deep Poisson Mixtures. \n",
+    "       Submitted to the International Journal Forecasting, Working paper available at arxiv.](https://arxiv.org/pdf/2110.13179.pdf)\n",
+    "    \"\"\"\n",
+    "    if mask is None: \n",
+    "       mask = np.ones_like(y)\n",
+    "    n_series, horizon = y.shape\n",
+    "\n",
+    "    eps = np.finfo(float).eps\n",
+    "    y_naive = np.repeat(y_train[:,[-1]], horizon, axis=1)\n",
+    "    norm = mse(y=y, y_hat=y_naive)\n",
+    "    loss = mse(y=y, y_hat=y_hat, weights=mask)\n",
+    "    loss = loss / (norm + eps)\n",
+    "    return loss"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "show_doc(msse)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Scaled CRPS"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "def scaled_crps(y, y_hat, quantiles):\n",
+    "    \"\"\"Scaled Continues Ranked Probability Score\n",
+    "\n",
+    "    Calculates a scaled variation of the CRPS, as proposed by Rangapuram (2021),\n",
+    "    to measure the accuracy of predicted quantiles `y_hat` compared to the observation `y`.\n",
+    "\n",
+    "    This metric averages percentual weighted absolute deviations as \n",
+    "    defined by the quantile losses.\n",
+    "\n",
+    "    $$ \\mathrm{sCRPS}(\\hat{F}_{\\\\tau}, \\mathbf{y}_{\\\\tau}) = 2 \\sum_{i}\n",
+    "    \\int^{1}_{0}\n",
+    "    \\\\frac{\\mathrm{QL}(\\hat{F}_{i,\\\\tau}, y_{i,\\\\tau})_{q}}{\\sum_{i} | y_{i,\\\\tau} |} dq $$\n",
+    "\n",
+    "    where $\\hat{F}_{\\\\tau}$ is the an estimated multivariate distribution, and $y_{i,\\\\tau}$\n",
+    "    are its realizations.\n",
+    "\n",
+    "    **Parameters:**<br>\n",
+    "    `y`: numpy array, Actual values of size (`n_series`, `horizon`).<br>\n",
+    "    `y_hat`: numpy array, Predicted quantiles of size (`n_series`, `horizon`, `n_quantiles`).<br>\n",
+    "    `quantiles`: numpy array,(`n_quantiles`). Quantiles to estimate from the distribution of y.<br>\n",
+    "\n",
+    "    **Returns:**<br>\n",
+    "    `loss`: float.\n",
+    "\n",
+    "    **References:**<br>\n",
+    "    - [Gneiting, Tilmann. (2011). \\\"Quantiles as optimal point forecasts\\\". \n",
+    "    International Journal of Forecasting.](https://www.sciencedirect.com/science/article/pii/S0169207010000063)\n",
+    "    - [Spyros Makridakis, Evangelos Spiliotis, Vassilios Assimakopoulos, Zhi Chen, Anil Gaba, Ilia Tsetlin, Robert L. Winkler. (2022). \n",
+    "    \\\"The M5 uncertainty competition: Results, findings and conclusions\\\". \n",
+    "    International Journal of Forecasting.](https://www.sciencedirect.com/science/article/pii/S0169207021001722)\n",
+    "    - [Syama Sundar Rangapuram, Lucien D Werner, Konstantinos Benidis, Pedro Mercado, Jan Gasthaus, Tim Januschowski. (2021). \n",
+    "    \\\"End-to-End Learning of Coherent Probabilistic Forecasts for Hierarchical Time Series\\\". \n",
+    "    Proceedings of the 38th International Conference on Machine Learning (ICML).](https://proceedings.mlr.press/v139/rangapuram21a.html)\n",
+    "    \"\"\"\n",
+    "    eps = np.finfo(float).eps\n",
+    "    norm  = np.sum(np.abs(y))\n",
+    "    loss  = mqloss(y=y, y_hat=y_hat, quantiles=quantiles)\n",
+    "    loss  = 2 * loss * np.sum(np.ones(y.shape)) / (norm + eps)\n",
+    "    return loss"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "show_doc(scaled_crps)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Energy Score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "def energy_score(y, y_sample1, y_sample2, beta=2):\n",
+    "    \"\"\"Energy Score\n",
+    "\n",
+    "    Calculates Gneiting's Energy Score sample approximation for\n",
+    "    `y` and independent multivariate samples `y_sample1` and `y_sample2`.\n",
+    "    The Energy Score generalizes the CRPS (`beta`=1) in the multivariate setting.\n",
+    "\n",
+    "    $$ \\mathrm{ES}(\\\\mathbf{y}_{\\\\tau}, \\\\mathbf{\\hat{y}}_{\\\\tau}, \\\\mathbf{\\hat{y}}_{\\\\tau}') \n",
+    "    = \\\\frac{1}{2} \\mathbb{E}_{\\hat{P}} \\\\left[ ||\\\\mathbf{\\hat{y}}_{\\\\tau} - \\\\mathbf{\\hat{y}}_{\\\\tau}'||^{\\\\beta} \\\\right]\n",
+    "    -  \\mathbb{E}_{\\hat{P}} \\\\left[ ||\\\\mathbf{y}_{\\\\tau} - \\\\mathbf{\\hat{y}}_{\\\\tau}||^{\\\\beta} \\\\right] \n",
+    "    \\quad \\\\beta \\in (0,2]$$\n",
+    "\n",
+    "    where $\\\\mathbf{\\hat{y}}_{\\\\tau}, \\\\mathbf{\\hat{y}}_{\\\\tau}'$ are independent samples drawn from $\\hat{P}$.\n",
+    "\n",
+    "    **Parameters:**<br>\n",
+    "    `y`: numpy array, Actual values of size (`n_series`, `horizon`).<br>\n",
+    "    `y_sample1`: numpy array, predictive distribution sample of size (`n_series`, `horizon`, `n_samples`).<br>\n",
+    "    `y_sample2`: numpy array, predictive distribution sample of size (`n_series`, `horizon`, `n_samples`).<br>\n",
+    "    `beta`: float in (0,2], defines the energy score's power for the euclidean metric.<br>\n",
+    "\n",
+    "    **Returns:**<br>\n",
+    "    `loss`: float.\n",
+    "\n",
+    "    **References:**<br>\n",
+    "    - [Gneiting, Tilmann, and Adrian E. Raftery. (2007). \n",
+    "    \\\"Strictly proper scoring rules, prediction and estimation\\\". \n",
+    "    Journal of the American Statistical Association.](https://sites.stat.washington.edu/raftery/Research/PDF/Gneiting2007jasa.pdf)\n",
+    "    - [Anastasios Panagiotelis, Puwasala Gamakumara, George Athanasopoulos, Rob J. Hyndman. (2022). \n",
+    "    \\\"Probabilistic forecast reconciliation: Properties, evaluation and score optimisation\\\". \n",
+    "    European Journal of Operational Research.](https://www.sciencedirect.com/science/article/pii/S0377221722006087)    \n",
+    "    \"\"\"\n",
+    "    if beta>2 or beta<0:\n",
+    "        raise Exception(\"beta needs to be between 0 and 2.\")\n",
+    "\n",
+    "    dif1 = (y_sample1 - y_sample2)\n",
+    "    dif2 = (y[:,:,None] - y_sample1)\n",
+    "\n",
+    "    term1 = np.linalg.norm(dif1, axis=0) ** beta\n",
+    "    term2 = np.linalg.norm(dif2, axis=0) ** beta\n",
+    "\n",
+    "    loss = np.mean(term2 - 0.5 * term1)\n",
+    "    return loss"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "show_doc(energy_score)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -59,7 +369,7 @@
    "outputs": [],
    "source": [
     "#| export\n",
-    "class HierarchicalEvaluation:    \n",
+    "class HierarchicalEvaluation:\n",
     "    \"\"\"Hierarchical Evaluation Class.\n",
     "    \n",
     "    You can use your own metrics to evaluate the performance of each level in the structure.\n",
@@ -71,6 +381,8 @@
     "\n",
     "    **Parameters:**<br>\n",
     "    `evaluators`: functions with arguments `y`, `y_hat` (numpy arrays).<br>\n",
+    "\n",
+    "    **References:**<br>\n",
     "    \"\"\"\n",
     "    def __init__(self, \n",
     "                 evaluators: List[Callable]):\n",
@@ -182,6 +494,7 @@
     "from hierarchicalforecast.core import HierarchicalReconciliation\n",
     "from hierarchicalforecast.methods import BottomUp, MinTrace, ERM\n",
     "from hierarchicalforecast.utils import aggregate\n",
+    "\n",
     "df = pd.read_csv('https://raw.githubusercontent.com/Nixtla/transfer-learning-time-series/main/datasets/tourism.csv')\n",
     "df = df.rename({'Trips': 'y', 'Quarter': 'ds'}, axis=1)\n",
     "df.insert(0, 'Country', 'Australia')\n",
@@ -283,11 +596,42 @@
     "                   Y_df=hier_grouped_df,\n",
     "                   benchmark='y_model')"
    ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# <span style=\"color:DarkBlue\"> References </span>\n",
+    "\n",
+    "- [Gneiting, Tilmann, and Adrian E. Raftery. (2007). \n",
+    "\\\"Strictly proper scoring rules, prediction and estimation\\\". \n",
+    "Journal of the American Statistical Association.](https://sites.stat.washington.edu/raftery/Research/PDF/Gneiting2007jasa.pdf)\n",
+    "- [Gneiting, Tilmann. (2011). \\\"Quantiles as optimal point forecasts\\\". \n",
+    "International Journal of Forecasting.](https://www.sciencedirect.com/science/article/pii/S0169207010000063)\n",
+    "- [Spyros Makridakis, Evangelos Spiliotis, Vassilios Assimakopoulos, Zhi Chen, Anil Gaba, Ilia Tsetlin, Robert L. Winkler. (2022). \n",
+    "\\\"The M5 uncertainty competition: Results, findings and conclusions\\\". \n",
+    "International Journal of Forecasting.](https://www.sciencedirect.com/science/article/pii/S0169207021001722)\n",
+    "- [Anastasios Panagiotelis, Puwasala Gamakumara, George Athanasopoulos, Rob J. Hyndman. (2022). \n",
+    "\\\"Probabilistic forecast reconciliation: Properties, evaluation and score optimisation\\\". \n",
+    "European Journal of Operational Research.](https://www.sciencedirect.com/science/article/pii/S0377221722006087)\n",
+    "- [Syama Sundar Rangapuram, Lucien D Werner, Konstantinos Benidis, Pedro Mercado, Jan Gasthaus, Tim Januschowski. (2021). \n",
+    "\\\"End-to-End Learning of Coherent Probabilistic Forecasts for Hierarchical Time Series\\\". \n",
+    "Proceedings of the 38th International Conference on Machine Learning (ICML).](https://proceedings.mlr.press/v139/rangapuram21a.html)\n",
+    "- [Kin G. Olivares, O. Nganba Meetei, Ruijun Ma, Rohan Reddy, Mengfei Cao, Lee Dicker (2022). \n",
+    "\"Probabilistic Hierarchical Forecasting with Deep Poisson Mixtures\". \n",
+    "Submitted to the International Journal Forecasting, Working paper available at arxiv.](https://arxiv.org/pdf/2110.13179.pdf)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "hierarchicalforecast",
    "language": "python",
    "name": "python3"
   }
diff --git a/nbs/examples/TourismLarge-Evaluation.ipynb b/nbs/examples/TourismLarge-Evaluation.ipynb
new file mode 100644
index 00000000..6d7a17a2
--- /dev/null
+++ b/nbs/examples/TourismLarge-Evaluation.ipynb
@@ -0,0 +1,261 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "9a2ffba7",
+   "metadata": {},
+   "source": [
+    "# • End to End Walkthrough\n",
+    "\n",
+    "> Hierarchical Forecast's reconciliation and evaluation."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3ebf6eb9",
+   "metadata": {},
+   "source": [
+    "This notebook offers a step to step guide to create a hierarchical forecasting pipeline.\n",
+    "\n",
+    "In the pipeline we will use `HierarchicalForecast` and `StatsForecast` core class, to create base predictions, reconcile and evaluate them. \n",
+    "\n",
+    "We will use the TourismL dataset that summarizes large Australian national visitor survey.\n",
+    "\n",
+    "Outline\n",
+    "1. Installing Packages\n",
+    "2. Prepare TourismL dataset\n",
+    "    - Read and aggregate\n",
+    "    - StatsForecast's Base Predictions\n",
+    "3. Reconciliar\n",
+    "4. Evaluar"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d08fca30",
+   "metadata": {},
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/Nixtla/hierarchicalforecast/blob/main/nbs/examples/TourismLarge-Evaluation.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "690aea05",
+   "metadata": {},
+   "source": [
+    "## 1. Installing HierarchicalForecast"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "225fd9fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%capture\n",
+    "# !pip install hierarchicalforecast\n",
+    "# !pip install -U numba statsforecast datasetsforecast"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1624c79f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "from datasetsforecast.hierarchical import HierarchicalData\n",
+    "\n",
+    "from statsforecast.core import StatsForecast\n",
+    "from statsforecast.models import AutoARIMA, Naive\n",
+    "\n",
+    "from hierarchicalforecast.utils import HierarchicalPlot\n",
+    "from hierarchicalforecast.core import HierarchicalReconciliation\n",
+    "from hierarchicalforecast.evaluation import HierarchicalEvaluation\n",
+    "from hierarchicalforecast.methods import BottomUp, MinTrace, ERM"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "58961ccd",
+   "metadata": {},
+   "source": [
+    "## 2. Preparing TourismL Dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8d002ec0",
+   "metadata": {},
+   "source": [
+    "### 2.1 Read and Aggregate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0045589f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Y_df, S_df, tags = HierarchicalData.load('./data', 'TourismLarge')\n",
+    "Y_df['ds'] = pd.to_datetime(Y_df['ds'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3958dc40",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hplot = HierarchicalPlot(S=S_df, tags=tags)\n",
+    "hplot.plot_summing_matrix()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "da22dd3c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Y_test_df = Y_df.groupby('unique_id').tail(12)\n",
+    "Y_train_df = Y_df.drop(Y_test_df.index)\n",
+    "\n",
+    "Y_test_df = Y_test_df.set_index('unique_id')\n",
+    "Y_train_df = Y_train_df.set_index('unique_id')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c56af1d9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Y_train_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "98fec0eb",
+   "metadata": {},
+   "source": [
+    "### 2.2 StatsForecast's Base Predictions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "76df4d31",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "if os.path.isfile('Y_hat.csv'):\n",
+    "    Y_hat_df = pd.read_csv('Y_hat.csv')\n",
+    "    Y_hat_df = Y_hat_df.set_index('unique_id')\n",
+    "else:\n",
+    "    fcst = StatsForecast(\n",
+    "        df=Y_train_df, \n",
+    "        models=[AutoARIMA(season_length=12)],\n",
+    "        fallback_model=[Naive()],\n",
+    "        freq='M', \n",
+    "        n_jobs=-1\n",
+    "    )\n",
+    "    Y_hat_df = fcst.forecast(h=12, level=[80])\n",
+    "    Y_hat_df.to_csv('Y_hat.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "740a5ea8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Y_hat_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e557adc7",
+   "metadata": {},
+   "source": [
+    "## 3. Reconciliate Predictions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e1982db8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# reconcilers = [\n",
+    "#     BottomUp(),\n",
+    "#     MinTrace(method='ols'),\n",
+    "#     ERM(method='reg_bu', lambda_reg=100)\n",
+    "# ]\n",
+    "# hrec = HierarchicalReconciliation(reconcilers=reconcilers)\n",
+    "# Y_rec_df = hrec.reconcile(Y_hat_df=Y_hat_df, Y_df=Y_train_df, \n",
+    "#                           S=S_df, tags=tags)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7109676b",
+   "metadata": {},
+   "source": [
+    "## 4. Evaluation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a524e1c2",
+   "metadata": {},
+   "source": [
+    "For reference we can check the performance when compared to that reported by previous hierarchical forecasting studies from the [PMM-CNN paper](https://arxiv.org/abs/2110.13179)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c0b2c842",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0821f68d",
+   "metadata": {},
+   "source": [
+    "## References"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ffa4a749",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "hierarchicalforecast",
+   "language": "python",
+   "name": "hierarchicalforecast"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/nbs/methods.ipynb b/nbs/methods.ipynb
index daec40a9..b4f4fd56 100644
--- a/nbs/methods.ipynb
+++ b/nbs/methods.ipynb
@@ -180,9 +180,7 @@
     "        if self.sampler is None:\n",
     "            raise Exception(\"This model instance does not have sampler. Call fit with `intervals_method`.\")\n",
     "\n",
-    "        # [samples, N, H] -> [N, H, samples]\n",
     "        samples = self.sampler.get_samples(num_samples=num_samples)\n",
-    "        samples = samples.transpose((1, 2, 0))\n",
     "        return samples"
    ]
   },
diff --git a/nbs/probabilistic_methods.ipynb b/nbs/probabilistic_methods.ipynb
index 0a028176..565cad80 100644
--- a/nbs/probabilistic_methods.ipynb
+++ b/nbs/probabilistic_methods.ipynb
@@ -50,7 +50,7 @@
    "source": [
     "#| hide\n",
     "from nbdev.showdoc import add_docs, show_doc\n",
-    "from fastcore.test import ExceptionExpected, test_close, test_eq\n",
+    "from fastcore.test import ExceptionExpected, test_close, test_eq, test_fail\n",
     "\n",
     "from hierarchicalforecast.methods import BottomUp, TopDown, MiddleOut, MinTrace, OptimalCombination, ERM"
    ]
@@ -138,6 +138,9 @@
     "            partial_samples = state.multivariate_normal(mean=self.SP @ self.y_hat[:,t],\n",
     "                                                  cov=self.cov_rec[t], size=num_samples)\n",
     "            samples[:,:,t] = partial_samples\n",
+    "\n",
+    "        # [samples, N, H] -> [N, H, samples]\n",
+    "        samples = samples.transpose((1, 2, 0))\n",
     "        return samples\n",
     "\n",
     "    def get_prediction_levels(self, res, level):\n",
@@ -256,7 +259,11 @@
     "        SP = self.S @ self.P\n",
     "        samples = np.apply_along_axis(lambda path: np.matmul(SP, path),\n",
     "                                      axis=1, arr=samples)\n",
-    "        return np.stack(samples)\n",
+    "        samples = np.stack(samples)\n",
+    "\n",
+    "        # [samples, N, H] -> [N, H, samples]\n",
+    "        samples = samples.transpose((1, 2, 0))\n",
+    "        return samples\n",
     "\n",
     "    def get_prediction_levels(self, res, level):\n",
     "        \"\"\" Adds reconciled forecast levels to results dictionary \"\"\"\n",
@@ -264,8 +271,8 @@
     "        for lv in level:\n",
     "            min_q = (100 - lv) / 200\n",
     "            max_q = min_q + lv / 100\n",
-    "            res[f'lo-{lv}'] = np.quantile(samples, min_q, axis=0)\n",
-    "            res[f'hi-{lv}'] = np.quantile(samples, max_q, axis=0)\n",
+    "            res[f'lo-{lv}'] = np.quantile(samples, min_q, axis=2)\n",
+    "            res[f'hi-{lv}'] = np.quantile(samples, max_q, axis=2)\n",
     "        return res"
    ]
   },
@@ -502,8 +509,7 @@
     "            )\n",
     "\n",
     "            rec_samples[parent_idxs,:,:] = parent_samples\n",
-    "\n",
-    "        return np.transpose(rec_samples, (2, 0, 1))\n",
+    "        return rec_samples\n",
     "\n",
     "    def get_prediction_levels(self, res, level):\n",
     "        \"\"\" Adds reconciled forecast levels to results dictionary \"\"\"\n",
@@ -511,8 +517,8 @@
     "        for lv in level:\n",
     "            min_q = (100 - lv) / 200\n",
     "            max_q = min_q + lv / 100\n",
-    "            res[f'lo-{lv}'] = np.quantile(samples, min_q, axis=0)\n",
-    "            res[f'hi-{lv}'] = np.quantile(samples, max_q, axis=0)\n",
+    "            res[f'lo-{lv}'] = np.quantile(samples, min_q, axis=2)\n",
+    "            res[f'hi-{lv}'] = np.quantile(samples, max_q, axis=2)\n",
     "        return res"
    ]
   },
@@ -541,6 +547,8 @@
    "outputs": [],
    "source": [
     "#| hide\n",
+    "from hierarchicalforecast.evaluation import msse, energy_score, scaled_crps\n",
+    "\n",
     "S = np.array([[1., 1., 1., 1.],\n",
     "              [1., 1., 0., 0.],\n",
     "              [0., 0., 1., 1.],\n",
@@ -568,7 +576,9 @@
     "y_hat_base_insample = S @ y_hat_bottom_insample\n",
     "sigma = np.nansum((y_base - y_hat_base_insample) ** 2, axis=1) / (y_base.shape[1] - 1)\n",
     "sigma = np.sqrt(sigma)\n",
-    "sigmah = sigma[:, None] * np.sqrt(np.vstack([np.arange(1, h + 1) for _ in range(y_base.shape[0])]))        "
+    "sigmah = sigma[:, None] * np.sqrt(np.vstack([np.arange(1, h + 1) for _ in range(y_base.shape[0])]))\n",
+    "noise = np.random.normal(scale=sigmah)\n",
+    "y_test = y_hat_base + noise"
    ]
   },
   {
@@ -624,6 +634,43 @@
     "test_eq(bootstrap_samples.shape, permbu_samples.shape)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| hide\n",
+    "# test MSSE's execution\n",
+    "msse(y=y_test, y_hat=y_hat_base, y_train=y_base)\n",
+    "\n",
+    "# test energy score's execution\n",
+    "energy_score(y=y_test,\n",
+    "             y_sample1=bootstrap_samples, y_sample2=permbu_samples)\n",
+    "\n",
+    "# test scaled CRPS' execution\n",
+    "quantiles = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])\n",
+    "bootstrap_quantiles = np.quantile(bootstrap_samples, q=quantiles, axis=2)\n",
+    "bootstrap_quantiles = bootstrap_quantiles.transpose((1,2,0)) # [Q,N,H] -> [N,H,Q]\n",
+    "scaled_crps(y=y_test, y_hat=bootstrap_quantiles, quantiles=quantiles)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| hide\n",
+    "# test quantile loss protections\n",
+    "quantiles = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.2])\n",
+    "test_fail(\n",
+    "    scaled_crps,\n",
+    "    contains='between 0 and 1',\n",
+    "    args=(y_test, bootstrap_quantiles, quantiles),   \n",
+    ")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},