diff --git a/examples/cat_breeds/README.md b/examples/cat_breeds/README.md
index e69de29b..8543717b 100644
--- a/examples/cat_breeds/README.md
+++ b/examples/cat_breeds/README.md
@@ -0,0 +1,247 @@
+# Cat breeds tutorial
+
+Download the images from https://www.kaggle.com/datasets/imbikramsaha/cat-breeds/code, or with the following code.
+
+```
+kagglehub.dataset_download("imbikramsaha/cat-breeds")
+```
+
+This tutorial demonstrates training and loading a model and its data into ChAI code to be used for multi-locale inference. 
+
+## Image Preprocessing (load_cats.py)
+
+The structure of the data will vary between different Kaggle datasets. For this specific one, the data consists of .jpg images of 12 different cat breeds, with each breed separated into its own directory. The following code iterates through every image in this structure.
+
+```
+classes=sorted(os.listdir(sdir) )
+n = 0
+for i, c in enumerate(classes):
+    cpath=os.path.join(sdir, c)
+    files=os.listdir(cpath)        
+    for f in files:
+        fpath=os.path.join(cpath,f)
+```
+
+PyTorch expects images to have (C, H, W) dimensions, which stands for channel, height, and width. It is also much easier to work with images that have the same height and width as each other. Within the for-loops above, every image in the dataset is resized down to (32, 32), transposed from (H, W, C) to (C, H, W), and saved as a .npy file. Labels are also saved as .npy files.
+
+```
+        image = cv2.imread(fpath)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        resized_img = cv2.resize(
+            src=image,
+            dsize(32, 32),
+            interpolation=cv2.INTER_CUBIC
+        )
+        transposed_img = np.transpose(resized_img, (2, 0, 1))
+        np.save(f"{save_path}/images/item{n}", transposed_img)
+        np.save(f"{save_path}/labels/item{n}", i)
+```
+
+## Building the Model (models/for_cats.py)
+
+To create a customized model in PyTorch, we create a class that inherits from nn.Module and provides its own __init__() and forward() functions. Define the layers of the model in __init__() and specify how the data will pass through the model in forward().
+
+Although it is not required, using nn.Sequential helps to ensure that every layer or activation function of the model is readable by ChAI.
+
+```
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class SmallCNN(nn.Module):
+    def __init__(self):
+        super(SmallCNN, self).__init__()
+        self.layers = nn.Sequential(
+            nn.Conv2d(3, 64, 3, padding="same"),
+            nn.ReLU(),
+            nn.Conv2d(64, 128, 3, padding="same"),
+            nn.ReLU(),
+            nn.MaxPool2d(2, 2),
+            nn.Flatten(),
+            nn.Linear(8192, 256),
+            nn.Linear(256, 10)
+        )
+
+    def forward(self, x):
+        return self.layers(x)
+```
+
+## Loading Data (utils.py and train_cnn.py)
+
+PyTorch provides the DataLoader class to shuffle and split data into batches for training. To use it, we implement a custom Dataset class to read, store, and retrieve our images. The following implementation of `cat_breed_dataset` iterates through every image in our dataset and saves them in an array for images and another array for labels. Here, we assume that in the directory pointed to by `path_to_data`, there are two directories, one holding the images and the other holding the labels, both ordered the same as the other with each image and data called "image#.npy", "#" being a number.
+
+```
+class cat_breed_dataset(VisionDataset):
+    def __init__(self, path_to_data):
+        self.imgpath = os.path.join(path_to_data, "images")
+        self.labpath = os.path.join(path_to_data, "labels")
+        self.images, self.labels = [], []
+        for lab in os.listdir(self.labpath):
+            if "item" in lab:
+                self.labels.append(
+                    np.load(os.path.join(self.labpath, lab))
+                )
+        self.labels = np.array(self.labels)
+        for img in os.listdir(self.imgpath):
+            if "item" in img:
+                self.images.append(
+                    np.load(os.path.join(self.imgpath, img))
+                )
+        self.images = np.array(self.images)
+
+        assert len(self.images) == len(self.labels)
+```
+
+Next, we implement `__len__` and `__getitem__`. For the latter, we return the image and the label as two separate tensors. We ensure that the label contains long(s) and the image contains floats for compatibility with the model's weights and the loss function.
+
+```
+    def __len__(self):
+        return len(self.labels)
+    
+    def __getitem__(self, idx):
+        if torch.is_tensor(idx):
+            idx = idx.tolist()
+        img = torch.tensor(self.images[idx]).float()
+        lab = torch.tensor(self.labels[idx]).long()
+        # `tensor` is lowercase to make `lab` a 0-dim tensor
+        return img, lab
+```
+
+This class is instantiated and passed to a DataLoader for training.
+
+```
+cats_train = utils.cat_breed_dataset("./cat_breeds/data/catbreeds")
+trainloader = DataLoader(cats_train, batch_size=128, shuffle=True)
+```
+
+## Training the Model (utils.py and train_cnn.py)
+
+Before training, define a loss function and an optimizer to train the model.
+
+```
+optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
+criterion = torch.nn.CrossEntropyLoss()
+```
+
+During training, loss is calculated with the model's predictions and the provided labels. The model backpropagates the prediction error for the current batch, adjusting its parameters, before going to the next batch of data until the training is complete.
+
+```
+def train(model, device, train_loader, optimizer, criterion):
+    model.train()
+    avg_loss = 0
+
+    for batch_idx, (data, target) in enumerate(train_loader):
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+
+        loss = criterion(output, target)
+        avg_loss += loss
+        loss.backward()
+
+        optimizer.step()
+        if one_pass: break
+    
+    avg_loss /= len(train_loader.dataset)
+
+    if verbose:
+        print(f'Average loss: {avg_loss:.6f}')
+```
+
+Next, train the model and save it as a .pt file.
+
+for epoch in range(epochs):
+    utils.train(model, device, trainloader, optimizer, criterion, epoch, one_pass=False, verbose=True)
+
+model.to(torch.device("cpu"))
+torch.save(model, "./cat_breeds/models/pretest.pt")
+
+## From PyTorch to ChAI (to_chai.py)
+
+Once the images, labels, and the model have been saved as .npy and .pt files, we can call .chai_dump and .chai_save to save them as files that are readable by the current ChAI functionality. The following saves the first 20 images for brevity.
+
+```
+import lib.chai
+import torch
+import os
+import numpy as np
+
+model = torch.load("./cat_breeds/models/pretest.pt")
+model.chai_dump("./cat_breeds/models/chai_model", "SmallCNN")
+
+load_path = "./cat_breeds/data/catbreeds/images"
+for i, item in enumerate(os.listdir(load_path)):
+    if "item" in item: # check file name
+        img = np.load(f"{load_path}/{item}")
+        img = torch.Tensor(img)
+        img.chai_save("./cat_breeds/data/catbreeds/chai_images", f"item{i}", verbose=False)
+    if i > 20:
+        break
+```
+
+The specific path that we follow here holds the data and the model in separate directories, as follows.
+
+cat_breeds
+├───models
+│   ├───chai_model
+│   │   ├───conv1.bias.chdata
+│   │   ├───conv2.bias.json
+│   │   └───...
+│   └───pretest.pt
+└───data
+    └───catbreeds
+        ├───chai_images
+        │   ├───item0.chdata
+        │   ├───item0.json
+        │   └───...
+        ├───images
+        │   ├───item0.npy
+        │   ├───item1.npy
+        │   └───...
+        └───labels
+            ├───item0.npy
+            ├───item1.npy
+            └───...
+
+## Single-locale inference in ChAI (single_locale.chpl)
+
+We can call `loadModel` to read the model's information into ChAI.
+
+```
+var model: owned Module(real(32))  = loadModel(
+    specFile="./cat_breeds/models/chai_model/specification.json",
+    weightsFolder = "./cat_breeds/models/chai_model/",
+    dtype=real(32)
+);
+
+writeln(model.signature);
+```
+
+Next, we can call Tensor.load to read each images' data into ChAI. The following code reads `numImages` images into an array.
+
+```
+config const numImages = 1;
+var images = forall i in 0..<numImages do Tensor.load("./cat_breeds/data/catbreeds/chai_images/item"+i:string+".chdata") : real(32);
+```
+
+Lastly, we can use the model by passing images into it, which will call its forward function. The following code passes `numImages` images into the model `numTimes` times.
+
+```
+var preds: [0..<numImages] int;
+config const numTimes = 1;
+var time: real;
+for i in 0..<numTimes {
+    writeln("Inference (loop ",i,")...");
+    var st = new Time.stopwatch();
+
+    st.start();
+    forall (img, pred) in zip(images, preds) {
+        writeln(img.type:string);
+        pred = model(img).argmax();
+    }
+    st.stop();
+
+    const tm = st.elapsed();
+    writeln("Time: ", tm, " seconds.");
+}
+```
\ No newline at end of file
diff --git a/lib/Autograd.chpl b/lib/Autograd.chpl
index 0bf7fa70..6672f540 100644
--- a/lib/Autograd.chpl
+++ b/lib/Autograd.chpl
@@ -1044,24 +1044,30 @@ record negOp : serializable {
 
 
 record batchNormOp : serializable {
-    type eltType = defaultEltType;
+    type eltType = real;
     var features: shared BaseTensorResource(?); // what to put here?
     var weight: shared BaseTensorResource(eltType, 1);
     var bias: shared BaseTensorResource(eltType, 1);
     var movingAvg: shared BaseTensorResource(eltType, 1);
     var movingVar: shared BaseTensorResource(eltType, 1);
+    var eps: real;
+    var momentum: real;
+    var train: bool;
     var n: int;
 
-    proc children do return (features, weight, bias, movingAvg, movingVar);
+    proc children do return (features, weight, bias, movingAvg, movingVar, train);
 
     proc forward() {
-        return ndarray.batchNorm(features.array, weight.array, bias.array, movingAvg.array, movingVar.array, n);
+        if train {
+            return ndarray.batchNormTrain(features.array, weight.array, bias.array, movingAvg.array, movingVar.array, eps, momentum, n);
+        } else {
+            return ndarray.batchNorm(features.array, weight.array, bias.array, movingAvg.array, movingVar.array, eps);
+        }
     }
 
     proc spec : GradOpSpec do return new dict(("operation","BatchNorm"));
 }
 
-
 record dropoutOp : serializable {
     param rank: int;
     type eltType;
diff --git a/lib/DynamicTensor.chpl b/lib/DynamicTensor.chpl
index e604d289..a743b433 100644
--- a/lib/DynamicTensor.chpl
+++ b/lib/DynamicTensor.chpl
@@ -603,9 +603,11 @@ proc type dynamicTensor.batchnorm(
     bias: dynamicTensor(eltType),
     movingAvg: dynamicTensor(eltType),
     movingVar: dynamicTensor(eltType),
-    numFeatures: int
+    eps: real,
+    momentum: real,
+    train: bool,
+    num_features: int
 ): dynamicTensor(eltType) {
-
     for param rankF in 2..4 {
         if features.checkRank(rankF) {
             return staticTensor.batchNorm(
@@ -614,7 +616,10 @@ proc type dynamicTensor.batchnorm(
                 bias.forceRank(1),
                 movingAvg.forceRank(1),
                 movingVar.forceRank(1),
-                numFeatures
+                eps,
+                momentum,
+                train,
+                num_features
             ).eraseRank();
         }
     }
diff --git a/lib/NDArray.chpl b/lib/NDArray.chpl
index c63089e8..637b3b9f 100644
--- a/lib/NDArray.chpl
+++ b/lib/NDArray.chpl
@@ -1587,15 +1587,63 @@ proc type ndarray.matvecmul(mat: ndarray(2,?eltType),vec: ndarray(1,eltType)): n
     return u;
 }
 
+proc type ndarray.batchNormTrain(
+    features: ndarray(?rank,?eltType),
+    weight: ndarray(1,eltType),
+    bias: ndarray(1, eltType),
+    ref movingAvg: ndarray(1, eltType),
+    ref movingVar: ndarray(1, eltType),
+    eps: real,
+    momentum: real,
+    n: int // num_features
+): ndarray(rank,eltType) {
+    if rank < 2 then halt("Rank must be greater than 2");
+    if rank > 4 then halt("Rank must be less than 4");
+    const fshape = features.shape;
+
+    var avgs = features.mean(0).reshape(n);
+    var vars = features.variance(0, correction=0).reshape(n);
+    const m = 1 - momentum;
+
+    ref a = avgs.data;
+    ref v = ndarray.sqrt(vars).data;
+    // ref v = ndarray.sqrt(vars).data;
+    ref ma = movingAvg.data;
+    ref mv = movingVar.data;
+    ref f = features.data;
+    ref w = weight.data;
+    ref b = bias.data;
+    
+    writeln("momentum: ", momentum);
+    writeln("ma: ", ma);
+    writeln("a: ", a);
+    ma = m*ma + momentum*a;
+    writeln("result: ", ma);
+    mv = m*mv + momentum*v;
+
+    var outDom = util.domainFromShape((...fshape));
+    var outFeatures = new ndarray(outDom,eltType);
+    ref dat = outFeatures.data;
+
+    writeln("Calculated mean: ", avgs, "\nCalculated vars: ", vars);
+
+    forall idx in outDom.every() {
+        var c = idx[1];
+        dat[idx] = w[c]*((f[idx]-a[c])/v[c])+b[c];
+        // writeln("dat[idx]: ", dat[idx], "; a[c]: ", a[c], "; v[c]: ", v[c], "; w[c]: ", w[c], "; b[c]: ", b[c]);
+    }
+
+    return outFeatures;
+}
+
 proc type ndarray.batchNorm(
     features: ndarray(?rank,?eltType),
     weight: ndarray(1,eltType),
     bias: ndarray(1, eltType),
     movingAvg: ndarray(1, eltType),
     movingVar: ndarray(1, eltType),
-    n: int // num_features
+    eps: real
 ): ndarray(rank,eltType) {
-    // writeln("IN ndarray.batchNorm");
     if rank < 2 then halt("Rank must be greater than 2");
     if rank > 4 then halt("Rank must be less than 4");
     const fshape = features.shape;
@@ -1616,7 +1664,6 @@ proc type ndarray.batchNorm(
     }
 
     return outFeatures;
-
 }
 
 
diff --git a/lib/Network.chpl b/lib/Network.chpl
index 2440850a..db44fe16 100644
--- a/lib/Network.chpl
+++ b/lib/Network.chpl
@@ -1053,19 +1053,25 @@ class BatchNorm : Module(?) {
     var movingVar: Tensor(eltType);
     var weight: owned Parameter(eltType);
     var bias: owned Parameter(eltType);
+    var eps: real;
+    var momentum: real;
+    var train: bool;
     var num_features: int;
 
-    proc init(type eltType = defaultEltType, num_features: int) {
+    proc init(type eltType = real, num_features: int, momentum: real = 0.1, eps: real = 1e-5, train: bool = false) {
         super.init(eltType);
         this.movingAvg = Tensor.zeros(num_features);
         this.movingVar = Tensor.ones(num_features);
         this.weight = new Parameter(Tensor.ones(num_features));
         this.bias = new Parameter(Tensor.zeros(num_features));
+        this.eps = 1e-5;
+        this.momentum = 0.1;
+        this.train = false;
         this.num_features = num_features;
     }
 
     override proc forward(input: Tensor(eltType)): Tensor(eltType) {
-        return Tensor.batchnorm(input, weight.data, bias.data, movingAvg, movingVar, num_features);
+        return Tensor.batchnorm(input, weight.data, bias.data, movingAvg, movingVar, eps, momentum, train, num_features);
     }
 
     override proc setup() {
diff --git a/lib/StaticTensor.chpl b/lib/StaticTensor.chpl
index 2c0ed118..e1cdfb8a 100644
--- a/lib/StaticTensor.chpl
+++ b/lib/StaticTensor.chpl
@@ -454,10 +454,12 @@ proc type staticTensor.batchNorm(
     bias: staticTensor(1,eltType),
     movingAvg: staticTensor(1,eltType), 
     movingVar: staticTensor(1,eltType),
+    eps: real,
+    momentum: real,
+    train: bool,
     numFeatures: int
 ): staticTensor(featureRank, eltType) {
-
-    var ctx = new batchNormOp(eltType, features.meta, weight.meta, bias.meta, movingAvg.meta, movingVar.meta, numFeatures);
+    var ctx = new batchNormOp(eltType, features.meta, weight.meta, bias.meta, movingAvg.meta, movingVar.meta, eps, momentum, train, numFeatures);
     return tensorFromCtx(featureRank, eltType, ctx);
 }