implment the structure

Harry-Yang0518 · Apr 25, 2024 · 6786ea0 · 6786ea0
1 parent 572da37
commit 6786ea0
Show file tree

Hide file tree

Showing 14 changed files with 1,064 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -1 +1,2 @@
-dataset/
+dataset/
+.output/
diff --git a/Trainer.py b/Trainer.py
@@ -0,0 +1,57 @@
+from tqdm import tqdm
+import torch.optim as optim
+from model import LabelSmoothCrossEntropyLoss
+from dataset import get_dataset
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+from torch.optim.lr_scheduler import StepLR
+
+
+def train_and_validate(model, criterion, device, train_loader, val_loader, optimizer, epoch):
+    model.train()
+
+    for batch_idx, (data, target) in tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Training Epoch {epoch}"):
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = criterion(output, target)
+        loss.backward()
+        optimizer.step()
+
+    model.eval()
+    val_loss = 0
+    correct = 0
+    with torch.no_grad():
+        for data, target in tqdm(val_loader, total=len(val_loader), desc=f"Validating Epoch {epoch}"):
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            val_loss += criterion(output, target).item()  
+            pred = output.argmax(dim=1, keepdim=True) 
+            correct += pred.eq(target.view_as(pred)).sum().item()
+
+    val_loss /= len(val_loader.dataset)
+    val_accuracy = 100. * correct / len(val_loader.dataset)
+
+    print(f'Validation set: Average loss: {val_loss:.4f}, Accuracy: {correct}/{len(val_loader.dataset)} ({val_accuracy:.0f}%)')
+    return val_loss, val_accuracy
+
+def train_model(data_dir, model, device, lr=0.01, momentum=0.9):
+    #train_loader, val_loader,_ = load_data(BATCH_SIZE,)
+    train_loader, val_loader = get_dataset(data_dir)
+    optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, weight_decay=weight_decay)
+    criterion = LabelSmoothingLoss(classes=NUM_CLASSES, smoothing=0.1)
+    scheduler = StepLR(optimizer,step_size=100,gamma=0.25)
+
+    for epoch in range(1, EPOCHS + 1):
+        train_and_validate(model, criterion, device, train_loader, val_loader, optimizer, epoch)
+        scheduler.step()
+        if epoch % 10 == 0:
+            checkpoint = {
+                'epoch': epoch,
+                'model_state_dict': model.state_dict(),
+                'optimizer_state_dict': optimizer.state_dict(),
+                'loss': criterion
+            }
+            torch.save(checkpoint, f"result/model_checkpoint_epoch_{epoch}.pth")
diff --git a/__pycache__/Trainer.cpython-310.pyc b/__pycache__/Trainer.cpython-310.pyc
diff --git a/__pycache__/aug_helper.cpython-310.pyc b/__pycache__/aug_helper.cpython-310.pyc
diff --git a/__pycache__/dataset.cpython-310.pyc b/__pycache__/dataset.cpython-310.pyc
diff --git a/__pycache__/model.cpython-310.pyc b/__pycache__/model.cpython-310.pyc
diff --git a/__pycache__/utils.cpython-310.pyc b/__pycache__/utils.cpython-310.pyc
diff --git a/aug_helper.py b/aug_helper.py
@@ -0,0 +1,218 @@
+import torch
+import random
+import torch.nn.functional as F
+import numpy as np
+import torch.distributed as dist
+import copy
+epsilon = 1e-8
+
+
+class AugBasic:
+    def __init__(self, fs):
+        super().__init__()
+        self.fs = fs
+        self.fft_params = {}
+        if fs == 22050:
+            self.fft_params['win_len'] = [512, 1024, 2048]
+            self.fft_params['hop_len'] = [128, 256, 1024]
+            self.fft_params['n_fft'] = [512, 1024, 2048]
+        elif fs == 16000:
+            self.fft_params['win_len'] = [256, 512, 1024]
+            self.fft_params['hop_len'] = [256 // 4, 512 // 4, 1024 // 4]
+            self.fft_params['n_fft'] = [256, 512, 1024]
+        elif fs == 8000:
+            self.fft_params['win_len'] = [128, 256, 512]
+            self.fft_params['hop_len'] = [32, 64, 128]
+            self.fft_params['n_fft'] = [128, 256, 512]
+        else:
+            raise ValueError
+
+
+def count_parameters(model):
+    # return sum(p.numel() for p in model.parameters() if p.requires_grad)
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+
+def make_weights_for_balanced_classes(samples, nclasses):
+    count = [0] * nclasses
+    for item in samples:
+        count[item[1]] += 1
+    weight_per_class = [0.] * nclasses
+    N = float(sum(count))
+    for i in range(nclasses):
+        weight_per_class[i] = N/float(count[i])
+    weight = [0] * len(samples)
+    for idx, val in enumerate(samples):
+        weight[idx] = weight_per_class[val[1]]
+    return weight
+
+
+def measure_inference_time(model, input, repetitions=300, use_16b=False):
+    device = torch.device("cuda")
+    model_= copy.deepcopy(model)
+    model_.eval()
+    starter = torch.cuda.Event(enable_timing=True)
+    ender = torch.cuda.Event(enable_timing=True)
+    # repetitions = 300
+    timings = np.zeros((repetitions, 1))
+    print(input.shape)
+    if use_16b:
+        input = input.half()
+        model_.half()
+    else:
+        pass
+    input = input.to(device)
+    model_.to(device)
+    for _ in range(10):
+        _ = model_(input)
+    with torch.no_grad():
+        # GPU-WARM-UP
+        for rep in range(repetitions):
+            starter.record()
+            _ = model_(input)
+            ender.record()
+            # WAIT FOR GPU SYNC
+            torch.cuda.synchronize()
+            curr_time = starter.elapsed_time(ender)
+            timings[rep] = curr_time
+    mean_syn = np.sum(timings) / repetitions
+    std_syn = np.std(timings)
+    return mean_syn, std_syn
+
+def collate_fn(batch):
+    x = [item[0] for item in batch]
+    y = [item[1] for item in batch]
+    x = torch.stack(x, dim=0).contiguous()
+    return (x, y)
+
+def files_to_list(filename):
+    """
+    Takes a text file of filenames and makes a list of filenames
+    """
+    with open(filename, encoding="utf-8") as f:
+        files = f.readlines()
+
+    files = [f.rstrip() for f in files]
+    return files
+
+
+def find_first_nnz(t, q, dim=1):
+    _, mask_max_indices = torch.max(t == q, dim=dim)
+    return mask_max_indices
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the precision@k for the specified values of k"""
+    maxk = max(topk)
+    batch_size = target.size(0)
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    with torch.no_grad():
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+    return [correct[:k].view(-1).float().sum(0) * 100. / batch_size for k in topk]
+
+
+def average_precision(output, target):
+    # sort examples
+    indices = output.argsort()[::-1]
+    # Computes prec@i
+    total_count_ = np.cumsum(np.ones((len(output), 1)))
+    target_ = target[indices]
+    ind = target_ == 1
+    pos_count_ = np.cumsum(ind)
+    total = pos_count_[-1]
+    pos_count_[np.logical_not(ind)] = 0
+    pp = pos_count_ / total_count_
+    precision_at_i_ = np.sum(pp)
+    precision_at_i = precision_at_i_/(total + epsilon)
+    return precision_at_i
+
+
+def mAP(targs, preds):
+    """Returns the model's average precision for each class
+    Return:
+        ap (FloatTensor): 1xK tensor, with avg precision for each class k
+    """
+    if np.size(preds) == 0:
+        return 0
+    ap = np.zeros((preds.shape[1]))
+    # compute average precision for each class
+    for k in range(preds.shape[1]):
+        # sort scores
+        scores = preds[:, k]
+        targets = targs[:, k]
+        # compute average precision
+        ap[k] = average_precision(scores, targets)
+    return 100*ap.mean()
+
+def pad_sample_seq(x, n_samples):
+    if x.size(-1) >= n_samples:
+        max_x_start = x.size(-1) - n_samples
+        x_start = random.randint(0, max_x_start)
+        x = x[x_start: x_start + n_samples]
+    else:
+        x = F.pad(
+            x, (0, n_samples - x.size(-1)), "constant"
+        ).data
+    return x
+
+
+def pad_sample_seq_batch(x, n_samples):
+    if x.size(0) >= n_samples:
+        max_x_start = x.size(0) - n_samples
+        x_start = random.randint(0, max_x_start)
+        x = x[:, x_start: x_start + n_samples]
+    else:
+        x = F.pad(
+            x, (0, n_samples - x.size(1)), "constant"
+        ).data
+    return x
+
+
+def add_weight_decay(model, weight_decay=1e-5, skip_list=()):
+    decay = []
+    no_decay = []
+    for name, param in model.named_parameters():
+        # print(name)
+        if not param.requires_grad:
+            continue
+        if len(param.shape) == 1 or name in skip_list:
+            no_decay.append(param)
+        else:
+            decay.append(param)
+    return [
+        {'params': no_decay, 'weight_decay': 0.},
+        {'params': decay, 'weight_decay': weight_decay}]
+
+
+def _get_bn_param_ids(net):
+    bn_ids = []
+    for m in net.modules():
+        print(m)
+        if isinstance(m, torch.nn.BatchNorm1d) or isinstance(m, torch.nn.LayerNorm):
+            bn_ids.append(id(m.weight))
+            bn_ids.append(id(m.bias))
+        elif isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Linear):
+            if m.bias is not None:
+                bn_ids.append(id(m.bias))
+    return bn_ids
+
+
+def reduce_tensor(tensor, n):
+    rt = tensor.clone()
+    dist.all_reduce(rt, op=dist.ReduceOp.SUM)
+    rt /= n
+    return rt
+
+
+def gather_tensor(tensor, n):
+    rt = tensor.clone()
+    tensor_list = [torch.zeros(n, device=tensor.device, dtype=torch.cuda.float()) for _ in range(n)]
+    dist.all_gather(tensor_list, rt)
+    return tensor_list
+
+
+def parse_gpu_ids(gpu_ids): #list of ints
+    s = ''.join(str(x) + ',' for x in gpu_ids)
+    s = s.rstrip().rstrip(',')
+    return s
diff --git a/dataset.py b/dataset.py
@@ -0,0 +1,52 @@
+from utils import AudioAugs
+import os
+import librosa
+import pandas as pd
+import numpy as np
+import torch
+import torchaudio
+
+def load_audio_files_with_torchaudio(path, file_paths, augmentor):
+    features = []
+    for file_path in file_paths:
+        full_path = os.path.join(path, file_path)
+        waveform, sample_rate = torchaudio.load(full_path)
+        waveform = waveform.mean(dim=0, keepdim=True)  # Ensure mono by averaging channels
+        augmented_waveform, _ = augmentor(waveform.squeeze(0).numpy())
+        augmented_waveform = torch.tensor(augmented_waveform, dtype=torch.float32).unsqueeze(0)
+        mfccs = torchaudio.transforms.MFCC(sample_rate=sample_rate, n_mfcc=13)(augmented_waveform)
+        mfccs_mean = mfccs.mean(dim=2).squeeze(0).numpy()
+        features.append(mfccs_mean)
+    return features
+
+
+def get_dataset(data_dir, apply_augmentation=True):
+    """
+    Load dataset and process it for classification task with optional augmentation.
+    """
+    train_audio_path = os.path.join(data_dir, 'train_mp3s')
+    test_audio_path = os.path.join(data_dir, 'test_mp3s')
+    label_file = os.path.join(data_dir, 'train_label.txt')
+
+    labels = pd.read_csv(label_file, header=None, names=['file', 'label'])
+
+    train_files = os.listdir(train_audio_path)
+    test_files = os.listdir(test_audio_path)
+
+    # Instantiate the augmentor
+    augmentor = AudioAugs(k_augs=['flip', 'tshift', 'mulaw'], fs=22050) if apply_augmentation else None
+
+    # Load and process audio files
+    train_features = load_audio_files_with_augmentation(train_audio_path, train_files, augmentor) if apply_augmentation else load_audio_files(train_audio_path, train_files)
+    test_features = load_audio_files(test_audio_path, test_files)  # Assume no augmentation for testing
+
+    train_df = pd.DataFrame(train_features)
+    train_df['label'] = labels['label'].values[:len(train_features)]  # Make sure labels align correctly
+
+    test_df = pd.DataFrame(test_features)
+
+    return train_df, test_df
+
+# # Example usage
+# data_dir = '/scratch/hy2611/ML_Competition/dataset'
+# train_data, test_data = get_dataset(data_dir)
diff --git a/main.py b/main.py
@@ -0,0 +1,55 @@
+from model import SoundNetRaw
+from Trainer import train_model
+from dataset import get_dataset
+import torch
+from tqdm import tqdm
+import pandas as pd
+
+
+NUM_CLASSES = 4
+EPOCHS = 200
+BATCH_SIZE = 32 
+learning_rate = 0.01
+momentum = 0.9
+weight_decay = 0.0005
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+def predict(model, device, test_loader):
+    model.eval()
+    predictions = []
+
+    with torch.no_grad():
+        for data in tqdm(test_loader, total=len(test_loader), desc="Predicting"):
+            images = data[0].to(device)
+            outputs = model(images)
+            _, predicted = torch.max(outputs, 1)
+            predictions.extend(predicted.cpu().numpy())
+
+    return predictions
+
+def save_predictions_to_csv(predictions, file_name):
+    df = pd.DataFrame({'id': range(len(predictions)), 'category': predictions})
+    df.to_csv(file_name, index=False)
+
+
+if __name__ == '__main__':
+    model = SoundNetRaw(
+    nf=32,                             # Number of filters in the initial convolution layer
+    clip_length=66150 // 256,          # Total samples (66150 for 3s at 22050 Hz) divided by the product of the downsampling factors
+    embed_dim=128,                     # Embedding dimension
+    n_layers=4,                        # Number of layers
+    nhead=8,                           # Number of attention heads
+    factors=[4, 4, 4, 4],              # Downsampling factors for each layer
+    n_classes=4,                      # Number of classes (adjust based on your specific task)
+    dim_feedforward=512                # Dimensionality of the feedforward network within the transformer layers
+    )
+    model.to(device)
+    data_dir = '/scratch/hy2611/ML_Competition/dataset'
+    train_model(data_dir, model, device)
+
+    torch.save(model, "Limbo.pth")
+
+    # test_loader = load_data(BATCH_SIZE,)[2] 
+    _, test_loader = get_dataset(data_dir)
+    predictions = predict(model, device, test_loader)
+    save_predictions_to_csv(predictions, 'predictions.csv')