malware_classifier.py

import time
import lightgbm as lgb
import os
import json
from memory_profiler import memory_usage
import joblib
import pathlib

saved_models = os.path.join(pathlib.Path(__file__).parent.resolve(), "data/models")

def benchmark_function(func, *args):
    time_elapsed_arr = []
    memory_consumed_arr = []
    for _ in range(1):
        start_time = time.time()

        memory_consumed, result = memory_usage(proc=(func, args), max_usage=True, retval=True, include_children=True, multiprocess=True)

        end_time = time.time()
        time_elapsed = end_time - start_time

        time_elapsed_arr.append(time_elapsed)
        memory_consumed_arr.append(memory_consumed)

    time_elapsed = sum(time_elapsed_arr) / len(time_elapsed_arr)
    memory_consumed = sum(memory_consumed_arr) / len(memory_consumed_arr)

    print(f"Benchmarking function {func.__name__}")
    print(f"Time elapsed: {time_elapsed:.2f} s")
    print(f"Memory usage: {memory_consumed:.2f} MB")

    benchmark_stats = {
        "time": time_elapsed,
        "memory": memory_consumed
    }
    
    return result, benchmark_stats


# Interface representing Malware Detection Model
class MalwareClassifier:
    def __init__(self, name, is_trained, model_path):
        self.name = name
        self.model_path = model_path
        self.model = None

        if is_trained:
            self._load_model()

    def _load_model(self):
        raise NotImplementedError

    def train(self, X_train, y_train, training_info=None):
        raise NotImplementedError

    def _save_model(self, training_info=None):
        raise NotImplementedError

    def retrain(self, mode, X_train, y_train, training_info=None):
        raise NotImplementedError
    
    def predict_proba(self, X):
        raise NotImplementedError
    
    def print_info(self):
        print("Name:", self.name)

# Class representing LightGBM Malware Detection Model
class GBDTMalwareClassifier(MalwareClassifier):
    """
    A Gradient Boosting Decision Tree (GBDT) based malware classifier using LightGBM.
    Attributes:
        name (str): The name of the classifier.
        is_trained (bool): Indicates whether the model is trained.
        model_path (str): Path to the directory where the model is saved.
        _params (dict): Parameters for the LightGBM model.
    Methods:
        predict_proba(X):
            Predicts the probability of the input samples being malware.
        train(X_train, y_train, training_info=None):
            Trains the model using the provided training data.
        retrain(mode, X_train, y_train, training_info=None):
            Retrains the model based on the specified mode.
        print_info():
            Prints information about the model.
        _save_model(training_info=None):
            Saves the trained model to the specified path.
        _load_model():
            Loads the model from the specified path.
    """
    def __init__(self, name, is_trained, model_path=os.path.join(saved_models, "trained_models", "LightGBM")):
        super().__init__(name, is_trained, model_path)
        self._params = {
            "boosting": "gbdt",
            "objective": "binary",
            "num_iterations": 1000,
            "learning_rate": 0.05,
            "num_leaves": 2048,
            "max_depth": 15,
            "min_data_in_leaf": 50,
            "feature_fraction": 0.5,
            "application": "binary"
        }
    
    def predict_proba(self, X):
        return self.model.predict(X)

    def train(self, X_train, y_train, training_info=None):
        lgbm_dataset = lgb.Dataset(X_train, y_train)
        # Replace this with lgb.train for faster training if you don't need resource usage statistics    
        self.model, benchmark_stats = benchmark_function(lgb.train, self._params, lgbm_dataset)

        if training_info is not None:
            training_info.update(benchmark_stats)

        self._save_model(training_info)

    def retrain(self, mode, X_train, y_train, training_info=None):
        assert mode in ["full", "small", "medium"], "Unknown mode."
        print(50*"#")
        print("Before retraining:")
        self.print_info()
        self.name = f"{mode}AT-{self.name}"
        
        print("Dataset info:")
        print("X_train:", X_train.shape, "y_train:", y_train.shape)
        print(y_train.value_counts())
        print("Training...")
        
        if mode == "full": # Full retraining, i.e., new model is trained from scratch
            lgbm_dataset = lgb.Dataset(X_train, y_train)
            self.model, benchmark_stats = benchmark_function(lgb.train, self._params, lgbm_dataset)
        elif mode == "small": # Refiting the model to the new train data: no trees are added, only leafs are updated
            self.model, benchmark_stats = benchmark_function(self.model.refit, X_train, y_train)
        elif mode == "medium": # Training the model for 100 iterations, starting from the current model
            lgbm_dataset = lgb.Dataset(X_train, y_train)
            #self.model = lgb.train(self._params, lgbm_dataset, num_boost_round=100, init_model=self.model)
            self.model, benchmark_stats = benchmark_function(self.model.train, self._params, lgbm_dataset, num_boost_round=100, init_model=self.model)

        if training_info is not None:
            training_info.update(benchmark_stats)
                    
        self._save_model(training_info)

        print("After retraining:")
        self.print_info()
        print(50*"#")
    
    def print_info(self):
        super().print_info()
        print("Num trees:", self.model.num_trees())

    def _save_model(self, training_info=None):
        print("Saving model...")
        self.model.save_model(os.path.join(self.model_path, f"{self.name}.txt"))

        if training_info is not None:
            with open(os.path.join(saved_models, "training_info", "LightGBM", f"{self.name}_training_stats.json"), "w") as f:
                json.dump(training_info, f)

        print(50*"#")

    def _load_model(self):
        print(50*"#")
        self.model = lgb.Booster(model_file=os.path.join(self.model_path, f"{self.name}.txt"))
        print("Model loaded:")
        self.print_info()
        print(50*"#")

class SklearnMalwareClassifier(MalwareClassifier):
    """
    SklearnMalwareClassifier is a specialized classifier that integrates scikit-learn
    models with the MalwareClassifier base class. It handles tasks such as model
    training, saving, loading, and probability prediction.
    Attributes:
        name (str): The name of the classifier.
        is_trained (bool): Indicates whether the model is already trained.
        model_path (str): Path to the directory where the model is stored.
        model (sklearn.base.BaseEstimator): The scikit-learn model instance used
            for classification.
    Methods:
        predict_proba(X):
            Returns an array of predicted probabilities for the positive class
            from the scikit-learn model.
        train(X_train, y_train, training_info=None):
            Trains the provided model on the given data, measuring performance
            benchmarks. Saves the trained model and optional training information
            afterward.
        print_info():
            Prints diagnostic information about the model and its parameters.
        _save_model(training_info=None):
            Persists the trained model to disk using joblib, and optionally
            saves any provided training metadata in JSON format.
        _load_model():
            Loads the trained model from disk, then prints diagnostic information.
    """
    def __init__(self, name, is_trained, model_path=os.path.join(saved_models, "trained_models", "Sklearn"), model=None):
        super().__init__(name, is_trained, model_path)

        assert model is not None or is_trained, "Model must be provided or loaded from file."
        if model is not None:
            self.model = model

    
    def predict_proba(self, X):
        return self.model.predict_proba(X)[:, 1]

    def train(self, X_train, y_train, training_info=None):
        print("Training Sklearn model -", self.name, "...")
        self.model, benchmark_stats = benchmark_function(self.model.fit, X_train, y_train)

        if training_info is not None:
            training_info.update(benchmark_stats)

        print("Saving trained model...")
        self._save_model(training_info=training_info)

    def print_info(self):
        super().print_info()
        print("Sklearn clf params:", self.model.get_params())

    def _save_model(self, training_info=None):
        print("Saving model...")
        joblib.dump(self.model, os.path.join(self.model_path, f"{self.name}.joblib"))

        if training_info is not None:
            with open(os.path.join(saved_models, "training_info", "Sklearn", f"{self.name}_training_stats.json"), "w") as f:
                json.dump(training_info, f)
        print(50*"#")

    def _load_model(self):
        print(50*"#")
        self.model = joblib.load(os.path.join(self.model_path, f"{self.name}.joblib"))
        print("Model loaded:")
        self.print_info()
        print(50*"#")