diff --git a/components/outlier-detection/README.md b/components/outlier-detection/README.md deleted file mode 100644 index 987d98a553..0000000000 --- a/components/outlier-detection/README.md +++ /dev/null @@ -1,33 +0,0 @@ -# Outlier Detection in Seldon Core - -## Description - -[Anomaly or outlier detection](https://en.wikipedia.org/wiki/Anomaly_detection) has many applications, ranging from preventing credit card fraud to detecting computer network intrusions. Seldon Core provides a number of outlier detectors suitable for different use cases. The detectors can be run as models or transformers which are part of the pre-defined types of [predictive units](../../docs/reference/seldon-deployment.md#proto-buffer-definition) in Seldon Core. Models are microservices that make predictions and can receive feedback rewards while the input transformers add the anomaly predictions to the metadata of the underlying model. The REST and gRPC internal APIs that the model and transformer components must conform to are covered in the [internal API](../../docs/reference/internal-api.md) reference. - -## Implementations - -The following types of outlier detectors are implemented and showcased with demos on Seldon Core: -* [Sequence-to-Sequence LSTM](./seq2seq-lstm) -* [Variational Auto-Encoder](./vae) -* [Isolation Forest](./isolation-forest) -* [Mahalanobis Distance](./mahalanobis) - -The Sequence-to-Sequence LSTM algorithm can be used to detect outliers in time series data, while the other algorithms spot anomalies in tabular data. The Mahalanobis detector works online and does not need to be trained first. The other algorithms are ideally trained on a batch of normal data or data with a low fraction of outliers. - -## Implementing custom outlier detectors - -An outlier detection component can be implemented either as a model or input transformer component. If the component is defined as a model, a ```predict``` method needs to be implemented to return the detected anomalies. Optionally, a ```send_feedback``` method can return additional information about the performance of the algorithm. When the component is used as a transformer, the anomaly predictions will occur in the ```transform_input``` method which returns the unchanged input features. The anomaly predictions will then be added to the underlying model's metadata via the ```tags``` method. Both models and transformers can make use of custom metrics defined by the ```metrics``` function. - -The required methods to use the outlier detection algorithms as models or transformers are implemented in the Python files with the ```Core``` prefix. The demos contain clear instructions on how to run your component as a model or transformer. - -## Language specific templates - -Reference templates for custom model and input transformer components written in several languages are available: -* Python - * [model](../../wrappers/s2i/python/test/model-template-app/MyModel.py) - * [transformer](../../wrappers/s2i/python/test/transformer-template-app/MyTransformer.py) -* R - * [model](../../wrappers/s2i/R/test/model-template-app/MyModel.R) - * [transformer](../../wrappers/s2i/R/test/transformer-template-app/MyTransformer.R) - -Additionally, the [wrappers](../../wrappers/s2i) provide guidelines for implementing the model component in other languages. \ No newline at end of file diff --git a/components/outlier-detection/isolation-forest/.s2i/environment b/components/outlier-detection/isolation-forest/.s2i/environment deleted file mode 100644 index ce589f54a3..0000000000 --- a/components/outlier-detection/isolation-forest/.s2i/environment +++ /dev/null @@ -1,4 +0,0 @@ -MODEL_NAME=OutlierIsolationForest -API_TYPE=REST -SERVICE_TYPE=MODEL -PERSISTENCE=0 diff --git a/components/outlier-detection/isolation-forest/CoreIsolationForest.py b/components/outlier-detection/isolation-forest/CoreIsolationForest.py deleted file mode 100644 index 0db0fca41c..0000000000 --- a/components/outlier-detection/isolation-forest/CoreIsolationForest.py +++ /dev/null @@ -1,117 +0,0 @@ -import logging -import numpy as np -import pickle -from sklearn.ensemble import IsolationForest - -logger = logging.getLogger(__name__) - -class CoreIsolationForest(object): - """ Outlier detection using Isolation Forests. - - Parameters - ---------- - threshold (float) : anomaly score threshold; scores below threshold are outliers - - Functions - ---------- - predict : detect and return outliers - transform_input : detect outliers and return input features - send_feedback : add target labels as part of the feedback loop - tags : add metadata for input transformer - metrics : return custom metrics - """ - - def __init__(self,threshold=0.,model_name='if',load_path='./models/'): - - logger.info("Initializing model") - self.threshold = threshold - self.N = 0 # total sample count up until now - self.nb_outliers = 0 - - # load pre-trained model - with open(load_path + model_name + '.pickle', 'rb') as f: - self.clf = pickle.load(f) - - - def predict(self, X, feature_names): - """ Return outlier predictions. - - Parameters - ---------- - X : array-like - feature_names : array of feature names (optional) - """ - logger.info("Using component as a model") - return self._get_preds(X) - - - def transform_input(self, X, feature_names): - """ Transform the input. - Used when the outlier detector sits on top of another model. - - Parameters - ---------- - X : array-like - feature_names : array of feature names (optional) - """ - logger.info("Using component as an outlier-detector transformer") - self.prediction_meta = self._get_preds(X) - return X - - - def _get_preds(self,X): - """ Detect outliers below the anomaly score threshold. - - Parameters - ---------- - X : array-like - """ - self.decision_val = self.clf.decision_function(X) # anomaly scores - - # make prediction - self.prediction = (self.decision_val < self.threshold).astype(int) # scores below threshold are outliers - - self.N+=self.prediction.shape[0] # update counter - - return self.prediction - - - def send_feedback(self,X,feature_names,reward,truth): - """ Return additional data as part of the feedback loop. - - Parameters - ---------- - X : array of the features sent in the original predict request - feature_names : array of feature names. May be None if not available. - reward (float): the reward - truth : array with correct value (optional) - """ - logger.info("Send feedback called") - return [] - - - def tags(self): - """ - Use predictions made within transform to add these as metadata - to the response. Tags will only be collected if the component is - used as an input-transformer. - """ - try: - return {"outlier-predictions": self.prediction_meta.tolist()} - except AttributeError: - logger.info("No metadata about outliers") - - - def metrics(self): - """ Return custom metrics averaged over the prediction batch. - """ - self.nb_outliers += np.sum(self.prediction) - - is_outlier = {"type":"GAUGE","key":"is_outlier","value":np.mean(self.prediction)} - anomaly_score = {"type":"GAUGE","key":"anomaly_score","value":np.mean(self.decision_val)} - nb_outliers = {"type":"GAUGE","key":"nb_outliers","value":int(self.nb_outliers)} - fraction_outliers = {"type":"GAUGE","key":"fraction_outliers","value":int(self.nb_outliers)/self.N} - obs = {"type":"GAUGE","key":"observation","value":self.N} - threshold = {"type":"GAUGE","key":"threshold","value":self.threshold} - - return [is_outlier,anomaly_score,nb_outliers,fraction_outliers,obs,threshold] \ No newline at end of file diff --git a/components/outlier-detection/isolation-forest/OutlierIsolationForest.py b/components/outlier-detection/isolation-forest/OutlierIsolationForest.py deleted file mode 100644 index a56ba32085..0000000000 --- a/components/outlier-detection/isolation-forest/OutlierIsolationForest.py +++ /dev/null @@ -1,115 +0,0 @@ -import numpy as np - -from CoreIsolationForest import CoreIsolationForest -from utils import flatten, performance, outlier_stats - -class OutlierIsolationForest(CoreIsolationForest): - """ Outlier detection using Isolation Forests. - - Parameters - ---------- - threshold (float) : anomaly score threshold; scores below threshold are outliers - - Functions - ---------- - send_feedback : add target labels as part of the feedback loop - metrics : return custom metrics - """ - def __init__(self,threshold=0.,model_name='if',load_path='./models/'): - - super().__init__(threshold=threshold, model_name=model_name, load_path=load_path) - - self._predictions = [] - self._labels = [] - self._anomaly_score = [] - self.roll_window = 100 - self.metric = [float('nan') for i in range(18)] - - - def send_feedback(self,X,feature_names,reward,truth): - """ Return outlier labels as part of the feedback loop. - - Parameters - ---------- - X : array of the features sent in the original predict request - feature_names : array of feature names. May be None if not available. - reward (float): the reward - truth : array with correct value (optional) - """ - _ = super().send_feedback(X,feature_names,reward,truth) - - # historical reconstruction errors and predictions - self._anomaly_score.append(self.decision_val) - self._anomaly_score = flatten(self._anomaly_score) - self._predictions.append(self.prediction) - self._predictions = flatten(self._predictions) - - # target labels - self.label = truth - self._labels.append(self.label) - self._labels = flatten(self._labels) - - # performance metrics - scores = performance(self._labels,self._predictions,roll_window=self.roll_window) - stats = outlier_stats(self._labels,self._predictions,roll_window=self.roll_window) - - convert = flatten([scores,stats]) - metric = [] - for c in convert: # convert from np to native python type to jsonify - metric.append(np.asscalar(np.asarray(c))) - self.metric = metric - - return [] - - - def metrics(self): - """ Return custom metrics. - Printed with a delay of 1 prediction because the labels are returned in the feedback step. - """ - - if self.prediction.shape[0]>1: - raise ValueError('Metrics can only handle single observations.') - - if self.N==1: - pred = float('nan') - dec_val = float('nan') - y_true = float('nan') - else: - pred = int(self._predictions[-1]) - dec_val = self._anomaly_score[-1] - y_true = int(self.label[0]) - - is_outlier = {"type":"GAUGE","key":"is_outlier","value":pred} - anomaly_score = {"type":"GAUGE","key":"anomaly_score","value":dec_val} - obs = {"type":"GAUGE","key":"observation","value":self.N - 1} - threshold = {"type":"GAUGE","key":"threshold","value":self.threshold} - - label = {"type":"GAUGE","key":"label","value":y_true} - - accuracy_tot = {"type":"GAUGE","key":"accuracy_tot","value":self.metric[4]} - precision_tot = {"type":"GAUGE","key":"precision_tot","value":self.metric[5]} - recall_tot = {"type":"GAUGE","key":"recall_tot","value":self.metric[6]} - f1_score_tot = {"type":"GAUGE","key":"f1_tot","value":self.metric[7]} - f2_score_tot = {"type":"GAUGE","key":"f2_tot","value":self.metric[8]} - - accuracy_roll = {"type":"GAUGE","key":"accuracy_roll","value":self.metric[9]} - precision_roll = {"type":"GAUGE","key":"precision_roll","value":self.metric[10]} - recall_roll = {"type":"GAUGE","key":"recall_roll","value":self.metric[11]} - f1_score_roll = {"type":"GAUGE","key":"f1_roll","value":self.metric[12]} - f2_score_roll = {"type":"GAUGE","key":"f2_roll","value":self.metric[13]} - - true_negative = {"type":"GAUGE","key":"true_negative","value":self.metric[0]} - false_positive = {"type":"GAUGE","key":"false_positive","value":self.metric[1]} - false_negative = {"type":"GAUGE","key":"false_negative","value":self.metric[2]} - true_positive = {"type":"GAUGE","key":"true_positive","value":self.metric[3]} - - nb_outliers_roll = {"type":"GAUGE","key":"nb_outliers_roll","value":self.metric[14]} - nb_labels_roll = {"type":"GAUGE","key":"nb_labels_roll","value":self.metric[15]} - nb_outliers_tot = {"type":"GAUGE","key":"nb_outliers_tot","value":self.metric[16]} - nb_labels_tot = {"type":"GAUGE","key":"nb_labels_tot","value":self.metric[17]} - - return [is_outlier,anomaly_score,obs,threshold,label, - accuracy_tot,precision_tot,recall_tot,f1_score_tot,f2_score_tot, - accuracy_roll,precision_roll,recall_roll,f1_score_roll,f2_score_roll, - true_negative,false_positive,false_negative,true_positive, - nb_outliers_roll,nb_labels_roll,nb_outliers_tot,nb_labels_tot] \ No newline at end of file diff --git a/components/outlier-detection/isolation-forest/README.md b/components/outlier-detection/isolation-forest/README.md deleted file mode 100644 index 0ec8c10f10..0000000000 --- a/components/outlier-detection/isolation-forest/README.md +++ /dev/null @@ -1,22 +0,0 @@ -# Isolation Forest Outlier Detector - -## Description - -[Anomaly or outlier detection](https://en.wikipedia.org/wiki/Anomaly_detection) has many applications, ranging from preventing credit card fraud to detecting computer network intrusions. The implemented [Isolation Forest](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html) outlier detector aims to predict anomalies in tabular data. The anomaly detector predicts whether the input features represent normal behaviour or not, dependent on a threshold level set by the user. - -## Implementation - -The Isolation Forest is trained by running the ```train.py``` script. The ```OutlierIsolationForest``` class inherits from ```CoreIsolationForest``` which loads a pre-trained model and can make predictions on new data. - -A detailed explanation of the implementation and usage of Isolation Forests as outlier detectors can be found in the [isolation forest doc](./doc.md). - -## Running on Seldon - -An end-to-end example running an Isolation Forest outlier detector on GCP or Minikube using Seldon to identify computer network intrusions is available [here](./isolation_forest.ipynb). - -Docker images to use the generic Isolation Forest outlier detector as a model or transformer can be found on Docker Hub: -* [seldonio/outlier-if-model](https://hub.docker.com/r/seldonio/outlier-if-model) -* [seldonio/outlier-if-transformer](https://hub.docker.com/r/seldonio/outlier-if-transformer) - -A model docker image specific for the demo is also available: -* [seldonio/outlier-if-model-demo](https://hub.docker.com/r/seldonio/outlier-if-model-demo) \ No newline at end of file diff --git a/components/outlier-detection/isolation-forest/__init__.py b/components/outlier-detection/isolation-forest/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/components/outlier-detection/isolation-forest/doc.md b/components/outlier-detection/isolation-forest/doc.md deleted file mode 100644 index 111341ed7f..0000000000 --- a/components/outlier-detection/isolation-forest/doc.md +++ /dev/null @@ -1,130 +0,0 @@ -# Isolation Forest (IF) Algorithm Documentation - -The aim of this document is to explain the Isolation Forest algorithm in Seldon's outlier detection framework. - -First, we provide a high level overview of the algorithm and the use case, then we will give a detailed explanation of the implementation. - -## Overview - -Outlier detection has many applications, ranging from preventing credit card fraud to detecting computer network intrusions. The available data is typically unlabeled and detection needs to be done in real-time. The outlier detector can be used as a standalone algorithm, or to detect anomalies in the input data of another predictive model. - -The IF outlier detection algorithm predicts whether the input features are an outlier or not, dependent on a threshold level set by the user. The algorithm needs to be pretrained first on a representable batch of data. - -As observations arrive, the algorithm will: -- calculate an anomaly score for the observation -- predict that the observation is an outlier if the anomaly score is below the threshold level - -## Why Isolation Forests? - -Isolation forests are tree based models specifically used for outlier detection. The IF isolates observations by randomly selecting a feature and then randomly selecting a split value between the maximum and minimum values of the selected feature. The number of splittings required to isolate a sample is equivalent to the path length from the root node to the terminating node. This path length, averaged over a forest of random trees, is a measure of normality and is used to define an anomaly score. Outliers can typically be isolated quicker, leading to shorter paths. In the scikit-learn implementation, lower anomaly scores indicate that the probability of an observation being an outlier is higher. - -## Implementation - -### 1. Defining and training the IF model - -The model takes 4 hyperparameters: - -- contamination: the fraction of expected outliers in the data set -- number of estimators: the number of base estimators; number of trees in the forest -- max samples: fraction of samples used for each base estimator -- max features: fraction of features used for each base estimator - -``` python -!python train.py \ ---dataset 'kddcup99' \ ---samples 50000 \ ---keep_cols "$cols_str" \ ---contamination .1 \ ---n_estimators 100 \ ---max_samples .8 \ ---max_features 1. \ ---save_path './models/' -``` - -The model is saved in the folder specified by "save_path". - -### 2. Making predictions - -In order to make predictions, which can then be served by Seldon Core, the pre-trained model is loaded when defining an OutlierIsolationForest object. The "threshold" argument defines below which anomaly score a sample is classified as an outlier. The threshold is a key hyperparameter and needs to be picked carefully for each application. The OutlierIsolationForest class inherits from the CoreIsolationForest class in ```CoreIsolationForest.py```. - -``` python -class CoreIsolationForest(object): - """ Outlier detection using Isolation Forests. - - Parameters - ---------- - threshold (float) : anomaly score threshold; scores below threshold are outliers - - Functions - ---------- - predict : detect and return outliers - transform_input : detect outliers and return input features - send_feedback : add target labels as part of the feedback loop - tags : add metadata for input transformer - metrics : return custom metrics - """ - - def __init__(self,threshold=0.,load_path='./models/'): - - logger.info("Initializing model") - self.threshold = threshold - self.N = 0 # total sample count up until now - self.nb_outliers = 0 - - # load pre-trained model - with open(load_path + 'model.pickle', 'rb') as f: - self.clf = pickle.load(f) -``` - -```python -class OutlierIsolationForest(CoreIsolationForest): - """ Outlier detection using Isolation Forests. - - Parameters - ---------- - threshold (float) : anomaly score threshold; scores below threshold are outliers - - Functions - ---------- - send_feedback : add target labels as part of the feedback loop - metrics : return custom metrics - """ - def __init__(self,threshold=0.,load_path='./models/'): - - super().__init__(threshold=threshold, load_path=load_path) -``` - -The actual outlier detection is done by the ```_get_preds``` method which is invoked by ```predict``` or ```transform_input``` dependent on whether the detector is defined as respectively a model or a transformer. - -``` python -def predict(self, X, feature_names): - """ Return outlier predictions. - - Parameters - ---------- - X : array-like - feature_names : array of feature names (optional) - """ - logger.info("Using component as a model") - return self._get_preds(X) -``` - -```python -def transform_input(self, X, feature_names): - """ Transform the input. - Used when the outlier detector sits on top of another model. - - Parameters - ---------- - X : array-like - feature_names : array of feature names (optional) - """ - logger.info("Using component as an outlier-detector transformer") - self.prediction_meta = self._get_preds(X) - return X -``` - -## References - -Scikit-learn Isolation Forest: -- https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html \ No newline at end of file diff --git a/components/outlier-detection/isolation-forest/isolation_forest.ipynb b/components/outlier-detection/isolation-forest/isolation_forest.ipynb deleted file mode 100644 index d0aecb7606..0000000000 --- a/components/outlier-detection/isolation-forest/isolation_forest.ipynb +++ /dev/null @@ -1,608 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Isolation Forest (IF) outlier detector deployment" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Wrap a scikit-learn Isolation Forest python model for use as a prediction microservice in seldon-core and deploy on seldon-core running on minikube or a Kubernetes cluster using GCP." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Dependencies" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- [helm](https://github.com/helm/helm)\n", - "- [minikube](https://github.com/kubernetes/minikube) \n", - "- [s2i](https://github.com/openshift/source-to-image) >= 1.1.13 \n", - "\n", - "python packages:\n", - "- scikit-learn: pip install scikit-learn --> 0.20.1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Task" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The outlier detector needs to detect computer network intrusions using TCP dump data for a local-area network (LAN) simulating a typical U.S. Air Force LAN. A connection is a sequence of TCP packets starting and ending at some well defined times, between which data flows to and from a source IP address to a target IP address under some well defined protocol. Each connection is labeled as either normal, or as an attack. \n", - "\n", - "There are 4 types of attacks in the dataset:\n", - "- DOS: denial-of-service, e.g. syn flood;\n", - "- R2L: unauthorized access from a remote machine, e.g. guessing password;\n", - "- U2R: unauthorized access to local superuser (root) privileges;\n", - "- probing: surveillance and other probing, e.g., port scanning.\n", - " \n", - "The dataset contains about 5 million connection records.\n", - "\n", - "There are 3 types of features:\n", - "- basic features of individual connections, e.g. duration of connection\n", - "- content features within a connection, e.g. number of failed log in attempts\n", - "- traffic features within a 2 second window, e.g. number of connections to the same host as the current connection\n", - "\n", - "The outlier detector is only using 40 out of 41 features." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Train locally\n", - "\n", - "Train on small dataset where you roughly know the fraction of outliers, defined by the \"contamination\" parameter." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# define columns to keep\n", - "cols=['duration','protocol_type','flag','src_bytes','dst_bytes','land',\n", - " 'wrong_fragment','urgent','hot','num_failed_logins','logged_in',\n", - " 'num_compromised','root_shell','su_attempted','num_root','num_file_creations',\n", - " 'num_shells','num_access_files','num_outbound_cmds','is_host_login',\n", - " 'is_guest_login','count','srv_count','serror_rate','srv_serror_rate',\n", - " 'rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate',\n", - " 'srv_diff_host_rate','dst_host_count','dst_host_srv_count','dst_host_same_srv_rate',\n", - " 'dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate',\n", - " 'dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate',\n", - " 'dst_host_srv_rerror_rate','target']\n", - "cols_str = str(cols)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!python train.py \\\n", - "--dataset 'kddcup99' \\\n", - "--samples 50000 \\\n", - "--keep_cols \"$cols_str\" \\\n", - "--contamination .1 \\\n", - "--n_estimators 100 \\\n", - "--max_samples .8 \\\n", - "--max_features 1. \\\n", - "--save_path './models/'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test using Kubernetes cluster on GCP or Minikube" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Run the outlier detector as a model or a transformer. If you want to run the anomaly detector as a transformer, change the SERVICE_TYPE variable from MODEL to TRANSFORMER [here](./.s2i/environment), set MODEL = False and change ```OutlierIsolationForest.py``` to:\n", - "\n", - "```python\n", - "from CoreIsolationForest import CoreIsolationForest\n", - "\n", - "class OutlierIsolationForest(CoreIsolationForest):\n", - " \"\"\" Outlier detection using Isolation Forests.\n", - "\n", - " Parameters\n", - " ----------\n", - " threshold (float) : anomaly score threshold; scores below threshold are outliers\n", - " \"\"\"\n", - " def __init__(self,threshold=0.,load_path='./models/'):\n", - "\n", - " super().__init__(threshold=threshold, load_path=load_path)\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "MODEL = True" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Pick Kubernetes cluster on GCP or Minikube." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "MINIKUBE = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if MINIKUBE:\n", - " !minikube start --memory 4096\n", - "else:\n", - " !gcloud container clusters get-credentials standard-cluster-1 --zone europe-west1-b --project seldon-demos" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create a cluster-wide cluster-admin role assigned to a service account named “default” in the namespace “kube-system”." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kubectl create clusterrolebinding kube-system-cluster-admin --clusterrole=cluster-admin \\\n", - "--serviceaccount=kube-system:default" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kubectl create namespace seldon" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Add current context details to the configuration file in the seldon namespace." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kubectl config set-context $(kubectl config current-context) --namespace=seldon" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create tiller service account and give it a cluster-wide cluster-admin role." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kubectl -n kube-system create sa tiller\n", - "!kubectl create clusterrolebinding tiller --clusterrole cluster-admin --serviceaccount=kube-system:tiller\n", - "!helm init --service-account tiller" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check deployment rollout status and deploy seldon/spartakus helm charts." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kubectl rollout status deploy/tiller-deploy -n kube-system" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "!helm install ../../../helm-charts/seldon-core-operator --name seldon-core --set usage_metrics.enabled=true --namespace seldon-system" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check deployment rollout status for seldon core." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "!kubectl rollout status deploy/seldon-controller-manager -n seldon-system" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Install Ambassador API gateway" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!helm install stable/ambassador --name ambassador --set crds.keep=false" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kubectl rollout status deployment.apps/ambassador" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If Minikube used: create docker image for outlier detector inside Minikube using s2i. Besides the transformer image and the demo specific model image, the general model image for the Isolation Forest outlier detector is also available from Docker Hub as ***seldonio/outlier-if-model:0.1***." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "if MINIKUBE & MODEL:\n", - " !eval $(minikube docker-env) && \\\n", - " s2i build . seldonio/seldon-core-s2i-python3:0.4 seldonio/outlier-if-model-demo:0.1\n", - "elif MINIKUBE:\n", - " !eval $(minikube docker-env) && \\\n", - " s2i build . seldonio/seldon-core-s2i-python3:0.4 seldonio/outlier-if-transformer:0.1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Install outlier detector helm charts either as a model or transformer and set *threshold* hyperparameter value." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if MODEL:\n", - " !helm install ../../../helm-charts/seldon-od-model \\\n", - " --name outlier-detector \\\n", - " --namespace=seldon \\\n", - " --set model.type=isolationforest \\\n", - " --set model.isolationforest.image.name=seldonio/outlier-if-model-demo:0.1 \\\n", - " --set model.isolationforest.threshold=0 \\\n", - " --set oauth.key=oauth-key \\\n", - " --set oauth.secret=oauth-secret \\\n", - " --set replicas=1\n", - "else:\n", - " !helm install ../../../helm-charts/seldon-od-transformer \\\n", - " --name outlier-detector \\\n", - " --namespace=seldon \\\n", - " --set outlierDetection.enabled=true \\\n", - " --set outlierDetection.name=outlier-if \\\n", - " --set outlierDetection.type=isolationforest \\\n", - " --set outlierDetection.isolationforest.image.name=seldonio/outlier-if-transformer:0.1 \\\n", - " --set outlierDetection.isolationforest.threshold=0 \\\n", - " --set oauth.key=oauth-key \\\n", - " --set oauth.secret=oauth-secret \\\n", - " --set model.image.name=seldonio/mock_classifier:1.0" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Port forward Ambassador\n", - "\n", - "Run command in terminal:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```\n", - "kubectl port-forward $(kubectl get pods -n seldon -l app.kubernetes.io/name=ambassador -o jsonpath='{.items[0].metadata.name}') -n seldon 8003:8080\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Import rest requests, load data and test requests" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from utils import get_payload, rest_request_ambassador, send_feedback_rest, get_kdd_data, generate_batch\n", - "\n", - "data = get_kdd_data(keep_cols=cols,percent10=True) # load dataset\n", - "print(data.shape)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Generate a random batch from the data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "\n", - "samples = 1\n", - "fraction_outlier = 0.\n", - "X, labels = generate_batch(data,samples,fraction_outlier)\n", - "print(X.shape)\n", - "print(labels.shape)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Test the rest requests with the generated data. It is important that the order of requests is respected. First we make predictions, then we get the \"true\" labels back using the feedback request. If we do not respect the order and eg keep making predictions without getting the feedback for each prediction, there will be a mismatch between the predicted and \"true\" labels. This will result in errors in the produced metrics." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "request = get_payload(X)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "response = rest_request_ambassador(\"outlier-detector\",\"seldon\",request,endpoint=\"localhost:8003\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If the outlier detector is used as a transformer, the output of the anomaly detection is added as part of the metadata. If it is used as a model, we send model feedback to retrieve custom performance metrics." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if MODEL:\n", - " send_feedback_rest(\"outlier-detector\",\"seldon\",request,response,0,labels,endpoint=\"localhost:8003\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Analytics" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Install the helm charts for prometheus and the grafana dashboard" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "!helm install ../../../helm-charts/seldon-core-analytics --name seldon-core-analytics \\\n", - " --set grafana_prom_admin_password=password \\\n", - " --set persistence.enabled=false \\\n", - " --namespace seldon" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Port forward Grafana dashboard" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Run command in terminal:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```\n", - "kubectl port-forward $(kubectl get pods -n seldon -l app=grafana-prom-server -o jsonpath='{.items[0].metadata.name}') -n seldon 3000:3000\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can then view an analytics dashboard inside the cluster at http://localhost:3000/dashboard/db/prediction-analytics?refresh=5s&orgId=1. Your IP address may be different. get it via minikube ip. Login with:\n", - "\n", - "Username : admin\n", - "\n", - "password : password (as set when starting seldon-core-analytics above)\n", - "\n", - "Import the outlier-detector-if dashboard from ../../../helm-charts/seldon-core-analytics/files/grafana/configs." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Run simulation\n", - "\n", - "- Sample random network intrusion data with a certain outlier probability.\n", - "- Get payload for the observation.\n", - "- Make a prediction.\n", - "- Send the \"true\" label with the feedback if the detector is run as a model.\n", - "\n", - "It is important that the prediction-feedback order is maintained. Otherwise there will be a mismatch between the predicted and \"true\" labels.\n", - "\n", - "View the progress on the grafana \"Outlier Detection\" dashboard. Most metrics need the outlier detector to be run as a model since they need model feedback." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "n_requests = 100\n", - "samples = 1\n", - "for i in range(n_requests):\n", - " fraction_outlier = .1\n", - " X, labels = generate_batch(data,samples,fraction_outlier)\n", - " request = get_payload(X)\n", - " response = rest_request_ambassador(\"outlier-detector\",\"seldon\",request,endpoint=\"localhost:8003\")\n", - " if MODEL:\n", - " send_feedback_rest(\"outlier-detector\",\"seldon\",request,response,0,labels,endpoint=\"localhost:8003\")\n", - " time.sleep(1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if MINIKUBE:\n", - " !minikube delete" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/components/outlier-detection/isolation-forest/models/.keep b/components/outlier-detection/isolation-forest/models/.keep deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/components/outlier-detection/isolation-forest/requirements.txt b/components/outlier-detection/isolation-forest/requirements.txt deleted file mode 100644 index 772f39f27b..0000000000 --- a/components/outlier-detection/isolation-forest/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -numpy==1.14.5 -argparse==1.1 -pandas==0.23.4 -scikit-learn==0.20.1 -scipy==1.1.0 -requests>=2.20.0 \ No newline at end of file diff --git a/components/outlier-detection/isolation-forest/train.py b/components/outlier-detection/isolation-forest/train.py deleted file mode 100644 index e799a37120..0000000000 --- a/components/outlier-detection/isolation-forest/train.py +++ /dev/null @@ -1,77 +0,0 @@ -import argparse -import numpy as np -import pickle -import random -from sklearn.ensemble import IsolationForest - -from utils import get_kdd_data, generate_batch - -np.random.seed(2018) -np.random.RandomState(2018) -random.seed(2018) - -# default args -DATASET = 'kddcup99' -SAMPLES = 50000 -COLS = str(['duration','protocol_type','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot', - 'num_failed_logins','logged_in','num_compromised','root_shell','su_attempted','num_root','num_file_creations', - 'num_shells','num_access_files','num_outbound_cmds','is_host_login','is_guest_login','count','srv_count', - 'serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate', - 'srv_diff_host_rate','dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate', - 'dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate', - 'dst_host_rerror_rate','dst_host_srv_rerror_rate','target']) -MODEL_NAME = 'if' -SAVE = True -SAVE_PATH = './models/' - -# Isolation Forest hyperparameters -CONTAMINATION = .1 -N_ESTIMATORS = 50 -MAX_SAMPLES = .8 -MAX_FEATURES = 1. - -def train(X,args): - """ Fit Isolation Forest. """ - - clf = IsolationForest(n_estimators=args.n_estimators, max_samples=args.max_samples, max_features=args.max_features, - contamination=args.contamination,behaviour='new') - clf.fit(X) - - if args.save: # save model - with open(args.save_path + args.model_name + '.pickle', 'wb') as f: - pickle.dump(clf,f) - -def run(args): - """ Load data, generate training batch and train Isolation Forest. """ - - print('\nLoad dataset') - if args.dataset=='kddcup99': - keep_cols = args.keep_cols[1:-1].replace("'","").replace(" ","").split(",") - data = get_kdd_data(keep_cols=keep_cols) - else: - raise ValueError('Only "kddcup99" dataset supported.') - - print('\nGenerate training batch') - X, _ = generate_batch(data,args.samples,args.contamination) - - print('\nTrain outlier detector') - train(X,args) - - print('\nTraining done!') - -if __name__ == '__main__': - - parser = argparse.ArgumentParser(description="Train Isolation Forest outlier detector.") - parser.add_argument('--dataset',type=str,choices=DATASET,default=DATASET) - parser.add_argument('--keep_cols',type=str,default=COLS) - parser.add_argument('--samples',type=int,default=SAMPLES) - parser.add_argument('--contamination',type=float,default=CONTAMINATION) - parser.add_argument('--n_estimators',type=int,default=N_ESTIMATORS) - parser.add_argument('--max_samples',type=float,default=MAX_SAMPLES) - parser.add_argument('--max_features',type=float,default=MAX_FEATURES) - parser.add_argument('--model_name',type=str,default=MODEL_NAME) - parser.add_argument('--save', default=SAVE, action='store_false') - parser.add_argument('--save_path',type=str,default=SAVE_PATH) - args = parser.parse_args() - - run(args) \ No newline at end of file diff --git a/components/outlier-detection/isolation-forest/utils.py b/components/outlier-detection/isolation-forest/utils.py deleted file mode 100644 index 4515e7ef3d..0000000000 --- a/components/outlier-detection/isolation-forest/utils.py +++ /dev/null @@ -1,179 +0,0 @@ -import collections -import json -import numpy as np -import pandas as pd -import requests -from sklearn.datasets import fetch_kddcup99 -from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, fbeta_score - -pd.options.mode.chained_assignment = None # default='warn' - -def get_kdd_data(target=['dos','r2l','u2r','probe'], - keep_cols=['srv_count','serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate', - 'same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count','dst_host_srv_count', - 'dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate', - 'dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate', - 'dst_host_rerror_rate','dst_host_srv_rerror_rate','target'], - percent10=False): - """ Load KDD Cup 1999 data and return in dataframe. """ - - data_raw = fetch_kddcup99(subset=None, data_home=None, percent10=percent10) - - # specify columns - cols=['duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot', - 'num_failed_logins','logged_in','num_compromised','root_shell','su_attempted','num_root','num_file_creations', - 'num_shells','num_access_files','num_outbound_cmds','is_host_login','is_guest_login','count','srv_count', - 'serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate', - 'srv_diff_host_rate','dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate', - 'dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate', - 'dst_host_rerror_rate','dst_host_srv_rerror_rate'] - - # create dataframe - data = pd.DataFrame(data=data_raw['data'],columns=cols) - - # add target to dataframe - data['attack_type'] = data_raw['target'] - - # specify and map attack types - attack_list = np.unique(data['attack_type']) - attack_category = ['dos','u2r','r2l','r2l','r2l','probe','dos','u2r','r2l','dos','probe','normal','u2r', - 'r2l','dos','probe','u2r','probe','dos','r2l','dos','r2l','r2l'] - - attack_types = {} - for i,j in zip(attack_list,attack_category): - attack_types[i] = j - - data['attack_category'] = 'normal' - for key,value in attack_types.items(): - data['attack_category'][data['attack_type'] == key] = value - - # define target - data['target'] = 0 - for t in target: - data['target'][data['attack_category'] == t] = 1 - - # define columns to be dropped - drop_cols = [] - for col in data.columns.values: - if col not in keep_cols: - drop_cols.append(col) - - if drop_cols!=[]: - data.drop(columns=drop_cols,inplace=True) - - # apply OHE if necessary - cols_ohe = ['protocol_type','service','flag'] - for col in cols_ohe: - if col in keep_cols: - col_ohe = pd.get_dummies(data[col],prefix=col) - data = data.join(col_ohe) - data.drop([col],axis=1,inplace=True) - - return data - - -def sample_df(df,n): - """ Sample from df. """ - if n < df.shape[0]+1: - replace = False - else: - replace = True - return df.sample(n=n,replace=replace) - - -def generate_batch(data,n_samples,frac_outliers): - """ Generate random batch from data with fixed size and fraction of outliers. """ - - normal = data[data['target']==0] - outlier = data[data['target']==1] - - if n_samples==1: - n_outlier = np.random.binomial(1,frac_outliers) - n_normal = 1 - n_outlier - else: - n_normal = int((1-frac_outliers) * n_samples) - n_outlier = int(frac_outliers * n_samples) - - batch_normal = sample_df(normal,n_normal) - batch_outlier = sample_df(outlier,n_outlier) - - batch = pd.concat([batch_normal,batch_outlier]) - batch = batch.sample(frac=1).reset_index(drop=True) - - outlier_true = batch['target'].values - batch.drop(columns=['target'],inplace=True) - - return batch.values.astype('float'), outlier_true - -def flatten(x): - if isinstance(x, collections.Iterable): - return [a for i in x for a in flatten(i)] - else: - return [x] - -def performance(y_true,y_pred,roll_window=100): - """ Return a confusion matrix and calculate rolling accuracy, precision, recall, F1 and F2 scores. """ - - # confusion matrix - cm = confusion_matrix(y_true,y_pred,labels=[0,1]) - tn, fp, fn, tp = cm.ravel() - - # total scores - acc_tot = accuracy_score(y_true,y_pred) - prec_tot = precision_score(y_true,y_pred) - rec_tot = recall_score(y_true,y_pred) - f1_tot = f1_score(y_true,y_pred) - f2_tot = fbeta_score(y_true,y_pred,beta=2) - - # rolling scores - y_true_roll = y_true[-roll_window:] - y_pred_roll = y_pred[-roll_window:] - acc_roll = accuracy_score(y_true_roll,y_pred_roll) - prec_roll = precision_score(y_true_roll,y_pred_roll) - rec_roll = recall_score(y_true_roll,y_pred_roll) - f1_roll = f1_score(y_true_roll,y_pred_roll) - f2_roll = fbeta_score(y_true_roll,y_pred_roll,beta=2) - - scores = [tn, fp, fn, tp, acc_tot, prec_tot, rec_tot, f1_tot, f2_tot, - acc_roll, prec_roll, rec_roll, f1_roll, f2_roll] - - return scores - -def outlier_stats(y_true,y_pred,roll_window=100): - """ Calculate number and percentage of predicted and labeled outliers. """ - - y_pred_roll = np.sum(y_pred[-roll_window:]) - y_true_roll = np.sum(y_true[-roll_window:]) - y_pred_tot = np.sum(y_pred) - y_true_tot = np.sum(y_true) - - return y_pred_roll, y_true_roll, y_pred_tot, y_true_tot - -def get_payload(arr): - features = ["srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate", - "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count","dst_host_same_srv_rate", - "dst_host_diff_srv_rate","dst_host_same_src_port_rate","dst_host_srv_diff_host_rate", - "dst_host_serror_rate","dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate"] - datadef = {"names":features,"ndarray":arr.tolist()} - payload = {"meta":{},"data":datadef} - return payload - -def rest_request_ambassador(deploymentName,namespace,request,endpoint="localhost:8003"): - response = requests.post( - "http://"+endpoint+"/seldon/"+namespace+"/"+deploymentName+"/api/v0.1/predictions", - json=request) - print(response.status_code) - print(response.text) - return response.json() - -def send_feedback_rest(deploymentName,namespace,request,response,reward,truth,endpoint="localhost:8003"): - feedback = { - "request": request, - "response": response, - "reward": reward, - "truth": {"data":{"ndarray":truth.tolist()}} - } - ret = requests.post( - "http://"+endpoint+"/seldon/"+namespace+"/"+deploymentName+"/api/v0.1/feedback", - json=feedback) - return diff --git a/components/outlier-detection/mahalanobis/.s2i/environment b/components/outlier-detection/mahalanobis/.s2i/environment deleted file mode 100644 index 8bbf65edae..0000000000 --- a/components/outlier-detection/mahalanobis/.s2i/environment +++ /dev/null @@ -1,4 +0,0 @@ -MODEL_NAME=OutlierMahalanobis -API_TYPE=REST -SERVICE_TYPE=MODEL -PERSISTENCE=0 diff --git a/components/outlier-detection/mahalanobis/CoreMahalanobis.py b/components/outlier-detection/mahalanobis/CoreMahalanobis.py deleted file mode 100644 index ac90c553de..0000000000 --- a/components/outlier-detection/mahalanobis/CoreMahalanobis.py +++ /dev/null @@ -1,192 +0,0 @@ -import logging -import numpy as np -from scipy.linalg import eigh - -logger = logging.getLogger(__name__) - -class CoreMahalanobis(object): - """ Outlier detection using the Mahalanobis distance. - - Parameters - ---------- - threshold (float) : Mahalanobis distance threshold used to classify outliers - n_components (int) : number of principal components used - n_stdev (float) : stdev used for feature-wise clipping of observations - start_clip (int) : number of observations before clipping is applied - max_n (int) : algorithm behaves as if it has seen at most max_n points - - Functions - ---------- - predict : detect and return outliers - transform_input : detect outliers and return input features - send_feedback : add target labels as part of the feedback loop - tags : add metadata for input transformer - metrics : return custom metrics - """ - def __init__(self,threshold=25,n_components=3,n_stdev=3,start_clip=50,max_n=-1): - - logger.info("Initializing model") - self.threshold = threshold - self.n_components = n_components - self.max_n = max_n - self.n_stdev = n_stdev - self.start_clip = start_clip - - self.clip = None - self.mean = 0 - self.C = 0 - self.n = 0 - self.nb_outliers = 0 - - - def predict(self, X, feature_names): - """ Return outlier predictions. - - Parameters - ---------- - X : array-like - feature_names : array of feature names (optional) - """ - logger.info("Using component as a model") - return self._get_preds(X) - - - def transform_input(self, X, feature_names): - """ Transform the input. - Used when the outlier detector sits on top of another model. - - Parameters - ---------- - X : array-like - feature_names : array of feature names (optional) - """ - logger.info("Using component as an outlier-detector transformer") - self.prediction_meta = self._get_preds(X) - return X - - - def _get_preds(self,X): - """ Detect outliers using the Mahalanobis distance threshold. - - Parameters - ---------- - X : array-like - """ - - nb = X.shape[0] # batch size - p = X.shape[1] # number of features - n_components = min(self.n_components,p) - if self.max_n>0: - n = min(self.n,self.max_n) # n can never be above max_n - else: - n = self.n - - # Clip X - if self.n > self.start_clip: - Xclip = np.clip(X,self.clip[0],self.clip[1]) - else: - Xclip = X - - # Tracking the mean and covariance matrix - roll_partial_means = Xclip.cumsum(axis=0)/(np.arange(nb)+1).reshape((nb,1)) - coefs = (np.arange(nb)+1.)/(np.arange(nb)+n+1.) - new_means = self.mean + coefs.reshape((nb,1))*(roll_partial_means-self.mean) - new_means_offset = np.empty_like(new_means) - new_means_offset[0] = self.mean - new_means_offset[1:] = new_means[:-1] - - coefs = ((n+np.arange(nb))/(n+np.arange(nb)+1.)).reshape((nb,1,1)) - B = coefs*np.matmul((Xclip - new_means_offset)[:,:,None],(Xclip - new_means_offset)[:,None,:]) - cov_batch = (n-1.)/(n+max(1,nb-1.))*self.C + 1./(n+max(1,nb-1.))*B.sum(axis=0) - - # PCA - eigvals, eigvects = eigh(cov_batch,eigvals=(p-n_components,p-1)) - - # Projections - proj_x = np.matmul(X,eigvects) - proj_x_clip = np.matmul(Xclip,eigvects) - proj_means = np.matmul(new_means_offset,eigvects) - if type(self.C) == int and self.C == 0: - proj_cov = np.diag(np.zeros(n_components)) - else: - proj_cov = np.matmul(eigvects.transpose(),np.matmul(self.C,eigvects)) - - # Outlier detection in the PC subspace - coefs = (1./(n+np.arange(nb)+1.)).reshape((nb,1,1)) - B = coefs*np.matmul((proj_x_clip - proj_means)[:,:,None],(proj_x_clip - proj_means)[:,None,:]) - - all_C_inv = np.zeros_like(B) - c_inv = None - _EPSILON = 1e-8 - - for i, b in enumerate(B): - if c_inv is None: - if abs(np.linalg.det(proj_cov)) > _EPSILON: - c_inv = np.linalg.inv(proj_cov) - all_C_inv[i] = c_inv - continue - else: - if n + i == 0: - continue - proj_cov = (n + i -1. )/(n + i)*proj_cov + b - continue - else: - c_inv = (n + i - 1.)/float(n + i - 2.)*all_C_inv[i-1] - BC1 = np.matmul(B[i-1],c_inv) - all_C_inv[i] = c_inv - 1./(1.+np.trace(BC1))*np.matmul(c_inv,BC1) - - # Updates - self.mean = new_means[-1] - self.C = cov_batch - stdev = np.sqrt(np.diag(cov_batch)) - self.n += nb - if self.n > self.start_clip: - self.clip = [self.mean-self.n_stdev*stdev,self.mean+self.n_stdev*stdev] - - # Outlier scores and predictions - x_diff = proj_x-proj_means - self.score = np.matmul(x_diff[:,None,:],np.matmul(all_C_inv,x_diff[:,:,None])).reshape(nb) - self.prediction = np.array([1 if s > self.threshold else 0 for s in self.score]).astype(int) - - return self.prediction - - - def send_feedback(self,X,feature_names,reward,truth): - """ Return additional data as part of the feedback loop. - - Parameters - ---------- - X : array of the features sent in the original predict request - feature_names : array of feature names. May be None if not available. - reward (float): the reward - truth : array with correct value (optional) - """ - logger.info("Send feedback called") - return [] - - - def tags(self): - """ - Use predictions made within transform to add these as metadata - to the response. Tags will only be collected if the component is - used as an input-transformer. - """ - try: - return {"outlier-predictions": self.prediction_meta.tolist()} - except AttributeError: - logger.info("No metadata about outliers") - - - def metrics(self): - """ Return custom metrics averaged over the prediction batch. - """ - self.nb_outliers += np.sum(self.prediction) - - is_outlier = {"type":"GAUGE","key":"is_outlier","value":np.mean(self.prediction)} - outlier_score = {"type":"GAUGE","key":"outlier_score","value":np.mean(self.score)} - nb_outliers = {"type":"GAUGE","key":"nb_outliers","value":int(self.nb_outliers)} - fraction_outliers = {"type":"GAUGE","key":"fraction_outliers","value":int(self.nb_outliers)/self.n} - obs = {"type":"GAUGE","key":"observation","value":self.n} - threshold = {"type":"GAUGE","key":"threshold","value":self.threshold} - - return [is_outlier,outlier_score,nb_outliers,fraction_outliers,obs,threshold] \ No newline at end of file diff --git a/components/outlier-detection/mahalanobis/OutlierMahalanobis.py b/components/outlier-detection/mahalanobis/OutlierMahalanobis.py deleted file mode 100644 index 916440289e..0000000000 --- a/components/outlier-detection/mahalanobis/OutlierMahalanobis.py +++ /dev/null @@ -1,120 +0,0 @@ -import numpy as np - -from CoreMahalanobis import CoreMahalanobis -from utils import flatten, performance, outlier_stats - -class OutlierMahalanobis(CoreMahalanobis): - """ Outlier detection using the Mahalanobis distance. - - Parameters - ---------- - threshold (float) : Mahalanobis distance threshold used to classify outliers - n_components (int) : number of principal components used - n_stdev (float) : stdev used for feature-wise clipping of observations - start_clip (int) : number of observations before clipping is applied - max_n (int) : algorithm behaves as if it has seen at most max_n points - - Functions - ---------- - send_feedback : add target labels as part of the feedback loop - metrics : return custom metrics - """ - def __init__(self,threshold=25,n_components=3,n_stdev=3,start_clip=50,max_n=-1): - - super().__init__(threshold=threshold,n_components=n_components,n_stdev=n_stdev, - start_clip=start_clip,max_n=max_n) - - self._predictions = [] - self._labels = [] - self._scores = [] - self.roll_window = 100 - self.metric = [float('nan') for i in range(18)] - - - def send_feedback(self,X,feature_names,reward,truth): - """ Return outlier labels as part of the feedback loop. - - Parameters - ---------- - X : array of the features sent in the original predict request - feature_names : array of feature names. May be None if not available. - reward (float): the reward - truth : array with correct value (optional) - """ - _ = super().send_feedback(X,feature_names,reward,truth) - - # historical reconstruction errors and predictions - self._scores.append(self.score) - self._scores = flatten(self._scores) - self._predictions.append(self.prediction) - self._predictions = flatten(self._predictions) - - # target labels - self.label = truth - self._labels.append(self.label) - self._labels = flatten(self._labels) - - # performance metrics - scores = performance(self._labels,self._predictions,roll_window=self.roll_window) - stats = outlier_stats(self._labels,self._predictions,roll_window=self.roll_window) - - convert = flatten([scores,stats]) - metric = [] - for c in convert: # convert from np to native python type to jsonify - metric.append(np.asscalar(np.asarray(c))) - self.metric = metric - - return [] - - - def metrics(self): - """ Return custom metrics. - Printed with a delay of 1 prediction because the labels are returned in the feedback step. - """ - - if self.score.shape[0]>1: - raise ValueError('Metrics can only handle single observations.') - - if self.n==1: - pred = float('nan') - err = float('nan') - y_true = float('nan') - else: - pred = int(self._predictions[-1]) - err = self._scores[-1] - y_true = int(self.label[0]) - - is_outlier = {"type":"GAUGE","key":"is_outlier","value":pred} - outlier_score = {"type":"GAUGE","key":"outlier_score","value":err} - obs = {"type":"GAUGE","key":"observation","value":self.n - 1} - threshold = {"type":"GAUGE","key":"threshold","value":self.threshold} - - label = {"type":"GAUGE","key":"label","value":y_true} - - accuracy_tot = {"type":"GAUGE","key":"accuracy_tot","value":self.metric[4]} - precision_tot = {"type":"GAUGE","key":"precision_tot","value":self.metric[5]} - recall_tot = {"type":"GAUGE","key":"recall_tot","value":self.metric[6]} - f1_score_tot = {"type":"GAUGE","key":"f1_tot","value":self.metric[7]} - f2_score_tot = {"type":"GAUGE","key":"f2_tot","value":self.metric[8]} - - accuracy_roll = {"type":"GAUGE","key":"accuracy_roll","value":self.metric[9]} - precision_roll = {"type":"GAUGE","key":"precision_roll","value":self.metric[10]} - recall_roll = {"type":"GAUGE","key":"recall_roll","value":self.metric[11]} - f1_score_roll = {"type":"GAUGE","key":"f1_roll","value":self.metric[12]} - f2_score_roll = {"type":"GAUGE","key":"f2_roll","value":self.metric[13]} - - true_negative = {"type":"GAUGE","key":"true_negative","value":self.metric[0]} - false_positive = {"type":"GAUGE","key":"false_positive","value":self.metric[1]} - false_negative = {"type":"GAUGE","key":"false_negative","value":self.metric[2]} - true_positive = {"type":"GAUGE","key":"true_positive","value":self.metric[3]} - - nb_outliers_roll = {"type":"GAUGE","key":"nb_outliers_roll","value":self.metric[14]} - nb_labels_roll = {"type":"GAUGE","key":"nb_labels_roll","value":self.metric[15]} - nb_outliers_tot = {"type":"GAUGE","key":"nb_outliers_tot","value":self.metric[16]} - nb_labels_tot = {"type":"GAUGE","key":"nb_labels_tot","value":self.metric[17]} - - return [is_outlier,outlier_score,obs,threshold,label, - accuracy_tot,precision_tot,recall_tot,f1_score_tot,f2_score_tot, - accuracy_roll,precision_roll,recall_roll,f1_score_roll,f2_score_roll, - true_negative,false_positive,false_negative,true_positive, - nb_outliers_roll,nb_labels_roll,nb_outliers_tot,nb_labels_tot] diff --git a/components/outlier-detection/mahalanobis/README.md b/components/outlier-detection/mahalanobis/README.md deleted file mode 100644 index bf95332b2c..0000000000 --- a/components/outlier-detection/mahalanobis/README.md +++ /dev/null @@ -1,22 +0,0 @@ -# Mahalanobis Online Outlier Detector - -## Description - -[Anomaly or outlier detection](https://en.wikipedia.org/wiki/Anomaly_detection) has many applications, ranging from preventing credit card fraud to detecting computer network intrusions. - -The Mahalanobis online outlier detector aims to predict anomalies in tabular data. The algorithm calculates an outlier score, which is a measure of distance from the center of the features distribution ([Mahalanobis distance](https://en.wikipedia.org/wiki/Mahalanobis_distance)). If this outlier score is higher than a user-defined threshold, the observation is flagged as an outlier. The algorithm is online, which means that it starts without knowledge about the distribution of the features and learns as requests arrive. Consequently you should expect the output to be bad at the start and to improve over time. - -## Implementation - -The algorithm is implemented in the ```CoreOutlierMahalanobis``` class and a detailed explanation of the implementation and usage of the algorithm to spot anomalies can be found in the [mahalanobis doc](./doc.ipynb). - -## Running on Seldon - -An end-to-end example running a Mahalanobis outlier detector on GCP or Minikube using Seldon to identify computer network intrusions is available [here](./outlier_mahalanobis.ipynb). - -Docker images to use the generic Mahalanobis outlier detector as a model or transformer can be found on Docker Hub: -* [seldonio/outlier-mahalanobis-model](https://hub.docker.com/r/seldonio/outlier-mahalanobis-model) -* [seldonio/outlier-mahalanobis-transformer](https://hub.docker.com/r/seldonio/outlier-mahalanobis-transformer) - -A model docker image specific for the demo is also available: -* [seldonio/outlier-mahalanobis-model-demo](https://hub.docker.com/r/seldonio/outlier-mahalanobis-model-demo) \ No newline at end of file diff --git a/components/outlier-detection/mahalanobis/__init__.py b/components/outlier-detection/mahalanobis/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/components/outlier-detection/mahalanobis/doc.ipynb b/components/outlier-detection/mahalanobis/doc.ipynb deleted file mode 100644 index 76c608a5c6..0000000000 --- a/components/outlier-detection/mahalanobis/doc.ipynb +++ /dev/null @@ -1,350 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Mahalanobis Outlier Algorithm Documentation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The aim of the document is to explain and document the algorithm used in Seldon's Mahalanobis Online Outlier Detector.\n", - "\n", - "In the first part we give a high level overview of the algorithm, then we explain the implementation in detail." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Overview" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Outlier detection has many applications, ranging from preventing credit card fraud to detecting computer network intrusions. The available data is typically unlabeled and detection needs to be done in real-time. The outlier detector can be used as a standalone algorithm, or to detect anomalies in the input data of another predictive model.\n", - "\n", - "The Mahalanobis outlier detection algorithm calculates an outlier score, which is a measure of distance from the center of the features distribution (Mahalanobis distance). If this outlier score is higher than a user-defined threshold, the observation is flagged as an outlier. The algorithm is online, which means that it starts without knowledge about the distribution of the features and learns as requests arrive. Consequently you should expect the output to be bad at the start and to improve over time.\n", - "\n", - "As observations arrive, the algorithm will:\n", - "- Keep track and update the mean and sample covariance matrix of the dataset\n", - "- Apply a principal component analysis using these moments and project the new observations on the first 3 principal components (default value, can be changed)\n", - "- Compute the Mahalanobis distance from these projections to the projected mean\n", - "- Predict that the observation is an outlier if the Mahalanobis distance is larger than the threshold level" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Implementation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The core of the algorithm is using efficient and numerically stable streaming techniques to keep track of the mean and covariance matrix as new points are observed. The PCA is done by finding the eigenvectors of the covariance matrix using a function implemented in scipy. We also use an efficient algorithm to avoid inverting a new covariance matrix for each point in the batch." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Object state" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The outlier detector class has 9 attributes:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```python\n", - "class CoreMahalanobis(object):\n", - " def __init__(self,threshold=25,n_components=3,n_stdev=3,start_clip=50,max_n=-1):\n", - " \n", - " self.threshold = threshold\n", - " self.n_components = n_components\n", - " self.max_n = max_n\n", - " self.n_stdev = n_stdev\n", - " self.start_clip = start_clip\n", - " \n", - " self.clip = None\n", - " self.mean = 0\n", - " self.C = 0\n", - " self.n = 0\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- ***threshold***: If the Mahalanobis distance for an observation is above the threshold, the observation is classified as an outlier.\n", - "- ***n_components***: Number of principal components used for projection of the features.\n", - "- ***max_n***: Used to make the algorithm non stationary by capping the number of observations to max_n. When specified, the algorithm will behave like if it had seen at most max_n points, thus adapting faster to changes in the underlying distribution. Turned off (set to -1) by default.\n", - "- ***n_stdev***: Number of standard deviations away from the mean for each feature beyond which the feature's value is clipped before updating the mean and covariance matrix.\n", - "- ***start_clip***: Number of observations before feature-wise clipping is applied.\n", - "- ***clip***: List with lower and upper values for each feature beyond which clipping is applied after start_clip observations. Initiated with None.\n", - "- ***mean***: Online mean of the observed values.\n", - "- ***C***: Online covariance matrix of the observed values.\n", - "- ***n***: Number of observations so far." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### First Step: Tracking the mean and covariance matrix" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```python\n", - "def _get_preds(self,X):\n", - " \"\"\" Detect outliers using the Mahalanobis distance threshold. \n", - "\n", - " Parameters\n", - " ----------\n", - " X : array-like\n", - " \"\"\"\n", - "\n", - " nb = X.shape[0] # batch size\n", - " p = X.shape[1] # number of features\n", - " n_components = min(self.n_components,p)\n", - " if self.max_n>0:\n", - " n = min(self.n,self.max_n) # n can never be above max_n\n", - " else:\n", - " n = self.n\n", - "\n", - " # Clip X\n", - " if self.n > self.start_clip:\n", - " Xclip = np.clip(X,self.clip[0],self.clip[1])\n", - " else:\n", - " Xclip = X\n", - "\n", - " # Tracking the mean and covariance matrix\n", - " roll_partial_means = Xclip.cumsum(axis=0)/(np.arange(nb)+1).reshape((nb,1))\n", - " coefs = (np.arange(nb)+1.)/(np.arange(nb)+n+1.)\n", - " new_means = self.mean + coefs.reshape((nb,1))*(roll_partial_means-self.mean)\n", - " new_means_offset = np.empty_like(new_means)\n", - " new_means_offset[0] = self.mean\n", - " new_means_offset[1:] = new_means[:-1]\n", - "\n", - " coefs = ((n+np.arange(nb))/(n+np.arange(nb)+1.)).reshape((nb,1,1))\n", - " B = coefs*np.matmul((Xclip - new_means_offset)[:,:,None],(Xclip - new_means_offset)[:,None,:])\n", - " cov_batch = (n-1.)/(n+max(1,nb-1.))*self.C + 1./(n+max(1,nb-1.))*B.sum(axis=0)\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here we have implemented a numerically stable algorithm for updating the mean and covariance matrix based on the following formulas.\n", - "\n", - "**Batch Online Mean**\n", - "\n", - "Let $\\bar{X}_n = \\frac{1}{n} \\sum_{k=1}^n{X_k} $ the rolling mean of $ (X_n)_n $\n", - "\n", - "Let $\\bar{X}_{n,N} = \\frac{1}{N-n} \\sum_{k=n+1}^N{X_k} $ the batch mean of $X_n$ between $n$ and $N$\n", - "\n", - "Then we have: \n", - "\n", - "$ \\bar{X}_{n+b} = \\bar{X}_n + \\frac{b}{n+b}(\\bar{X}_{n,n+b} - \\bar{X}_n) $ $ (1) $\n", - "\n", - "**Batch Online Covariance Matrix**\n", - "\n", - "Let $C_n = \\frac{1}{n-1} \\sum_{k=1}^n{(X_k - \\bar{X}_n)(X_k - \\bar{X}_n)^t} $ the rolling sample covariance matrix of $ (X_n)_n $\n", - "\n", - "Then we have:\n", - "\n", - "$ C_{n+b} = \\frac{n-1}{n+b-1}C_{n} + \\frac{1}{n+b-1}\\sum^{b-1}_{i=0}\\frac{n+i}{n+i+1}(X_{n+i+1}-\\bar{X}_{n+i})(X_{n+i+1}-\\bar{X}_{n+i})^t $ $ (2) $\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The online mean and covariance matrix are updated with clipped values of new observations. As a result, we can limit the impact of outliers on the estimated mean and covariance matrix. This can be particularly helpful when outliers arrive in sequences instead of uniformly distributed over time. Clipping is applied to each feature that has a value beyond the lower or upper boundary defined by the *n_stdev* hyperparameter. \n", - "\n", - "The 2 figures below illustrate the impact of clipping on the detection of computer network intrusions by showing the outlier score per observation for a sequence of network data. Scores above the red threshold line are classified as outliers by the algorithm. Please check out the [case study](./outlier_mahalanobis.ipynb) for more information regarding the dataset. During the first 500 observations, the fraction of anomalies is set at 2%. We then increase the amount of network intrusions temporarily to 20% over the next 500 observations before settling at an anomaly rate of 5%. No clipping is applied in the first figure, while figure 2 clips observations 3 standard deviations away from the online mean of each feature. The higher fraction of outliers is quickly incorporated in the unclipped covariance matrix. Consequently, the Mahalanobis distances of both outliers and inliers become similar and it is hard to spot anomalies. The covariance matrix in the clipped outlier detector is less impacted by the anomalies. As a result, the Mahalanobis distance of outliers is much larger than for normal data and less affected by the temporary spike in outliers." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![outliers_unclipped](images/outliers_no_clipping.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![outliers_clipped](images/outliers_3stdev_clipped.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Second Step: PCA and projection" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```python\n", - " # PCA\n", - " eigvals, eigvects = eigh(cov_batch,eigvals=(p-n_components,p-1))\n", - "\n", - " # Projections\n", - " proj_x = np.matmul(X,eigvects)\n", - " proj_x_clip = np.matmul(Xclip,eigvects)\n", - " proj_means = np.matmul(new_means_offset,eigvects)\n", - " if type(self.C) == int and self.C == 0:\n", - " proj_cov = np.diag(np.zeros(n_components))\n", - " else:\n", - " proj_cov = np.matmul(eigvects.transpose(),np.matmul(self.C,eigvects))\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We compute the first principal components: these are the eigenvectors of the sample covariance matrix associated to the largest eigenvalues. For this we use the function [eigh](https://docs.scipy.org/doc/scipy/reference/generated/scipy.linalg.eigh.html) from ```scipy.linalg```.\n", - "\n", - "We then project the new, both original and clipped, observations, the rolling means and the previous covariance matrix on the principal components subspace." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Third Step: Outlier detection in the Principal Components Subspace" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Substep: Fast computation of the inverses of the covariance matrices" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To compute the outlier score of each point in the new batch, we need the inverse of the covariance matrix of all the points up to this one. This means inverting $b$ matrices. We made this operation faster by leveraging the fact that each covariance matrix is a rank one update of the previous one. \n", - "Knowing this we can use the following theorem.\n", - "\n", - "**Theorem:**\n", - "\n", - "if $A$ and $A+B$ are invertible and $rank(B) = 1$ then\n", - "\n", - "$ (A + B)^{-1} = A^{-1} - \\frac{1}{1+trace(BA^{-1})}A^{-1}BA^{-1} $\n", - "\n", - "The implementation is:\n", - "\n", - "```python\n", - " coefs = (1./(n+np.arange(nb)+1.)).reshape((nb,1,1))\n", - " B = coefs*np.matmul((proj_x_clip - proj_means)[:,:,None],(proj_x_clip - proj_means)[:,None,:])\n", - "\n", - " all_C_inv = np.zeros_like(B)\n", - " c_inv = None\n", - " _EPSILON = 1e-8\n", - "\n", - " for i, b in enumerate(B):\n", - " if c_inv is None:\n", - " if abs(np.linalg.det(proj_cov)) > _EPSILON:\n", - " c_inv = np.linalg.inv(proj_cov)\n", - " all_C_inv[i] = c_inv\n", - " continue\n", - " else:\n", - " if n + i == 0:\n", - " continue\n", - " proj_cov = (n + i -1. )/(n + i)*proj_cov + b\n", - " continue\n", - " else:\n", - " c_inv = (n + i - 1.)/float(n + i - 2.)*all_C_inv[i-1]\n", - " BC1 = np.matmul(B[i-1],c_inv)\n", - " all_C_inv[i] = c_inv - 1./(1.+np.trace(BC1))*np.matmul(c_inv,BC1)\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Finally, we update the attributes including the clip ranges, compute the outlier scores and return the outlier predictions" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```python\n", - " self.mean = new_means[-1]\n", - " self.C = cov_batch\n", - " stdev = np.sqrt(np.diag(cov_batch))\n", - " self.n += nb\n", - " if self.n > self.start_clip:\n", - " self.clip = [self.mean-self.n_stdev*stdev,self.mean+self.n_stdev*stdev]\n", - "\n", - " # Outlier scores and predictions\n", - " x_diff = proj_x-proj_means\n", - " self.score = np.matmul(x_diff[:,None,:],np.matmul(all_C_inv,x_diff[:,:,None])).reshape(nb)\n", - " self.prediction = np.array([1 if s > self.threshold else 0 for s in self.score]).astype(int)\n", - " \n", - " return self.prediction\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The outlier score is the Mahalanobis distance in the Principal Components Subspace.\n", - "\n", - "$ score_n = (X_n - \\bar{X}_{n-1})^tC_{n-1}^{-1}(X_n - \\bar{X}_{n-1}) $" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.6" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/components/outlier-detection/mahalanobis/images/outliers_3stdev_clipped.png b/components/outlier-detection/mahalanobis/images/outliers_3stdev_clipped.png deleted file mode 100644 index b3ba224108..0000000000 Binary files a/components/outlier-detection/mahalanobis/images/outliers_3stdev_clipped.png and /dev/null differ diff --git a/components/outlier-detection/mahalanobis/images/outliers_no_clipping.png b/components/outlier-detection/mahalanobis/images/outliers_no_clipping.png deleted file mode 100644 index 980c6ae77c..0000000000 Binary files a/components/outlier-detection/mahalanobis/images/outliers_no_clipping.png and /dev/null differ diff --git a/components/outlier-detection/mahalanobis/outlier_mahalanobis.ipynb b/components/outlier-detection/mahalanobis/outlier_mahalanobis.ipynb deleted file mode 100644 index da3aecb912..0000000000 --- a/components/outlier-detection/mahalanobis/outlier_mahalanobis.ipynb +++ /dev/null @@ -1,577 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Mahalanobis outlier detector deployment" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Wrap a Mahalanobis anomaly detection model for use as a prediction microservice in seldon-core and deploy on seldon-core running on minikube or a Kubernetes cluster using GCP." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Dependencies" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- [helm](https://github.com/helm/helm)\n", - "- [minikube](https://github.com/kubernetes/minikube)\n", - "- [s2i](https://github.com/openshift/source-to-image) >= 1.1.13\n", - "\n", - "python packages:\n", - "- scikit-learn: pip install scikit-learn --> 0.20.1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Task" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The outlier detector needs to detect computer network intrusions using TCP dump data for a local-area network (LAN) simulating a typical U.S. Air Force LAN. A connection is a sequence of TCP packets starting and ending at some well defined times, between which data flows to and from a source IP address to a target IP address under some well defined protocol. Each connection is labeled as either normal, or as an attack. \n", - "\n", - "There are 4 types of attacks in the dataset:\n", - "- DOS: denial-of-service, e.g. syn flood;\n", - "- R2L: unauthorized access from a remote machine, e.g. guessing password;\n", - "- U2R: unauthorized access to local superuser (root) privileges;\n", - "- probing: surveillance and other probing, e.g., port scanning.\n", - " \n", - "The dataset contains about 5 million connection records.\n", - "\n", - "There are 3 types of features:\n", - "- basic features of individual connections, e.g. duration of connection\n", - "- content features within a connection, e.g. number of failed log in attempts\n", - "- traffic features within a 2 second window, e.g. number of connections to the same host as the current connection\n", - "\n", - "The outlier detector is only using the continuous (18 out of 41) features." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test using Kubernetes cluster on GCP or Minikube" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Run the outlier detector as a model or a transformer. If you want to run the anomaly detector as a transformer, change the SERVICE_TYPE variable from MODEL to TRANSFORMER [here](./.s2i/environment), set MODEL = False and change ```OutlierMahalanobis.py``` to:\n", - "\n", - "```python\n", - "from CoreMahalanobis import CoreMahalanobis\n", - "\n", - "class OutlierMahalanobis(CoreMahalanobis):\n", - " \"\"\" Outlier detection using the Mahalanobis distance.\n", - " \n", - " Parameters\n", - " ----------\n", - " threshold (float) : Mahalanobis distance threshold used to classify outliers\n", - " n_components (int) : number of principal components used\n", - " n_stdev (float) : stdev used for feature-wise clipping of observations\n", - " start_clip (int) : number of observations before clipping is applied\n", - " max_n (int) : algorithm behaves as if it has seen at most max_n points\n", - " \"\"\"\n", - " def __init__(self,threshold=25,n_components=3,n_stdev=3,start_clip=50,max_n=-1):\n", - " \n", - " super().__init__(threshold=threshold,n_components=n_components,n_stdev=n_stdev,\n", - " start_clip=start_clip,max_n=max_n)\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "MODEL = True" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Pick Kubernetes cluster on GCP or Minikube." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "MINIKUBE = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "if MINIKUBE:\n", - " !minikube start --memory 4096 \n", - "else:\n", - " !gcloud container clusters get-credentials standard-cluster-1 --zone europe-west1-b --project seldon-demos" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create a cluster-wide cluster-admin role assigned to a service account named “default” in the namespace “kube-system”." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kubectl create clusterrolebinding kube-system-cluster-admin --clusterrole=cluster-admin \\\n", - "--serviceaccount=kube-system:default" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kubectl create namespace seldon" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Add current context details to the configuration file in the seldon namespace." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kubectl config set-context $(kubectl config current-context) --namespace=seldon" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create tiller service account and give it a cluster-wide cluster-admin role." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "!kubectl -n kube-system create sa tiller\n", - "!kubectl create clusterrolebinding tiller --clusterrole cluster-admin --serviceaccount=kube-system:tiller\n", - "!helm init --service-account tiller" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check deployment rollout status and deploy seldon/spartakus helm charts." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kubectl rollout status deploy/tiller-deploy -n kube-system" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "!helm install ../../../helm-charts/seldon-core-operator --name seldon-core --set usage_metrics.enabled=true --namespace seldon-system" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check deployment rollout status for seldon core." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kubectl rollout status deploy/seldon-controller-manager -n seldon-system" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Install Ambassador API gateway" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!helm install stable/ambassador --name ambassador --set crds.keep=false" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kubectl rollout status deployment.apps/ambassador" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If Minikube used: create docker image for outlier detector inside Minikube using s2i. Besides the transformer image and the demo specific model image, the general model image for the Mahalanobis outlier detector is also available from Docker Hub as ***seldonio/outlier-mahalanobis-model:0.1***." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "if MINIKUBE & MODEL:\n", - " !eval $(minikube docker-env) && \\\n", - " s2i build . seldonio/seldon-core-s2i-python3:0.4 seldonio/outlier-mahalanobis-model-demo:0.1\n", - "elif MINIKUBE:\n", - " !eval $(minikube docker-env) && \\\n", - " s2i build . seldonio/seldon-core-s2i-python3:0.4 seldonio/outlier-mahalanobis-transformer:0.1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Install outlier detector helm charts either as a model or transformer and set *threshold*, *n_components*, *n_stdev* and *start_clip* hyperparameter values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if MODEL:\n", - " !helm install ../../../helm-charts/seldon-od-model \\\n", - " --name outlier-detector \\\n", - " --namespace=seldon \\\n", - " --set model.type=mahalanobis \\\n", - " --set model.mahalanobis.image.name=seldonio/outlier-mahalanobis-model-demo:0.1 \\\n", - " --set model.mahalanobis.threshold=25 \\\n", - " --set model.mahalanobis.n_components=3 \\\n", - " --set model.mahalanobis.n_stdev=3 \\\n", - " --set model.mahalanobis.start_clip=50 \\\n", - " --set oauth.key=oauth-key \\\n", - " --set oauth.secret=oauth-secret \\\n", - " --set replicas=1\n", - "else:\n", - " !helm install ../../../helm-charts/seldon-od-transformer \\\n", - " --name outlier-detector \\\n", - " --namespace=seldon \\\n", - " --set outlierDetection.enabled=true \\\n", - " --set outlierDetection.name=outlier-mahalanobis \\\n", - " --set outlierDetection.type=mahalanobis \\\n", - " --set outlierDetection.mahalanobis.image.name=seldonio/outlier-mahalanobis-transformer:0.1 \\\n", - " --set outlierDetection.mahalanobis.threshold=25 \\\n", - " --set outlierDetection.mahalanobis.n_components=3 \\\n", - " --set outlierDetection.mahalanobis.n_stdev=3 \\\n", - " --set outlierDetection.mahalanobis.start_clip=50 \\\n", - " --set oauth.key=oauth-key \\\n", - " --set oauth.secret=oauth-secret \\\n", - " --set model.image.name=seldonio/mock_classifier:1.0" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Port forward Ambassador\n", - "\n", - "Run command in terminal:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```\n", - "kubectl port-forward $(kubectl get pods -n seldon -l app.kubernetes.io/name=ambassador -o jsonpath='{.items[0].metadata.name}') -n seldon 8003:8080\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Import rest requests, load data and test requests" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from utils import get_payload, rest_request_ambassador, send_feedback_rest, get_kdd_data, generate_batch\n", - "\n", - "data = get_kdd_data(percent10=True) # load dataset\n", - "print(data.shape)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Generate a random batch from the data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "\n", - "samples = 1\n", - "fraction_outlier = 0.\n", - "X, labels = generate_batch(data,samples,fraction_outlier)\n", - "print(X.shape)\n", - "print(labels.shape)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Test the rest requests with the generated data. It is important that the order of requests is respected. First we make predictions, then we get the \"true\" labels back using the feedback request. If we do not respect the order and eg keep making predictions without getting the feedback for each prediction, there will be a mismatch between the predicted and \"true\" labels. This will result in errors in the produced metrics." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "request = get_payload(X)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "response = rest_request_ambassador(\"outlier-detector\",\"seldon\",request,endpoint=\"localhost:8003\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If the outlier detector is used as a transformer, the output of the anomaly detection is added as part of the metadata. If it is used as a model, we send model feedback to retrieve custom performance metrics." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if MODEL:\n", - " send_feedback_rest(\"outlier-detector\",\"seldon\",request,response,0,labels,endpoint=\"localhost:8003\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Analytics" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Install the helm charts for prometheus and the grafana dashboard" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "!helm install ../../../helm-charts/seldon-core-analytics --name seldon-core-analytics \\\n", - " --set grafana_prom_admin_password=password \\\n", - " --set persistence.enabled=false \\\n", - " --namespace seldon" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Port forward Grafana dashboard" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Run command in terminal:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```\n", - "kubectl port-forward $(kubectl get pods -n seldon -l app=grafana-prom-server -o jsonpath='{.items[0].metadata.name}') -n seldon 3000:3000\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can then view an analytics dashboard inside the cluster at http://localhost:3000/dashboard/db/prediction-analytics?refresh=5s&orgId=1. Your IP address may be different. get it via minikube ip. Login with:\n", - "\n", - "Username : admin\n", - "\n", - "password : password (as set when starting seldon-core-analytics above)\n", - "\n", - "Import the outlier-detector-md dashboard from ../../../helm-charts/seldon-core-analytics/files/grafana/configs." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Run simulation\n", - "\n", - "- Sample random network intrusion data with a certain outlier probability.\n", - "- Get payload for the observation.\n", - "- Make a prediction.\n", - "- Send the \"true\" label with the feedback if the detector is run as a model.\n", - "\n", - "It is important that the prediction-feedback order is maintained. Otherwise there will be a mismatch between the predicted and \"true\" labels.\n", - "\n", - "View the progress on the grafana \"Outlier Detection\" dashboard. Most metrics need the outlier detector to be run as a model since they need model feedback." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "import time\n", - "n_requests = 100\n", - "samples = 1\n", - "for i in range(n_requests):\n", - " fraction_outlier = .1\n", - " X, labels = generate_batch(data,samples,fraction_outlier)\n", - " request = get_payload(X)\n", - " response = rest_request_ambassador(\"outlier-detector\",\"seldon\",request,endpoint=\"localhost:8003\")\n", - " if MODEL:\n", - " send_feedback_rest(\"outlier-detector\",\"seldon\",request,response,0,labels,endpoint=\"localhost:8003\")\n", - " time.sleep(1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if MINIKUBE:\n", - " !minikube delete" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/components/outlier-detection/mahalanobis/requirements.txt b/components/outlier-detection/mahalanobis/requirements.txt deleted file mode 100644 index daeb42f09c..0000000000 --- a/components/outlier-detection/mahalanobis/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -numpy==1.15.4 -pandas==0.23.4 -requests>=2.20.0 -scikit-learn==0.20.1 -scipy==0.19.1 \ No newline at end of file diff --git a/components/outlier-detection/mahalanobis/utils.py b/components/outlier-detection/mahalanobis/utils.py deleted file mode 100644 index 569dd54ba9..0000000000 --- a/components/outlier-detection/mahalanobis/utils.py +++ /dev/null @@ -1,171 +0,0 @@ -import collections -import json -import numpy as np -import pandas as pd -import requests -from sklearn.datasets import fetch_kddcup99 -from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, fbeta_score - -pd.options.mode.chained_assignment = None # default='warn' - -def get_kdd_data(target=['dos','r2l','u2r','probe'], - keep_cols=['srv_count','serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate', - 'same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count','dst_host_srv_count', - 'dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate', - 'dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate', - 'dst_host_rerror_rate','dst_host_srv_rerror_rate','target'], - percent10=False): - """ Load KDD Cup 1999 data and return in dataframe. """ - - data_raw = fetch_kddcup99(subset=None, data_home=None, percent10=percent10) - - # specify columns - cols=['duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot', - 'num_failed_logins','logged_in','num_compromised','root_shell','su_attempted','num_root','num_file_creations', - 'num_shells','num_access_files','num_outbound_cmds','is_host_login','is_guest_login','count','srv_count', - 'serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate', - 'srv_diff_host_rate','dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate', - 'dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate', - 'dst_host_rerror_rate','dst_host_srv_rerror_rate'] - - # create dataframe - data = pd.DataFrame(data=data_raw['data'],columns=cols) - - # add target to dataframe - data['attack_type'] = data_raw['target'] - - # specify and map attack types - attack_list = np.unique(data['attack_type']) - attack_category = ['dos','u2r','r2l','r2l','r2l','probe','dos','u2r','r2l','dos','probe','normal','u2r', - 'r2l','dos','probe','u2r','probe','dos','r2l','dos','r2l','r2l'] - - attack_types = {} - for i,j in zip(attack_list,attack_category): - attack_types[i] = j - - data['attack_category'] = 'normal' - for key,value in attack_types.items(): - data['attack_category'][data['attack_type'] == key] = value - - # define target - data['target'] = 0 - for t in target: - data['target'][data['attack_category'] == t] = 1 - - # define columns to be dropped - drop_cols = [] - for col in data.columns.values: - if col not in keep_cols: - drop_cols.append(col) - - if drop_cols!=[]: - data.drop(columns=drop_cols,inplace=True) - - return data - - -def sample_df(df,n): - """ Sample from df. """ - if n < df.shape[0]+1: - replace = False - else: - replace = True - return df.sample(n=n,replace=replace) - - -def generate_batch(data,n_samples,frac_outliers): - """ Generate random batch from data with fixed size and fraction of outliers. """ - - normal = data[data['target']==0] - outlier = data[data['target']==1] - - if n_samples==1: - n_outlier = np.random.binomial(1,frac_outliers) - n_normal = 1 - n_outlier - else: - n_normal = int((1-frac_outliers) * n_samples) - n_outlier = int(frac_outliers * n_samples) - - batch_normal = sample_df(normal,n_normal) - batch_outlier = sample_df(outlier,n_outlier) - - batch = pd.concat([batch_normal,batch_outlier]) - batch = batch.sample(frac=1).reset_index(drop=True) - - outlier_true = batch['target'].values - batch.drop(columns=['target'],inplace=True) - - return batch.values.astype('float'), outlier_true - -def flatten(x): - if isinstance(x, collections.Iterable): - return [a for i in x for a in flatten(i)] - else: - return [x] - -def performance(y_true,y_pred,roll_window=100): - """ Return a confusion matrix and calculate rolling accuracy, precision, recall, F1 and F2 scores. """ - - # confusion matrix - cm = confusion_matrix(y_true,y_pred,labels=[0,1]) - tn, fp, fn, tp = cm.ravel() - - # total scores - acc_tot = accuracy_score(y_true,y_pred) - prec_tot = precision_score(y_true,y_pred) - rec_tot = recall_score(y_true,y_pred) - f1_tot = f1_score(y_true,y_pred) - f2_tot = fbeta_score(y_true,y_pred,beta=2) - - # rolling scores - y_true_roll = y_true[-roll_window:] - y_pred_roll = y_pred[-roll_window:] - acc_roll = accuracy_score(y_true_roll,y_pred_roll) - prec_roll = precision_score(y_true_roll,y_pred_roll) - rec_roll = recall_score(y_true_roll,y_pred_roll) - f1_roll = f1_score(y_true_roll,y_pred_roll) - f2_roll = fbeta_score(y_true_roll,y_pred_roll,beta=2) - - scores = [tn, fp, fn, tp, acc_tot, prec_tot, rec_tot, f1_tot, f2_tot, - acc_roll, prec_roll, rec_roll, f1_roll, f2_roll] - - return scores - -def outlier_stats(y_true,y_pred,roll_window=100): - """ Calculate number and percentage of predicted and labeled outliers. """ - - y_pred_roll = np.sum(y_pred[-roll_window:]) - y_true_roll = np.sum(y_true[-roll_window:]) - y_pred_tot = np.sum(y_pred) - y_true_tot = np.sum(y_true) - - return y_pred_roll, y_true_roll, y_pred_tot, y_true_tot - -def get_payload(arr): - features = ["srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate", - "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count","dst_host_same_srv_rate", - "dst_host_diff_srv_rate","dst_host_same_src_port_rate","dst_host_srv_diff_host_rate", - "dst_host_serror_rate","dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate"] - datadef = {"names":features,"ndarray":arr.tolist()} - payload = {"meta":{},"data":datadef} - return payload - -def rest_request_ambassador(deploymentName,namespace,request,endpoint="localhost:8003"): - response = requests.post( - "http://"+endpoint+"/seldon/"+namespace+"/"+deploymentName+"/api/v0.1/predictions", - json=request) - print(response.status_code) - print(response.text) - return response.json() - -def send_feedback_rest(deploymentName,namespace,request,response,reward,truth,endpoint="localhost:8003"): - feedback = { - "request": request, - "response": response, - "reward": reward, - "truth": {"data":{"ndarray":truth.tolist()}} - } - ret = requests.post( - "http://"+endpoint+"/seldon/"+namespace+"/"+deploymentName+"/api/v0.1/feedback", - json=feedback) - return diff --git a/components/outlier-detection/seq2seq-lstm/.s2i/environment b/components/outlier-detection/seq2seq-lstm/.s2i/environment deleted file mode 100644 index 867d00a693..0000000000 --- a/components/outlier-detection/seq2seq-lstm/.s2i/environment +++ /dev/null @@ -1,4 +0,0 @@ -MODEL_NAME=OutlierSeq2SeqLSTM -API_TYPE=REST -SERVICE_TYPE=MODEL -PERSISTENCE=0 diff --git a/components/outlier-detection/seq2seq-lstm/CoreSeq2SeqLSTM.py b/components/outlier-detection/seq2seq-lstm/CoreSeq2SeqLSTM.py deleted file mode 100644 index 32036198c5..0000000000 --- a/components/outlier-detection/seq2seq-lstm/CoreSeq2SeqLSTM.py +++ /dev/null @@ -1,215 +0,0 @@ -import logging -import numpy as np -import pickle -import random - -from model import model - -logger = logging.getLogger(__name__) - -class CoreSeq2SeqLSTM(object): - """ Outlier detection using a sequence-to-sequence (seq2seq) LSTM model. - - Parameters - ---------- - threshold (float): reconstruction error (mse) threshold used to classify outliers - reservoir_size (int) : number of observations kept in memory using reservoir sampling - - Functions - ---------- - reservoir_sampling : applies reservoir sampling to incoming data - predict : detect and return outliers - transform_input : detect outliers and return input features - send_feedback : add target labels as part of the feedback loop - tags : add metadata for input transformer - metrics : return custom metrics - """ - - def __init__(self,threshold=0.003,reservoir_size=50000,model_name='seq2seq',load_path='./models/'): - - logger.info("Initializing model") - self.threshold = threshold - self.reservoir_size = reservoir_size - self.batch = [] - self.N = 0 # total sample count up until now for reservoir sampling - self.nb_outliers = 0 - - # load model architecture parameters - with open(load_path + model_name + '.pickle', 'rb') as f: - self.timesteps, self.n_features, encoder_dim, decoder_dim, output_activation = pickle.load(f) - - # instantiate model - self.s2s, self.enc, self.dec = model(self.n_features,encoder_dim=encoder_dim, - decoder_dim=decoder_dim,output_activation=output_activation) - self.s2s.load_weights(load_path + model_name + '_weights.h5') # load pretrained model weights - self.s2s._make_predict_function() - self.enc._make_predict_function() - self.dec._make_predict_function() - - # load data preprocessing info - with open(load_path + 'preprocess_' + model_name + '.pickle', 'rb') as f: - preprocess = pickle.load(f) - self.preprocess, self.clip, self.axis = preprocess[:3] - if self.preprocess=='minmax': - self.xmin, self.xmax = preprocess[3:5] - self.min, self.max = preprocess[5:] - elif self.preprocess=='standardized': - self.mu, self.sigma = preprocess[3:] - - - def reservoir_sampling(self,X,update_stand=False): - """ Keep batch of data in memory using reservoir sampling. """ - for item in X: - self.N+=1 - if len(self.batch) < self.reservoir_size: - self.batch.append(item) - else: - s = int(random.random() * self.N) - if s < self.reservoir_size: - self.batch[s] = item - - if update_stand: - if self.preprocess=='minmax': - self.xmin = np.array(self.batch).min(axis=self.axis) - self.xmax = np.array(self.batch).max(axis=self.axis) - elif self.preprocess=='standardized': - self.mu = np.array(self.batch).mean(axis=self.axis) - self.sigma = np.array(self.batch).std(axis=self.axis) - return - - - def predict(self, X, feature_names): - """ Return outlier predictions. - - Parameters - ---------- - X : array-like - feature_names : array of feature names (optional) - """ - logger.info("Using component as a model") - return self._get_preds(X) - - - def transform_input(self, X, feature_names): - """ Transform the input. - Used when the outlier detector sits on top of another model. - - Parameters - ---------- - X : array-like - feature_names : array of feature names (optional) - """ - logger.info("Using component as an outlier-detector transformer") - self.prediction_meta = self._get_preds(X) - return X - - - def decode_sequence(self,input_seq): - """ Feed output of encoder to decoder and make sequential predictions. """ - - # use encoder the get state vectors - states_value = self.enc.predict(input_seq) - - # generate initial target sequence - target_seq = input_seq[0,0,:].reshape((1,1,self.n_features)) - - # sequential prediction of time series - decoded_seq = np.zeros((1, self.timesteps, self.n_features)) - decoded_seq[0,0,:] = target_seq[0,0,:] - i = 1 - while i < self.timesteps: - - decoder_output = self.dec.predict([target_seq] + states_value) - - # update the target sequence - target_seq = np.zeros((1, 1, self.n_features)) - target_seq[0, 0, :] = decoder_output[0] - - # update output - decoded_seq[0, i, :] = decoder_output[0] - - # update states - states_value = decoder_output[1:] - - i+=1 - - return decoded_seq - - - def _get_preds(self,X): - """ Detect outliers if the reconstruction error is above the threshold. - - Parameters - ---------- - X : array-like - """ - - # clip data per feature - for col,clip in enumerate(self.clip): - X[:,:,col] = np.clip(X[:,:,col],-clip,clip) - - # update reservoir - if self.N < self.reservoir_size: - update_stand = False - else: - update_stand = True - - self.reservoir_sampling(X,update_stand=update_stand) - - # apply scaling - if self.preprocess=='minmax': - X = ((X - self.xmin) / (self.xmax - self.xmin)) * (self.max - self.min) + self.min - elif self.preprocess=='standardized': - X = (X - self.mu) / (self.sigma + 1e-10) - - # make predictions - n_obs = X.shape[0] - self.mse = np.zeros(n_obs) - for obs in range(n_obs): - input_seq = X[obs:obs+1,:,:] - decoded_seq = self.decode_sequence(input_seq) - self.mse[obs] = np.mean(np.power(input_seq[0,:,:] - decoded_seq[0,:,:], 2)) - self.prediction = np.array([1 if e > self.threshold else 0 for e in self.mse]).astype(int) - - return self.prediction - - - def send_feedback(self,X,feature_names,reward,truth): - """ Return additional data as part of the feedback loop. - - Parameters - ---------- - X : array of the features sent in the original predict request - feature_names : array of feature names. May be None if not available. - reward (float): the reward - truth : array with correct value (optional) - """ - logger.info("Send feedback called") - return [] - - - def tags(self): - """ - Use predictions made within transform to add these as metadata - to the response. Tags will only be collected if the component is - used as an input-transformer. - """ - try: - return {"outlier-predictions": self.prediction_meta.tolist()} - except AttributeError: - logger.info("No metadata about outliers") - - - def metrics(self): - """ Return custom metrics averaged over the prediction batch. - """ - self.nb_outliers += np.sum(self.prediction) - - is_outlier = {"type":"GAUGE","key":"is_outlier","value":np.mean(self.prediction)} - mse = {"type":"GAUGE","key":"mse","value":np.mean(self.mse)} - nb_outliers = {"type":"GAUGE","key":"nb_outliers","value":int(self.nb_outliers)} - fraction_outliers = {"type":"GAUGE","key":"fraction_outliers","value":int(self.nb_outliers)/self.N} - obs = {"type":"GAUGE","key":"observation","value":self.N} - threshold = {"type":"GAUGE","key":"threshold","value":self.threshold} - - return [is_outlier,mse,nb_outliers,fraction_outliers,obs,threshold] \ No newline at end of file diff --git a/components/outlier-detection/seq2seq-lstm/OutlierSeq2SeqLSTM.py b/components/outlier-detection/seq2seq-lstm/OutlierSeq2SeqLSTM.py deleted file mode 100644 index 6dd72afe2d..0000000000 --- a/components/outlier-detection/seq2seq-lstm/OutlierSeq2SeqLSTM.py +++ /dev/null @@ -1,117 +0,0 @@ -import numpy as np - -from CoreSeq2SeqLSTM import CoreSeq2SeqLSTM -from utils import flatten, performance, outlier_stats - -class OutlierSeq2SeqLSTM(CoreSeq2SeqLSTM): - """ Outlier detection using a sequence-to-sequence (seq2seq) LSTM model. - - Parameters - ---------- - threshold (float) : reconstruction error (mse) threshold used to classify outliers - reservoir_size (int) : number of observations kept in memory using reservoir sampling - - Functions - ---------- - send_feedback : add target labels as part of the feedback loop - metrics : return custom metrics - """ - def __init__(self,threshold=0.003,reservoir_size=50000,model_name='seq2seq',load_path='./models/'): - - super().__init__(threshold=threshold,reservoir_size=reservoir_size, - model_name=model_name,load_path=load_path) - - self._predictions = [] - self._labels = [] - self._mse = [] - self.roll_window = 100 - self.metric = [float('nan') for i in range(18)] - - - def send_feedback(self,X,feature_names,reward,truth): - """ Return outlier labels as part of the feedback loop. - - Parameters - ---------- - X : array of the features sent in the original predict request - feature_names : array of feature names. May be None if not available. - reward (float): the reward - truth : array with correct value (optional) - """ - _ = super().send_feedback(X,feature_names,reward,truth) - - # historical reconstruction errors and predictions - self._mse.append(self.mse) - self._mse = flatten(self._mse) - self._predictions.append(self.prediction) - self._predictions = flatten(self._predictions) - - # target labels - self.label = truth - self._labels.append(self.label) - self._labels = flatten(self._labels) - - # performance metrics - scores = performance(self._labels,self._predictions,roll_window=self.roll_window) - stats = outlier_stats(self._labels,self._predictions,roll_window=self.roll_window) - - convert = flatten([scores,stats]) - metric = [] - for c in convert: # convert from np to native python type to jsonify - metric.append(np.asscalar(np.asarray(c))) - self.metric = metric - - return [] - - - def metrics(self): - """ Return custom metrics. - Printed with a delay of 1 prediction because the labels are returned in the feedback step. - """ - - if self.mse.shape[0]>1: - raise ValueError('Metrics can only handle single observations.') - - if self.N==1: - pred = float('nan') - err = float('nan') - y_true = float('nan') - else: - pred = int(self._predictions[-1]) - err = self._mse[-1] - y_true = int(self.label[0]) - - is_outlier = {"type":"GAUGE","key":"is_outlier","value":pred} - mse = {"type":"GAUGE","key":"mse","value":err} - obs = {"type":"GAUGE","key":"observation","value":self.N - 1} - threshold = {"type":"GAUGE","key":"threshold","value":self.threshold} - - label = {"type":"GAUGE","key":"label","value":y_true} - - accuracy_tot = {"type":"GAUGE","key":"accuracy_tot","value":self.metric[4]} - precision_tot = {"type":"GAUGE","key":"precision_tot","value":self.metric[5]} - recall_tot = {"type":"GAUGE","key":"recall_tot","value":self.metric[6]} - f1_score_tot = {"type":"GAUGE","key":"f1_tot","value":self.metric[7]} - f2_score_tot = {"type":"GAUGE","key":"f2_tot","value":self.metric[8]} - - accuracy_roll = {"type":"GAUGE","key":"accuracy_roll","value":self.metric[9]} - precision_roll = {"type":"GAUGE","key":"precision_roll","value":self.metric[10]} - recall_roll = {"type":"GAUGE","key":"recall_roll","value":self.metric[11]} - f1_score_roll = {"type":"GAUGE","key":"f1_roll","value":self.metric[12]} - f2_score_roll = {"type":"GAUGE","key":"f2_roll","value":self.metric[13]} - - true_negative = {"type":"GAUGE","key":"true_negative","value":self.metric[0]} - false_positive = {"type":"GAUGE","key":"false_positive","value":self.metric[1]} - false_negative = {"type":"GAUGE","key":"false_negative","value":self.metric[2]} - true_positive = {"type":"GAUGE","key":"true_positive","value":self.metric[3]} - - nb_outliers_roll = {"type":"GAUGE","key":"nb_outliers_roll","value":self.metric[14]} - nb_labels_roll = {"type":"GAUGE","key":"nb_labels_roll","value":self.metric[15]} - nb_outliers_tot = {"type":"GAUGE","key":"nb_outliers_tot","value":self.metric[16]} - nb_labels_tot = {"type":"GAUGE","key":"nb_labels_tot","value":self.metric[17]} - - return [is_outlier,mse,obs,threshold,label, - accuracy_tot,precision_tot,recall_tot,f1_score_tot,f2_score_tot, - accuracy_roll,precision_roll,recall_roll,f1_score_roll,f2_score_roll, - true_negative,false_positive,false_negative,true_positive, - nb_outliers_roll,nb_labels_roll,nb_outliers_tot,nb_labels_tot] \ No newline at end of file diff --git a/components/outlier-detection/seq2seq-lstm/README.md b/components/outlier-detection/seq2seq-lstm/README.md deleted file mode 100644 index fa18b1d4d1..0000000000 --- a/components/outlier-detection/seq2seq-lstm/README.md +++ /dev/null @@ -1,24 +0,0 @@ -# Sequence-to-Sequence LSTM (seq2seq-LSTM) Outlier Detector - -## Description - -[Anomaly or outlier detection](https://en.wikipedia.org/wiki/Anomaly_detection) has many applications, ranging from preventing credit card fraud to detecting computer network intrusions. - -The implemented seq2seq outlier detector aims to predict anomalies in a sequence of input features. The model can be trained in an unsupervised or semi-supervised way, which is helpful since labeled training data is often scarce. The outlier detector predicts whether the input features represent normal behaviour or not, dependent on a threshold level set by the user. - -## Implementation - -The architecture of the seq2seq model is defined in ```model.py``` and it is trained by running the ```train.py``` script. The ```OutlierSeq2SeqLSTM``` class loads a pre-trained model and makes predictions on new data. - -A detailed explanation of the implementation and usage of the seq2seq model as an outlier detector can be found in the [seq2seq documentation](./doc.md). - -## Running on Seldon - -An end-to-end example running a seq2seq outlier detector on GCP or Minikube using Seldon to identify anomalies in ECGs is available [here](./seq2seq_lstm.ipynb). - -Docker images to use the generic Mahalanobis outlier detector as a model or transformer can be found on Docker Hub: -* [seldonio/outlier-s2s-lstm-model](https://hub.docker.com/r/seldonio/outlier-s2s-lstm-model) -* [seldonio/outlier-s2s-lstm-transformer](https://hub.docker.com/r/seldonio/outlier-s2s-lstm-transformer) - -A model docker image specific for the demo is also available: -* [seldonio/outlier-s2s-lstm-model-demo](https://hub.docker.com/r/seldonio/outlier-s2s-lstm-model-demo) \ No newline at end of file diff --git a/components/outlier-detection/seq2seq-lstm/__init__.py b/components/outlier-detection/seq2seq-lstm/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/components/outlier-detection/seq2seq-lstm/data/.keep b/components/outlier-detection/seq2seq-lstm/data/.keep deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/components/outlier-detection/seq2seq-lstm/doc.md b/components/outlier-detection/seq2seq-lstm/doc.md deleted file mode 100644 index d1911d311f..0000000000 --- a/components/outlier-detection/seq2seq-lstm/doc.md +++ /dev/null @@ -1,336 +0,0 @@ -# Sequence-to-Sequence LSTM (seq2seq-LSTM) Outlier Algorithm Documentation - -The aim of this document is to explain the seq2seq-LSTM algorithm in Seldon's outlier detection framework. - -First, we provide a high level overview of the algorithm and the use case, then we will give a detailed explanation of the implementation. - -## Overview - -Outlier detection has many applications, ranging from preventing credit card fraud to detecting computer network intrusions. The available data is typically unlabeled and detection needs to be done in real-time. The outlier detector can be used as a standalone algorithm, or to detect anomalies in the input data of another predictive model. - -The seq2seq-LSTM outlier detection algorithm is suitable for time series data and predicts whether a sequence of input features is an outlier or not, dependent on a threshold level set by the user. The algorithm needs to be pretrained first on a batch of -preferably- inliers. - -As observations arrive, the algorithm will: -- clip and scale the input features -- first encode, and then sequentially decode the input time series data in an attempt to reconstruct the initial observations -- compute a reconstruction error between the output of the decoder and the input data -- predict that the observation is an outlier if the error is larger than the threshold level - -## Why Sequence-to-Sequence Models? - -Seq2seq models convert sequences from one domain into sequences in another domain. A typical example would be sentence translation between different languages. A seq2seq model consists of 2 main building blocks: an encoder and a decoder. The encoder processes the input sequence and initializes the decoder. The decoder then makes sequential predictions for the output sequence. In our case, the decoder aims to reconstruct the input sequence. Both the encoder and decoder are typically implemented with recurrent or 1D convolutional neural networks. Our implementation uses a type of recurrent neural network called LSTM networks. An excellent explanation of how LSTM units work is available [here](http://colah.github.io/posts/2015-08-Understanding-LSTMs/). The loss function to be minimized with stochastic gradient descent is the mean squared error between the input and output sequence, and is called the reconstruction error. - -If we train the seq2seq model with inliers, it will be able to replicate new inlier data well with a low reconstruction error. However, if outliers are fed to the seq2seq model, the reconstruction error becomes large and we can classify the sequence as an anomaly. - -## Implementation - -The implementation is inspired by [this blog post](https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html). - -### 1. Building the seq2seq-LSTM Model - -The seq2seq model definition in ```model.py``` takes 4 arguments that define the architecture: -- the number of features in the input -- a list with the number of units per [bidirectional](https://en.wikipedia.org/wiki/Bidirectional_recurrent_neural_networks) LSTM layer in the encoder -- a list with the number of units per LSTM layer in the decoder -- the output activation type for the dense output layer on top of the last LSTM unit in the decoder - -``` python -def model(n_features, encoder_dim = [20], decoder_dim = [20], dropout=0., learning_rate=.001, - loss='mean_squared_error', output_activation='sigmoid'): - """ Build seq2seq model. - - Arguments: - - n_features (int): number of features in the data - - encoder_dim (list): list with number of units per encoder layer - - decoder_dim (list): list with number of units per decoder layer - - dropout (float): dropout for LSTM units - - learning_rate (float): learning rate used during training - - loss (str): loss function used - - output_activation (str): activation type for the dense output layer in the decoder - """ -``` - -First, we define the bidirectional LSTM layers in the encoder and keep the state of the last LSTM unit to initialise the decoder: - -```python -# add encoder hidden layers -encoder_lstm = [] -for i in range(enc_dim-1): - encoder_lstm.append(Bidirectional(LSTM(encoder_dim[i], dropout=dropout, - return_sequences=True,name='encoder_lstm_' + str(i)))) - encoder_hidden = encoder_lstm[i](encoder_hidden) - -encoder_lstm.append(Bidirectional(LSTM(encoder_dim[-1], dropout=dropout, return_state=True, - name='encoder_lstm_' + str(enc_dim-1)))) -encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm[-1](encoder_hidden) - -# only need to keep encoder states -state_h = Concatenate()([forward_h, backward_h]) -state_c = Concatenate()([forward_c, backward_c]) -encoder_states = [state_h, state_c] -``` - -We can then define the LSTM units in the decoder, with the states initialised by the encoder: - -```python -# initialise decoder states with encoder states -decoder_lstm = [] -for i in range(dec_dim): - decoder_lstm.append(LSTM(decoder_dim[i], dropout=dropout, return_sequences=True, - return_state=True, name='decoder_lstm_' + str(i))) - decoder_hidden, _, _ = decoder_lstm[i](decoder_hidden, initial_state=encoder_states) -``` - -We add a dense layer with output activation of choice on top of the last LSTM layer in the decoder and compile the model: - -```python -# add linear layer on top of LSTM -decoder_dense = Dense(n_features, activation=output_activation, name='dense_output') -decoder_outputs = decoder_dense(decoder_hidden) - -# define seq2seq model -model = Model([encoder_inputs, decoder_inputs], decoder_outputs) -optimizer = Adam(lr=learning_rate) -model.compile(optimizer=optimizer, loss=loss) -``` - -The decoder predictions are sequential and we only need the encoder states to initialise the decoder for the first item in the sequence. From then on, the output and state of the decoder at each step in the sequence is used to predict the next item. As a result, we define separate encoder and decoder models for the prediction stage: - -```python -# define encoder model returning encoder states -encoder_model = Model(encoder_inputs, encoder_states * dec_dim) - -# define decoder model -# need state inputs for each LSTM layer -decoder_states_inputs = [] -for i in range(dec_dim): - decoder_state_input_h = Input(shape=(decoder_dim[i],), name='decoder_state_input_h_' + str(i)) - decoder_state_input_c = Input(shape=(decoder_dim[i],), name='decoder_state_input_c_' + str(i)) - decoder_states_inputs.append([decoder_state_input_h, decoder_state_input_c]) -decoder_states_inputs = [state for states in decoder_states_inputs for state in states] - -decoder_inference = decoder_inputs -decoder_states = [] -for i in range(dec_dim): - decoder_inference, state_h, state_c = decoder_lstm[i](decoder_inference, - initial_state=decoder_states_inputs[2*i:2*i+2]) - decoder_states.append([state_h,state_c]) -decoder_states = [state for states in decoder_states for state in states] - -decoder_outputs = decoder_dense(decoder_inference) -decoder_model = Model([decoder_inputs] + decoder_states_inputs, - [decoder_outputs] + decoder_states) -``` - -### 2. Training the model - -The seq2seq-LSTM model can be trained on a batch of -ideally- inliers by running the ```train.py``` script with the desired hyperparameters. The example below trains the model on the first 2628 ECG's of the ECG5000 dataset. The input/output sequence has a length of 140, the encoder has 1 bidirectional LSTM layer with 20 units, and the decoder consists of 1 LSTM layer with 40 units. This has to be 2x the number of units of the bidirectional encoder because both the forward and backward encoder states are used to initialise the decoder. Feature-wise minmax scaling between 0 and 1 is applied to the input sequence so we can use a sigmoid activation in the decoder's output layer. - -```python -!python train.py \ ---dataset './data/ECG5000_TEST.arff' \ ---data_range 0 2627 \ ---minmax \ ---timesteps 140 \ ---encoder_dim 20 \ ---decoder_dim 40 \ ---output_activation 'sigmoid' \ ---dropout 0 \ ---learning_rate 0.005 \ ---loss 'mean_squared_error' \ ---epochs 100 \ ---batch_size 32 \ ---validation_split 0.2 \ ---model_name 'seq2seq' \ ---print_progress \ ---save \ ---save_path './models/' -``` - -The model weights and hyperparameters are saved in the folder specified by "save_path". - -### 3. Making predictions - -In order to make predictions, which can then be served by Seldon Core, the pre-trained model weights and hyperparameters are loaded when defining an OutlierSeq2SeqLSTM object. The "threshold" argument defines above which reconstruction error a sample is classified as an outlier. The threshold is a key hyperparameter and needs to be picked carefully for each application. The OutlierSeq2SeqLSTM class inherits from the CoreSeq2SeqLSTM class in ```CoreSeq2SeqLSTM.py```. - -```python -class CoreSeq2SeqLSTM(object): - """ Outlier detection using a sequence-to-sequence (seq2seq) LSTM model. - - Parameters - ---------- - threshold (float): reconstruction error (mse) threshold used to classify outliers - reservoir_size (int) : number of observations kept in memory using reservoir sampling - - Functions - ---------- - reservoir_sampling : applies reservoir sampling to incoming data - predict : detect and return outliers - transform_input : detect outliers and return input features - send_feedback : add target labels as part of the feedback loop - tags : add metadata for input transformer - metrics : return custom metrics - """ - - def __init__(self,threshold=0.003,reservoir_size=50000,model_name='seq2seq',load_path='./models/'): - - logger.info("Initializing model") - self.threshold = threshold - self.reservoir_size = reservoir_size - self.batch = [] - self.N = 0 # total sample count up until now for reservoir sampling - self.nb_outliers = 0 - - # load model architecture parameters - with open(load_path + model_name + '.pickle', 'rb') as f: - self.timesteps, self.n_features, encoder_dim, decoder_dim, output_activation = pickle.load(f) - - # instantiate model - self.s2s, self.enc, self.dec = model(self.n_features,encoder_dim=encoder_dim, - decoder_dim=decoder_dim,output_activation=output_activation) - self.s2s.load_weights(load_path + model_name + '_weights.h5') # load pretrained model weights - self.s2s._make_predict_function() - self.enc._make_predict_function() - self.dec._make_predict_function() - - # load data preprocessing info - with open(load_path + 'preprocess_' + model_name + '.pickle', 'rb') as f: - preprocess = pickle.load(f) - self.preprocess, self.clip, self.axis = preprocess[:3] - if self.preprocess=='minmax': - self.xmin, self.xmax = preprocess[3:5] - self.min, self.max = preprocess[5:] - elif self.preprocess=='standardized': - self.mu, self.sigma = preprocess[3:] -``` - -```python -class OutlierSeq2SeqLSTM(CoreSeq2SeqLSTM): - """ Outlier detection using a sequence-to-sequence (seq2seq) LSTM model. - - Parameters - ---------- - threshold (float) : reconstruction error (mse) threshold used to classify outliers - reservoir_size (int) : number of observations kept in memory using reservoir sampling - - Functions - ---------- - send_feedback : add target labels as part of the feedback loop - metrics : return custom metrics - """ - def __init__(self,threshold=0.003,reservoir_size=50000,model_name='seq2seq',load_path='./models/'): - - super().__init__(threshold=threshold,reservoir_size=reservoir_size, - model_name=model_name,load_path=load_path) -``` - -The actual outlier detection is done by the ```_get_preds``` method which is invoked by ```predict``` or ```transform_input``` dependent on whether the detector is defined as respectively a model or a transformer. - -```python -def predict(self, X, feature_names): - """ Return outlier predictions. - - Parameters - ---------- - X : array-like - feature_names : array of feature names (optional) - """ - logger.info("Using component as a model") - return self._get_preds(X) -``` - -```python -def transform_input(self, X, feature_names): - """ Transform the input. - Used when the outlier detector sits on top of another model. - - Parameters - ---------- - X : array-like - feature_names : array of feature names (optional) - """ - logger.info("Using component as an outlier-detector transformer") - self.prediction_meta = self._get_preds(X) - return X -``` - -First the data is (optionally) clipped. If the number of observations fed to the outlier detector up until now is at least equal to the defined reservoir size, the feature-wise scaling parameters are updated using the observations in the reservoir. The reservoir is updated each observation using reservoir sampling. We can then scale the input data. - -```python -# clip data per feature -for col,clip in enumerate(self.clip): - X[:,:,col] = np.clip(X[:,:,col],-clip,clip) - -# update reservoir -if self.N < self.reservoir_size: - update_stand = False -else: - update_stand = True - -self.reservoir_sampling(X,update_stand=update_stand) - -# apply scaling -if self.preprocess=='minmax': - X = ((X - self.xmin) / (self.xmax - self.xmin)) * (self.max - self.min) + self.min -elif self.preprocess=='standardized': - X = (X - self.mu) / (self.sigma + 1e-10) -``` - -We then make predictions using the ```decode_sequence``` function and calculate the mean squared error between the input and output sequences. If this value is above the threshold, an outlier is predicted. - -```python -# make predictions -n_obs = X.shape[0] -self.mse = np.zeros(n_obs) -for obs in range(n_obs): - input_seq = X[obs:obs+1,:,:] - decoded_seq = self.decode_sequence(input_seq) - self.mse[obs] = np.mean(np.power(input_seq[0,:,:] - decoded_seq[0,:,:], 2)) -self.prediction = np.array([1 if e > self.threshold else 0 for e in self.mse]).astype(int) -``` - -The ```decode_sequence``` function takes an input sequence and uses the encoder model to retrieve the state vectors of the last LSTM layer in the encoder so they can be used to initialise the LSTM layers in the decoder. The feature values of the first step in the input sequence are used to initialise the output sequence. We can then use the decoder model to make sequential predictions for the output sequence. At each step, we use the previous step's output value and state as decoder inputs. - -```python -def decode_sequence(self,input_seq): - """ Feed output of encoder to decoder and make sequential predictions. """ - - # use encoder the get state vectors - states_value = self.enc.predict(input_seq) - - # generate initial target sequence - target_seq = input_seq[0,0,:].reshape((1,1,self.n_features)) - - # sequential prediction of time series - decoded_seq = np.zeros((1, self.timesteps, self.n_features)) - decoded_seq[0,0,:] = target_seq[0,0,:] - i = 1 - while i < self.timesteps: - - decoder_output = self.dec.predict([target_seq] + states_value) - - # update the target sequence - target_seq = np.zeros((1, 1, self.n_features)) - target_seq[0, 0, :] = decoder_output[0] - - # update output - decoded_seq[0, i, :] = decoder_output[0] - - # update states - states_value = decoder_output[1:] - - i+=1 - - return decoded_seq -``` - -## References - -Francois Chollet. A ten-minute introduction to sequence-to-sequence learning in Keras -- https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html - -Christopher Olah. Understanding LSTM Networks -- http://colah.github.io/posts/2015-08-Understanding-LSTMs/ - -Ilya Sutskever, Oriol Vinyals and Quoc V. Le. Sequence to Sequence Learning with Neural Networks. 2014 -- https://arxiv.org/abs/1409.3215 \ No newline at end of file diff --git a/components/outlier-detection/seq2seq-lstm/images/ecg.png b/components/outlier-detection/seq2seq-lstm/images/ecg.png deleted file mode 100644 index 5947a605a2..0000000000 Binary files a/components/outlier-detection/seq2seq-lstm/images/ecg.png and /dev/null differ diff --git a/components/outlier-detection/seq2seq-lstm/images/inlier_ecg.png b/components/outlier-detection/seq2seq-lstm/images/inlier_ecg.png deleted file mode 100644 index d935412581..0000000000 Binary files a/components/outlier-detection/seq2seq-lstm/images/inlier_ecg.png and /dev/null differ diff --git a/components/outlier-detection/seq2seq-lstm/images/outlier_ecg.png b/components/outlier-detection/seq2seq-lstm/images/outlier_ecg.png deleted file mode 100644 index b5c31df48b..0000000000 Binary files a/components/outlier-detection/seq2seq-lstm/images/outlier_ecg.png and /dev/null differ diff --git a/components/outlier-detection/seq2seq-lstm/model.py b/components/outlier-detection/seq2seq-lstm/model.py deleted file mode 100644 index 6ff072b47f..0000000000 --- a/components/outlier-detection/seq2seq-lstm/model.py +++ /dev/null @@ -1,93 +0,0 @@ -from keras.layers import Input, LSTM, Dense, Bidirectional, Concatenate -from keras.models import Model -from keras.optimizers import Adam -import numpy as np - -def model(n_features, encoder_dim = [20], decoder_dim = [20], dropout=0., learning_rate=.001, - loss='mean_squared_error', output_activation='sigmoid'): - """ Build seq2seq model. - - Arguments: - - n_features (int): number of features in the data - - encoder_dim (list): list with number of units per encoder layer - - decoder_dim (list): list with number of units per decoder layer - - dropout (float): dropout for LSTM units - - learning_rate (float): learning rate used during training - - loss (str): loss function used - - output_activation (str): activation type for the dense output layer in the decoder - """ - - enc_dim = len(encoder_dim) - dec_dim = len(decoder_dim) - - # seq2seq = encoder + decoder - # encoder - encoder_hidden = encoder_inputs = Input(shape=(None, n_features), name='encoder_input') - - # add encoder hidden layers - encoder_lstm = [] - for i in range(enc_dim-1): - encoder_lstm.append(Bidirectional(LSTM(encoder_dim[i], dropout=dropout, - return_sequences=True,name='encoder_lstm_' + str(i)))) - encoder_hidden = encoder_lstm[i](encoder_hidden) - - encoder_lstm.append(Bidirectional(LSTM(encoder_dim[-1], dropout=dropout, return_state=True, - name='encoder_lstm_' + str(enc_dim-1)))) - encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm[-1](encoder_hidden) - - # only need to keep encoder states - state_h = Concatenate()([forward_h, backward_h]) - state_c = Concatenate()([forward_c, backward_c]) - encoder_states = [state_h, state_c] - - # decoder - decoder_hidden = decoder_inputs = Input(shape=(None, n_features), name='decoder_input') - - # add decoder hidden layers - # check if dimensions are correct - dim_check = [(idx,dim) for idx,dim in enumerate(decoder_dim) if dim!=encoder_dim[-1]*2] - if len(dim_check)>0: - raise ValueError('\nDecoder (layer,units) {0} is not compatible with encoder hidden ' \ - 'states. Units should be equal to {1}'.format(dim_check,encoder_dim[-1]*2)) - - # initialise decoder states with encoder states - decoder_lstm = [] - for i in range(dec_dim): - decoder_lstm.append(LSTM(decoder_dim[i], dropout=dropout, return_sequences=True, - return_state=True, name='decoder_lstm_' + str(i))) - decoder_hidden, _, _ = decoder_lstm[i](decoder_hidden, initial_state=encoder_states) - - # add linear layer on top of LSTM - decoder_dense = Dense(n_features, activation=output_activation, name='dense_output') - decoder_outputs = decoder_dense(decoder_hidden) - - # define seq2seq model - model = Model([encoder_inputs, decoder_inputs], decoder_outputs) - optimizer = Adam(lr=learning_rate) - model.compile(optimizer=optimizer, loss=loss) - - # define encoder model returning encoder states - encoder_model = Model(encoder_inputs, encoder_states * dec_dim) - - # define decoder model - # need state inputs for each LSTM layer - decoder_states_inputs = [] - for i in range(dec_dim): - decoder_state_input_h = Input(shape=(decoder_dim[i],), name='decoder_state_input_h_' + str(i)) - decoder_state_input_c = Input(shape=(decoder_dim[i],), name='decoder_state_input_c_' + str(i)) - decoder_states_inputs.append([decoder_state_input_h, decoder_state_input_c]) - decoder_states_inputs = [state for states in decoder_states_inputs for state in states] - - decoder_inference = decoder_inputs - decoder_states = [] - for i in range(dec_dim): - decoder_inference, state_h, state_c = decoder_lstm[i](decoder_inference, - initial_state=decoder_states_inputs[2*i:2*i+2]) - decoder_states.append([state_h,state_c]) - decoder_states = [state for states in decoder_states for state in states] - - decoder_outputs = decoder_dense(decoder_inference) - decoder_model = Model([decoder_inputs] + decoder_states_inputs, - [decoder_outputs] + decoder_states) - - return model, encoder_model, decoder_model \ No newline at end of file diff --git a/components/outlier-detection/seq2seq-lstm/models/.keep b/components/outlier-detection/seq2seq-lstm/models/.keep deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/components/outlier-detection/seq2seq-lstm/models/preprocess_seq2seq.pickle b/components/outlier-detection/seq2seq-lstm/models/preprocess_seq2seq.pickle deleted file mode 100644 index fd41523e06..0000000000 Binary files a/components/outlier-detection/seq2seq-lstm/models/preprocess_seq2seq.pickle and /dev/null differ diff --git a/components/outlier-detection/seq2seq-lstm/models/seq2seq.pickle b/components/outlier-detection/seq2seq-lstm/models/seq2seq.pickle deleted file mode 100644 index 5340cc328d..0000000000 Binary files a/components/outlier-detection/seq2seq-lstm/models/seq2seq.pickle and /dev/null differ diff --git a/components/outlier-detection/seq2seq-lstm/models/seq2seq_weights.h5 b/components/outlier-detection/seq2seq-lstm/models/seq2seq_weights.h5 deleted file mode 100644 index 794d5b87b2..0000000000 Binary files a/components/outlier-detection/seq2seq-lstm/models/seq2seq_weights.h5 and /dev/null differ diff --git a/components/outlier-detection/seq2seq-lstm/requirements.txt b/components/outlier-detection/seq2seq-lstm/requirements.txt deleted file mode 100644 index b011520221..0000000000 --- a/components/outlier-detection/seq2seq-lstm/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -keras==2.2.2 -tensorflow==1.10.1 -numpy==1.14.5 -argparse==1.1 -pandas==0.23.4 -scikit-learn==0.20.1 -scipy==1.1.0 -requests>=2.20.0 \ No newline at end of file diff --git a/components/outlier-detection/seq2seq-lstm/seq2seq_lstm.ipynb b/components/outlier-detection/seq2seq-lstm/seq2seq_lstm.ipynb deleted file mode 100644 index 8d66c67abe..0000000000 --- a/components/outlier-detection/seq2seq-lstm/seq2seq_lstm.ipynb +++ /dev/null @@ -1,610 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Sequence-to-sequence LSTM outlier detector deployment" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Wrap a keras seq2seq-LSTM python model for use as a prediction microservice in seldon-core and deploy on seldon-core running on Minikube or a Kubernetes cluster using GCP." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Dependencies" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- [helm](https://github.com/helm/helm)\n", - "- [minikube](https://github.com/kubernetes/minikube)\n", - "- [s2i](https://github.com/openshift/source-to-image) >= 1.1.13\n", - "\n", - "Python packages:\n", - "- keras: pip install keras\n", - "- tensorflow: https://www.tensorflow.org/install/pip" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Task" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The outlier detector needs to spot anomalies in electrocardiograms (ECG's). The dataset \"ECG5000\" contains 5000 ECG's, originally obtained from [Physionet](https://physionet.org/cgi-bin/atm/ATM) under the name \"BIDMC Congestive Heart Failure Database(chfdb)\", record \"chf07\". The data has been pre-processed in 2 steps: first each heartbeat is extracted, and then each beat is made equal length via interpolation. The data is labeled and contains 5 classes. The first class which contains almost 60% of the observations is seen as \"normal\" while the others are outliers. The seq2seq-LSTM algorithm is trained on some heartbeats from the first class and needs to flag the other classes as anomalies. The plot below shows an example ECG for each of the classes." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![ECGs](images/ecg.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Train locally\n", - "\n", - "Train on some inlier ECG's. The data can be downloaded [here](http://www.timeseriesclassification.com/description.php?Dataset=ECG5000) and should be extracted in the [data](./data) folder." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!python train.py \\\n", - "--dataset './data/ECG5000_TEST.arff' \\\n", - "--data_range 0 2627 \\\n", - "--minmax \\\n", - "--timesteps 140 \\\n", - "--encoder_dim 20 \\\n", - "--decoder_dim 40 \\\n", - "--output_activation 'sigmoid' \\\n", - "--dropout 0 \\\n", - "--learning_rate 0.005 \\\n", - "--loss 'mean_squared_error' \\\n", - "--epochs 100 \\\n", - "--batch_size 32 \\\n", - "--validation_split 0.2 \\\n", - "--print_progress \\\n", - "--save" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The plot below shows a typical prediction (*red line*) of an inlier (class 1) ECG compared to the original (*blue line*) after training the seq2seq-LSTM model." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![inlier_ecg](images/inlier_ecg.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "On the other hand, the model is not good at fitting ECG's from the other classes, as illustrated in the chart below:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![outlier_ecg](images/outlier_ecg.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The predictions in the above charts are made on ECG's the model has not seen before. The differences in scale are due to the sigmoid output layer and do not affect the prediction accuracy." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test using Kubernetes cluster on GCP or Minikube" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Run the outlier detector as a model or a transformer. If you want to run the anomaly detector as a transformer, change the SERVICE_TYPE variable from MODEL to TRANSFORMER [here](./.s2i/environment), set MODEL = False and change ```OutlierSeq2SeqLSTM.py``` to:\n", - "\n", - "```python\n", - "from CoreSeq2SeqLSTM import CoreSeq2SeqLSTM\n", - "\n", - "class OutlierSeq2SeqLSTM(CoreSeq2SeqLSTM):\n", - " \"\"\" Outlier detection using a sequence-to-sequence (seq2seq) LSTM model.\n", - " \n", - " Parameters\n", - " ----------\n", - " threshold (float) : reconstruction error (mse) threshold used to classify outliers\n", - " reservoir_size (int) : number of observations kept in memory using reservoir sampling\n", - " \"\"\"\n", - " def __init__(self,threshold=0.003,reservoir_size=50000,model_name='seq2seq',load_path='./models/'):\n", - " \n", - " super().__init__(threshold=threshold,reservoir_size=reservoir_size,\n", - " model_name=model_name,load_path=load_path)\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "MODEL = True" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Pick Kubernetes cluster on GCP or Minikube." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "MINIKUBE = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if MINIKUBE:\n", - " !minikube start --memory 4096 \n", - "else:\n", - " !gcloud container clusters get-credentials standard-cluster-1 --zone europe-west1-b --project seldon-demos" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create a cluster-wide cluster-admin role assigned to a service account named “default” in the namespace “kube-system”." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kubectl create clusterrolebinding kube-system-cluster-admin --clusterrole=cluster-admin \\\n", - "--serviceaccount=kube-system:default" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kubectl create namespace seldon" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Add current context details to the configuration file in the seldon namespace." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kubectl config set-context $(kubectl config current-context) --namespace=seldon" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create tiller service account and give it a cluster-wide cluster-admin role." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kubectl -n kube-system create sa tiller\n", - "!kubectl create clusterrolebinding tiller --clusterrole cluster-admin --serviceaccount=kube-system:tiller\n", - "!helm init --service-account tiller" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check deployment rollout status and deploy seldon/spartakus helm charts." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kubectl rollout status deploy/tiller-deploy -n kube-system" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "!helm install ../../../helm-charts/seldon-core-operator --name seldon-core --set usage_metrics.enabled=true --namespace seldon-system" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check deployment rollout status for seldon core." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kubectl rollout status deploy/seldon-controller-manager -n seldon-system" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Install Ambassador API gateway" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!helm install stable/ambassador --name ambassador --set crds.keep=false" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kubectl rollout status deployment.apps/ambassador" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If Minikube used: create docker image for outlier detector inside Minikube using s2i. Besides the transformer image and the demo specific model image, the general model image for the Seq2Seq LSTM outlier detector is also available from Docker Hub as ***seldonio/outlier-s2s-lstm-model:0.1***." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if MINIKUBE & MODEL:\n", - " !eval $(minikube docker-env) && \\\n", - " s2i build . seldonio/seldon-core-s2i-python3:0.4 seldonio/outlier-s2s-lstm-model-demo:0.1\n", - "elif MINIKUBE:\n", - " !eval $(minikube docker-env) && \\\n", - " s2i build . seldonio/seldon-core-s2i-python3:0.4 seldonio/outlier-s2s-lstm-transformer:0.1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Install outlier detector helm charts and set *threshold* and *reservoir_size* hyperparameter values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if MODEL:\n", - " !helm install ../../../helm-charts/seldon-od-model \\\n", - " --name outlier-detector \\\n", - " --namespace=seldon \\\n", - " --set model.type=seq2seq \\\n", - " --set model.seq2seq.image.name=seldonio/outlier-s2s-lstm-model-demo:0.1 \\\n", - " --set model.seq2seq.threshold=0.002 \\\n", - " --set model.seq2seq.reservoir_size=50000 \\\n", - " --set oauth.key=oauth-key \\\n", - " --set oauth.secret=oauth-secret \\\n", - " --set replicas=1\n", - "else:\n", - " !helm install ../../../helm-charts/seldon-od-transformer \\\n", - " --name outlier-detector \\\n", - " --namespace=seldon \\\n", - " --set outlierDetection.enabled=true \\\n", - " --set outlierDetection.name=outlier-s2s-lstm \\\n", - " --set outlierDetection.type=seq2seq \\\n", - " --set outlierDetection.seq2seq.image.name=seldonio/outlier-s2s-lstm-transformer:0.1 \\\n", - " --set outlierDetection.seq2seq.threshold=0.002 \\\n", - " --set outlierDetection.seq2seq.reservoir_size=50000 \\\n", - " --set oauth.key=oauth-key \\\n", - " --set oauth.secret=oauth-secret \\\n", - " --set model.image.name=seldonio/outlier-s2s-lstm-model:0.1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Port forward Ambassador\n", - "\n", - "Run command in terminal:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```\n", - "kubectl port-forward $(kubectl get pods -n seldon -l app.kubernetes.io/name=ambassador -o jsonpath='{.items[0].metadata.name}') -n seldon 8003:8080\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Import rest requests, load data and test requests" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "from utils import get_payload, rest_request_ambassador, send_feedback_rest, ecg_data\n", - "\n", - "ecg_data, ecg_labels = ecg_data(dataset='TRAIN')\n", - "X = ecg_data[0,:].reshape(1,ecg_data.shape[1],1)\n", - "label = ecg_labels[0].reshape(1)\n", - "print(X.shape)\n", - "print(label.shape)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Test the rest requests with the generated data. It is important that the order of requests is respected. First we make predictions, then we get the \"true\" labels back using the feedback request. If we do not respect the order and eg keep making predictions without getting the feedback for each prediction, there will be a mismatch between the predicted and \"true\" labels. This will result in errors in the produced metrics." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "request = get_payload(X)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "response = rest_request_ambassador(\"outlier-detector\",\"seldon\",request,endpoint=\"localhost:8003\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If the outlier detector is used as a transformer, the output of the anomaly detection is added as part of the metadata. If it is used as a model, we send model feedback to retrieve custom performance metrics." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if MODEL:\n", - " send_feedback_rest(\"outlier-detector\",\"seldon\",request,response,0,label,endpoint=\"localhost:8003\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Analytics" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Install the helm charts for prometheus and the grafana dashboard" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "!helm install ../../../helm-charts/seldon-core-analytics --name seldon-core-analytics \\\n", - " --set grafana_prom_admin_password=password \\\n", - " --set persistence.enabled=false \\\n", - " --namespace seldon" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Port forward Grafana dashboard" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Run command in terminal:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```\n", - "kubectl port-forward $(kubectl get pods -n seldon -l app=grafana-prom-server -o jsonpath='{.items[0].metadata.name}') -n seldon 3000:3000\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can then view an analytics dashboard inside the cluster at http://localhost:3000/dashboard/db/prediction-analytics?refresh=5s&orgId=1. Your IP address may be different. get it via minikube ip. Login with:\n", - "\n", - "Username : admin\n", - "\n", - "password : password (as set when starting seldon-core-analytics above)\n", - "\n", - "Import the outlier-detector-s2s-lstm dashboard from ../../../helm-charts/seldon-core-analytics/files/grafana/configs." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Run simulation\n", - "\n", - "- Sample random ECG from dataset.\n", - "- Get payload for the observation.\n", - "- Make a prediction.\n", - "- Send the \"true\" label with the feedback if the detector is run as a model.\n", - "\n", - "It is important that the prediction-feedback order is maintained. Otherwise there will be a mismatch between the predicted and \"true\" labels.\n", - "\n", - "View the progress on the grafana \"Outlier Detection\" dashboard. Most metrics need the outlier detector to be run as a model since they need model feedback." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "import time\n", - "n_requests = 100\n", - "n_samples, sample_length = ecg_data.shape\n", - "for i in range(n_requests):\n", - " idx = np.random.choice(n_samples)\n", - " X = ecg_data[idx,:].reshape(1,sample_length,1)\n", - " label = ecg_labels[idx].reshape(1)\n", - " request = get_payload(X)\n", - " response = rest_request_ambassador(\"outlier-detector\",\"seldon\",request,endpoint=\"localhost:8003\")\n", - " if MODEL:\n", - " send_feedback_rest(\"outlier-detector\",\"seldon\",request,response,0,label,endpoint=\"localhost:8003\")\n", - " time.sleep(1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if MINIKUBE:\n", - " !minikube delete" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/components/outlier-detection/seq2seq-lstm/train.py b/components/outlier-detection/seq2seq-lstm/train.py deleted file mode 100644 index ee7a697e2e..0000000000 --- a/components/outlier-detection/seq2seq-lstm/train.py +++ /dev/null @@ -1,155 +0,0 @@ -import argparse -from keras.callbacks import ModelCheckpoint -import numpy as np -import pandas as pd -import pickle -import random -from scipy.io import arff - -from model import model - -np.random.seed(2018) -np.random.RandomState(2018) -random.seed(2018) - -# default args -DATASET = './data/ECG5000_TEST.arff' -SAVE_PATH = './models/' -MODEL_NAME = 'seq2seq' -DATA_RANGE = [0,2627] - -# data preprocessing -STANDARDIZED = False -MINMAX = False -CLIP = [99999] - -# architecture -TIMESTEPS = 140 # length of 1 ECG -ENCODER_DIM = [20] -DECODER_DIM = [40] -OUTPUT_ACTIVATION = 'sigmoid' - -# training -EPOCHS = 100 -BATCH_SIZE = 32 -LEARNING_RATE = .005 -LOSS = 'mean_squared_error' -DROPOUT = 0. -VALIDATION_SPLIT = 0.2 -SAVE = False -PRINT_PROGRESS = False -CONTINUE_TRAINING = False -LOAD_PATH = SAVE_PATH - -def train(model,X,args): - """ Train seq2seq-LSTM model. """ - - # clip data per feature - for col,clip in enumerate(args.clip): - X[:,:,col] = np.clip(X[:,:,col],-clip,clip) - - # apply scaling and save data preprocessing method - axis = (0,1) # scaling per feature over all observations - if args.standardized: - print('\nStandardizing data') - mu, sigma = np.mean(X,axis=axis), np.std(X,axis=axis) - X = (X - mu) / (sigma + 1e-10) - - with open(args.save_path + 'preprocess_' + args.model_name + '.pickle', 'wb') as f: - pickle.dump(['standardized',args.clip,axis,mu,sigma], f) - - if args.minmax: - print('\nMinmax scaling of data') - xmin, xmax = X.min(axis=axis), X.max(axis=axis) - min, max = 0, 1 - X = ((X - xmin) / (xmax - xmin)) * (max - min) + min - - with open(args.save_path + 'preprocess_' + args.model_name + '.pickle', 'wb') as f: - pickle.dump(['minmax',args.clip,axis,xmin,xmax,min,max], f) - - # define inputs - encoder_input_data = X - decoder_input_data = X - decoder_target_data = np.roll(X, -1, axis=1) # offset decoder_input_data by 1 across time axis - - # set training arguments - if args.print_progress: - verbose = 1 - else: - verbose = 0 - - kwargs = {} - kwargs['epochs'] = args.epochs - kwargs['batch_size'] = args.batch_size - kwargs['shuffle'] = True - kwargs['validation_split'] = args.validation_split - kwargs['verbose'] = verbose - - if args.save: # create callback - print('\nSave stuff') - checkpointer = ModelCheckpoint(filepath=args.save_path + args.model_name + '_weights.h5',verbose=0, - save_best_only=True,save_weights_only=True) - kwargs['callbacks'] = [checkpointer] - - # save model architecture - with open(args.save_path + args.model_name + '.pickle', 'wb') as f: - pickle.dump([X.shape[1],X.shape[2],args.encoder_dim, - args.decoder_dim,args.output_activation],f) - - model.fit([encoder_input_data, decoder_input_data], decoder_target_data, **kwargs) - -def run(args): - """ Load data, generate training batch, initiate and train model. """ - - print('\nLoad dataset') - data = arff.loadarff(args.dataset) - data = pd.DataFrame(data[0]) - data.drop(columns='target',inplace=True) - - print('\nGenerate training batch') - args.n_features = 1 # only 1 feature in the ECG dataset - X = data.values[args.data_range[0]:args.data_range[1],:] - X = np.reshape(X, (X.shape[0],X.shape[1],args.n_features)) - - print('\nInitiate outlier detector model') - s2s, enc, dec = model(args.n_features,encoder_dim=args.encoder_dim,decoder_dim=args.decoder_dim, - dropout=args.dropout,learning_rate=args.learning_rate,loss=args.loss, - output_activation=args.output_activation) - - if args.continue_training: - print('\nLoad pre-trained model') - s2s.load_weights(args.load_path + args.model_name + '_weights.h5') # load pretrained model weights - - if args.print_progress: - s2s.summary() - - print('\nTrain outlier detector') - train(s2s,X,args) - -if __name__ == '__main__': - - parser = argparse.ArgumentParser(description="Train seq2seq-LSTM outlier detector.") - parser.add_argument('--dataset',type=str,choices=DATASET,default=DATASET) - parser.add_argument('--data_range',type=int,nargs=2,default=DATA_RANGE) - parser.add_argument('--timesteps',type=int,default=TIMESTEPS) - parser.add_argument('--encoder_dim',type=int,nargs='+',default=ENCODER_DIM) - parser.add_argument('--decoder_dim',type=int,nargs='+',default=DECODER_DIM) - parser.add_argument('--output_activation',type=str,default=OUTPUT_ACTIVATION) - parser.add_argument('--dropout',type=float,default=DROPOUT) - parser.add_argument('--learning_rate',type=float,default=LEARNING_RATE) - parser.add_argument('--loss',type=str,default=LOSS) - parser.add_argument('--validation_split',type=float,default=VALIDATION_SPLIT) - parser.add_argument('--epochs',type=int,default=EPOCHS) - parser.add_argument('--batch_size',type=int,default=BATCH_SIZE) - parser.add_argument('--clip',type=float,nargs='+',default=CLIP) - parser.add_argument('--standardized', default=STANDARDIZED, action='store_true') - parser.add_argument('--minmax', default=MINMAX, action='store_true') - parser.add_argument('--print_progress', default=PRINT_PROGRESS, action='store_true') - parser.add_argument('--save', default=SAVE, action='store_true') - parser.add_argument('--save_path',type=str,default=SAVE_PATH) - parser.add_argument('--load_path',type=str,default=LOAD_PATH) - parser.add_argument('--model_name',type=str,default=MODEL_NAME) - parser.add_argument('--continue_training', default=CONTINUE_TRAINING, action='store_true') - args = parser.parse_args() - - run(args) \ No newline at end of file diff --git a/components/outlier-detection/seq2seq-lstm/utils.py b/components/outlier-detection/seq2seq-lstm/utils.py deleted file mode 100644 index bbcb44cafb..0000000000 --- a/components/outlier-detection/seq2seq-lstm/utils.py +++ /dev/null @@ -1,91 +0,0 @@ -import collections -import json -import numpy as np -import pandas as pd -import requests -from scipy.io import arff -from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, fbeta_score - -def ecg_data(dataset='TEST',data_range=None, outlier=[2,3,4,5]): - """ Return ECG dataset with outlier labels. """ - - data = arff.loadarff('./data/ECG5000_' + dataset + '.arff') - data = pd.DataFrame(data[0]) - data['target'] = data['target'].astype(int) - if data_range is None: - data_range = [0,data.shape[0]] - outlier_true = data['target'][data_range[0]:data_range[1]].isin(outlier).astype(int).values - data.drop(columns='target',inplace=True) - X = data.values[data_range[0]:data_range[1],:] - return X, outlier_true - -def flatten(x): - """ Flatten list. """ - if isinstance(x, collections.Iterable): - return [a for i in x for a in flatten(i)] - else: - return [x] - -def performance(y_true,y_pred,roll_window=100): - """ Return a confusion matrix and calculate rolling accuracy, precision, recall, F1 and F2 scores. """ - - # confusion matrix - cm = confusion_matrix(y_true,y_pred,labels=[0,1]) - tn, fp, fn, tp = cm.ravel() - - # total scores - acc_tot = accuracy_score(y_true,y_pred) - prec_tot = precision_score(y_true,y_pred) - rec_tot = recall_score(y_true,y_pred) - f1_tot = f1_score(y_true,y_pred) - f2_tot = fbeta_score(y_true,y_pred,beta=2) - - # rolling scores - y_true_roll = y_true[-roll_window:] - y_pred_roll = y_pred[-roll_window:] - acc_roll = accuracy_score(y_true_roll,y_pred_roll) - prec_roll = precision_score(y_true_roll,y_pred_roll) - rec_roll = recall_score(y_true_roll,y_pred_roll) - f1_roll = f1_score(y_true_roll,y_pred_roll) - f2_roll = fbeta_score(y_true_roll,y_pred_roll,beta=2) - - scores = [tn, fp, fn, tp, acc_tot, prec_tot, rec_tot, f1_tot, f2_tot, - acc_roll, prec_roll, rec_roll, f1_roll, f2_roll] - - return scores - -def outlier_stats(y_true,y_pred,roll_window=100): - """ Calculate number and percentage of predicted and labeled outliers. """ - - y_pred_roll = np.sum(y_pred[-roll_window:]) - y_true_roll = np.sum(y_true[-roll_window:]) - y_pred_tot = np.sum(y_pred) - y_true_tot = np.sum(y_true) - - return y_pred_roll, y_true_roll, y_pred_tot, y_true_tot - -def get_payload(arr): - features = ["x{}".format(str(i)) for i in range(arr.size)] - datadef = {"names":features,"ndarray":arr.tolist()} - payload = {"meta":{},"data":datadef} - return payload - -def rest_request_ambassador(deploymentName,namespace,request,endpoint="localhost:8003"): - response = requests.post( - "http://"+endpoint+"/seldon/"+namespace+"/"+deploymentName+"/api/v0.1/predictions", - json=request) - print(response.status_code) - print(response.text) - return response.json() - -def send_feedback_rest(deploymentName,namespace,request,response,reward,truth,endpoint="localhost:8003"): - feedback = { - "request": request, - "response": response, - "reward": reward, - "truth": {"data":{"ndarray":truth.tolist()}} - } - ret = requests.post( - "http://"+endpoint+"/seldon/"+namespace+"/"+deploymentName+"/api/v0.1/feedback", - json=feedback) - return diff --git a/components/outlier-detection/vae/.s2i/environment b/components/outlier-detection/vae/.s2i/environment deleted file mode 100644 index 273df4bc3c..0000000000 --- a/components/outlier-detection/vae/.s2i/environment +++ /dev/null @@ -1,4 +0,0 @@ -MODEL_NAME=OutlierVAE -API_TYPE=REST -SERVICE_TYPE=MODEL -PERSISTENCE=0 diff --git a/components/outlier-detection/vae/CoreVAE.py b/components/outlier-detection/vae/CoreVAE.py deleted file mode 100644 index 79d736435c..0000000000 --- a/components/outlier-detection/vae/CoreVAE.py +++ /dev/null @@ -1,182 +0,0 @@ -import logging -import numpy as np -import pickle -import random - -from model import model - -logger = logging.getLogger(__name__) - - -class CoreVAE(object): - """ Outlier detection using variational autoencoders (VAE). - - Parameters - ---------- - threshold (float) : reconstruction error (mse) threshold used to classify outliers - reservoir_size (int) : number of observations kept in memory using reservoir sampling - - Functions - ---------- - reservoir_sampling : applies reservoir sampling to incoming data - predict : detect and return outliers - transform_input : detect outliers and return input features - send_feedback : add target labels as part of the feedback loop - tags : add metadata for input transformer - metrics : return custom metrics - """ - - def __init__(self,threshold=10,reservoir_size=50000,model_name='vae',load_path='./models/'): - - logger.info("Initializing model") - self.threshold = threshold - self.reservoir_size = reservoir_size - self.batch = [] - self.N = 0 # total sample count up until now for reservoir sampling - self.nb_outliers = 0 - - # load model architecture parameters - with open(load_path + model_name + '.pickle', 'rb') as f: - n_features, hidden_layers, latent_dim, hidden_dim, output_activation = pickle.load(f) - - # instantiate model - self.vae = model(n_features,hidden_layers=hidden_layers,latent_dim=latent_dim, - hidden_dim=hidden_dim,output_activation=output_activation) - self.vae.load_weights(load_path + model_name + '_weights.h5') # load pretrained model weights - self.vae._make_predict_function() - - # load data preprocessing info - with open(load_path + 'preprocess_' + model_name + '.pickle', 'rb') as f: - preprocess = pickle.load(f) - self.preprocess, self.clip, self.axis = preprocess[:3] - if self.preprocess=='minmax': - self.xmin, self.xmax = preprocess[3:5] - self.min, self.max = preprocess[5:] - elif self.preprocess=='standardized': - self.mu, self.sigma = preprocess[3:] - - - def reservoir_sampling(self,X,update_stand=False): - """ Keep batch of data in memory using reservoir sampling. """ - for item in X: - self.N+=1 - if len(self.batch) < self.reservoir_size: - self.batch.append(item) - else: - s = int(random.random() * self.N) - if s < self.reservoir_size: - self.batch[s] = item - - if update_stand: - if self.preprocess=='minmax': - self.xmin = np.array(self.batch).min(axis=self.axis) - self.xmax = np.array(self.batch).max(axis=self.axis) - elif self.preprocess=='standardized': - self.mu = np.array(self.batch).mean(axis=self.axis) - self.sigma = np.array(self.batch).std(axis=self.axis) - return - - - def predict(self, X, feature_names): - """ Return outlier predictions. - - Parameters - ---------- - X : array-like - feature_names : array of feature names (optional) - """ - logger.info("Using component as a model") - return self._get_preds(X) - - - def transform_input(self, X, feature_names): - """ Transform the input. - Used when the outlier detector sits on top of another model. - - Parameters - ---------- - X : array-like - feature_names : array of feature names (optional) - """ - logger.info("Using component as an outlier-detector transformer") - self.prediction_meta = self._get_preds(X) - return X - - - def _get_preds(self, X): - """ Detect outliers if the reconstruction error is above the threshold. - - Parameters - ---------- - X : array-like - """ - - # clip data per feature - X = np.clip(X,[-c for c in self.clip],self.clip) - - if self.N < self.reservoir_size: - update_stand = False - else: - update_stand = True - - self.reservoir_sampling(X,update_stand=update_stand) - - # apply scaling - if self.preprocess=='minmax': - X_scaled = ((X - self.xmin) / (self.xmax - self.xmin)) * (self.max - self.min) + self.min - elif self.preprocess=='standardized': - X_scaled = (X - self.mu) / (self.sigma + 1e-10) - - # sample latent variables and calculate reconstruction errors - N = 10 - mse = np.zeros([X.shape[0],N]) - for i in range(N): - preds = self.vae.predict(X_scaled) - mse[:,i] = np.mean(np.power(X_scaled - preds, 2), axis=1) - self.mse = np.mean(mse, axis=1) - - # make prediction - self.prediction = np.array([1 if e > self.threshold else 0 for e in self.mse]).astype(int) - - return self.prediction - - - def send_feedback(self,X,feature_names,reward,truth): - """ Return additional data as part of the feedback loop. - - Parameters - ---------- - X : array of the features sent in the original predict request - feature_names : array of feature names. May be None if not available. - reward (float): the reward - truth : array with correct value (optional) - """ - logger.info("Send feedback called") - return [] - - - def tags(self): - """ - Use predictions made within transform to add these as metadata - to the response. Tags will only be collected if the component is - used as an input-transformer. - """ - try: - return {"outlier-predictions": self.prediction_meta.tolist()} - except AttributeError: - logger.info("No metadata about outliers") - - - def metrics(self): - """ Return custom metrics averaged over the prediction batch. - """ - self.nb_outliers += np.sum(self.prediction) - - is_outlier = {"type":"GAUGE","key":"is_outlier","value":np.mean(self.prediction)} - mse = {"type":"GAUGE","key":"mse","value":np.mean(self.mse)} - nb_outliers = {"type":"GAUGE","key":"nb_outliers","value":int(self.nb_outliers)} - fraction_outliers = {"type":"GAUGE","key":"fraction_outliers","value":int(self.nb_outliers)/self.N} - obs = {"type":"GAUGE","key":"observation","value":self.N} - threshold = {"type":"GAUGE","key":"threshold","value":self.threshold} - - return [is_outlier,mse,nb_outliers,fraction_outliers,obs,threshold] \ No newline at end of file diff --git a/components/outlier-detection/vae/OutlierVAE.py b/components/outlier-detection/vae/OutlierVAE.py deleted file mode 100644 index 7f92dcf866..0000000000 --- a/components/outlier-detection/vae/OutlierVAE.py +++ /dev/null @@ -1,119 +0,0 @@ -import numpy as np - -from CoreVAE import CoreVAE -from utils import flatten, performance, outlier_stats - - -class OutlierVAE(CoreVAE): - """ Outlier detection using variational autoencoders (VAE). - - Parameters - ---------- - threshold (float) : reconstruction error (mse) threshold used to classify outliers - reservoir_size (int) : number of observations kept in memory using reservoir sampling - - Functions - ---------- - send_feedback : add target labels as part of the feedback loop - metrics : return custom metrics - """ - - def __init__(self,threshold=10,reservoir_size=50000,model_name='vae',load_path='./models/'): - - super().__init__(threshold=threshold,reservoir_size=reservoir_size, - model_name=model_name,load_path=load_path) - - self._predictions = [] - self._labels = [] - self._mse = [] - self.roll_window = 100 - self.metric = [float('nan') for i in range(18)] - - - def send_feedback(self,X,feature_names,reward,truth): - """ Return outlier labels as part of the feedback loop. - - Parameters - ---------- - X : array of the features sent in the original predict request - feature_names : array of feature names. May be None if not available. - reward (float): the reward - truth : array with correct value (optional) - """ - _ = super().send_feedback(X,feature_names,reward,truth) - - # historical reconstruction errors and predictions - self._mse.append(self.mse) - self._mse = flatten(self._mse) - self._predictions.append(self.prediction) - self._predictions = flatten(self._predictions) - - # target labels - self.label = truth - self._labels.append(self.label) - self._labels = flatten(self._labels) - - # performance metrics - scores = performance(self._labels,self._predictions,roll_window=self.roll_window) - stats = outlier_stats(self._labels,self._predictions,roll_window=self.roll_window) - - convert = flatten([scores,stats]) - metric = [] - for c in convert: # convert from np to native python type to jsonify - metric.append(np.asscalar(np.asarray(c))) - self.metric = metric - - return [] - - - def metrics(self): - """ Return custom metrics. - Printed with a delay of 1 prediction because the labels are returned in the feedback step. - """ - - if self.mse.shape[0]>1: - raise ValueError('Metrics can only handle single observations.') - - if self.N==1: - pred = float('nan') - err = float('nan') - y_true = float('nan') - else: - pred = int(self._predictions[-1]) - err = self._mse[-1] - y_true = int(self.label[0]) - - is_outlier = {"type":"GAUGE","key":"is_outlier","value":pred} - mse = {"type":"GAUGE","key":"mse","value":err} - obs = {"type":"GAUGE","key":"observation","value":self.N - 1} - threshold = {"type":"GAUGE","key":"threshold","value":self.threshold} - - label = {"type":"GAUGE","key":"label","value":y_true} - - accuracy_tot = {"type":"GAUGE","key":"accuracy_tot","value":self.metric[4]} - precision_tot = {"type":"GAUGE","key":"precision_tot","value":self.metric[5]} - recall_tot = {"type":"GAUGE","key":"recall_tot","value":self.metric[6]} - f1_score_tot = {"type":"GAUGE","key":"f1_tot","value":self.metric[7]} - f2_score_tot = {"type":"GAUGE","key":"f2_tot","value":self.metric[8]} - - accuracy_roll = {"type":"GAUGE","key":"accuracy_roll","value":self.metric[9]} - precision_roll = {"type":"GAUGE","key":"precision_roll","value":self.metric[10]} - recall_roll = {"type":"GAUGE","key":"recall_roll","value":self.metric[11]} - f1_score_roll = {"type":"GAUGE","key":"f1_roll","value":self.metric[12]} - f2_score_roll = {"type":"GAUGE","key":"f2_roll","value":self.metric[13]} - - true_negative = {"type":"GAUGE","key":"true_negative","value":self.metric[0]} - false_positive = {"type":"GAUGE","key":"false_positive","value":self.metric[1]} - false_negative = {"type":"GAUGE","key":"false_negative","value":self.metric[2]} - true_positive = {"type":"GAUGE","key":"true_positive","value":self.metric[3]} - - nb_outliers_roll = {"type":"GAUGE","key":"nb_outliers_roll","value":self.metric[14]} - nb_labels_roll = {"type":"GAUGE","key":"nb_labels_roll","value":self.metric[15]} - nb_outliers_tot = {"type":"GAUGE","key":"nb_outliers_tot","value":self.metric[16]} - nb_labels_tot = {"type":"GAUGE","key":"nb_labels_tot","value":self.metric[17]} - - return [is_outlier,mse,obs,threshold,label, - accuracy_tot,precision_tot,recall_tot,f1_score_tot,f2_score_tot, - accuracy_roll,precision_roll,recall_roll,f1_score_roll,f2_score_roll, - true_negative,false_positive,false_negative,true_positive, - nb_outliers_roll,nb_labels_roll,nb_outliers_tot,nb_labels_tot] \ No newline at end of file diff --git a/components/outlier-detection/vae/README.md b/components/outlier-detection/vae/README.md deleted file mode 100644 index 07bc4ecc70..0000000000 --- a/components/outlier-detection/vae/README.md +++ /dev/null @@ -1,22 +0,0 @@ -# Variational Auto-Encoder (VAE) Outlier Detector - -## Description - -[Anomaly or outlier detection](https://en.wikipedia.org/wiki/Anomaly_detection) has many applications, ranging from preventing credit card fraud to detecting computer network intrusions. The implemented VAE outlier detector aims to predict anomalies in tabular data. The VAE model can be trained in an unsupervised or semi-supervised way, which is helpful since labeled training data is often scarce. The outlier detector predicts whether the input features represent normal behaviour or not, dependent on a threshold level set by the user. - -## Implementation - -The architecture of the VAE is defined in ```model.py``` and the model is trained by running the ```train.py``` script. The ```OutlierVAE``` class loads a pre-trained model and makes predictions on new data. - -A detailed explanation of the implementation and usage of the Variational Auto-Encoder as an outlier detector can be found in the [VAE documentation](./doc.md). - -## Running on Seldon - -An end-to-end example running a VAE outlier detector on GCP or Minikube using Seldon to identify computer network intrusions is available [here](./outlier_vae.ipynb). - -Docker images to use the generic VAE outlier detector as a model or transformer can be found on Docker Hub: -* [seldonio/outlier-vae-model](https://hub.docker.com/r/seldonio/outlier-vae-model) -* [seldonio/outlier-vae-transformer](https://hub.docker.com/r/seldonio/outlier-vae-transformer) - -A model docker image specific for the demo is also available: -* [seldonio/outlier-vae-model-demo](https://hub.docker.com/r/seldonio/outlier-vae-model-demo) \ No newline at end of file diff --git a/components/outlier-detection/vae/__init__.py b/components/outlier-detection/vae/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/components/outlier-detection/vae/doc.md b/components/outlier-detection/vae/doc.md deleted file mode 100644 index d26290affc..0000000000 --- a/components/outlier-detection/vae/doc.md +++ /dev/null @@ -1,292 +0,0 @@ -# Variational Auto-Encoder Outlier (VAE) Algorithm Documentation - -The aim of this document is to explain the Variational Auto-Encoder algorithm in Seldon's outlier detection framework. - -First, we provide a high level overview of the algorithm and the use case, then we will give a detailed explanation of the implementation. - -## Overview - -Outlier detection has many applications, ranging from preventing credit card fraud to detecting computer network intrusions. The available data is typically unlabeled and detection needs to be done in real-time. The outlier detector can be used as a standalone algorithm, or to detect anomalies in the input data of another predictive model. - -The VAE outlier detection algorithm predicts whether the input features are an outlier or not, dependent on a threshold level set by the user. The algorithm needs to be pretrained first on a batch of -preferably- inliers. - -As observations arrive, the algorithm will: -- scale (standardize or minmax) the input features -- first encode, and then decode the input data in an attempt to reconstruct the initial observations -- compute a reconstruction error between the output of the decoder and the input data -- predict that the observation is an outlier if the error is larger than the threshold level - -## Why Variational Auto-Encoders? - -An Auto-Encoder is an algorithm that consists of 2 main building blocks: an encoder and a decoder. The encoder tries to find a compressed representation of the input data. The compressed data is then fed into the decoder, which aims to replicate the input data. Both the encoder and decoder are typically implemented with neural networks. The loss function to be minimized with stochastic gradient descent is a distance function between the input data and output of the decoder, and is called the reconstruction error. - -If we train the Auto-Encoder with inliers, it will be able to replicate new inlier data well with a low reconstruction error. However, if outliers are fed to the Auto-Encoder, the reconstruction error becomes large and we can classify the observation as an anomaly. - -A Variational Auto-Encoder adds constraints to the encoded representations of the input. The encodings are parameters of a probability distribution modeling the data. The decoder can then generate new data by sampling from the learned distribution. - -## Implementation - -### 1. Building the VAE model - -The VAE model definition in ```model.py``` takes 4 arguments that define the architecture: -- the number of features in the input -- the number of hidden layers used in the encoder and decoder -- the dimension of the latent variable -- the dimensions of each hidden layer - -``` python -def model(n_features, hidden_layers=1, latent_dim=2, hidden_dim=[], - output_activation='sigmoid', learning_rate=0.001): - """ Build VAE model. - - Arguments: - - n_features (int): number of features in the data - - hidden_layers (int): number of hidden layers used in encoder/decoder - - latent_dim (int): dimension of latent variable - - hidden_dim (list): list with dimension of each hidden layer - - output_activation (str): activation type for last dense layer in the decoder - - learning_rate (float): learning rate used during training - """ -``` - -First, the input data feeds in the encoder and is compressed by mapping it on the latent space which defines the probability distribution of the encodings: - -``` python - # encoder - inputs = Input(shape=(n_features,), name='encoder_input') - # define hidden layers - enc_hidden = Dense(hidden_dim[0], activation='relu', name='encoder_hidden_0')(inputs) - i = 1 - while i < hidden_layers: - enc_hidden = Dense(hidden_dim[i],activation='relu',name='encoder_hidden_'+str(i))(enc_hidden) - i+=1 - - z_mean = Dense(latent_dim, name='z_mean')(enc_hidden) - z_log_var = Dense(latent_dim, name='z_log_var')(enc_hidden) -``` - -We can then sample data from the latent space. - -``` python -def sampling(args): - """ Reparameterization trick by sampling from an isotropic unit Gaussian. - - Arguments: - - args (tensor): mean and log of variance of Q(z|X) - - Returns: - - z (tensor): sampled latent vector - """ - z_mean, z_log_var = args - batch = K.shape(z_mean)[0] - dim = K.int_shape(z_mean)[1] - epsilon = K.random_normal(shape=(batch, dim)) # by default, random_normal has mean=0 and std=1.0 - return z_mean + K.exp(0.5 * z_log_var) * epsilon # mean + stdev * eps -``` - -``` python - # reparametrization trick to sample z - z = Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var]) -``` - -The sampled data passes through the decoder which aims to reconstruct the input. - -``` python - # decoder - latent_inputs = Input(shape=(latent_dim,), name='z_sampling') - # define hidden layers - dec_hidden = Dense(hidden_dim[-1], activation='relu', name='decoder_hidden_0')(latent_inputs) - - i = 2 - while i < hidden_layers+1: - dec_hidden = Dense(hidden_dim[-i],activation='relu',name='decoder_hidden_'+str(i-1))(dec_hidden) - i+=1 - - outputs = Dense(n_features, activation=output_activation, name='decoder_output')(dec_hidden) -``` - -The loss function is the sum of the reconstruction error and the KL-divergence. While the reconstruction error quantifies how well we can recreate the input data, the KL-divergence measures how close the latent representation is to the unit Gaussian distribution. This trade-off is important because we want our encodings to parameterize a probability distribution from which we can sample data. - -``` python - # define VAE loss, optimizer and compile model - reconstruction_loss = mse(inputs, outputs) - reconstruction_loss *= n_features - kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var) - kl_loss = K.sum(kl_loss, axis=-1) - kl_loss *= -0.5 - vae_loss = K.mean(reconstruction_loss + kl_loss) - vae.add_loss(vae_loss) -``` - -### 2. Training the model - -The VAE model can be trained on a batch of inliers by running the ```train.py``` script with the desired hyperparameters: - -``` python -!python train.py \ ---dataset 'kddcup99' \ ---samples 50000 \ ---keep_cols "$cols_str" \ ---hidden_layers 1 \ ---latent_dim 2 \ ---hidden_dim 9 \ ---output_activation 'sigmoid' \ ---clip 999999 \ ---standardized \ ---epochs 10 \ ---batch_size 32 \ ---learning_rate 0.001 \ ---print_progress \ ---model_name 'vae' \ ---save \ ---save_path './models/' -``` - -The model weights and hyperparameters are saved in the folder specified by "save_path". - -### 3. Making predictions - -In order to make predictions, which can then be served by Seldon Core, the pre-trained model weights and hyperparameters are loaded when defining an OutlierVAE object. The "threshold" argument defines above which reconstruction error a sample is classified as an outlier. The threshold is a key hyperparameter and needs to be picked carefully for each application. The OutlierVAE class inherits from the CoreVAE class in ```CoreVAE.py```. - -```python -class CoreVAE(object): - """ Outlier detection using variational autoencoders (VAE). - - Parameters - ---------- - threshold (float) : reconstruction error (mse) threshold used to classify outliers - reservoir_size (int) : number of observations kept in memory using reservoir sampling - - Functions - ---------- - reservoir_sampling : applies reservoir sampling to incoming data - predict : detect and return outliers - transform_input : detect outliers and return input features - send_feedback : add target labels as part of the feedback loop - tags : add metadata for input transformer - metrics : return custom metrics - """ - - def __init__(self,threshold=10,reservoir_size=50000,model_name='vae',load_path='./models/'): - - logger.info("Initializing model") - self.threshold = threshold - self.reservoir_size = reservoir_size - self.batch = [] - self.N = 0 # total sample count up until now for reservoir sampling - self.nb_outliers = 0 - - # load model architecture parameters - with open(load_path + model_name + '.pickle', 'rb') as f: - n_features, hidden_layers, latent_dim, hidden_dim, output_activation = pickle.load(f) - - # instantiate model - self.vae = model(n_features,hidden_layers=hidden_layers,latent_dim=latent_dim, - hidden_dim=hidden_dim,output_activation=output_activation) - self.vae.load_weights(load_path + model_name + '_weights.h5') # load pretrained model weights - self.vae._make_predict_function() - - # load data preprocessing info - with open(load_path + 'preprocess_' + model_name + '.pickle', 'rb') as f: - preprocess = pickle.load(f) - self.preprocess, self.clip, self.axis = preprocess[:3] - if self.preprocess=='minmax': - self.xmin, self.xmax = preprocess[3:5] - self.min, self.max = preprocess[5:] - elif self.preprocess=='standardized': - self.mu, self.sigma = preprocess[3:] -``` - -``` python -class OutlierVAE(CoreVAE): - """ Outlier detection using variational autoencoders (VAE). - - Parameters - ---------- - threshold (float) : reconstruction error (mse) threshold used to classify outliers - reservoir_size (int) : number of observations kept in memory using reservoir sampling - - Functions - ---------- - send_feedback : add target labels as part of the feedback loop - metrics : return custom metrics - """ - - def __init__(self,threshold=10,reservoir_size=50000,model_name='vae',load_path='./models/'): - - super().__init__(threshold=threshold,reservoir_size=reservoir_size, - model_name=model_name,load_path=load_path) -``` - -The actual outlier detection is done by the ```_get_preds``` method which is invoked by ```predict``` or ```transform_input``` dependent on whether the detector is defined as respectively a model or a transformer. - -```python -def predict(self, X, feature_names): - """ Return outlier predictions. - - Parameters - ---------- - X : array-like - feature_names : array of feature names (optional) - """ - logger.info("Using component as a model") - return self._get_preds(X) -``` - -```python -def transform_input(self, X, feature_names): - """ Transform the input. - Used when the outlier detector sits on top of another model. - - Parameters - ---------- - X : array-like - feature_names : array of feature names (optional) - """ - logger.info("Using component as an outlier-detector transformer") - self.prediction_meta = self._get_preds(X) - return X -``` - -In ```_get_preds```, the observations are first clipped. If the number of observations fed to the outlier detector up until now is at least equal to the defined reservoir size, the feature-wise scaling parameters are updated using the observations in the reservoir. The reservoir is updated each observation using reservoir sampling. The input data is then scaled using either standardization or minmax scaling. - -``` python - # clip data per feature - X = np.clip(X,[-c for c in self.clip],self.clip) - - if self.N < self.reservoir_size: - update_stand = False - else: - update_stand = True - - self.reservoir_sampling(X,update_stand=update_stand) - - # apply scaling - if self.preprocess=='minmax': - X_scaled = ((X - self.xmin) / (self.xmax - self.xmin)) * (self.max - self.min) + self.min - elif self.preprocess=='standardized': - X_scaled = (X - self.mu) / (self.sigma + 1e-10) -``` - -We then make multiple predictions for an observation by sampling N times from the latent space. The mean squared error between the input data and output of the decoder is averaged across the N samples. If this value is above the threshold, an outlier is predicted. - -``` python - # sample latent variables and calculate reconstruction errors - N = 10 - mse = np.zeros([X.shape[0],N]) - for i in range(N): - preds = self.vae.predict(X_scaled) - mse[:,i] = np.mean(np.power(X_scaled - preds, 2), axis=1) - self.mse = np.mean(mse, axis=1) - - # make prediction - self.prediction = np.array([1 if e > self.threshold else 0 for e in self.mse]).astype(int) -``` - -## References - -Diederik P. Kingma and Max Welling. Auto-Encoding Variational Bayes. ICLR 2014. -- https://arxiv.org/pdf/1312.6114.pdf - -Francois Chollet. Building Autoencoders in Keras. -- https://blog.keras.io/building-autoencoders-in-keras.html \ No newline at end of file diff --git a/components/outlier-detection/vae/model.py b/components/outlier-detection/vae/model.py deleted file mode 100644 index e54c61e65f..0000000000 --- a/components/outlier-detection/vae/model.py +++ /dev/null @@ -1,92 +0,0 @@ -from keras.layers import Lambda, Input, Dense -from keras.models import Model -from keras.losses import mse -from keras import backend as K -from keras.optimizers import Adam -import numpy as np - -def sampling(args): - """ Reparameterization trick by sampling from an isotropic unit Gaussian. - - Arguments: - - args (tensor): mean and log of variance of Q(z|X) - - Returns: - - z (tensor): sampled latent vector - """ - z_mean, z_log_var = args - batch = K.shape(z_mean)[0] - dim = K.int_shape(z_mean)[1] - epsilon = K.random_normal(shape=(batch, dim)) # by default, random_normal has mean=0 and std=1.0 - return z_mean + K.exp(0.5 * z_log_var) * epsilon # mean + stdev * eps - -def model(n_features, hidden_layers=1, latent_dim=2, hidden_dim=[], - output_activation='sigmoid', learning_rate=0.001): - """ Build VAE model. - - Arguments: - - n_features (int): number of features in the data - - hidden_layers (int): number of hidden layers used in encoder/decoder - - latent_dim (int): dimension of latent variable - - hidden_dim (list): list with dimension of each hidden layer - - output_activation (str): activation type for last dense layer in the decoder - - learning_rate (float): learning rate used during training - """ - - # set dimensions hidden layers - if hidden_dim==[]: - i = 0 - dim = n_features - while i < hidden_layers: - hidden_dim.append(int(np.max([dim/2,2]))) - dim/=2 - i+=1 - - # VAE = encoder + decoder - # encoder - inputs = Input(shape=(n_features,), name='encoder_input') - # define hidden layers - enc_hidden = Dense(hidden_dim[0], activation='relu', name='encoder_hidden_0')(inputs) - i = 1 - while i < hidden_layers: - enc_hidden = Dense(hidden_dim[i],activation='relu',name='encoder_hidden_'+str(i))(enc_hidden) - i+=1 - - z_mean = Dense(latent_dim, name='z_mean')(enc_hidden) - z_log_var = Dense(latent_dim, name='z_log_var')(enc_hidden) - # reparametrization trick to sample z - z = Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var]) - # instantiate encoder model - encoder = Model(inputs, [z_mean, z_log_var, z], name='encoder') - - # decoder - latent_inputs = Input(shape=(latent_dim,), name='z_sampling') - # define hidden layers - dec_hidden = Dense(hidden_dim[-1], activation='relu', name='decoder_hidden_0')(latent_inputs) - - i = 2 - while i < hidden_layers+1: - dec_hidden = Dense(hidden_dim[-i],activation='relu',name='decoder_hidden_'+str(i-1))(dec_hidden) - i+=1 - - outputs = Dense(n_features, activation=output_activation, name='decoder_output')(dec_hidden) - # instantiate decoder model - decoder = Model(latent_inputs, outputs, name='decoder') - - # instantiate VAE model - outputs = decoder(encoder(inputs)[2]) - vae = Model(inputs, outputs, name='vae') - - # define VAE loss, optimizer and compile model - reconstruction_loss = mse(inputs, outputs) - reconstruction_loss *= n_features - kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var) - kl_loss = K.sum(kl_loss, axis=-1) - kl_loss *= -0.5 - vae_loss = K.mean(reconstruction_loss + kl_loss) - vae.add_loss(vae_loss) - - optimizer = Adam(lr=learning_rate) - vae.compile(optimizer=optimizer) - - return vae \ No newline at end of file diff --git a/components/outlier-detection/vae/models/preprocess_vae.pickle b/components/outlier-detection/vae/models/preprocess_vae.pickle deleted file mode 100644 index 0496d54491..0000000000 Binary files a/components/outlier-detection/vae/models/preprocess_vae.pickle and /dev/null differ diff --git a/components/outlier-detection/vae/models/vae.pickle b/components/outlier-detection/vae/models/vae.pickle deleted file mode 100644 index 7dc3774cc1..0000000000 Binary files a/components/outlier-detection/vae/models/vae.pickle and /dev/null differ diff --git a/components/outlier-detection/vae/models/vae_weights.h5 b/components/outlier-detection/vae/models/vae_weights.h5 deleted file mode 100644 index a6d56ac30b..0000000000 Binary files a/components/outlier-detection/vae/models/vae_weights.h5 and /dev/null differ diff --git a/components/outlier-detection/vae/outlier_vae.ipynb b/components/outlier-detection/vae/outlier_vae.ipynb deleted file mode 100644 index 7290ac7c7e..0000000000 --- a/components/outlier-detection/vae/outlier_vae.ipynb +++ /dev/null @@ -1,622 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# VAE (variational autoencoder) outlier detector deployment" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Wrap a keras VAE python model for use as a prediction microservice in seldon-core and deploy on seldon-core running on minikube or a Kubernetes cluster using GCP." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Dependencies" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- [helm](https://github.com/helm/helm)\n", - "- [minikube](https://github.com/kubernetes/minikube)\n", - "- [s2i](https://github.com/openshift/source-to-image) >= 1.1.13\n", - "\n", - "python packages:\n", - "- keras: pip install keras\n", - "- tensorflow: https://www.tensorflow.org/install/pip\n", - "- scikit-learn: pip install scikit-learn --> 0.20.1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Task" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The outlier detector needs to detect computer network intrusions using TCP dump data for a local-area network (LAN) simulating a typical U.S. Air Force LAN. A connection is a sequence of TCP packets starting and ending at some well defined times, between which data flows to and from a source IP address to a target IP address under some well defined protocol. Each connection is labeled as either normal, or as an attack. \n", - "\n", - "There are 4 types of attacks in the dataset:\n", - "- DOS: denial-of-service, e.g. syn flood;\n", - "- R2L: unauthorized access from a remote machine, e.g. guessing password;\n", - "- U2R: unauthorized access to local superuser (root) privileges;\n", - "- probing: surveillance and other probing, e.g., port scanning.\n", - " \n", - "The dataset contains about 5 million connection records.\n", - "\n", - "There are 3 types of features:\n", - "- basic features of individual connections, e.g. duration of connection\n", - "- content features within a connection, e.g. number of failed log in attempts\n", - "- traffic features within a 2 second window, e.g. number of connections to the same host as the current connection\n", - "\n", - "The outlier detector is only using the continuous (18 out of 41) features." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Train locally\n", - "\n", - "Train on small dataset of normal traffic." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# define columns to keep\n", - "cols=['srv_count','serror_rate','srv_serror_rate','rerror_rate',\n", - " 'srv_rerror_rate','same_srv_rate','diff_srv_rate',\n", - " 'srv_diff_host_rate','dst_host_count','dst_host_srv_count',\n", - " 'dst_host_same_srv_rate','dst_host_diff_srv_rate',\n", - " 'dst_host_same_src_port_rate','dst_host_srv_diff_host_rate',\n", - " 'dst_host_serror_rate','dst_host_srv_serror_rate',\n", - " 'dst_host_rerror_rate','dst_host_srv_rerror_rate','target']\n", - "cols_str = str(cols)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!python train.py \\\n", - "--dataset 'kddcup99' \\\n", - "--samples 50000 \\\n", - "--keep_cols \"$cols_str\" \\\n", - "--hidden_layers 1 \\\n", - "--latent_dim 2 \\\n", - "--hidden_dim 9 \\\n", - "--output_activation 'sigmoid' \\\n", - "--clip 999999 \\\n", - "--standardized \\\n", - "--epochs 10 \\\n", - "--batch_size 32 \\\n", - "--learning_rate 0.001 \\\n", - "--print_progress \\\n", - "--model_name 'vae' \\\n", - "--save \\\n", - "--save_path './models/'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test using Kubernetes cluster on GCP or Minikube" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Run the outlier detector as a model or a transformer. If you want to run the anomaly detector as a transformer, change the SERVICE_TYPE variable from MODEL to TRANSFORMER [here](./.s2i/environment), set MODEL = False and change ```OutlierVAE.py``` to:\n", - "\n", - "```python\n", - "from CoreVAE import CoreVAE\n", - "\n", - "class OutlierVAE(CoreVAE):\n", - " \"\"\" Outlier detection using variational autoencoders (VAE).\n", - " \n", - " Parameters\n", - " ----------\n", - " threshold (float) : reconstruction error (mse) threshold used to classify outliers\n", - " reservoir_size (int) : number of observations kept in memory using reservoir sampling\n", - " \"\"\"\n", - " \n", - " def __init__(self,threshold=10,reservoir_size=50000,model_name='vae',load_path='./models/'):\n", - " \n", - " super().__init__(threshold=threshold,reservoir_size=reservoir_size,\n", - " model_name=model_name,load_path=load_path)\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "MODEL = True" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Pick Kubernetes cluster on GCP or Minikube." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "MINIKUBE = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "if MINIKUBE:\n", - " !minikube start --memory 4096\n", - "else:\n", - " !gcloud container clusters get-credentials standard-cluster-1 --zone europe-west1-b --project seldon-demos" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create a cluster-wide cluster-admin role assigned to a service account named “default” in the namespace “kube-system”." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kubectl create clusterrolebinding kube-system-cluster-admin --clusterrole=cluster-admin \\\n", - "--serviceaccount=kube-system:default" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kubectl create namespace seldon" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Add current context details to the configuration file in the seldon namespace." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kubectl config set-context $(kubectl config current-context) --namespace=seldon" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create tiller service account and give it a cluster-wide cluster-admin role." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kubectl -n kube-system create sa tiller\n", - "!kubectl create clusterrolebinding tiller --clusterrole cluster-admin --serviceaccount=kube-system:tiller\n", - "!helm init --service-account tiller" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check deployment rollout status and deploy seldon/spartakus helm charts." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kubectl rollout status deploy/tiller-deploy -n kube-system" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "!helm install ../../../helm-charts/seldon-core-operator --name seldon-core --set usage_metrics.enabled=true --namespace seldon-system" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check deployment rollout status for seldon core." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kubectl rollout status deploy/seldon-controller-manager -n seldon-system" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Install Ambassador API gateway" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!helm install stable/ambassador --name ambassador --set crds.keep=false" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kubectl rollout status deployment.apps/ambassador" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If Minikube used: create docker image for outlier detector inside Minikube using s2i. Besides the transformer image and the demo specific model image, the general model image for the VAE outlier detector is also available from Docker Hub as ***seldonio/outlier-vae-model:0.1***." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "if MINIKUBE & MODEL:\n", - " !eval $(minikube docker-env) && \\\n", - " s2i build . seldonio/seldon-core-s2i-python3:0.4 seldonio/outlier-vae-model-demo:0.1\n", - "elif MINIKUBE:\n", - " !eval $(minikube docker-env) && \\\n", - " s2i build . seldonio/seldon-core-s2i-python3:0.4 seldonio/outlier-vae-transformer:0.1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Install outlier detector helm charts either as a model or transformer and set *threshold* and *reservoir_size* hyperparameter values." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if MODEL:\n", - " !helm install ../../../helm-charts/seldon-od-model \\\n", - " --name outlier-detector \\\n", - " --namespace=seldon \\\n", - " --set model.type=vae \\\n", - " --set model.vae.image.name=seldonio/outlier-vae-model-demo:0.1 \\\n", - " --set model.vae.threshold=10 \\\n", - " --set model.vae.reservoir_size=50000 \\\n", - " --set oauth.key=oauth-key \\\n", - " --set oauth.secret=oauth-secret \\\n", - " --set replicas=1\n", - "else:\n", - " !helm install ../../../helm-charts/seldon-od-transformer \\\n", - " --name outlier-detector \\\n", - " --namespace=seldon \\\n", - " --set outlierDetection.enabled=true \\\n", - " --set outlierDetection.name=outlier-vae \\\n", - " --set outlierDetection.type=vae \\\n", - " --set outlierDetection.vae.image.name=seldonio/outlier-vae-transformer:0.1 \\\n", - " --set outlierDetection.vae.threshold=10 \\\n", - " --set outlierDetection.vae.reservoir_size=50000 \\\n", - " --set oauth.key=oauth-key \\\n", - " --set oauth.secret=oauth-secret \\\n", - " --set model.image.name=seldonio/mock_classifier:1.0" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Port forward Ambassador\n", - "\n", - "Run command in terminal:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```\n", - "kubectl port-forward $(kubectl get pods -n seldon -l app.kubernetes.io/name=ambassador -o jsonpath='{.items[0].metadata.name}') -n seldon 8003:8080\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Import rest requests, load data and test requests" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from utils import get_payload, rest_request_ambassador, send_feedback_rest, get_kdd_data, generate_batch\n", - "\n", - "data = get_kdd_data(keep_cols=cols,percent10=True) # load dataset\n", - "print(data.shape)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Generate a random batch from the data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "\n", - "samples = 1\n", - "fraction_outlier = 0.\n", - "X, labels = generate_batch(data,samples,fraction_outlier)\n", - "print(X.shape)\n", - "print(labels.shape)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Test the rest requests with the generated data. It is important that the order of requests is respected. First we make predictions, then we get the \"true\" labels back using the feedback request. If we do not respect the order and eg keep making predictions without getting the feedback for each prediction, there will be a mismatch between the predicted and \"true\" labels. This will result in errors in the produced metrics." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "request = get_payload(X)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "response = rest_request_ambassador(\"outlier-detector\",\"seldon\",request,endpoint=\"localhost:8003\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If the outlier detector is used as a transformer, the output of the anomaly detection is added as part of the metadata. If it is used as a model, we send model feedback to retrieve custom performance metrics." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if MODEL:\n", - " send_feedback_rest(\"outlier-detector\",\"seldon\",request,response,0,labels,endpoint=\"localhost:8003\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Analytics" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Install the helm charts for prometheus and the grafana dashboard" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "!helm install ../../../helm-charts/seldon-core-analytics --name seldon-core-analytics \\\n", - " --set grafana_prom_admin_password=password \\\n", - " --set persistence.enabled=false \\\n", - " --namespace seldon" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Port forward Grafana dashboard" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Run command in terminal:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```\n", - "kubectl port-forward $(kubectl get pods -n seldon -l app=grafana-prom-server -o jsonpath='{.items[0].metadata.name}') -n seldon 3000:3000\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can then view an analytics dashboard inside the cluster at http://localhost:3000/dashboard/db/prediction-analytics?refresh=5s&orgId=1. Your IP address may be different. get it via minikube ip. Login with:\n", - "\n", - "Username : admin\n", - "\n", - "password : password (as set when starting seldon-core-analytics above)\n", - "\n", - "Import the outlier-detector-vae dashboard from ../../../helm-charts/seldon-core-analytics/files/grafana/configs." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Run simulation\n", - "\n", - "- Sample random network intrusion data with a certain outlier probability.\n", - "- Get payload for the observation.\n", - "- Make a prediction.\n", - "- Send the \"true\" label with the feedback if the detector is run as a model.\n", - "\n", - "It is important that the prediction-feedback order is maintained. Otherwise there will be a mismatch between the predicted and \"true\" labels.\n", - "\n", - "View the progress on the grafana \"Outlier Detection\" dashboard. Most metrics need the outlier detector to be run as a model since they need model feedback." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "import time\n", - "n_requests = 100\n", - "samples = 1\n", - "for i in range(n_requests):\n", - " fraction_outlier = .1\n", - " X, labels = generate_batch(data,samples,fraction_outlier)\n", - " request = get_payload(X)\n", - " response = rest_request_ambassador(\"outlier-detector\",\"seldon\",request,endpoint=\"localhost:8003\")\n", - " if MODEL:\n", - " send_feedback_rest(\"outlier-detector\",\"seldon\",request,response,0,labels,endpoint=\"localhost:8003\")\n", - " time.sleep(1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if MINIKUBE:\n", - " !minikube delete" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/components/outlier-detection/vae/requirements.txt b/components/outlier-detection/vae/requirements.txt deleted file mode 100644 index dcf2c1bcef..0000000000 --- a/components/outlier-detection/vae/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -keras==2.2.2 -tensorflow==1.15.2 -numpy==1.14.5 -argparse==1.1 -pandas==0.23.4 -scikit-learn==0.19.1 -requests>=2.20.0 diff --git a/components/outlier-detection/vae/train.py b/components/outlier-detection/vae/train.py deleted file mode 100644 index 333c1f0501..0000000000 --- a/components/outlier-detection/vae/train.py +++ /dev/null @@ -1,147 +0,0 @@ -import argparse -from keras.callbacks import ModelCheckpoint -import numpy as np -import pickle -import random - -from model import model -from utils import get_kdd_data, generate_batch - -np.random.seed(2018) -np.random.RandomState(2018) -random.seed(2018) - -# default args -DATASET = 'kddcup99' -SAMPLES = 50000 -COLS = str(['srv_count','serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate', - 'same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count','dst_host_srv_count', - 'dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate', - 'dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate', - 'dst_host_rerror_rate','dst_host_srv_rerror_rate','target']) -MODEL_NAME = 'vae' -SAVE_PATH = './models/' - -# data preprocessing -STANDARDIZED = False -MINMAX = False -CLIP = [99999] - -# architecture -HIDDEN_LAYERS = 2 -LATENT_DIM = 2 -HIDDEN_DIM = [15,7] -OUTPUT_ACTIVATION = 'sigmoid' - -# training -EPOCHS = 20 -BATCH_SIZE = 32 -LEARNING_RATE = .001 -SAVE = False -PRINT_PROGRESS = False -CONTINUE_TRAINING = False -LOAD_PATH = SAVE_PATH - -def train(model,X,args): - """ Train VAE. """ - - # clip data per feature - X = np.clip(X,[-c for c in args.clip],args.clip) - - # apply scaling and save data preprocessing method - axis = 0 - if args.standardized: - print('\nStandardizing data') - mu, sigma = np.mean(X,axis=axis), np.std(X,axis=axis) - X = (X - mu) / (sigma + 1e-10) - - with open(args.save_path + 'preprocess_' + args.model_name + '.pickle', 'wb') as f: - pickle.dump(['standardized',args.clip,axis,mu,sigma], f) - - if args.minmax: - print('\nMinmax scaling of data') - xmin, xmax = X.min(axis=axis), X.max(axis=axis) - min, max = 0, 1 - X = ((X - xmin) / (xmax - xmin)) * (max - min) + min - - with open(args.save_path + 'preprocess_' + args.model_name + '.pickle', 'wb') as f: - pickle.dump(['minmax',args.clip,axis,xmin,xmax,min,max], f) - - # set training arguments - if args.print_progress: - verbose = 1 - else: - verbose = 0 - - kwargs = {} - kwargs['epochs'] = args.epochs - kwargs['batch_size'] = args.batch_size - kwargs['shuffle'] = True - kwargs['validation_data'] = (X,None) - kwargs['verbose'] = verbose - - if args.save: # create callback - checkpointer = ModelCheckpoint(filepath=args.save_path + args.model_name + '_weights.h5',verbose=0, - save_best_only=True,save_weights_only=True) - kwargs['callbacks'] = [checkpointer] - - # save model architecture - with open(args.save_path + args.model_name + '.pickle', 'wb') as f: - pickle.dump([X.shape[1],args.hidden_layers,args.latent_dim, - args.hidden_dim,args.output_activation],f) - - model.fit(X,**kwargs) - -def run(args): - """ Load data, generate training batch, initiate model and train VAE. """ - - print('\nLoad dataset') - if args.dataset=='kddcup99': - keep_cols = args.keep_cols[1:-1].replace("'","").replace(" ","").split(",") - data = get_kdd_data(keep_cols=keep_cols) - else: - raise ValueError('Only "kddcup99" dataset supported.') - - print('\nGenerate training batch') - X, _ = generate_batch(data,args.samples,0.) - - print('\nInitiate outlier detector model') - n_features = data.shape[1]-1 # nb of features - vae = model(n_features,hidden_layers=args.hidden_layers,latent_dim=args.latent_dim,hidden_dim=args.hidden_dim, - output_activation=args.output_activation,learning_rate=args.learning_rate) - - if args.continue_training: - print('\nLoad pre-trained model') - vae.load_weights(args.load_path + args.model_name + '_weights.h5') # load pretrained model weights - - if args.print_progress: - vae.summary() - - print('\nTrain outlier detector') - train(vae,X,args) - -if __name__ == '__main__': - - parser = argparse.ArgumentParser(description="Train VAE outlier detector.") - parser.add_argument('--dataset',type=str,choices=DATASET,default=DATASET) - parser.add_argument('--samples',type=int,default=SAMPLES) - parser.add_argument('--keep_cols',type=str,default=COLS) - parser.add_argument('--hidden_layers',type=int,default=HIDDEN_LAYERS) - parser.add_argument('--latent_dim',type=int,default=LATENT_DIM) - parser.add_argument('--hidden_dim',type=int,nargs='+',default=HIDDEN_DIM) - parser.add_argument('--output_activation',type=str,default=OUTPUT_ACTIVATION) - parser.add_argument('--epochs',type=int,default=EPOCHS) - parser.add_argument('--batch_size',type=int,default=BATCH_SIZE) - parser.add_argument('--learning_rate',type=float,default=LEARNING_RATE) - parser.add_argument('--clip',type=float,nargs='+',default=CLIP) - parser.add_argument('--standardized', default=STANDARDIZED, action='store_true') - parser.add_argument('--minmax', default=MINMAX, action='store_true') - parser.add_argument('--print_progress', default=PRINT_PROGRESS, action='store_true') - parser.add_argument('--save', default=SAVE, action='store_true') - parser.add_argument('--save_path',type=str,default=SAVE_PATH) - parser.add_argument('--load_path',type=str,default=LOAD_PATH) - parser.add_argument('--model_name',type=str,default=MODEL_NAME) - parser.add_argument('--continue_training', default=CONTINUE_TRAINING, action='store_true') - args = parser.parse_args() - - run(args) \ No newline at end of file diff --git a/components/outlier-detection/vae/utils.py b/components/outlier-detection/vae/utils.py deleted file mode 100644 index 569dd54ba9..0000000000 --- a/components/outlier-detection/vae/utils.py +++ /dev/null @@ -1,171 +0,0 @@ -import collections -import json -import numpy as np -import pandas as pd -import requests -from sklearn.datasets import fetch_kddcup99 -from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, fbeta_score - -pd.options.mode.chained_assignment = None # default='warn' - -def get_kdd_data(target=['dos','r2l','u2r','probe'], - keep_cols=['srv_count','serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate', - 'same_srv_rate','diff_srv_rate','srv_diff_host_rate','dst_host_count','dst_host_srv_count', - 'dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate', - 'dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate', - 'dst_host_rerror_rate','dst_host_srv_rerror_rate','target'], - percent10=False): - """ Load KDD Cup 1999 data and return in dataframe. """ - - data_raw = fetch_kddcup99(subset=None, data_home=None, percent10=percent10) - - # specify columns - cols=['duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot', - 'num_failed_logins','logged_in','num_compromised','root_shell','su_attempted','num_root','num_file_creations', - 'num_shells','num_access_files','num_outbound_cmds','is_host_login','is_guest_login','count','srv_count', - 'serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate', - 'srv_diff_host_rate','dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate', - 'dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate', - 'dst_host_rerror_rate','dst_host_srv_rerror_rate'] - - # create dataframe - data = pd.DataFrame(data=data_raw['data'],columns=cols) - - # add target to dataframe - data['attack_type'] = data_raw['target'] - - # specify and map attack types - attack_list = np.unique(data['attack_type']) - attack_category = ['dos','u2r','r2l','r2l','r2l','probe','dos','u2r','r2l','dos','probe','normal','u2r', - 'r2l','dos','probe','u2r','probe','dos','r2l','dos','r2l','r2l'] - - attack_types = {} - for i,j in zip(attack_list,attack_category): - attack_types[i] = j - - data['attack_category'] = 'normal' - for key,value in attack_types.items(): - data['attack_category'][data['attack_type'] == key] = value - - # define target - data['target'] = 0 - for t in target: - data['target'][data['attack_category'] == t] = 1 - - # define columns to be dropped - drop_cols = [] - for col in data.columns.values: - if col not in keep_cols: - drop_cols.append(col) - - if drop_cols!=[]: - data.drop(columns=drop_cols,inplace=True) - - return data - - -def sample_df(df,n): - """ Sample from df. """ - if n < df.shape[0]+1: - replace = False - else: - replace = True - return df.sample(n=n,replace=replace) - - -def generate_batch(data,n_samples,frac_outliers): - """ Generate random batch from data with fixed size and fraction of outliers. """ - - normal = data[data['target']==0] - outlier = data[data['target']==1] - - if n_samples==1: - n_outlier = np.random.binomial(1,frac_outliers) - n_normal = 1 - n_outlier - else: - n_normal = int((1-frac_outliers) * n_samples) - n_outlier = int(frac_outliers * n_samples) - - batch_normal = sample_df(normal,n_normal) - batch_outlier = sample_df(outlier,n_outlier) - - batch = pd.concat([batch_normal,batch_outlier]) - batch = batch.sample(frac=1).reset_index(drop=True) - - outlier_true = batch['target'].values - batch.drop(columns=['target'],inplace=True) - - return batch.values.astype('float'), outlier_true - -def flatten(x): - if isinstance(x, collections.Iterable): - return [a for i in x for a in flatten(i)] - else: - return [x] - -def performance(y_true,y_pred,roll_window=100): - """ Return a confusion matrix and calculate rolling accuracy, precision, recall, F1 and F2 scores. """ - - # confusion matrix - cm = confusion_matrix(y_true,y_pred,labels=[0,1]) - tn, fp, fn, tp = cm.ravel() - - # total scores - acc_tot = accuracy_score(y_true,y_pred) - prec_tot = precision_score(y_true,y_pred) - rec_tot = recall_score(y_true,y_pred) - f1_tot = f1_score(y_true,y_pred) - f2_tot = fbeta_score(y_true,y_pred,beta=2) - - # rolling scores - y_true_roll = y_true[-roll_window:] - y_pred_roll = y_pred[-roll_window:] - acc_roll = accuracy_score(y_true_roll,y_pred_roll) - prec_roll = precision_score(y_true_roll,y_pred_roll) - rec_roll = recall_score(y_true_roll,y_pred_roll) - f1_roll = f1_score(y_true_roll,y_pred_roll) - f2_roll = fbeta_score(y_true_roll,y_pred_roll,beta=2) - - scores = [tn, fp, fn, tp, acc_tot, prec_tot, rec_tot, f1_tot, f2_tot, - acc_roll, prec_roll, rec_roll, f1_roll, f2_roll] - - return scores - -def outlier_stats(y_true,y_pred,roll_window=100): - """ Calculate number and percentage of predicted and labeled outliers. """ - - y_pred_roll = np.sum(y_pred[-roll_window:]) - y_true_roll = np.sum(y_true[-roll_window:]) - y_pred_tot = np.sum(y_pred) - y_true_tot = np.sum(y_true) - - return y_pred_roll, y_true_roll, y_pred_tot, y_true_tot - -def get_payload(arr): - features = ["srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate", - "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count","dst_host_same_srv_rate", - "dst_host_diff_srv_rate","dst_host_same_src_port_rate","dst_host_srv_diff_host_rate", - "dst_host_serror_rate","dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate"] - datadef = {"names":features,"ndarray":arr.tolist()} - payload = {"meta":{},"data":datadef} - return payload - -def rest_request_ambassador(deploymentName,namespace,request,endpoint="localhost:8003"): - response = requests.post( - "http://"+endpoint+"/seldon/"+namespace+"/"+deploymentName+"/api/v0.1/predictions", - json=request) - print(response.status_code) - print(response.text) - return response.json() - -def send_feedback_rest(deploymentName,namespace,request,response,reward,truth,endpoint="localhost:8003"): - feedback = { - "request": request, - "response": response, - "reward": reward, - "truth": {"data":{"ndarray":truth.tolist()}} - } - ret = requests.post( - "http://"+endpoint+"/seldon/"+namespace+"/"+deploymentName+"/api/v0.1/feedback", - json=feedback) - return