From ce103c89aad81f7e31de27f7d3aa30a2ace8ffa6 Mon Sep 17 00:00:00 2001 From: Andres Gomez Date: Fri, 26 Apr 2024 23:56:50 +0000 Subject: [PATCH] improve anomaly detection and adapt the forta bot for that model --- .gitignore | 3 + ...ipynb => GPT_anomaly_classification.ipynb} | 375 ++++++++++++------ .../GPT_anomaly_pretraining.ipynb | 27 +- .../GPT_finetuning_data_collection.ipynb | 2 +- 4 files changed, 275 insertions(+), 132 deletions(-) rename analysis/ethereum_smart_contracts/{GPT_evaluate.ipynb => GPT_anomaly_classification.ipynb} (90%) diff --git a/.gitignore b/.gitignore index 0af1abe..ab9ead0 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,6 @@ data .env* venv* *_env +.ipynb_checkpoints/ +tikuna_model_data/ +nohup.out diff --git a/analysis/ethereum_smart_contracts/GPT_evaluate.ipynb b/analysis/ethereum_smart_contracts/GPT_anomaly_classification.ipynb similarity index 90% rename from analysis/ethereum_smart_contracts/GPT_evaluate.ipynb rename to analysis/ethereum_smart_contracts/GPT_anomaly_classification.ipynb index b7feeca..ddfbe79 100644 --- a/analysis/ethereum_smart_contracts/GPT_evaluate.ipynb +++ b/analysis/ethereum_smart_contracts/GPT_anomaly_classification.ipynb @@ -18,7 +18,8 @@ "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", "# See the License for the specific language governing permissions and\n", - "# limitations under the License." + "# limitations under the License.\n", + "# Based on: https://gmihaila.github.io/tutorial_notebooks/gpt2_finetune_classification/" ] }, { @@ -29,98 +30,168 @@ }, "outputs": [], "source": [ + "import io\n", + "import os\n", "import torch\n", "import warnings\n", - "import pandas as pd\n", - "import os\n", - "import io\n", - "import numpy\n", + "import itertools\n", "\n", + "from io import StringIO\n", + "import pandas as pd\n", "from tqdm.notebook import tqdm\n", "from torch.utils.data import Dataset, DataLoader\n", - "from transformers import (set_seed,\n", - " GPT2Config,\n", - " GPT2Tokenizer,\n", - " GPT2ForSequenceClassification)\n", + "from ml_things import plot_dict, plot_confusion_matrix, fix_text\n", + "from sklearn.metrics import classification_report, accuracy_score\n", + "from transformers import (\n", + " AutoConfig,\n", + " AutoTokenizer,\n", + " AutoModelForCausalLM,\n", + " PretrainedConfig,\n", + " set_seed,\n", + " )\n", "\n", "# Supress deprecation warnings\n", "warnings.filterwarnings('ignore', category=DeprecationWarning)\n", "warnings.filterwarnings('ignore', category=FutureWarning)\n", "\n", + "# Set seed for reproducibility,\n", + "set_seed(4444)\n", + "\n", "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", "\n", - "model_name_or_path = '/data/forta/ethereum/model_128_stride/'\n", - "tokenizer_name_or_path = '/data/forta/ethereum/tokenizer'\n", - "evaluation_data_file = \"/data/forta/ethereum/text/evaluation/malicious-eval.csv\"\n", + "model = None\n", + "tokenizer_path = '/data/forta/ethereum/tokenizer'\n", + "model_path = '/data/forta/ethereum/model_anomaly'\n", "\n", - "labels_ids = {'malicious': 0, 'normal': 1}\n", - "n_labels = len(labels_ids)\n", + "# Regular perplexity: 1.6968108415603638\n", + "mean = 1.9269466400146484\n", + "std_dev = 1.5235518217086792\n", + "delta = 3\n", + "anomaly_threshold = mean + delta * std_dev\n", + "# Original Anomaly Threshold: 4.974050521850586\n", + "print(\"anomaly_threshold: \"+str(anomaly_threshold))\n", + "stride = 128\n", + "max_length = 1024\n", + "batch_size = 1\n", "\n", - "# Set seed for reproducibility.\n", - "set_seed(4444)" + "labels_ids = {'malicious': 0, 'normal': 1}" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "EDEubgJIt23C" + }, "outputs": [], "source": [ "class SmartContractOpcodeDataset(Dataset):\n", - " r\"\"\" PyTorch Dataset class for loading data. \"\"\"\n", "\n", - " def __init__(self, file_path, use_tokenizer):\n", - " self.texts = []\n", - " self.labels = []\n", - " current_file = io.open(file_path, mode='r', encoding='utf-8')\n", - " for line in current_file:\n", - " self.texts.append(line)\n", - " self.labels.append(0)\n", - " self.n_examples = len(self.labels)\n", - " return\n", + " def __init__(self, path, use_tokenizer):\n", + " if not os.path.isdir(path):\n", + " raise ValueError('Invalid `path` variable! Needs to be a directory')\n", + " self.texts = []\n", + " self.labels = []\n", + " # Since the labels are defined by folders with data we loop \n", + " # through each label.\n", + " for label in ['normal', 'malicious']:\n", + " opcode_path = os.path.join(path, label)\n", + " # Get all files from path.\n", + " files_names = os.listdir(opcode_path)\n", + " # Go through each file and read its content.\n", + " for file_name in tqdm(files_names, desc=f'{label} files'):\n", + " file_path = os.path.join(opcode_path, file_name)\n", + " current_file = io.open(file_path, mode='r', encoding='utf-8')\n", + " file_data = current_file.read()\n", + " if file_data != \"\":\n", + " self.texts.append(file_data)\n", + " self.labels.append(label)\n", + " self.n_examples = len(self.labels)\n", "\n", - " def __len__(self):\n", - " return self.n_examples\n", + " return\n", "\n", - " def __getitem__(self, item):\n", - " return {'text':self.texts[item],\n", - " 'labels':self.labels[item]}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def extract_sequences(data, stride):\n", - " encodings = tokenizer(data, return_tensors=\"pt\")\n", - " max_length = model.config.n_positions\n", - " seq_len = encodings.input_ids.size(1)\n", + " def __len__(self):\n", + " return self.n_examples\n", " \n", - " prev_end_loc = 0\n", - " sequences = []\n", - " for begin_loc in range(0, seq_len, stride):\n", - " end_loc = min(begin_loc + max_length, seq_len)\n", - " trg_len = end_loc - prev_end_loc\n", - " input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)\n", - " list = input_ids[0].tolist()\n", - " list[0:0] = [0] * (max_length - len(list))\n", - " sequences.append(list)\n", - " prev_end_loc = end_loc\n", - " if end_loc == seq_len:\n", - " break\n", - " sequence_data = pd.DataFrame(sequences)\n", - " return torch.tensor(sequence_data.values).type(torch.long)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "eval_dataset = SmartContractOpcodeDataset(file_path=evaluation_data_file, use_tokenizer=None)" + " def __getitem__(self, item):\n", + " return {'text':self.texts[item],\n", + " 'label':self.labels[item]}\n", + "\n", + "class Gpt2ClassificationCollator(object):\n", + "\n", + " def __init__(self, use_tokenizer, labels_encoder, max_sequence_len=None):\n", + " # Tokenizer to be used inside the class.\n", + " self.use_tokenizer = use_tokenizer\n", + " # Check max sequence length.\n", + " self.max_sequence_len = use_tokenizer.model_max_length if max_sequence_len is None else max_sequence_len\n", + " # Label encoder used inside the class.\n", + " self.labels_encoder = labels_encoder\n", + "\n", + " return\n", + "\n", + " def __call__(self, sequences):\n", + " # List comprehension to read lines, strip whitespace, and convert to int\n", + " texts = []\n", + " for sequence in sequences:\n", + " for line in sequence['text'].split('\\n'):\n", + " if line != \"\":\n", + " integers = [int(float(num)) for num in line.split('\\t')]\n", + " texts.append(torch.tensor(integers, dtype=torch.int))\n", + " labels = [sequence['label'] for sequence in sequences]\n", + " labels = [self.labels_encoder[label] for label in labels]\n", + "\n", + " # We don't need to use the tokenizer since the data is already in numeric format\n", + " inputs = {'input_ids':torch.stack(texts, dim=0)}\n", + " # Update the inputs with the associated encoded labels as tensor.\n", + " inputs.update({'labels':torch.tensor(labels)})\n", + "\n", + " return inputs\n", + "\n", + "def validation(dataloader, device_):\n", + " # Use global variable for model.\n", + " global model\n", + " \n", + " # Tracking variables\n", + " predictions_labels = []\n", + " true_labels = []\n", + " #total loss for this epoch.\n", + " total_loss = 0\n", + " trg_len = 1024\n", + " \n", + " # Put the model in evaluation mode--the dropout layers behave differently\n", + " # during evaluation.\n", + " model.eval()\n", + "\n", + " # Evaluate data for one epoch\n", + " for batch in tqdm(dataloader, total=len(dataloader)):\n", + " # add original labels\n", + " true_labels += batch['labels'].numpy().flatten().tolist()\n", + " # move batch to device\n", + " batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}\n", + " # Collect predictions for each row\n", + " predicted_label = None\n", + " trg_len = 1024\n", + " for row in batch['input_ids']:\n", + " input_ids = row.view(1, max_length).to(device_)\n", + " target_ids = input_ids.clone()\n", + " target_ids[:, :-trg_len] = -100\n", + " trg_len = stride\n", + " # The predicted label is normal until a anomalous sequence is found\n", + " predicted_label = [labels_ids['normal']]\n", + " # Telling the model not to compute or store gradients, saving memory andbatch.items()\n", + " # speeding up validation\n", + " with torch.no_grad(): \n", + " outputs = model(**{'input_ids':input_ids}, labels=target_ids)\n", + " neg_log_likelihood = outputs.loss\n", + " local_perplexity = torch.exp(neg_log_likelihood)\n", + " if local_perplexity > anomaly_threshold:\n", + " predicted_label = [labels_ids['malicious']]\n", + " break\n", + " predictions_labels += [torch.all(torch.Tensor(predicted_label)).long()]\n", + " # Calculate the average loss over the training data.\n", + " avg_epoch_loss = total_loss / len(dataloader)\n", + " # Return all true labels and prediciton for future evaluations.\n", + " return true_labels, predictions_labels, avg_epoch_loss" ] }, { @@ -135,84 +206,138 @@ }, "outputs": [], "source": [ - "# Get model configuration.\n", - "print('Loading configuration...')\n", - "model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path=model_name_or_path,\n", - " num_labels=n_labels, local_files_only=True,\n", - " use_safetensors=True)\n", + "# Load model configuration.\n", + "print('Loading model configuration...')\n", + "model_config = AutoConfig.from_pretrained(model_path)\n", "\n", - "# Get model's tokenizer.\n", - "print('Loading tokenizer...')\n", - "tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path=tokenizer_name_or_path,\n", - " local_files_only=True, use_safetensors=True)\n", - "# default to left padding\n", - "tokenizer.padding_side = \"left\"\n", - "# Define PAD Token = EOS Token = 0\n", + "# Load model tokenizer.\n", + "print('Loading model`s tokenizer...')\n", + "tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)\n", "tokenizer.pad_token = tokenizer.eos_token\n", "\n", - "# Get the actual model.\n", - "print('Loading model...')\n", - "model = GPT2ForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path,\n", - " config=model_config, local_files_only=True,\n", - " use_safetensors=True)\n", + "# Loading model.\n", + "print('Loading actual model...')\n", + "model = AutoModelForCausalLM.from_pretrained(model_path, config=model_config)\n", "\n", - "# resize model embedding to match new tokenizer\n", + "# Resize model to fit all tokens in tokenizer.\n", "model.resize_token_embeddings(len(tokenizer))\n", - "# fix model padding token id\n", - "model.config.pad_token_id = model.config.eos_token_id\n", "\n", - "# Load model to defined device.\n", - "model.to(device)\n", - "print('Model loaded to `%s`'%device)" + "# fix model padding token id\n", + "model.config.pad_token_id = model.config.eos_token_id" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 332, + "referenced_widgets": [ + "e9972d8d49074defbe6232b8ca9c3998", + "0c94315204c648a4a8427ed19abd0b02", + "b8aca6ca6b7043e2a94c43c01578ab36", + "ec1e5a789a2c4e8fbcaac513c58e04fc", + "0fde44f436cc49eebfc9627c49ff3eb2", + "48158320f750495586f815aae5e74a6e", + "72e35b75c2dd4be38396d7d2dfc5916f", + "f9e624a958a343fd8ae0d00ed09172dc", + "b638077e5aef4d9f9ae18513fe348c9d", + "691d3b5ff274407aa9e8e20c989adaf5", + "d9ff8a49c31e4b068e9577784fe37a88", + "85132b1b278b4336845b1cc519066929", + "b53a0ea4433341a0b00314627ac4fd4c", + "5f6fded3e9734d02925ed1665ada214e", + "0a8e6b854f644ac790d1b60e00559524", + "e3f731bec3534a1cab0854b9192f491e", + "97f350df143d4da3b0111180998afc4d", + "5baa034ed79c450980ced4a1f7de2374", + "6e98a119a4874424be55b571ca8fdd62", + "ec918106d85d470d83c9491c4c7e283c", + "8de61314c47647c5af0e42ba8121d7ec", + "97deaa1813e64282a3a678cf1a719706", + "ff649a06a95641038987e810e1dcf065", + "29eb9184d1ff4574991ca2e6f11a7ee0", + "1632298dbbb64696a3c18976214ed3d6", + "9671cba109ab4ac19ef02e40905fbfd4", + "82d96937e37e4590921cd4dd2dd2e058", + "6277b8b253e6475a928f737995b8af4d", + "653587d9fefb4e4b9264ba80c2ada175", + "1d27fa237e2b499a889b3fe6ecb82a6d", + "06d2b4a7c9814917a53e002e142acee7", + "5c361029c19c4e81963046dbb03515d6" + ] + }, + "id": "OlXROUWu5Osq", + "outputId": "8a4ba877-dac3-4e84-b005-bb27df05deb0" + }, "outputs": [], "source": [ - "def evaluate(dataloader, device_):\n", - " global model\n", - " # Tracking variables\n", - " predictions_labels = []\n", - " model.eval()\n", - " index = 0\n", - " normal = 0\n", - " malicious = 0\n", - " \n", - " for batch in tqdm(dataloader, total=len(dataloader)):\n", - " processed_batch = {}\n", - " label = 0\n", - " for k,v in batch.items():\n", - " if k == \"text\":\n", - " processed_batch[\"input_ids\"] = extract_sequences(v, 128).to(device_)\n", - " # sequence_predictions = []\n", - " # for row in processed_batch['input_ids']:\n", - " #sequence = {}\n", - " #sequence['input_ids'] = row.view(1, 1024)\n", - " # sequence['labels'] = batch['labels'][0]\n", - " with torch.no_grad():\n", - " outputs = model(**processed_batch)\n", - " logits = outputs.logits.detach().cpu().numpy()\n", - " predict_content = logits.argmax(axis=-1).flatten()\n", - " if not predict_content.all():\n", - " malicious = malicious + 1\n", - " else:\n", - " normal = normal + 1\n", - " index = index + 1\n", - " print(\"Amount of normal samples: \"+str(normal))\n", - " print(\"Amount of malicious samples: \"+str(malicious))\n", - " print(\"Proportion of malicious over total: \"+str(malicious/len(dataloader)))" + "# Create data collator to encode text and labels into numbers.\n", + "gpt2_classificaiton_collator = Gpt2ClassificationCollator(use_tokenizer=tokenizer, \n", + " labels_encoder=labels_ids, \n", + " max_sequence_len=max_length)\n", + "\n", + "print('Dealing with Validation...')\n", + "# Create pytorch dataset.\n", + "valid_dataset = SmartContractOpcodeDataset(\n", + " path='/data/forta/ethereum/text/finetuning/validation',\n", + " use_tokenizer=tokenizer\n", + " )\n", + "print('Created `valid_dataset` with %d examples!'%len(valid_dataset))\n", + "\n", + "# Move pytorch dataset into dataloader.\n", + "valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size,\n", + " shuffle=False,\n", + " collate_fn=gpt2_classificaiton_collator\n", + " )\n", + "print('Created `eval_dataloader` with %d batches!'%len(valid_dataloader))" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 618, + "referenced_widgets": [ + "88d93f5b31c14ae3b2d6a889e5caa2a4", + "c745cb7c913448f59d3c017335e3ea22", + "04f2861f3e7f4255aed47e0b0e77eecd", + "5c2faf9a7d8e48caaa6d59875ed056e1", + "99caa2e95506457d89f167abe62d9f32", + "e492f5b344714a83bb8265fa9161b52d", + "6236711e6e76467fae7f79f266a7f8ed", + "138ae3e683c04a388e3d9d03cc425fec" + ] + }, + "id": "7Sifp6ocoSng", + "outputId": "1dccfb2a-242d-47e1-ed54-a7bfaa676f97", + "scrolled": true + }, "outputs": [], "source": [ - "evaluate(eval_dataset, device)" + "# Get prediction form model on validation data. This is where you should use\n", + "# your test data.\n", + "true_labels, predictions_labels, avg_epoch_loss = validation(valid_dataloader,\n", + " device)\n", + "\n", + "# Create the evaluation report.\n", + "evaluation_report = classification_report(true_labels,\n", + " predictions_labels,\n", + " labels=list(labels_ids.values()),\n", + " target_names=list(labels_ids.keys()))\n", + "# Show the evaluation report.\n", + "print(evaluation_report)\n", + "\n", + "# Plot confusion matrix.\n", + "plot_confusion_matrix(y_true=true_labels,\n", + " y_pred=predictions_labels, \n", + " classes=list(labels_ids.keys()),\n", + " normalize=True, \n", + " magnify=0.1,\n", + " );" ] } ], diff --git a/analysis/ethereum_smart_contracts/GPT_anomaly_pretraining.ipynb b/analysis/ethereum_smart_contracts/GPT_anomaly_pretraining.ipynb index 31eec25..172670f 100644 --- a/analysis/ethereum_smart_contracts/GPT_anomaly_pretraining.ipynb +++ b/analysis/ethereum_smart_contracts/GPT_anomaly_pretraining.ipynb @@ -70,7 +70,8 @@ "\n", "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", "\n", - "model = None" + "model = None\n", + "anomaly_validation = False" ] }, { @@ -463,7 +464,7 @@ "outputs": [], "source": [ "# if training_args.do_eval:\n", - "if False:\n", + "if training_args.do_eval:\n", " eval_output = trainer.evaluate()\n", " print(eval_output[\"eval_loss\"])\n", " perplexity = math.exp(eval_output[\"eval_loss\"])\n", @@ -490,7 +491,9 @@ " trg_len = end_loc - prev_end_loc\n", " input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)\n", " list = input_ids[0].tolist()\n", - " list[0:0] = [0] * (max_length - len(list))\n", + " # padding to the left\n", + " endoftext = 0\n", + " list[0:0] = [endoftext] * (max_length - len(list))\n", " normal_sequences.append(list)\n", " prev_end_loc = end_loc\n", " if end_loc == seq_len:\n", @@ -568,7 +571,7 @@ " )\n", "\n", "print(\"Extracting normal sequences validation...\")\n", - "for index in tqdm(range(0, 100)):\n", + "for index in tqdm(range(0, 200)):\n", " extract_normal_sequences(\n", " training[\"val\"][\"text\"][index],\n", " \"/data/forta/ethereum/text/finetuning/validation/normal/\",\n", @@ -576,10 +579,20 @@ " index\n", " )\n", "\n", + "if anomaly_validation:\n", + " print(\"Extracting malicious sequences anomaly validation...\")\n", + " for index in tqdm(range(0, len(test[\"train\"][\"text\"]))):\n", + " extract_normal_sequences(\n", + " test[\"train\"][\"text\"][index],\n", + " \"/data/forta/ethereum/text/finetuning/validation/malicious/\",\n", + " stride,\n", + " index\n", + " )\n", + "\n", "# Calculate threshold for extraction of malicious samples\n", "print(\"Calculating threshold\")\n", "threshold = calculate_anomaly_threshold(training[\"train\"][\"text\"][:100], stride)\n", - "print(\"Calculated threshold:\" % threshold)" + "print(\"Calculated threshold: %s\" % str(threshold))" ] }, { @@ -610,7 +623,9 @@ " local_ppl = torch.exp(neg_log_likelihood)\n", " if local_ppl > anomaly_threshold:\n", " list = input_ids[0].tolist()\n", - " list[0:0] = [0] * (max_length - len(list))\n", + " # padding to the left\n", + " endoftext = 0\n", + " list[0:0] = [endoftext] * (max_length - len(list))\n", " anomalous_sequences.append(list)\n", " \n", " nlls.append(neg_log_likelihood)\n", diff --git a/analysis/ethereum_smart_contracts/GPT_finetuning_data_collection.ipynb b/analysis/ethereum_smart_contracts/GPT_finetuning_data_collection.ipynb index 03f1267..0573999 100644 --- a/analysis/ethereum_smart_contracts/GPT_finetuning_data_collection.ipynb +++ b/analysis/ethereum_smart_contracts/GPT_finetuning_data_collection.ipynb @@ -11,7 +11,7 @@ "This notebook collects smart contract creation bytecode and decompiled opcodes for malicious contract classification. \n", "Benign contracts are gathered from blockchain explorers and malicious contracts from [Forta Network's labelled datasets github repo](\"https://github.com/forta-network/labelled-datasets\").\n", "\n", - "# Code provided by the Forta project" + "Code provided by the Forta project" ] }, {