diff --git a/analysis/ethereum_smart_contracts/pretrain_gpt.ipynb b/analysis/ethereum_smart_contracts/GPT_anomaly_pretraining.ipynb similarity index 93% rename from analysis/ethereum_smart_contracts/pretrain_gpt.ipynb rename to analysis/ethereum_smart_contracts/GPT_anomaly_pretraining.ipynb index 13299e8..b4d1f0d 100644 --- a/analysis/ethereum_smart_contracts/pretrain_gpt.ipynb +++ b/analysis/ethereum_smart_contracts/GPT_anomaly_pretraining.ipynb @@ -35,6 +35,8 @@ "import math\n", "import torch\n", "import warnings\n", + "import pandas as pd\n", + "\n", "from tqdm.notebook import tqdm\n", "from ml_things import plot_dict, fix_text\n", "from transformers import (\n", @@ -57,6 +59,7 @@ " Trainer,\n", " set_seed,\n", " )\n", + "from datasets import load_dataset\n", "\n", "# Supress deprecation warnings\n", "warnings.filterwarnings('ignore', category=DeprecationWarning)\n", @@ -249,9 +252,9 @@ "source": [ "# Define arguments for data, tokenizer and model arguments.\n", "model_data_args = ModelDataArguments(\n", - " train_data_file='/data/forta/ethereum/text/pretraining/pretraining_train.txt',\n", - " eval_data_file='/data/forta/ethereum/text/pretraining/pretraining_val.txt',\n", - " line_by_line=True, \n", + " train_data_file='/data/forta/ethereum/text/pretraining/small_pretraining_train.txt',\n", + " eval_data_file='/data/forta/ethereum/text/pretraining/small_pretraining_val.txt',\n", + " line_by_line=False,\n", " mlm=False,\n", " whole_word_mask=False,\n", " plm_probability=float(1/6),\n", @@ -275,9 +278,9 @@ " do_train=True, \n", " do_eval=True,\n", " per_device_train_batch_size=10,\n", - " per_device_eval_batch_size=100,\n", + " per_device_eval_batch_size=10,\n", " evaluation_strategy='steps',\n", - " logging_steps=700,\n", + " logging_steps=500,\n", " eval_steps = None,\n", " prediction_loss_only=True,\n", " learning_rate = 5e-5,\n", @@ -463,6 +466,8 @@ "loss_history = {'train_loss':[], 'eval_loss':[]}\n", "perplexity_history = {'train_perplexity':[], 'eval_perplexity':[]}\n", "\n", + "print(trainer.state.log_history)\n", + "\n", "for log_history in trainer.state.log_history:\n", " if 'loss' in log_history.keys():\n", " loss_history['train_loss'].append(log_history['loss'])\n", @@ -499,12 +504,102 @@ "outputs": [], "source": [ "if training_args.do_eval:\n", - " eval_output = trainer.evaluate()\n", - " perplexity = math.exp(eval_output[\"eval_loss\"])\n", - " print('\\nEvaluate Perplexity: {:10,.2f}'.format(perplexity))\n", + " eval_output = trainer.evaluate()\n", + " print(eval_output[\"eval_loss\"])\n", + " perplexity = math.exp(eval_output[\"eval_loss\"])\n", + " print('\\nEvaluate Perplexity: {:10,.2f}'.format(perplexity))\n", "else:\n", - " print('No evaluation needed. No evaluation data provided, `do_eval=False`!')" + " print('No evaluation needed. No evaluation data provided, `do_eval=False`!')" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def extract_normal_sequences(data, output_file):\n", + " encodings = tokenizer(\"\\n\\n\".join(data), return_tensors=\"pt\")\n", + " max_length = model.config.n_positions\n", + " stride = 512\n", + " seq_len = encodings.input_ids.size(1)\n", + " \n", + " prev_end_loc = 0\n", + " normal_sequences = []\n", + " for begin_loc in tqdm(range(0, seq_len, stride)):\n", + " end_loc = min(begin_loc + max_length, seq_len)\n", + " trg_len = end_loc - prev_end_loc # may be different from stride on last loop\n", + " input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)\n", + " normal_sequences.append(input_ids[0].tolist()) \n", + " prev_end_loc = end_loc\n", + " if end_loc == seq_len:\n", + " break \n", + " normal_data = pd.DataFrame(normal_sequences)\n", + " normal_data.to_csv(output_file, sep='\\t', index=False)\n", + "\n", + "def extract_anomalous_sequences(data, output_file):\n", + " encodings = tokenizer(\"\\n\\n\".join(data), return_tensors=\"pt\")\n", + " max_length = model.config.n_positions\n", + " stride = 512\n", + " seq_len = encodings.input_ids.size(1)\n", + " anomaly_threshold = 1.2\n", + " \n", + " nlls = []\n", + " prev_end_loc = 0\n", + " anomalous_sequences = []\n", + " for begin_loc in tqdm(range(0, seq_len, stride)):\n", + " end_loc = min(begin_loc + max_length, seq_len)\n", + " trg_len = end_loc - prev_end_loc # may be different from stride on last loop\n", + " input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)\n", + " target_ids = input_ids.clone()\n", + " target_ids[:, :-trg_len] = -100\n", + " \n", + " with torch.no_grad():\n", + " outputs = model(input_ids, labels=target_ids)\n", + " \n", + " # loss is calculated using CrossEntropyLoss which averages over valid labels\n", + " # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels\n", + " # to the left by 1.\n", + " neg_log_likelihood = outputs.loss\n", + " local_ppl = torch.exp(neg_log_likelihood)\n", + " if local_ppl > anomaly_threshold:\n", + " # print(\"local_ppl:\"+str(local_ppl))\n", + " # print(\"input_ids.shape:\"+str(input_ids.shape))\n", + " # anomalous_sequences.append(tokenizer.decode(input_ids[0]))\n", + " anomalous_sequences.append(input_ids[0].tolist())\n", + " \n", + " nlls.append(neg_log_likelihood)\n", + " \n", + " prev_end_loc = end_loc\n", + " if end_loc == seq_len:\n", + " break\n", + " \n", + " ppl = torch.exp(torch.stack(nlls).mean())\n", + " anomalous_data = pd.DataFrame(anomalous_sequences)\n", + " anomalous_data.to_csv(output_file, sep='\\t', index=False)\n", + "\n", + "# Load normal SC opcode files\n", + "training = load_dataset(\"text\", data_files={\"train\": \"/data/forta/ethereum/text/finetuning/training/normal/normal.txt\",\n", + " \"val\": \"/data/forta/ethereum/text/finetuning/validation/normal/normal.txt\"})\n", + "\n", + "# Load malicious SC opcode files\n", + "test = load_dataset(\"text\", data_files={\"train\": \"/data/forta/ethereum/text/finetuning/training/malicious/malicious.txt\",\n", + " \"val\": \"/data/forta/ethereum/text/finetuning/validation/malicious/malicious.txt\"})\n", + "# Extract normal SC opcode encodings\n", + "extract_normal_sequences(training[\"train\"][\"text\"], \"/data/forta/ethereum/text/finetuning/normal_training.csv\")\n", + "extract_normal_sequences(training[\"val\"][\"text\"], \"/data/forta/ethereum/text/finetuning/normal_validation.csv\")\n", + "\n", + "#Extract malicious SC opcode encodings\n", + "extract_anomalous_sequences(test[\"train\"][\"text\"], \"/data/forta/ethereum/text/finetuning/anomalous_training.csv\")\n", + "extract_anomalous_sequences(test[\"val\"][\"text\"], \"/data/forta/ethereum/text/finetuning/anomalous_validation.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/analysis/ethereum_smart_contracts/anomaly_gpt.ipynb b/analysis/ethereum_smart_contracts/GPT_finetuning_classification.ipynb similarity index 73% rename from analysis/ethereum_smart_contracts/anomaly_gpt.ipynb rename to analysis/ethereum_smart_contracts/GPT_finetuning_classification.ipynb index 17d661f..d1fbe42 100644 --- a/analysis/ethereum_smart_contracts/anomaly_gpt.ipynb +++ b/analysis/ethereum_smart_contracts/GPT_finetuning_classification.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "id": "3NmlcqdctXlB" }, @@ -24,7 +24,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "id": "gjr_J342tOPq" }, @@ -34,6 +34,8 @@ "import os\n", "import torch\n", "import warnings\n", + "from io import StringIO\n", + "import pandas as pd\n", "from tqdm.notebook import tqdm\n", "from torch.utils.data import Dataset, DataLoader\n", "from ml_things import plot_dict, plot_confusion_matrix, fix_text\n", @@ -70,7 +72,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "id": "EDEubgJIt23C" }, @@ -95,9 +97,9 @@ " file_path = os.path.join(opcode_path, file_name)\n", " current_file = io.open(file_path, mode='r', encoding='utf-8')\n", " for line in current_file:\n", - " # Fix any unicode issues.\n", - " line = fix_text(line)\n", - " self.texts.append(line)\n", + " # List comprehension to read lines, strip whitespace, and convert to int\n", + " integers = [int(float(num)) for num in line.strip().split('\\t')]\n", + " self.texts.append(torch.tensor(integers, dtype=torch.int))\n", " self.labels.append(label)\n", " self.n_examples = len(self.labels)\n", "\n", @@ -128,9 +130,8 @@ " labels = [sequence['label'] for sequence in sequences]\n", " labels = [self.labels_encoder[label] for label in labels]\n", "\n", - " # Call tokenizer on all texts to convert into tensors of numbers with \n", - " # appropriate padding.\n", - " inputs = self.use_tokenizer(text=texts, return_tensors=\"pt\", padding=True, truncation=True, max_length=self.max_sequence_len)\n", + " # We don't need to use the tokenizer since the data is already in numeric format\n", + " inputs = {'input_ids':torch.stack(texts, dim=0)}\n", " # Update the inputs with the associated encoded labels as tensor.\n", " inputs.update({'labels':torch.tensor(labels)})\n", "\n", @@ -150,7 +151,7 @@ " # For each batch of training data...\n", " for batch in tqdm(dataloader, total=len(dataloader)):\n", " # Add original labels - use later for evaluation.\n", - " true_labels += batch['labels'].numpy().flatten().tolist() \n", + " true_labels += batch['labels'].numpy().flatten().tolist()\n", " # move batch to device\n", " batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}\n", " # Always clear any previously calculated gradients before performing a\n", @@ -225,7 +226,7 @@ " true_labels += batch['labels'].numpy().flatten().tolist()\n", " # move batch to device\n", " batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}\n", - " # Telling the model not to compute or store gradients, saving memory and\n", + " # Telling the model not to compute or store gradients, saving memory andbatch.items()\n", " # speeding up validation\n", " with torch.no_grad(): \n", " # Forward pass, calculate logit predictions.\n", @@ -266,7 +267,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -274,7 +275,26 @@ "id": "ctTxnW903_VF", "outputId": "a447c89a-a48b-4262-b6c4-e28220db3ffc" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at /data/forta/ethereum/model and are newly initialized: ['score.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading configuraiton...\n", + "Loading tokenizer...\n", + "Loading model...\n", + "Model loaded to `cpu`\n" + ] + } + ], "source": [ "# Get model configuration.\n", "print('Loading configuraiton...')\n", @@ -304,7 +324,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -347,7 +367,88 @@ "id": "OlXROUWu5Osq", "outputId": "8a4ba877-dac3-4e84-b005-bb27df05deb0" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dealing with Train...\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4ca70ca74eb54408bc0e6dfb709ffaed", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "normal files: 0%| | 0/1 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "# Get prediction form model on validation data. This is where you should use\n", "# your test data.\n", diff --git a/analysis/ethereum_smart_contracts/finetuning_data_collection.ipynb b/analysis/ethereum_smart_contracts/GPT_finetuning_data_collection.ipynb similarity index 100% rename from analysis/ethereum_smart_contracts/finetuning_data_collection.ipynb rename to analysis/ethereum_smart_contracts/GPT_finetuning_data_collection.ipynb diff --git a/analysis/ethereum_smart_contracts/GPT_finetuning_preprocessing.ipynb b/analysis/ethereum_smart_contracts/GPT_finetuning_preprocessing.ipynb new file mode 100644 index 0000000..8526de7 --- /dev/null +++ b/analysis/ethereum_smart_contracts/GPT_finetuning_preprocessing.ipynb @@ -0,0 +1,170 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "05322c8e-75ea-4e7f-a87a-fd1e4fb81bb0", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import re\n", + "import tiktoken\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.metrics import classification_report\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.linear_model import SGDClassifier\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db906716-1c88-4fb6-b31b-6c1b8150cd99", + "metadata": {}, + "outputs": [], + "source": [ + "import sklearn\n", + "print(sklearn.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "207fafbd-f345-42c8-a009-3df1e7c7e817", + "metadata": {}, + "outputs": [], + "source": [ + "COLS = ['contract_creator', 'contract_address', 'contract_name', 'decompiled_opcodes', 'malicious']\n", + "pretraining_data = pd.read_parquet('/data/forta/ethereum/text/pretraining/malicious_contract_training_dataset_final.parquet', columns=COLS)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "984692e4-1a1d-4019-a81e-465f9c6dee06", + "metadata": {}, + "outputs": [], + "source": [ + "pretraining_data['malicious'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25de7beb-1de1-4101-8ff1-fa9748016597", + "metadata": {}, + "outputs": [], + "source": [ + "pretraining_data.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77d37ada-03ce-46a0-8ced-cfce09fcd886", + "metadata": {}, + "outputs": [], + "source": [ + "def get_exp_2_features(row):\n", + " creator = row['contract_creator']\n", + " opcodes = row['decompiled_opcodes'].split()\n", + " mask = '0xffffffffffffffffffffffffffffffffffffffff'\n", + " features = []\n", + " for i in range(len(opcodes)-1):\n", + " first = opcodes[i]\n", + " second = opcodes[i+1]\n", + " if not first.startswith('0x'):\n", + " token = first\n", + " if first.startswith('UNKNOWN') or first.startswith('INVALID'):\n", + " token = first.split('_')[0]\n", + " features.append(token)\n", + " elif first == 'PUSH4':\n", + " features.append(second)\n", + " elif first == 'PUSH20':\n", + " if second == creator:\n", + " features.append('creator')\n", + " elif second == mask:\n", + " features.append(mask)\n", + " else:\n", + " features.append('address')\n", + " elif first == 'PUSH32':\n", + " features.append(second)\n", + " return \" \".join(features)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5266b0b7-dde8-4e1b-b2da-789399fcf69f", + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare data for pretraining phase\n", + "# First clean and process the opcode data\n", + "pretraining_data['experiment_2_opcodes'] = pretraining_data.apply(get_exp_2_features, axis=1)\n", + "\n", + "# Files to store the data\n", + "train_file_path = '/data/forta/ethereum/text/pretraining/small_pretraining_train.txt'\n", + "val_file_path = '/data/forta/ethereum/text/pretraining/small_pretraining_val.txt'\n", + "\n", + "# Suffle data\n", + "pretraining_data = pd.concat([pretraining_data[:499], pretraining_data[5000:]])\n", + "pretraining_data = pretraining_data.sample(frac = 1)\n", + "\n", + "# Save the data to disk\n", + "training_data = pretraining_data[:499]\n", + "validation_data = pretraining_data[500:]\n", + "training_data['experiment_2_opcodes'].to_csv(train_file_path, sep='\\t', index=False)\n", + "validation_data['experiment_2_opcodes'].to_csv(val_file_path, sep='\\t', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1b33a09-eaf4-4ba5-822e-37653bbbc6d0", + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare data for finetuning phase\n", + "# Training\n", + "training_data.loc[training_data['malicious'] == False].to_csv('/data/forta/ethereum/text/finetuning/training/normal/normal.txt',\n", + " columns=['experiment_2_opcodes'], sep='\\t', index=False)\n", + "training_data.loc[training_data['malicious'] == True].to_csv('/data/forta/ethereum/text/finetuning/training/malicious/malicious.txt',\n", + " columns=['experiment_2_opcodes'], sep='\\t', index=False)\n", + "\n", + "# Validation\n", + "validation_data.loc[validation_data['malicious'] == False].to_csv('/data/forta/ethereum/text/finetuning/validation/normal/normal.txt',\n", + " columns=['experiment_2_opcodes'], sep='\\t', index=False)\n", + "validation_data.loc[validation_data['malicious'] == True].to_csv('/data/forta/ethereum/text/finetuning/validation/malicious/malicious.txt',\n", + " columns=['experiment_2_opcodes'], sep='\\t', index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/analysis/ethereum_smart_contracts/pretrain_longformer.ipynb b/analysis/ethereum_smart_contracts/GPT_general_pretraining.ipynb similarity index 92% rename from analysis/ethereum_smart_contracts/pretrain_longformer.ipynb rename to analysis/ethereum_smart_contracts/GPT_general_pretraining.ipynb index 538ba38..b4d1f0d 100644 --- a/analysis/ethereum_smart_contracts/pretrain_longformer.ipynb +++ b/analysis/ethereum_smart_contracts/GPT_general_pretraining.ipynb @@ -35,6 +35,8 @@ "import math\n", "import torch\n", "import warnings\n", + "import pandas as pd\n", + "\n", "from tqdm.notebook import tqdm\n", "from ml_things import plot_dict, fix_text\n", "from transformers import (\n", @@ -44,7 +46,6 @@ " PreTrainedTokenizer,\n", " TrainingArguments,\n", " AutoConfig,\n", - " LongformerConfig,\n", " AutoTokenizer,\n", " AutoModelWithLMHead,\n", " AutoModelForCausalLM,\n", @@ -58,6 +59,7 @@ " Trainer,\n", " set_seed,\n", " )\n", + "from datasets import load_dataset\n", "\n", "# Supress deprecation warnings\n", "warnings.filterwarnings('ignore', category=DeprecationWarning)\n", @@ -169,9 +171,10 @@ " else:\n", " # Use config mapping if building model from scratch.\n", " model_config = CONFIG_MAPPING[args.model_type]()\n", - "\n", + " \n", " return model_config\n", "\n", + "\n", "def get_tokenizer(args: ModelDataArguments):\n", " r\"\"\" Get model tokenizer. \"\"\"\n", "\n", @@ -251,46 +254,42 @@ "model_data_args = ModelDataArguments(\n", " train_data_file='/data/forta/ethereum/text/pretraining/small_pretraining_train.txt',\n", " eval_data_file='/data/forta/ethereum/text/pretraining/small_pretraining_val.txt',\n", - " line_by_line=True, \n", + " line_by_line=False,\n", " mlm=False,\n", " whole_word_mask=False,\n", " plm_probability=float(1/6),\n", " max_span_length=5,\n", " block_size=-1,\n", " overwrite_cache=False,\n", - " model_type='allenai/longformer-base-4096',\n", - " model_config_name='allenai/longformer-base-4096',\n", - " tokenizer_name='/data/forta/ethereum/tokenizer_longformer',\n", + " model_type='gpt2',\n", + " model_config_name='gpt2',\n", + " tokenizer_name='/data/forta/ethereum/tokenizer',\n", " model_name_or_path=None,\n", " model_cache_dir='/data/forta/ethereum/cache',\n", " ignore_mismatched_sizes=True,\n", " )\n", "\n", - "print(model_data_args)\n", - "\n", "# Define arguments for training\n", "training_args = TrainingArguments(\n", " # Disable wandb\n", " report_to=\"none\",\n", - " output_dir='/data/forta/ethereum/model_longformer',\n", + " output_dir='/data/forta/ethereum/model',\n", " overwrite_output_dir=True,\n", " do_train=True, \n", " do_eval=True,\n", - " per_device_train_batch_size=4,\n", - " per_device_eval_batch_size=4,\n", - " evaluation_strategy='epoch',\n", - " logging_steps=4,\n", + " per_device_train_batch_size=10,\n", + " per_device_eval_batch_size=10,\n", + " evaluation_strategy='steps',\n", + " logging_steps=500,\n", " eval_steps = None,\n", " prediction_loss_only=True,\n", " learning_rate = 5e-5,\n", - " weight_decay=0.01,\n", + " weight_decay=0,\n", " adam_epsilon = 1e-8,\n", " max_grad_norm = 1.0,\n", " num_train_epochs = 2,\n", " save_steps = -1,\n", - " )\n", - "\n", - "print(training_args)" + " )" ] }, { @@ -408,10 +407,7 @@ "eval_dataset = get_dataset(model_data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None\n", "\n", "# Get data collator to modify data format depending on type of model used.\n", - "data_collator = get_collator(model_data_args, config, tokenizer)\n", - "\n", - "print(len(train_dataset))\n", - "print(len(eval_dataset))" + "data_collator = get_collator(model_data_args, config, tokenizer)" ] }, { @@ -470,6 +466,8 @@ "loss_history = {'train_loss':[], 'eval_loss':[]}\n", "perplexity_history = {'train_perplexity':[], 'eval_perplexity':[]}\n", "\n", + "print(trainer.state.log_history)\n", + "\n", "for log_history in trainer.state.log_history:\n", " if 'loss' in log_history.keys():\n", " loss_history['train_loss'].append(log_history['loss'])\n", @@ -506,12 +504,102 @@ "outputs": [], "source": [ "if training_args.do_eval:\n", - " eval_output = trainer.evaluate()\n", - " perplexity = math.exp(eval_output[\"eval_loss\"])\n", - " print('\\nEvaluate Perplexity: {:10,.2f}'.format(perplexity))\n", + " eval_output = trainer.evaluate()\n", + " print(eval_output[\"eval_loss\"])\n", + " perplexity = math.exp(eval_output[\"eval_loss\"])\n", + " print('\\nEvaluate Perplexity: {:10,.2f}'.format(perplexity))\n", "else:\n", - " print('No evaluation needed. No evaluation data provided, `do_eval=False`!')" + " print('No evaluation needed. No evaluation data provided, `do_eval=False`!')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def extract_normal_sequences(data, output_file):\n", + " encodings = tokenizer(\"\\n\\n\".join(data), return_tensors=\"pt\")\n", + " max_length = model.config.n_positions\n", + " stride = 512\n", + " seq_len = encodings.input_ids.size(1)\n", + " \n", + " prev_end_loc = 0\n", + " normal_sequences = []\n", + " for begin_loc in tqdm(range(0, seq_len, stride)):\n", + " end_loc = min(begin_loc + max_length, seq_len)\n", + " trg_len = end_loc - prev_end_loc # may be different from stride on last loop\n", + " input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)\n", + " normal_sequences.append(input_ids[0].tolist()) \n", + " prev_end_loc = end_loc\n", + " if end_loc == seq_len:\n", + " break \n", + " normal_data = pd.DataFrame(normal_sequences)\n", + " normal_data.to_csv(output_file, sep='\\t', index=False)\n", + "\n", + "def extract_anomalous_sequences(data, output_file):\n", + " encodings = tokenizer(\"\\n\\n\".join(data), return_tensors=\"pt\")\n", + " max_length = model.config.n_positions\n", + " stride = 512\n", + " seq_len = encodings.input_ids.size(1)\n", + " anomaly_threshold = 1.2\n", + " \n", + " nlls = []\n", + " prev_end_loc = 0\n", + " anomalous_sequences = []\n", + " for begin_loc in tqdm(range(0, seq_len, stride)):\n", + " end_loc = min(begin_loc + max_length, seq_len)\n", + " trg_len = end_loc - prev_end_loc # may be different from stride on last loop\n", + " input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)\n", + " target_ids = input_ids.clone()\n", + " target_ids[:, :-trg_len] = -100\n", + " \n", + " with torch.no_grad():\n", + " outputs = model(input_ids, labels=target_ids)\n", + " \n", + " # loss is calculated using CrossEntropyLoss which averages over valid labels\n", + " # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels\n", + " # to the left by 1.\n", + " neg_log_likelihood = outputs.loss\n", + " local_ppl = torch.exp(neg_log_likelihood)\n", + " if local_ppl > anomaly_threshold:\n", + " # print(\"local_ppl:\"+str(local_ppl))\n", + " # print(\"input_ids.shape:\"+str(input_ids.shape))\n", + " # anomalous_sequences.append(tokenizer.decode(input_ids[0]))\n", + " anomalous_sequences.append(input_ids[0].tolist())\n", + " \n", + " nlls.append(neg_log_likelihood)\n", + " \n", + " prev_end_loc = end_loc\n", + " if end_loc == seq_len:\n", + " break\n", + " \n", + " ppl = torch.exp(torch.stack(nlls).mean())\n", + " anomalous_data = pd.DataFrame(anomalous_sequences)\n", + " anomalous_data.to_csv(output_file, sep='\\t', index=False)\n", + "\n", + "# Load normal SC opcode files\n", + "training = load_dataset(\"text\", data_files={\"train\": \"/data/forta/ethereum/text/finetuning/training/normal/normal.txt\",\n", + " \"val\": \"/data/forta/ethereum/text/finetuning/validation/normal/normal.txt\"})\n", + "\n", + "# Load malicious SC opcode files\n", + "test = load_dataset(\"text\", data_files={\"train\": \"/data/forta/ethereum/text/finetuning/training/malicious/malicious.txt\",\n", + " \"val\": \"/data/forta/ethereum/text/finetuning/validation/malicious/malicious.txt\"})\n", + "# Extract normal SC opcode encodings\n", + "extract_normal_sequences(training[\"train\"][\"text\"], \"/data/forta/ethereum/text/finetuning/normal_training.csv\")\n", + "extract_normal_sequences(training[\"val\"][\"text\"], \"/data/forta/ethereum/text/finetuning/normal_validation.csv\")\n", + "\n", + "#Extract malicious SC opcode encodings\n", + "extract_anomalous_sequences(test[\"train\"][\"text\"], \"/data/forta/ethereum/text/finetuning/anomalous_training.csv\")\n", + "extract_anomalous_sequences(test[\"val\"][\"text\"], \"/data/forta/ethereum/text/finetuning/anomalous_validation.csv\")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/analysis/ethereum_smart_contracts/tokenizer.ipynb b/analysis/ethereum_smart_contracts/GPT_tokenizer.ipynb similarity index 91% rename from analysis/ethereum_smart_contracts/tokenizer.ipynb rename to analysis/ethereum_smart_contracts/GPT_tokenizer.ipynb index 857382c..7c4130c 100644 --- a/analysis/ethereum_smart_contracts/tokenizer.ipynb +++ b/analysis/ethereum_smart_contracts/GPT_tokenizer.ipynb @@ -21,8 +21,8 @@ "from transformers import AutoTokenizer\n", "from collections import defaultdict\n", "\n", - "dataset = load_dataset(\"text\", data_files={\"train\": \"/data/forta/ethereum/text/pretraining/pretraining_train.txt\",\n", - " \"val\": \"/data/forta/ethereum/text/pretraining/pretraining_val.txt\"})" + "dataset = load_dataset(\"text\", data_files={\"train\": \"/data/forta/ethereum/text/pretraining/pretraining_train.csv\",\n", + " \"val\": \"/data/forta/ethereum/text/pretraining/pretraining_val.csv\"})" ] }, { @@ -90,7 +90,9 @@ "outputs": [], "source": [ "tokenizer.save_pretrained(\"/data/forta/ethereum/tokenizer\")\n", - "print(tokenizer)" + "print(tokenizer)\n", + "encode = old_tokenizer.tokenize(\"PUSH1 PUSH1 MSTORE PUSH1 CALLDATASIZE LT PUSH2\", return_tensors=\"pt\")\n", + "print(type(encode))" ] }, { diff --git a/analysis/ethereum_smart_contracts/pretraining_data_collection.ipynb b/analysis/ethereum_smart_contracts/GTP_pretraining_data_collection.ipynb similarity index 98% rename from analysis/ethereum_smart_contracts/pretraining_data_collection.ipynb rename to analysis/ethereum_smart_contracts/GTP_pretraining_data_collection.ipynb index 1783512..3c7ab65 100644 --- a/analysis/ethereum_smart_contracts/pretraining_data_collection.ipynb +++ b/analysis/ethereum_smart_contracts/GTP_pretraining_data_collection.ipynb @@ -230,13 +230,14 @@ " if not os.path.exists(processed_data_file+\"_\"+blockchain+\".csv\"):\n", " \"\"\"Collects contracts from Zettablock and its decompiled opcodes.\"\"\" \n", " chunksize = 10 ** 6\n", + " print(\"Decompiling and extracting opcodes for blockchain %s:\" % blockchain )\n", " with pd.read_csv(zettablock_data_file+\"_\"+blockchain+\".csv\", chunksize=chunksize) as contract_reader:\n", " for contracts in contract_reader:\n", " contracts['decompiled_opcodes'] = contracts['contract_code'].progress_apply(get_opcodes)\n", - " # Store data so we don't have to download it all the time\n", " contracts = contracts[(contracts['decompiled_opcodes'].notna()) & (contracts['decompiled_opcodes'] != '')]\n", " contracts.drop_duplicates('contract_address', inplace=True)\n", - " contracts.progress_apply(get_exp_2_features, axis=1)\n", + " contracts['decompiled_opcodes'] = contracts.progress_apply(get_exp_2_features, axis=1)\n", + " # Store data so we don't have to download it all the time\n", " contracts.to_csv(processed_data_file+\"_\"+blockchain+\".csv\", mode='a')\n", " else:\n", " print(\"%s already exists.\" % processed_data_file+\"_\"+blockchain+\".csv\")" diff --git a/analysis/ethereum_smart_contracts/finetune_gpt.ipynb b/analysis/ethereum_smart_contracts/finetune_gpt_old.ipynb similarity index 100% rename from analysis/ethereum_smart_contracts/finetune_gpt.ipynb rename to analysis/ethereum_smart_contracts/finetune_gpt_old.ipynb diff --git a/analysis/ethereum_smart_contracts/tokenizer_longformer.ipynb b/analysis/ethereum_smart_contracts/tokenizer_longformer.ipynb deleted file mode 100644 index 1df6d10..0000000 --- a/analysis/ethereum_smart_contracts/tokenizer_longformer.ipynb +++ /dev/null @@ -1,134 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "FPuIaYYJLjvJ" - }, - "source": [ - "# Training a new tokenizer based on smart contract disassembled code" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "jfwnEMBRLjvT" - }, - "outputs": [], - "source": [ - "from datasets import load_dataset\n", - "from transformers import AutoTokenizer\n", - "from collections import defaultdict\n", - "\n", - "dataset = load_dataset(\"text\", data_files={\"train\": \"/data/forta/ethereum/text/pretraining/small_pretraining_train.txt\",\n", - " \"val\": \"/data/forta/ethereum/text/pretraining/small_pretraining_val.txt\"})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "max = -1\n", - "for row in dataset[\"train\"][\"text\"]:\n", - " length_of_the_messages = row.split(\" \")\n", - " max = len(length_of_the_messages) if len(length_of_the_messages) > max else max\n", - "print(\"Max number of words = \", max)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "T29hShJ1LjvX" - }, - "outputs": [], - "source": [ - "def get_training_corpus():\n", - " batch_size = 400\n", - " aux_dataset = dataset[\"train\"]\n", - " for start_idx in range(0, len(aux_dataset), batch_size):\n", - " samples = aux_dataset[start_idx : start_idx + batch_size]\n", - " yield samples[\"text\"]\n", - "\n", - "training_corpus = get_training_corpus()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "2QDOpAR1LjvX" - }, - "outputs": [], - "source": [ - "old_tokenizer = AutoTokenizer.from_pretrained(\"allenai/longformer-base-4096\", max_length = 1024)\n", - "# print(old_tokenizer.tokenize(\"PUSH1 PUSH1 MSTORE PUSH1 CALLDATASIZE LT PUSH2\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "UoMI6s6JLjvY" - }, - "outputs": [], - "source": [ - "vocab_size = 528\n", - "tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, vocab_size)\n", - "# print(old_tokenizer.tokenize(\"PUSH1 PUSH1 MSTORE PUSH1 CALLDATASIZE LT PUSH2\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "RoHgMflNLjvZ" - }, - "outputs": [], - "source": [ - "tokenizer.save_pretrained(\"/data/forta/ethereum/tokenizer_longformer\")\n", - "print(tokenizer)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "colab": { - "name": "Training a new tokenizer from an old one", - "provenance": [ - { - "file_id": "https://github.com/huggingface/notebooks/blob/master/course/en/chapter6/section2.ipynb", - "timestamp": 1708376608320 - } - ] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.18" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -}