From ce103c89aad81f7e31de27f7d3aa30a2ace8ffa6 Mon Sep 17 00:00:00 2001
From: Andres Gomez <kurono@riseup.net>
Date: Fri, 26 Apr 2024 23:56:50 +0000
Subject: [PATCH] improve anomaly detection and adapt the forta bot for that
 model

---
 .gitignore                                    |   3 +
 ...ipynb => GPT_anomaly_classification.ipynb} | 375 ++++++++++++------
 .../GPT_anomaly_pretraining.ipynb             |  27 +-
 .../GPT_finetuning_data_collection.ipynb      |   2 +-
 4 files changed, 275 insertions(+), 132 deletions(-)
 rename analysis/ethereum_smart_contracts/{GPT_evaluate.ipynb => GPT_anomaly_classification.ipynb} (90%)

diff --git a/.gitignore b/.gitignore
index 0af1abe..ab9ead0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,6 @@ data
 .env*
 venv*
 *_env
+.ipynb_checkpoints/
+tikuna_model_data/
+nohup.out
diff --git a/analysis/ethereum_smart_contracts/GPT_evaluate.ipynb b/analysis/ethereum_smart_contracts/GPT_anomaly_classification.ipynb
similarity index 90%
rename from analysis/ethereum_smart_contracts/GPT_evaluate.ipynb
rename to analysis/ethereum_smart_contracts/GPT_anomaly_classification.ipynb
index b7feeca..ddfbe79 100644
--- a/analysis/ethereum_smart_contracts/GPT_evaluate.ipynb
+++ b/analysis/ethereum_smart_contracts/GPT_anomaly_classification.ipynb
@@ -18,7 +18,8 @@
     "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
     "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
     "# See the License for the specific language governing permissions and\n",
-    "# limitations under the License."
+    "# limitations under the License.\n",
+    "# Based on: https://gmihaila.github.io/tutorial_notebooks/gpt2_finetune_classification/"
    ]
   },
   {
@@ -29,98 +30,168 @@
    },
    "outputs": [],
    "source": [
+    "import io\n",
+    "import os\n",
     "import torch\n",
     "import warnings\n",
-    "import pandas as pd\n",
-    "import os\n",
-    "import io\n",
-    "import numpy\n",
+    "import itertools\n",
     "\n",
+    "from io import StringIO\n",
+    "import pandas as pd\n",
     "from tqdm.notebook import tqdm\n",
     "from torch.utils.data import Dataset, DataLoader\n",
-    "from transformers import (set_seed,\n",
-    "                          GPT2Config,\n",
-    "                          GPT2Tokenizer,\n",
-    "                          GPT2ForSequenceClassification)\n",
+    "from ml_things import plot_dict, plot_confusion_matrix, fix_text\n",
+    "from sklearn.metrics import classification_report, accuracy_score\n",
+    "from transformers import (\n",
+    "                          AutoConfig,\n",
+    "                          AutoTokenizer,\n",
+    "                          AutoModelForCausalLM,\n",
+    "                          PretrainedConfig,\n",
+    "                          set_seed,\n",
+    "                          )\n",
     "\n",
     "# Supress deprecation warnings\n",
     "warnings.filterwarnings('ignore', category=DeprecationWarning)\n",
     "warnings.filterwarnings('ignore', category=FutureWarning)\n",
     "\n",
+    "# Set seed for reproducibility,\n",
+    "set_seed(4444)\n",
+    "\n",
     "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
     "\n",
-    "model_name_or_path = '/data/forta/ethereum/model_128_stride/'\n",
-    "tokenizer_name_or_path = '/data/forta/ethereum/tokenizer'\n",
-    "evaluation_data_file = \"/data/forta/ethereum/text/evaluation/malicious-eval.csv\"\n",
+    "model = None\n",
+    "tokenizer_path = '/data/forta/ethereum/tokenizer'\n",
+    "model_path = '/data/forta/ethereum/model_anomaly'\n",
     "\n",
-    "labels_ids = {'malicious': 0, 'normal': 1}\n",
-    "n_labels = len(labels_ids)\n",
+    "# Regular perplexity: 1.6968108415603638\n",
+    "mean = 1.9269466400146484\n",
+    "std_dev = 1.5235518217086792\n",
+    "delta = 3\n",
+    "anomaly_threshold = mean + delta * std_dev\n",
+    "# Original Anomaly Threshold: 4.974050521850586\n",
+    "print(\"anomaly_threshold: \"+str(anomaly_threshold))\n",
+    "stride = 128\n",
+    "max_length = 1024\n",
+    "batch_size = 1\n",
     "\n",
-    "# Set seed for reproducibility.\n",
-    "set_seed(4444)"
+    "labels_ids = {'malicious': 0, 'normal': 1}"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "id": "EDEubgJIt23C"
+   },
    "outputs": [],
    "source": [
     "class SmartContractOpcodeDataset(Dataset):\n",
-    "  r\"\"\" PyTorch Dataset class for loading data. \"\"\"\n",
     "\n",
-    "  def __init__(self, file_path, use_tokenizer):\n",
-    "    self.texts = []\n",
-    "    self.labels = []\n",
-    "    current_file = io.open(file_path, mode='r', encoding='utf-8')\n",
-    "    for line in current_file:\n",
-    "        self.texts.append(line)\n",
-    "        self.labels.append(0)\n",
-    "    self.n_examples = len(self.labels)\n",
-    "    return\n",
+    "    def __init__(self, path, use_tokenizer):\n",
+    "        if not os.path.isdir(path):\n",
+    "            raise ValueError('Invalid `path` variable! Needs to be a directory')\n",
+    "        self.texts = []\n",
+    "        self.labels = []\n",
+    "        # Since the labels are defined by folders with data we loop \n",
+    "        # through each label.\n",
+    "        for label in ['normal', 'malicious']:\n",
+    "            opcode_path = os.path.join(path, label)\n",
+    "            # Get all files from path.\n",
+    "            files_names = os.listdir(opcode_path)\n",
+    "            # Go through each file and read its content.\n",
+    "            for file_name in tqdm(files_names, desc=f'{label} files'):\n",
+    "                file_path = os.path.join(opcode_path, file_name)\n",
+    "                current_file = io.open(file_path, mode='r', encoding='utf-8')\n",
+    "                file_data = current_file.read()\n",
+    "                if file_data != \"\":\n",
+    "                    self.texts.append(file_data)\n",
+    "                    self.labels.append(label)\n",
+    "        self.n_examples = len(self.labels)\n",
     "\n",
-    "  def __len__(self):\n",
-    "    return self.n_examples\n",
+    "        return\n",
     "\n",
-    "  def __getitem__(self, item):\n",
-    "    return {'text':self.texts[item],\n",
-    "            'labels':self.labels[item]}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def extract_sequences(data, stride):\n",
-    "    encodings = tokenizer(data, return_tensors=\"pt\")\n",
-    "    max_length = model.config.n_positions\n",
-    "    seq_len = encodings.input_ids.size(1)\n",
+    "    def __len__(self):\n",
+    "        return self.n_examples\n",
     "    \n",
-    "    prev_end_loc = 0\n",
-    "    sequences = []\n",
-    "    for begin_loc in range(0, seq_len, stride):\n",
-    "        end_loc = min(begin_loc + max_length, seq_len)\n",
-    "        trg_len = end_loc - prev_end_loc\n",
-    "        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)\n",
-    "        list = input_ids[0].tolist()\n",
-    "        list[0:0] = [0] * (max_length - len(list))\n",
-    "        sequences.append(list)\n",
-    "        prev_end_loc = end_loc\n",
-    "        if end_loc == seq_len:\n",
-    "            break\n",
-    "    sequence_data = pd.DataFrame(sequences)\n",
-    "    return torch.tensor(sequence_data.values).type(torch.long)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "eval_dataset = SmartContractOpcodeDataset(file_path=evaluation_data_file, use_tokenizer=None)"
+    "    def __getitem__(self, item):\n",
+    "        return {'text':self.texts[item],\n",
+    "                'label':self.labels[item]}\n",
+    "\n",
+    "class Gpt2ClassificationCollator(object):\n",
+    "\n",
+    "    def __init__(self, use_tokenizer, labels_encoder, max_sequence_len=None):\n",
+    "        # Tokenizer to be used inside the class.\n",
+    "        self.use_tokenizer = use_tokenizer\n",
+    "        # Check max sequence length.\n",
+    "        self.max_sequence_len = use_tokenizer.model_max_length if max_sequence_len is None else max_sequence_len\n",
+    "        # Label encoder used inside the class.\n",
+    "        self.labels_encoder = labels_encoder\n",
+    "\n",
+    "        return\n",
+    "\n",
+    "    def __call__(self, sequences):\n",
+    "        # List comprehension to read lines, strip whitespace, and convert to int\n",
+    "        texts = []\n",
+    "        for sequence in sequences:\n",
+    "            for line in sequence['text'].split('\\n'):\n",
+    "                if line != \"\":\n",
+    "                    integers = [int(float(num)) for num in line.split('\\t')]\n",
+    "                    texts.append(torch.tensor(integers, dtype=torch.int))\n",
+    "        labels = [sequence['label'] for sequence in sequences]\n",
+    "        labels = [self.labels_encoder[label] for label in labels]\n",
+    "\n",
+    "        # We don't need to use the tokenizer since the data is already in numeric format\n",
+    "        inputs = {'input_ids':torch.stack(texts, dim=0)}\n",
+    "        # Update the inputs with the associated encoded labels as tensor.\n",
+    "        inputs.update({'labels':torch.tensor(labels)})\n",
+    "\n",
+    "        return inputs\n",
+    "\n",
+    "def validation(dataloader, device_):\n",
+    "    # Use global variable for model.\n",
+    "    global model\n",
+    "    \n",
+    "    # Tracking variables\n",
+    "    predictions_labels = []\n",
+    "    true_labels = []\n",
+    "    #total loss for this epoch.\n",
+    "    total_loss = 0\n",
+    "    trg_len = 1024\n",
+    "    \n",
+    "    # Put the model in evaluation mode--the dropout layers behave differently\n",
+    "    # during evaluation.\n",
+    "    model.eval()\n",
+    "\n",
+    "    # Evaluate data for one epoch\n",
+    "    for batch in tqdm(dataloader, total=len(dataloader)):\n",
+    "        # add original labels\n",
+    "        true_labels += batch['labels'].numpy().flatten().tolist()\n",
+    "        # move batch to device\n",
+    "        batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}\n",
+    "        # Collect predictions for each row\n",
+    "        predicted_label = None\n",
+    "        trg_len = 1024\n",
+    "        for row in batch['input_ids']:\n",
+    "            input_ids = row.view(1, max_length).to(device_)\n",
+    "            target_ids =  input_ids.clone()\n",
+    "            target_ids[:, :-trg_len] = -100\n",
+    "            trg_len = stride\n",
+    "            # The predicted label is normal until a anomalous sequence is found\n",
+    "            predicted_label = [labels_ids['normal']]\n",
+    "            # Telling the model not to compute or store gradients, saving memory andbatch.items()\n",
+    "            # speeding up validation\n",
+    "            with torch.no_grad():        \n",
+    "                outputs = model(**{'input_ids':input_ids}, labels=target_ids)\n",
+    "                neg_log_likelihood = outputs.loss\n",
+    "                local_perplexity = torch.exp(neg_log_likelihood)\n",
+    "                if local_perplexity > anomaly_threshold:\n",
+    "                    predicted_label = [labels_ids['malicious']]\n",
+    "                    break\n",
+    "        predictions_labels += [torch.all(torch.Tensor(predicted_label)).long()]\n",
+    "    # Calculate the average loss over the training data.\n",
+    "    avg_epoch_loss = total_loss / len(dataloader)\n",
+    "    # Return all true labels and prediciton for future evaluations.\n",
+    "    return true_labels, predictions_labels, avg_epoch_loss"
    ]
   },
   {
@@ -135,84 +206,138 @@
    },
    "outputs": [],
    "source": [
-    "# Get model configuration.\n",
-    "print('Loading configuration...')\n",
-    "model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path=model_name_or_path,\n",
-    "                                          num_labels=n_labels, local_files_only=True,\n",
-    "                                         use_safetensors=True)\n",
+    "# Load model configuration.\n",
+    "print('Loading model configuration...')\n",
+    "model_config = AutoConfig.from_pretrained(model_path)\n",
     "\n",
-    "# Get model's tokenizer.\n",
-    "print('Loading tokenizer...')\n",
-    "tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path=tokenizer_name_or_path,\n",
-    "                                         local_files_only=True, use_safetensors=True)\n",
-    "# default to left padding\n",
-    "tokenizer.padding_side = \"left\"\n",
-    "# Define PAD Token = EOS Token = 0\n",
+    "# Load model tokenizer.\n",
+    "print('Loading model`s tokenizer...')\n",
+    "tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)\n",
     "tokenizer.pad_token = tokenizer.eos_token\n",
     "\n",
-    "# Get the actual model.\n",
-    "print('Loading model...')\n",
-    "model = GPT2ForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path,\n",
-    "                                                      config=model_config, local_files_only=True,\n",
-    "                                                      use_safetensors=True)\n",
+    "# Loading model.\n",
+    "print('Loading actual model...')\n",
+    "model = AutoModelForCausalLM.from_pretrained(model_path, config=model_config)\n",
     "\n",
-    "# resize model embedding to match new tokenizer\n",
+    "# Resize model to fit all tokens in tokenizer.\n",
     "model.resize_token_embeddings(len(tokenizer))\n",
-    "# fix model padding token id\n",
-    "model.config.pad_token_id = model.config.eos_token_id\n",
     "\n",
-    "# Load model to defined device.\n",
-    "model.to(device)\n",
-    "print('Model loaded to `%s`'%device)"
+    "# fix model padding token id\n",
+    "model.config.pad_token_id = model.config.eos_token_id"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 332,
+     "referenced_widgets": [
+      "e9972d8d49074defbe6232b8ca9c3998",
+      "0c94315204c648a4a8427ed19abd0b02",
+      "b8aca6ca6b7043e2a94c43c01578ab36",
+      "ec1e5a789a2c4e8fbcaac513c58e04fc",
+      "0fde44f436cc49eebfc9627c49ff3eb2",
+      "48158320f750495586f815aae5e74a6e",
+      "72e35b75c2dd4be38396d7d2dfc5916f",
+      "f9e624a958a343fd8ae0d00ed09172dc",
+      "b638077e5aef4d9f9ae18513fe348c9d",
+      "691d3b5ff274407aa9e8e20c989adaf5",
+      "d9ff8a49c31e4b068e9577784fe37a88",
+      "85132b1b278b4336845b1cc519066929",
+      "b53a0ea4433341a0b00314627ac4fd4c",
+      "5f6fded3e9734d02925ed1665ada214e",
+      "0a8e6b854f644ac790d1b60e00559524",
+      "e3f731bec3534a1cab0854b9192f491e",
+      "97f350df143d4da3b0111180998afc4d",
+      "5baa034ed79c450980ced4a1f7de2374",
+      "6e98a119a4874424be55b571ca8fdd62",
+      "ec918106d85d470d83c9491c4c7e283c",
+      "8de61314c47647c5af0e42ba8121d7ec",
+      "97deaa1813e64282a3a678cf1a719706",
+      "ff649a06a95641038987e810e1dcf065",
+      "29eb9184d1ff4574991ca2e6f11a7ee0",
+      "1632298dbbb64696a3c18976214ed3d6",
+      "9671cba109ab4ac19ef02e40905fbfd4",
+      "82d96937e37e4590921cd4dd2dd2e058",
+      "6277b8b253e6475a928f737995b8af4d",
+      "653587d9fefb4e4b9264ba80c2ada175",
+      "1d27fa237e2b499a889b3fe6ecb82a6d",
+      "06d2b4a7c9814917a53e002e142acee7",
+      "5c361029c19c4e81963046dbb03515d6"
+     ]
+    },
+    "id": "OlXROUWu5Osq",
+    "outputId": "8a4ba877-dac3-4e84-b005-bb27df05deb0"
+   },
    "outputs": [],
    "source": [
-    "def evaluate(dataloader, device_):\n",
-    "    global model\n",
-    "    # Tracking variables\n",
-    "    predictions_labels = []\n",
-    "    model.eval()\n",
-    "    index = 0\n",
-    "    normal = 0\n",
-    "    malicious = 0\n",
-    "    \n",
-    "    for batch in tqdm(dataloader, total=len(dataloader)):\n",
-    "        processed_batch = {}\n",
-    "        label = 0\n",
-    "        for k,v in batch.items():\n",
-    "            if k == \"text\":\n",
-    "                processed_batch[\"input_ids\"] = extract_sequences(v, 128).to(device_)\n",
-    "        # sequence_predictions = []\n",
-    "        # for row in processed_batch['input_ids']:\n",
-    "            #sequence = {}\n",
-    "            #sequence['input_ids'] = row.view(1, 1024)\n",
-    "            # sequence['labels'] = batch['labels'][0]\n",
-    "        with torch.no_grad():\n",
-    "            outputs = model(**processed_batch)\n",
-    "            logits = outputs.logits.detach().cpu().numpy()\n",
-    "            predict_content = logits.argmax(axis=-1).flatten()\n",
-    "            if not predict_content.all():\n",
-    "                malicious = malicious + 1\n",
-    "            else:\n",
-    "                normal = normal + 1\n",
-    "            index = index + 1\n",
-    "    print(\"Amount of normal samples: \"+str(normal))\n",
-    "    print(\"Amount of malicious samples: \"+str(malicious))\n",
-    "    print(\"Proportion of malicious over total: \"+str(malicious/len(dataloader)))"
+    "# Create data collator to encode text and labels into numbers.\n",
+    "gpt2_classificaiton_collator = Gpt2ClassificationCollator(use_tokenizer=tokenizer, \n",
+    "                                                          labels_encoder=labels_ids, \n",
+    "                                                          max_sequence_len=max_length)\n",
+    "\n",
+    "print('Dealing with Validation...')\n",
+    "# Create pytorch dataset.\n",
+    "valid_dataset =  SmartContractOpcodeDataset(\n",
+    "                     path='/data/forta/ethereum/text/finetuning/validation',\n",
+    "                     use_tokenizer=tokenizer\n",
+    "                 )\n",
+    "print('Created `valid_dataset` with %d examples!'%len(valid_dataset))\n",
+    "\n",
+    "# Move pytorch dataset into dataloader.\n",
+    "valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size,\n",
+    "                              shuffle=False,\n",
+    "                              collate_fn=gpt2_classificaiton_collator\n",
+    "                             )\n",
+    "print('Created `eval_dataloader` with %d batches!'%len(valid_dataloader))"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 618,
+     "referenced_widgets": [
+      "88d93f5b31c14ae3b2d6a889e5caa2a4",
+      "c745cb7c913448f59d3c017335e3ea22",
+      "04f2861f3e7f4255aed47e0b0e77eecd",
+      "5c2faf9a7d8e48caaa6d59875ed056e1",
+      "99caa2e95506457d89f167abe62d9f32",
+      "e492f5b344714a83bb8265fa9161b52d",
+      "6236711e6e76467fae7f79f266a7f8ed",
+      "138ae3e683c04a388e3d9d03cc425fec"
+     ]
+    },
+    "id": "7Sifp6ocoSng",
+    "outputId": "1dccfb2a-242d-47e1-ed54-a7bfaa676f97",
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
-    "evaluate(eval_dataset, device)"
+    "# Get prediction form model on validation data. This is where you should use\n",
+    "# your test data.\n",
+    "true_labels, predictions_labels, avg_epoch_loss = validation(valid_dataloader,\n",
+    "                                                             device)\n",
+    "\n",
+    "# Create the evaluation report.\n",
+    "evaluation_report = classification_report(true_labels,\n",
+    "                                          predictions_labels,\n",
+    "                                          labels=list(labels_ids.values()),\n",
+    "                                          target_names=list(labels_ids.keys()))\n",
+    "# Show the evaluation report.\n",
+    "print(evaluation_report)\n",
+    "\n",
+    "# Plot confusion matrix.\n",
+    "plot_confusion_matrix(y_true=true_labels,\n",
+    "                      y_pred=predictions_labels, \n",
+    "                      classes=list(labels_ids.keys()),\n",
+    "                      normalize=True, \n",
+    "                      magnify=0.1,\n",
+    "                      );"
    ]
   }
  ],
diff --git a/analysis/ethereum_smart_contracts/GPT_anomaly_pretraining.ipynb b/analysis/ethereum_smart_contracts/GPT_anomaly_pretraining.ipynb
index 31eec25..172670f 100644
--- a/analysis/ethereum_smart_contracts/GPT_anomaly_pretraining.ipynb
+++ b/analysis/ethereum_smart_contracts/GPT_anomaly_pretraining.ipynb
@@ -70,7 +70,8 @@
     "\n",
     "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
     "\n",
-    "model = None"
+    "model = None\n",
+    "anomaly_validation = False"
    ]
   },
   {
@@ -463,7 +464,7 @@
    "outputs": [],
    "source": [
     "# if training_args.do_eval:\n",
-    "if False:\n",
+    "if training_args.do_eval:\n",
     "    eval_output = trainer.evaluate()\n",
     "    print(eval_output[\"eval_loss\"])\n",
     "    perplexity = math.exp(eval_output[\"eval_loss\"])\n",
@@ -490,7 +491,9 @@
     "        trg_len = end_loc - prev_end_loc\n",
     "        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)\n",
     "        list = input_ids[0].tolist()\n",
-    "        list[0:0] = [0] * (max_length - len(list))\n",
+    "        # padding to the left\n",
+    "        endoftext = 0\n",
+    "        list[0:0] = [endoftext] * (max_length - len(list))\n",
     "        normal_sequences.append(list)\n",
     "        prev_end_loc = end_loc\n",
     "        if end_loc == seq_len:\n",
@@ -568,7 +571,7 @@
     "    )\n",
     "\n",
     "print(\"Extracting normal sequences validation...\")\n",
-    "for index in tqdm(range(0, 100)):\n",
+    "for index in tqdm(range(0, 200)):\n",
     "    extract_normal_sequences(\n",
     "        training[\"val\"][\"text\"][index],\n",
     "        \"/data/forta/ethereum/text/finetuning/validation/normal/\",\n",
@@ -576,10 +579,20 @@
     "        index\n",
     "    )\n",
     "\n",
+    "if anomaly_validation:\n",
+    "    print(\"Extracting malicious sequences anomaly validation...\")\n",
+    "    for index in tqdm(range(0, len(test[\"train\"][\"text\"]))):\n",
+    "        extract_normal_sequences(\n",
+    "            test[\"train\"][\"text\"][index],\n",
+    "            \"/data/forta/ethereum/text/finetuning/validation/malicious/\",\n",
+    "            stride,\n",
+    "            index\n",
+    "        )\n",
+    "\n",
     "# Calculate threshold for extraction of malicious samples\n",
     "print(\"Calculating threshold\")\n",
     "threshold = calculate_anomaly_threshold(training[\"train\"][\"text\"][:100], stride)\n",
-    "print(\"Calculated threshold:\" % threshold)"
+    "print(\"Calculated threshold: %s\" % str(threshold))"
    ]
   },
   {
@@ -610,7 +623,9 @@
     "            local_ppl = torch.exp(neg_log_likelihood)\n",
     "            if local_ppl > anomaly_threshold:\n",
     "                list = input_ids[0].tolist()\n",
-    "                list[0:0] = [0] * (max_length - len(list))\n",
+    "                # padding to the left\n",
+    "                endoftext = 0\n",
+    "                list[0:0] = [endoftext] * (max_length - len(list))\n",
     "                anomalous_sequences.append(list)\n",
     "    \n",
     "        nlls.append(neg_log_likelihood)\n",
diff --git a/analysis/ethereum_smart_contracts/GPT_finetuning_data_collection.ipynb b/analysis/ethereum_smart_contracts/GPT_finetuning_data_collection.ipynb
index 03f1267..0573999 100644
--- a/analysis/ethereum_smart_contracts/GPT_finetuning_data_collection.ipynb
+++ b/analysis/ethereum_smart_contracts/GPT_finetuning_data_collection.ipynb
@@ -11,7 +11,7 @@
     "This notebook collects smart contract creation bytecode and decompiled opcodes for malicious contract classification. \n",
     "Benign contracts are gathered from blockchain explorers and malicious contracts from [Forta Network's labelled datasets github repo](\"https://github.com/forta-network/labelled-datasets\").\n",
     "\n",
-    "# Code provided by the Forta project"
+    "Code provided by the Forta project"
    ]
   },
   {