update malicious smart contract detection

edenia · Jun 25, 2024 · d2fd9fa · d2fd9fa
1 parent 41e4399
commit d2fd9fa
Show file tree

Hide file tree

Showing 7 changed files with 302 additions and 161 deletions.
diff --git a/analysis/ethereum_smart_contracts/GPT_anomaly_classification.ipynb b/analysis/ethereum_smart_contracts/GPT_anomaly_classification.ipynb
@@ -48,6 +48,9 @@
     "                          AutoModelForCausalLM,\n",
     "                          PretrainedConfig,\n",
     "                          set_seed,\n",
+    "                          GPT2ForSequenceClassification,\n",
+    "                          GPT2Config,\n",
+    "                          GPT2Tokenizer\n",
     "                          )\n",
     "\n",
     "# Supress deprecation warnings\n",
@@ -64,17 +67,22 @@
     "model_path = '/data/forta/ethereum/model_anomaly'\n",
     "\n",
     "# Regular perplexity: 1.6968108415603638\n",
-    "mean = 1.9269466400146484\n",
+    "# mean = 1.9269466400146484\n",
+    "mean = 1.6\n",
     "std_dev = 1.5235518217086792\n",
-    "delta = 3\n",
+    "delta = 1.75\n",
     "anomaly_threshold = mean + delta * std_dev\n",
     "# Original Anomaly Threshold: 4.974050521850586\n",
     "print(\"anomaly_threshold: \"+str(anomaly_threshold))\n",
     "stride = 128\n",
     "max_length = 1024\n",
     "batch_size = 1\n",
     "\n",
-    "labels_ids = {'malicious': 0, 'normal': 1}"
+    "labels_ids = {'malicious': torch.tensor(\n",
+    "    [0], dtype=torch.long).to(device),\n",
+    "              'normal': torch.tensor(\n",
+    "    [1], dtype=torch.long).to(device)}\n",
+    "labels_ids_evaluation = {'malicious': 0, 'normal': 1}"
    ]
   },
   {
@@ -135,19 +143,23 @@
     "        for sequence in sequences:\n",
     "            for line in sequence['text'].split('\\n'):\n",
     "                if line != \"\":\n",
-    "                    integers = [int(float(num)) for num in line.split('\\t')]\n",
-    "                    texts.append(torch.tensor(integers, dtype=torch.int))\n",
+    "                    texts.append(line)\n",
     "        labels = [sequence['label'] for sequence in sequences]\n",
     "        labels = [self.labels_encoder[label] for label in labels]\n",
     "\n",
-    "        # We don't need to use the tokenizer since the data is already in numeric format\n",
-    "        inputs = {'input_ids':torch.stack(texts, dim=0)}\n",
+    "        # Call tokenizer on all texts to convert into tensors of numbers with \n",
+    "        # appropriate padding.\n",
+    "        inputs = self.use_tokenizer(text=texts,\n",
+    "                                    return_tensors=\"pt\",\n",
+    "                                    padding=True,\n",
+    "                                    truncation=True,\n",
+    "                                    max_length=self.max_sequence_len)\n",
     "        # Update the inputs with the associated encoded labels as tensor.\n",
     "        inputs.update({'labels':torch.tensor(labels)})\n",
     "\n",
     "        return inputs\n",
     "\n",
-    "def validation(dataloader, device_):\n",
+    "def validation(dataloader, device):\n",
     "    # Use global variable for model.\n",
     "    global model\n",
     "    \n",
@@ -156,7 +168,6 @@
     "    true_labels = []\n",
     "    #total loss for this epoch.\n",
     "    total_loss = 0\n",
-    "    trg_len = 1024\n",
     "    \n",
     "    # Put the model in evaluation mode--the dropout layers behave differently\n",
     "    # during evaluation.\n",
@@ -167,27 +178,45 @@
     "        # add original labels\n",
     "        true_labels += batch['labels'].numpy().flatten().tolist()\n",
     "        # move batch to device\n",
-    "        batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}\n",
+    "        batch = {k:v.type(torch.long).to(device) for k,v in batch.items()}\n",
     "        # Collect predictions for each row\n",
     "        predicted_label = None\n",
-    "        trg_len = 1024\n",
-    "        for row in batch['input_ids']:\n",
-    "            input_ids = row.view(1, max_length).to(device_)\n",
-    "            target_ids =  input_ids.clone()\n",
-    "            target_ids[:, :-trg_len] = -100\n",
-    "            trg_len = stride\n",
+    "        number_functions = batch['input_ids'].shape[0]\n",
+    "\n",
+    "        for i in range(0, number_functions):\n",
+    "            sequence = {}\n",
+    "            sequence['input_ids'] = batch['input_ids'][i,:].view(\n",
+    "                1, batch['input_ids'][i,:].shape[0])\n",
+    "            sequence['attention_mask'] = batch['attention_mask'][i,:].view(\n",
+    "                1, batch['attention_mask'][i,:].shape[0])\n",
+    "            sequence['labels'] = batch['labels']\n",
     "            # The predicted label is normal until a anomalous sequence is found\n",
-    "            predicted_label = [labels_ids['normal']]\n",
+    "            predicted_label = labels_ids['normal']\n",
+    "\n",
     "            # Telling the model not to compute or store gradients, saving memory andbatch.items()\n",
     "            # speeding up validation\n",
-    "            with torch.no_grad():        \n",
-    "                outputs = model(**{'input_ids':input_ids}, labels=target_ids)\n",
-    "                neg_log_likelihood = outputs.loss\n",
-    "                local_perplexity = torch.exp(neg_log_likelihood)\n",
-    "                if local_perplexity > anomaly_threshold:\n",
-    "                    predicted_label = [labels_ids['malicious']]\n",
-    "                    break\n",
-    "        predictions_labels += [torch.all(torch.Tensor(predicted_label)).long()]\n",
+    "            with torch.no_grad():\n",
+    "                if sequence['labels'] == labels_ids['malicious']:\n",
+    "                    sequence['labels'] = labels_ids['normal']\n",
+    "                    outputs = model(**sequence)\n",
+    "                    neg_log_likelihood = outputs.loss\n",
+    "                    local_perplexity = torch.exp(neg_log_likelihood)\n",
+    "\n",
+    "                    if local_perplexity > anomaly_threshold:\n",
+    "                        print(\"local_perplexity: %s\" % local_perplexity)\n",
+    "                        predicted_label = labels_ids['malicious']\n",
+    "                        break\n",
+    "                else:\n",
+    "                    outputs = model(**sequence)\n",
+    "                    neg_log_likelihood = outputs.loss\n",
+    "                    local_perplexity = torch.exp(neg_log_likelihood)\n",
+    "\n",
+    "                    if local_perplexity > anomaly_threshold:\n",
+    "                        print(\"local_perplexity: %s\" % local_perplexity)\n",
+    "                        predicted_label = labels_ids['malicious']\n",
+    "                        break\n",
+    "\n",
+    "        predictions_labels += predicted_label.cpu().numpy().flatten().tolist()\n",
     "    # Calculate the average loss over the training data.\n",
     "    avg_epoch_loss = total_loss / len(dataloader)\n",
     "    # Return all true labels and prediciton for future evaluations.\n",
@@ -208,22 +237,25 @@
    "source": [
     "# Load model configuration.\n",
     "print('Loading model configuration...')\n",
-    "model_config = AutoConfig.from_pretrained(model_path)\n",
+    "model_config = GPT2Config.from_pretrained(model_path)\n",
     "\n",
     "# Load model tokenizer.\n",
     "print('Loading model`s tokenizer...')\n",
-    "tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)\n",
+    "tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)\n",
     "tokenizer.pad_token = tokenizer.eos_token\n",
     "\n",
     "# Loading model.\n",
     "print('Loading actual model...')\n",
-    "model = AutoModelForCausalLM.from_pretrained(model_path, config=model_config)\n",
+    "model = GPT2ForSequenceClassification.from_pretrained(model_path, config=model_config)\n",
     "\n",
     "# Resize model to fit all tokens in tokenizer.\n",
     "model.resize_token_embeddings(len(tokenizer))\n",
     "\n",
     "# fix model padding token id\n",
-    "model.config.pad_token_id = model.config.eos_token_id"
+    "model.config.pad_token_id = model.config.eos_token_id\n",
+    "\n",
+    "# Bring model to device\n",
+    "model.to(device)"
    ]
   },
   {
@@ -322,12 +354,11 @@
     "# your test data.\n",
     "true_labels, predictions_labels, avg_epoch_loss = validation(valid_dataloader,\n",
     "                                                             device)\n",
-    "\n",
     "# Create the evaluation report.\n",
     "evaluation_report = classification_report(true_labels,\n",
     "                                          predictions_labels,\n",
-    "                                          labels=list(labels_ids.values()),\n",
-    "                                          target_names=list(labels_ids.keys()))\n",
+    "                                          labels=list(labels_ids_evaluation.values()),\n",
+    "                                          target_names=list(labels_ids_evaluation.keys()))\n",
     "# Show the evaluation report.\n",
     "print(evaluation_report)\n",
     "\n",
@@ -339,6 +370,13 @@
     "                      magnify=0.1,\n",
     "                      );"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -366,7 +404,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.18"
+   "version": "3.9.19"
   },
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {