Skip to content

Commit

Permalink
update malicious smart contract detection
Browse files Browse the repository at this point in the history
  • Loading branch information
kuronosec committed Jun 25, 2024
1 parent 41e4399 commit d2fd9fa
Show file tree
Hide file tree
Showing 7 changed files with 302 additions and 161 deletions.
104 changes: 71 additions & 33 deletions analysis/ethereum_smart_contracts/GPT_anomaly_classification.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@
" AutoModelForCausalLM,\n",
" PretrainedConfig,\n",
" set_seed,\n",
" GPT2ForSequenceClassification,\n",
" GPT2Config,\n",
" GPT2Tokenizer\n",
" )\n",
"\n",
"# Supress deprecation warnings\n",
Expand All @@ -64,17 +67,22 @@
"model_path = '/data/forta/ethereum/model_anomaly'\n",
"\n",
"# Regular perplexity: 1.6968108415603638\n",
"mean = 1.9269466400146484\n",
"# mean = 1.9269466400146484\n",
"mean = 1.6\n",
"std_dev = 1.5235518217086792\n",
"delta = 3\n",
"delta = 1.75\n",
"anomaly_threshold = mean + delta * std_dev\n",
"# Original Anomaly Threshold: 4.974050521850586\n",
"print(\"anomaly_threshold: \"+str(anomaly_threshold))\n",
"stride = 128\n",
"max_length = 1024\n",
"batch_size = 1\n",
"\n",
"labels_ids = {'malicious': 0, 'normal': 1}"
"labels_ids = {'malicious': torch.tensor(\n",
" [0], dtype=torch.long).to(device),\n",
" 'normal': torch.tensor(\n",
" [1], dtype=torch.long).to(device)}\n",
"labels_ids_evaluation = {'malicious': 0, 'normal': 1}"
]
},
{
Expand Down Expand Up @@ -135,19 +143,23 @@
" for sequence in sequences:\n",
" for line in sequence['text'].split('\\n'):\n",
" if line != \"\":\n",
" integers = [int(float(num)) for num in line.split('\\t')]\n",
" texts.append(torch.tensor(integers, dtype=torch.int))\n",
" texts.append(line)\n",
" labels = [sequence['label'] for sequence in sequences]\n",
" labels = [self.labels_encoder[label] for label in labels]\n",
"\n",
" # We don't need to use the tokenizer since the data is already in numeric format\n",
" inputs = {'input_ids':torch.stack(texts, dim=0)}\n",
" # Call tokenizer on all texts to convert into tensors of numbers with \n",
" # appropriate padding.\n",
" inputs = self.use_tokenizer(text=texts,\n",
" return_tensors=\"pt\",\n",
" padding=True,\n",
" truncation=True,\n",
" max_length=self.max_sequence_len)\n",
" # Update the inputs with the associated encoded labels as tensor.\n",
" inputs.update({'labels':torch.tensor(labels)})\n",
"\n",
" return inputs\n",
"\n",
"def validation(dataloader, device_):\n",
"def validation(dataloader, device):\n",
" # Use global variable for model.\n",
" global model\n",
" \n",
Expand All @@ -156,7 +168,6 @@
" true_labels = []\n",
" #total loss for this epoch.\n",
" total_loss = 0\n",
" trg_len = 1024\n",
" \n",
" # Put the model in evaluation mode--the dropout layers behave differently\n",
" # during evaluation.\n",
Expand All @@ -167,27 +178,45 @@
" # add original labels\n",
" true_labels += batch['labels'].numpy().flatten().tolist()\n",
" # move batch to device\n",
" batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}\n",
" batch = {k:v.type(torch.long).to(device) for k,v in batch.items()}\n",
" # Collect predictions for each row\n",
" predicted_label = None\n",
" trg_len = 1024\n",
" for row in batch['input_ids']:\n",
" input_ids = row.view(1, max_length).to(device_)\n",
" target_ids = input_ids.clone()\n",
" target_ids[:, :-trg_len] = -100\n",
" trg_len = stride\n",
" number_functions = batch['input_ids'].shape[0]\n",
"\n",
" for i in range(0, number_functions):\n",
" sequence = {}\n",
" sequence['input_ids'] = batch['input_ids'][i,:].view(\n",
" 1, batch['input_ids'][i,:].shape[0])\n",
" sequence['attention_mask'] = batch['attention_mask'][i,:].view(\n",
" 1, batch['attention_mask'][i,:].shape[0])\n",
" sequence['labels'] = batch['labels']\n",
" # The predicted label is normal until a anomalous sequence is found\n",
" predicted_label = [labels_ids['normal']]\n",
" predicted_label = labels_ids['normal']\n",
"\n",
" # Telling the model not to compute or store gradients, saving memory andbatch.items()\n",
" # speeding up validation\n",
" with torch.no_grad(): \n",
" outputs = model(**{'input_ids':input_ids}, labels=target_ids)\n",
" neg_log_likelihood = outputs.loss\n",
" local_perplexity = torch.exp(neg_log_likelihood)\n",
" if local_perplexity > anomaly_threshold:\n",
" predicted_label = [labels_ids['malicious']]\n",
" break\n",
" predictions_labels += [torch.all(torch.Tensor(predicted_label)).long()]\n",
" with torch.no_grad():\n",
" if sequence['labels'] == labels_ids['malicious']:\n",
" sequence['labels'] = labels_ids['normal']\n",
" outputs = model(**sequence)\n",
" neg_log_likelihood = outputs.loss\n",
" local_perplexity = torch.exp(neg_log_likelihood)\n",
"\n",
" if local_perplexity > anomaly_threshold:\n",
" print(\"local_perplexity: %s\" % local_perplexity)\n",
" predicted_label = labels_ids['malicious']\n",
" break\n",
" else:\n",
" outputs = model(**sequence)\n",
" neg_log_likelihood = outputs.loss\n",
" local_perplexity = torch.exp(neg_log_likelihood)\n",
"\n",
" if local_perplexity > anomaly_threshold:\n",
" print(\"local_perplexity: %s\" % local_perplexity)\n",
" predicted_label = labels_ids['malicious']\n",
" break\n",
"\n",
" predictions_labels += predicted_label.cpu().numpy().flatten().tolist()\n",
" # Calculate the average loss over the training data.\n",
" avg_epoch_loss = total_loss / len(dataloader)\n",
" # Return all true labels and prediciton for future evaluations.\n",
Expand All @@ -208,22 +237,25 @@
"source": [
"# Load model configuration.\n",
"print('Loading model configuration...')\n",
"model_config = AutoConfig.from_pretrained(model_path)\n",
"model_config = GPT2Config.from_pretrained(model_path)\n",
"\n",
"# Load model tokenizer.\n",
"print('Loading model`s tokenizer...')\n",
"tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)\n",
"tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)\n",
"tokenizer.pad_token = tokenizer.eos_token\n",
"\n",
"# Loading model.\n",
"print('Loading actual model...')\n",
"model = AutoModelForCausalLM.from_pretrained(model_path, config=model_config)\n",
"model = GPT2ForSequenceClassification.from_pretrained(model_path, config=model_config)\n",
"\n",
"# Resize model to fit all tokens in tokenizer.\n",
"model.resize_token_embeddings(len(tokenizer))\n",
"\n",
"# fix model padding token id\n",
"model.config.pad_token_id = model.config.eos_token_id"
"model.config.pad_token_id = model.config.eos_token_id\n",
"\n",
"# Bring model to device\n",
"model.to(device)"
]
},
{
Expand Down Expand Up @@ -322,12 +354,11 @@
"# your test data.\n",
"true_labels, predictions_labels, avg_epoch_loss = validation(valid_dataloader,\n",
" device)\n",
"\n",
"# Create the evaluation report.\n",
"evaluation_report = classification_report(true_labels,\n",
" predictions_labels,\n",
" labels=list(labels_ids.values()),\n",
" target_names=list(labels_ids.keys()))\n",
" labels=list(labels_ids_evaluation.values()),\n",
" target_names=list(labels_ids_evaluation.keys()))\n",
"# Show the evaluation report.\n",
"print(evaluation_report)\n",
"\n",
Expand All @@ -339,6 +370,13 @@
" magnify=0.1,\n",
" );"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down Expand Up @@ -366,7 +404,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
"version": "3.9.19"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
Expand Down
Loading

0 comments on commit d2fd9fa

Please sign in to comment.