Skip to content

Commit

Permalink
update final results for GPT2 on smart contract functions
Browse files Browse the repository at this point in the history
  • Loading branch information
kuronosec committed Jul 8, 2024
1 parent d2fd9fa commit ef17698
Show file tree
Hide file tree
Showing 6 changed files with 22 additions and 190 deletions.
2 changes: 1 addition & 1 deletion analysis/ethereum_p2p/ethereum_lstm_log_octets.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.16"
"version": "3.9.18"
}
},
"nbformat": 4,
Expand Down
23 changes: 9 additions & 14 deletions analysis/ethereum_smart_contracts/GPT_anomaly_classification.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,6 @@
"from ml_things import plot_dict, plot_confusion_matrix, fix_text\n",
"from sklearn.metrics import classification_report, accuracy_score\n",
"from transformers import (\n",
" AutoConfig,\n",
" AutoTokenizer,\n",
" AutoModelForCausalLM,\n",
" PretrainedConfig,\n",
" set_seed,\n",
" GPT2ForSequenceClassification,\n",
Expand All @@ -66,18 +63,16 @@
"tokenizer_path = '/data/forta/ethereum/tokenizer'\n",
"model_path = '/data/forta/ethereum/model_anomaly'\n",
"\n",
"# Regular perplexity: 1.6968108415603638\n",
"# mean = 1.9269466400146484\n",
"mean = 1.6\n",
"std_dev = 1.5235518217086792\n",
"delta = 1.75\n",
"anomaly_threshold = mean + delta * std_dev\n",
"# Original Anomaly Threshold: 4.974050521850586\n",
"print(\"anomaly_threshold: \"+str(anomaly_threshold))\n",
"stride = 128\n",
"# Stablish parameters\n",
"perplexity = 2.33\n",
"std_dev = 1.52\n",
"delta = 1.5\n",
"anomaly_threshold = perplexity + delta * std_dev\n",
"max_length = 1024\n",
"batch_size = 1\n",
"\n",
"print(\"anomaly_threshold: \"+str(anomaly_threshold))\n",
"\n",
"labels_ids = {'malicious': torch.tensor(\n",
" [0], dtype=torch.long).to(device),\n",
" 'normal': torch.tensor(\n",
Expand Down Expand Up @@ -320,7 +315,7 @@
"\n",
"# Move pytorch dataset into dataloader.\n",
"valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size,\n",
" shuffle=False,\n",
" shuffle=True,\n",
" collate_fn=gpt2_classificaiton_collator\n",
" )\n",
"print('Created `eval_dataloader` with %d batches!'%len(valid_dataloader))"
Expand Down Expand Up @@ -404,7 +399,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
"version": "3.9.18"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
Expand Down
179 changes: 8 additions & 171 deletions analysis/ethereum_smart_contracts/GPT_anomaly_pretraining.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@
"\n",
"model = None\n",
"anomaly_validation = True\n",
"do_train=False\n",
"do_eval=False"
"do_train=True\n",
"do_eval=True"
]
},
{
Expand Down Expand Up @@ -478,181 +478,18 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"def extract_normal_sequences(data, output_file, stride, index):\n",
" encodings = tokenizer(str(data), return_tensors=\"pt\")\n",
" max_length = model.config.n_positions\n",
" seq_len = encodings.input_ids.size(1)\n",
" \n",
" prev_end_loc = 0\n",
" normal_sequences = []\n",
" for begin_loc in range(0, seq_len, stride):\n",
" end_loc = min(begin_loc + max_length, seq_len)\n",
" trg_len = end_loc - prev_end_loc\n",
" input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)\n",
" list = input_ids[0].tolist()\n",
" # padding to the left\n",
" endoftext = 0\n",
" list[0:0] = [endoftext] * (max_length - len(list))\n",
" normal_sequences.append(list)\n",
" prev_end_loc = end_loc\n",
" if end_loc == seq_len:\n",
" break \n",
" normal_data = pd.DataFrame(normal_sequences)\n",
" normal_data.to_csv(output_file+str(index)+\".csv\", sep='\\t', index=False, header=False)\n",
"\n",
"def calculate_anomaly_threshold(data):\n",
" global model\n",
" nlls = []\n",
" perplexities = []\n",
" \n",
" for index in tqdm(range(0, 10000)):\n",
" encodings = tokenizer(str(data[index]), return_tensors=\"pt\")\n",
" max_length = model.config.n_positions\n",
" seq_len = encodings.input_ids.size(1)\n",
" input_ids = encodings.input_ids.to(device)\n",
" target_ids = input_ids.clone()\n",
" target_ids[:, :-trg_len] = -100\n",
" \n",
" with torch.no_grad():\n",
" outputs = model(input_ids, labels=target_ids)\n",
" neg_log_likelihood = outputs.loss\n",
" nlls.append(neg_log_likelihood)\n",
" local_ppl = torch.exp(neg_log_likelihood)\n",
" perplexities.append(local_ppl)\n",
" \n",
" # Calculate mean and standard deviation\n",
" ppl = torch.exp(torch.stack(nlls).mean())\n",
" print(f\"Regular perplexity: {ppl}\")\n",
" perplexities = torch.tensor(perplexities, dtype=torch.float32)\n",
" mean = perplexities.mean()\n",
" std_dev = perplexities.std()\n",
" print(f\"Mean: {mean}, Standard Deviation: {std_dev}\")\n",
" # Define anomaly threshold\n",
" threshold = 2\n",
" # Calculate anomaly threshold based on standard deviation and mean\n",
" anomaly_threshold = mean + threshold * std_dev\n",
" print(f\"Anomaly Threshold: {anomaly_threshold}\")\n",
" # anomalies = perplexities[(perplexities > anomaly_threshold)]\n",
" # print(f\"Anomalies: {anomalies.shape}\")\n",
" return anomaly_threshold\n",
"\n",
"if anomaly_validation:\n",
" # Load normal SC opcode files\n",
" training = load_dataset(\"text\",\n",
" data_files={\n",
" \"train\": \"/data/forta/ethereum/text/pretraining/training/normal/normal.csv\",\n",
" \"val\": \"/data/forta/ethereum/text/pretraining/validation/normal/normal.csv\"\n",
" }\n",
" )\n",
"\n",
" # Load malicious SC opcode files\n",
" test = load_dataset(\"text\",\n",
" data_files={\n",
" \"train\": \"/data/forta/ethereum/text/pretraining/training/malicious/malicious.csv\",\n",
" \"val\": \"/data/forta/ethereum/text/pretraining/validation/malicious/malicious.csv\"\n",
" }\n",
" )\n",
"\n",
" '''# Extract normal SC opcode encodings\n",
" print(\"Extracting normal sequences training...\")\n",
" for index in tqdm(range(0, 500)):\n",
" extract_normal_sequences(\n",
" training[\"train\"][\"text\"][index],\n",
" \"/data/forta/ethereum/text/finetuning/training/normal/\",\n",
" stride,\n",
" index\n",
" )\n",
"\n",
" print(\"Extracting normal sequences validation...\")\n",
" for index in tqdm(range(0, 200)):\n",
" extract_normal_sequences(\n",
" training[\"val\"][\"text\"][index],\n",
" \"/data/forta/ethereum/text/finetuning/validation/normal/\",\n",
" stride,\n",
" index\n",
" )\n",
"\n",
" print(\"Extracting malicious sequences anomaly validation...\")\n",
" for index in tqdm(range(0, len(test[\"train\"][\"text\"]))):\n",
" extract_normal_sequences(\n",
" test[\"train\"][\"text\"][index],\n",
" \"/data/forta/ethereum/text/finetuning/validation/malicious/\",\n",
" stride,\n",
" index\n",
" )'''\n",
"\n",
" # Calculate threshold for extraction of malicious samples\n",
" print(\"Calculating threshold\")\n",
" threshold = calculate_anomaly_threshold(training[\"train\"][\"text\"])\n",
" print(\"Calculated threshold: %s\" % str(threshold))"
]
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def extract_anomalous_sequences(data, output_file, stride, anomaly_threshold, index):\n",
" global model\n",
" encodings = tokenizer(str(data), return_tensors=\"pt\")\n",
" max_length = model.config.n_positions\n",
" seq_len = encodings.input_ids.size(1)\n",
" \n",
" nlls = []\n",
" prev_end_loc = 0\n",
" anomalous_sequences = []\n",
" for begin_loc in range(0, seq_len, stride):\n",
" end_loc = min(begin_loc + max_length, seq_len)\n",
" trg_len = end_loc - prev_end_loc\n",
" input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)\n",
" target_ids = input_ids.clone()\n",
" target_ids[:, :-trg_len] = -100\n",
" \n",
" with torch.no_grad():\n",
" outputs = model(input_ids, labels=target_ids)\n",
" neg_log_likelihood = outputs.loss\n",
" local_ppl = torch.exp(neg_log_likelihood)\n",
" if local_ppl > anomaly_threshold:\n",
" list = input_ids[0].tolist()\n",
" # padding to the left\n",
" endoftext = 0\n",
" list[0:0] = [endoftext] * (max_length - len(list))\n",
" anomalous_sequences.append(list)\n",
" \n",
" nlls.append(neg_log_likelihood)\n",
" \n",
" prev_end_loc = end_loc\n",
" if end_loc == seq_len:\n",
" break\n",
" anomalous_data = pd.DataFrame(anomalous_sequences)\n",
" anomalous_data.to_csv(output_file+str(index)+\".csv\", sep='\\t', index=False, header=False)\n",
"\n",
"if anomaly_validation:\n",
" #Extract malicious SC opcode encodings\n",
" print(\"Extracting anomalous sequences trainig...\")\n",
" for index in tqdm(range(0, len(test[\"train\"][\"text\"]))):\n",
" extract_anomalous_sequences(\n",
" test[\"train\"][\"text\"][index],\n",
" \"/data/forta/ethereum/text/finetuning/training/malicious/\",\n",
" stride,\n",
" threshold,\n",
" index\n",
" )\n",
"\n",
" print(\"Extracting anomalous sequences validation...\")\n",
" for index in tqdm(range(0, len(test[\"val\"][\"text\"]))):\n",
" extract_anomalous_sequences(\n",
" test[\"val\"][\"text\"][index],\n",
" \"/data/forta/ethereum/text/finetuning/validation/malicious/\",\n",
" stride,\n",
" threshold,\n",
" index\n",
" )"
]
"source": []
}
],
"metadata": {
Expand Down Expand Up @@ -681,7 +518,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
"version": "3.9.18"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -596,7 +596,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
"version": "3.9.18"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
"version": "3.9.18"
}
},
"nbformat": 4,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@
"source": [
"# Call decompiler command and return functions\n",
"def get_SC_opcodes(bytecode_hex, creator):\n",
" command = \"/home/kurono/Documents/Sakundi/Software/evm-dis/build/libs/driver-py/__main__.py\"\n",
" command = \"/home/kurono/Documents/development/sakundi/evm-dis/build/libs/driver-py/__main__.py\"\n",
" result = None\n",
" try:\n",
" result = subprocess.run(['python',\n",
Expand Down Expand Up @@ -303,7 +303,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
"version": "3.9.18"
}
},
"nbformat": 4,
Expand Down

0 comments on commit ef17698

Please sign in to comment.