update final results for GPT2 on smart contract functions

edenia · Jul 8, 2024 · ef17698 · ef17698
1 parent d2fd9fa
commit ef17698
Show file tree

Hide file tree

Showing 6 changed files with 22 additions and 190 deletions.
diff --git a/analysis/ethereum_p2p/ethereum_lstm_log_octets.ipynb b/analysis/ethereum_p2p/ethereum_lstm_log_octets.ipynb
@@ -229,7 +229,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.16"
+   "version": "3.9.18"
   }
  },
  "nbformat": 4,

diff --git a/analysis/ethereum_smart_contracts/GPT_anomaly_classification.ipynb b/analysis/ethereum_smart_contracts/GPT_anomaly_classification.ipynb
@@ -43,9 +43,6 @@
     "from ml_things import plot_dict, plot_confusion_matrix, fix_text\n",
     "from sklearn.metrics import classification_report, accuracy_score\n",
     "from transformers import (\n",
-    "                          AutoConfig,\n",
-    "                          AutoTokenizer,\n",
-    "                          AutoModelForCausalLM,\n",
     "                          PretrainedConfig,\n",
     "                          set_seed,\n",
     "                          GPT2ForSequenceClassification,\n",
@@ -66,18 +63,16 @@
     "tokenizer_path = '/data/forta/ethereum/tokenizer'\n",
     "model_path = '/data/forta/ethereum/model_anomaly'\n",
     "\n",
-    "# Regular perplexity: 1.6968108415603638\n",
-    "# mean = 1.9269466400146484\n",
-    "mean = 1.6\n",
-    "std_dev = 1.5235518217086792\n",
-    "delta = 1.75\n",
-    "anomaly_threshold = mean + delta * std_dev\n",
-    "# Original Anomaly Threshold: 4.974050521850586\n",
-    "print(\"anomaly_threshold: \"+str(anomaly_threshold))\n",
-    "stride = 128\n",
+    "# Stablish parameters\n",
+    "perplexity = 2.33\n",
+    "std_dev = 1.52\n",
+    "delta = 1.5\n",
+    "anomaly_threshold = perplexity + delta * std_dev\n",
     "max_length = 1024\n",
     "batch_size = 1\n",
     "\n",
+    "print(\"anomaly_threshold: \"+str(anomaly_threshold))\n",
+    "\n",
     "labels_ids = {'malicious': torch.tensor(\n",
     "    [0], dtype=torch.long).to(device),\n",
     "              'normal': torch.tensor(\n",
@@ -320,7 +315,7 @@
     "\n",
     "# Move pytorch dataset into dataloader.\n",
     "valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size,\n",
-    "                              shuffle=False,\n",
+    "                              shuffle=True,\n",
     "                              collate_fn=gpt2_classificaiton_collator\n",
     "                             )\n",
     "print('Created `eval_dataloader` with %d batches!'%len(valid_dataloader))"
@@ -404,7 +399,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.19"
+   "version": "3.9.18"
   },
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {

diff --git a/analysis/ethereum_smart_contracts/GPT_anomaly_pretraining.ipynb b/analysis/ethereum_smart_contracts/GPT_anomaly_pretraining.ipynb
@@ -68,8 +68,8 @@
     "\n",
     "model = None\n",
     "anomaly_validation = True\n",
-    "do_train=False\n",
-    "do_eval=False"
+    "do_train=True\n",
+    "do_eval=True"
    ]
   },
   {
@@ -478,181 +478,18 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
-   "source": [
-    "def extract_normal_sequences(data, output_file, stride, index):\n",
-    "    encodings = tokenizer(str(data), return_tensors=\"pt\")\n",
-    "    max_length = model.config.n_positions\n",
-    "    seq_len = encodings.input_ids.size(1)\n",
-    "    \n",
-    "    prev_end_loc = 0\n",
-    "    normal_sequences = []\n",
-    "    for begin_loc in range(0, seq_len, stride):\n",
-    "        end_loc = min(begin_loc + max_length, seq_len)\n",
-    "        trg_len = end_loc - prev_end_loc\n",
-    "        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)\n",
-    "        list = input_ids[0].tolist()\n",
-    "        # padding to the left\n",
-    "        endoftext = 0\n",
-    "        list[0:0] = [endoftext] * (max_length - len(list))\n",
-    "        normal_sequences.append(list)\n",
-    "        prev_end_loc = end_loc\n",
-    "        if end_loc == seq_len:\n",
-    "            break    \n",
-    "    normal_data = pd.DataFrame(normal_sequences)\n",
-    "    normal_data.to_csv(output_file+str(index)+\".csv\", sep='\\t', index=False, header=False)\n",
-    "\n",
-    "def calculate_anomaly_threshold(data):\n",
-    "    global model\n",
-    "    nlls = []\n",
-    "    perplexities = []\n",
-    "    \n",
-    "    for index in tqdm(range(0, 10000)):\n",
-    "        encodings = tokenizer(str(data[index]), return_tensors=\"pt\")\n",
-    "        max_length = model.config.n_positions\n",
-    "        seq_len = encodings.input_ids.size(1)\n",
-    "        input_ids = encodings.input_ids.to(device)\n",
-    "        target_ids = input_ids.clone()\n",
-    "        target_ids[:, :-trg_len] = -100\n",
-    "    \n",
-    "        with torch.no_grad():\n",
-    "            outputs = model(input_ids, labels=target_ids)\n",
-    "            neg_log_likelihood = outputs.loss\n",
-    "            nlls.append(neg_log_likelihood)\n",
-    "            local_ppl = torch.exp(neg_log_likelihood)\n",
-    "            perplexities.append(local_ppl)\n",
-    "    \n",
-    "    # Calculate mean and standard deviation\n",
-    "    ppl = torch.exp(torch.stack(nlls).mean())\n",
-    "    print(f\"Regular perplexity: {ppl}\")\n",
-    "    perplexities = torch.tensor(perplexities, dtype=torch.float32)\n",
-    "    mean = perplexities.mean()\n",
-    "    std_dev = perplexities.std()\n",
-    "    print(f\"Mean: {mean}, Standard Deviation: {std_dev}\")\n",
-    "    # Define anomaly threshold\n",
-    "    threshold = 2\n",
-    "    # Calculate anomaly threshold based on standard deviation and mean\n",
-    "    anomaly_threshold = mean + threshold * std_dev\n",
-    "    print(f\"Anomaly Threshold: {anomaly_threshold}\")\n",
-    "    # anomalies = perplexities[(perplexities > anomaly_threshold)]\n",
-    "    # print(f\"Anomalies: {anomalies.shape}\")\n",
-    "    return anomaly_threshold\n",
-    "\n",
-    "if anomaly_validation:\n",
-    "    # Load normal SC opcode files\n",
-    "    training = load_dataset(\"text\",\n",
-    "                            data_files={\n",
-    "                              \"train\": \"/data/forta/ethereum/text/pretraining/training/normal/normal.csv\",\n",
-    "                              \"val\": \"/data/forta/ethereum/text/pretraining/validation/normal/normal.csv\"\n",
-    "                              }\n",
-    "                           )\n",
-    "\n",
-    "    # Load malicious SC opcode files\n",
-    "    test = load_dataset(\"text\",\n",
-    "                        data_files={\n",
-    "                            \"train\": \"/data/forta/ethereum/text/pretraining/training/malicious/malicious.csv\",\n",
-    "                            \"val\": \"/data/forta/ethereum/text/pretraining/validation/malicious/malicious.csv\"\n",
-    "                            }\n",
-    "                       )\n",
-    "\n",
-    "    '''# Extract normal SC opcode encodings\n",
-    "    print(\"Extracting normal sequences training...\")\n",
-    "    for index in tqdm(range(0, 500)):\n",
-    "        extract_normal_sequences(\n",
-    "            training[\"train\"][\"text\"][index],\n",
-    "            \"/data/forta/ethereum/text/finetuning/training/normal/\",\n",
-    "            stride,\n",
-    "            index\n",
-    "        )\n",
-    "\n",
-    "    print(\"Extracting normal sequences validation...\")\n",
-    "    for index in tqdm(range(0, 200)):\n",
-    "        extract_normal_sequences(\n",
-    "            training[\"val\"][\"text\"][index],\n",
-    "            \"/data/forta/ethereum/text/finetuning/validation/normal/\",\n",
-    "            stride,\n",
-    "            index\n",
-    "        )\n",
-    "\n",
-    "    print(\"Extracting malicious sequences anomaly validation...\")\n",
-    "    for index in tqdm(range(0, len(test[\"train\"][\"text\"]))):\n",
-    "        extract_normal_sequences(\n",
-    "            test[\"train\"][\"text\"][index],\n",
-    "            \"/data/forta/ethereum/text/finetuning/validation/malicious/\",\n",
-    "            stride,\n",
-    "            index\n",
-    "        )'''\n",
-    "\n",
-    "    # Calculate threshold for extraction of malicious samples\n",
-    "    print(\"Calculating threshold\")\n",
-    "    threshold = calculate_anomaly_threshold(training[\"train\"][\"text\"])\n",
-    "    print(\"Calculated threshold: %s\" % str(threshold))"
-   ]
+   "source": []
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "def extract_anomalous_sequences(data, output_file, stride, anomaly_threshold, index):\n",
-    "    global model\n",
-    "    encodings = tokenizer(str(data), return_tensors=\"pt\")\n",
-    "    max_length = model.config.n_positions\n",
-    "    seq_len = encodings.input_ids.size(1)\n",
-    "    \n",
-    "    nlls = []\n",
-    "    prev_end_loc = 0\n",
-    "    anomalous_sequences = []\n",
-    "    for begin_loc in range(0, seq_len, stride):\n",
-    "        end_loc = min(begin_loc + max_length, seq_len)\n",
-    "        trg_len = end_loc - prev_end_loc\n",
-    "        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)\n",
-    "        target_ids = input_ids.clone()\n",
-    "        target_ids[:, :-trg_len] = -100\n",
-    "    \n",
-    "        with torch.no_grad():\n",
-    "            outputs = model(input_ids, labels=target_ids)\n",
-    "            neg_log_likelihood = outputs.loss\n",
-    "            local_ppl = torch.exp(neg_log_likelihood)\n",
-    "            if local_ppl > anomaly_threshold:\n",
-    "                list = input_ids[0].tolist()\n",
-    "                # padding to the left\n",
-    "                endoftext = 0\n",
-    "                list[0:0] = [endoftext] * (max_length - len(list))\n",
-    "                anomalous_sequences.append(list)\n",
-    "    \n",
-    "        nlls.append(neg_log_likelihood)\n",
-    "    \n",
-    "        prev_end_loc = end_loc\n",
-    "        if end_loc == seq_len:\n",
-    "            break\n",
-    "    anomalous_data = pd.DataFrame(anomalous_sequences)\n",
-    "    anomalous_data.to_csv(output_file+str(index)+\".csv\", sep='\\t', index=False, header=False)\n",
-    "\n",
-    "if anomaly_validation:\n",
-    "    #Extract malicious SC opcode encodings\n",
-    "    print(\"Extracting anomalous sequences trainig...\")\n",
-    "    for index in tqdm(range(0, len(test[\"train\"][\"text\"]))):\n",
-    "        extract_anomalous_sequences(\n",
-    "            test[\"train\"][\"text\"][index],\n",
-    "            \"/data/forta/ethereum/text/finetuning/training/malicious/\",\n",
-    "            stride,\n",
-    "            threshold,\n",
-    "            index\n",
-    "        )\n",
-    "\n",
-    "    print(\"Extracting anomalous sequences validation...\")\n",
-    "    for index in tqdm(range(0, len(test[\"val\"][\"text\"]))):\n",
-    "        extract_anomalous_sequences(\n",
-    "            test[\"val\"][\"text\"][index],\n",
-    "            \"/data/forta/ethereum/text/finetuning/validation/malicious/\",\n",
-    "            stride,\n",
-    "            threshold,\n",
-    "            index\n",
-    "        )"
-   ]
+   "source": []
   }
  ],
  "metadata": {
@@ -681,7 +518,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.19"
+   "version": "3.9.18"
   },
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {

diff --git a/analysis/ethereum_smart_contracts/GPT_finetuning_classification.ipynb b/analysis/ethereum_smart_contracts/GPT_finetuning_classification.ipynb
@@ -596,7 +596,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.19"
+   "version": "3.9.18"
   },
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {

diff --git a/analysis/ethereum_smart_contracts/GPT_finetuning_data_collection.ipynb b/analysis/ethereum_smart_contracts/GPT_finetuning_data_collection.ipynb
@@ -331,7 +331,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.19"
+   "version": "3.9.18"
   }
  },
  "nbformat": 4,

diff --git a/analysis/ethereum_smart_contracts/GPT_verified_data_preprocessing.ipynb b/analysis/ethereum_smart_contracts/GPT_verified_data_preprocessing.ipynb
@@ -147,7 +147,7 @@
    "source": [
     "# Call decompiler command and return functions\n",
     "def get_SC_opcodes(bytecode_hex, creator):\n",
-    "    command = \"/home/kurono/Documents/Sakundi/Software/evm-dis/build/libs/driver-py/__main__.py\"\n",
+    "    command = \"/home/kurono/Documents/development/sakundi/evm-dis/build/libs/driver-py/__main__.py\"\n",
     "    result = None\n",
     "    try:\n",
     "        result = subprocess.run(['python',\n",
@@ -303,7 +303,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.19"
+   "version": "3.9.18"
   }
  },
  "nbformat": 4,