adapt and improve gpt models for smart contract analysis

edenia · Mar 4, 2024 · a1277a3 · a1277a3
1 parent 727f6bb
commit a1277a3
Show file tree

Hide file tree

Showing 2 changed files with 24 additions and 13 deletions.
diff --git a/analysis/ethereum_smart_contracts/finetune_gpt.ipynb b/analysis/ethereum_smart_contracts/finetune_gpt.ipynb
@@ -56,7 +56,7 @@
     "\n",
     "epochs = 4\n",
     "batch_size = 32\n",
-    "max_length = None\n",
+    "max_length = 500\n",
     "\n",
     "# Look for gpu to use. Will use `cpu` by default if no gpu found.\n",
     "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",

diff --git a/analysis/ethereum_smart_contracts/tokenizer.ipynb b/analysis/ethereum_smart_contracts/tokenizer.ipynb
@@ -6,7 +6,7 @@
     "id": "FPuIaYYJLjvJ"
    },
    "source": [
-    "# Training a new tokenizer from an old one"
+    "# Training a new tokenizer based on smart contract disassembled code"
    ]
   },
   {
@@ -21,20 +21,21 @@
     "from transformers import AutoTokenizer\n",
     "from collections import defaultdict\n",
     "\n",
-    "dataset = load_dataset(\"text\", data_files={\"train\": \"/data/forta/ethereum/train.txt\",\n",
-    "                                           \"val\": \"/data/forta/ethereum/val.txt\"})"
+    "dataset = load_dataset(\"text\", data_files={\"train\": \"/data/forta/ethereum/text/train.txt\",\n",
+    "                                           \"val\": \"/data/forta/ethereum/text/val.txt\"})"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "id": "JRIVpv8ZLjvT",
-    "outputId": "7cf127e1-3fcd-40f1-d82e-9d29d60c02e2"
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "dataset[\"train\"]"
+    "max = -1\n",
+    "for row in dataset[\"train\"][\"text\"]:\n",
+    "    length_of_the_messages = row.split(\" \")\n",
+    "    max = len(length_of_the_messages) if len(length_of_the_messages) > max else max\n",
+    "print(\"Max number of words = \", max)"
    ]
   },
   {
@@ -46,9 +47,10 @@
    "outputs": [],
    "source": [
     "def get_training_corpus():\n",
+    "    batch_size = 400\n",
     "    aux_dataset = dataset[\"train\"]\n",
-    "    for start_idx in range(0, len(aux_dataset), 400):\n",
-    "        samples = aux_dataset[start_idx : start_idx + 400]\n",
+    "    for start_idx in range(0, len(aux_dataset), batch_size):\n",
+    "        samples = aux_dataset[start_idx : start_idx + batch_size]\n",
     "        yield samples[\"text\"]\n",
     "\n",
     "training_corpus = get_training_corpus()"
@@ -74,7 +76,8 @@
    },
    "outputs": [],
    "source": [
-    "tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 524)\n",
+    "vocab_size = 524\n",
+    "tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, vocab_size)\n",
     "# print(old_tokenizer.tokenize(\"PUSH1 PUSH1 MSTORE PUSH1 CALLDATASIZE LT PUSH2\"))"
    ]
   },
@@ -86,8 +89,16 @@
    },
    "outputs": [],
    "source": [
-    "tokenizer.save_pretrained(\"/data/forta/ethereum/tokenizer\")"
+    "tokenizer.save_pretrained(\"/data/forta/ethereum/tokenizer\")\n",
+    "print(tokenizer)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {