Skip to content

Commit

Permalink
adapt and improve gpt models for smart contract analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
kuronosec committed Mar 4, 2024
1 parent 727f6bb commit a1277a3
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 13 deletions.
2 changes: 1 addition & 1 deletion analysis/ethereum_smart_contracts/finetune_gpt.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
"\n",
"epochs = 4\n",
"batch_size = 32\n",
"max_length = None\n",
"max_length = 500\n",
"\n",
"# Look for gpu to use. Will use `cpu` by default if no gpu found.\n",
"device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
Expand Down
35 changes: 23 additions & 12 deletions analysis/ethereum_smart_contracts/tokenizer.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"id": "FPuIaYYJLjvJ"
},
"source": [
"# Training a new tokenizer from an old one"
"# Training a new tokenizer based on smart contract disassembled code"
]
},
{
Expand All @@ -21,20 +21,21 @@
"from transformers import AutoTokenizer\n",
"from collections import defaultdict\n",
"\n",
"dataset = load_dataset(\"text\", data_files={\"train\": \"/data/forta/ethereum/train.txt\",\n",
" \"val\": \"/data/forta/ethereum/val.txt\"})"
"dataset = load_dataset(\"text\", data_files={\"train\": \"/data/forta/ethereum/text/train.txt\",\n",
" \"val\": \"/data/forta/ethereum/text/val.txt\"})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "JRIVpv8ZLjvT",
"outputId": "7cf127e1-3fcd-40f1-d82e-9d29d60c02e2"
},
"metadata": {},
"outputs": [],
"source": [
"dataset[\"train\"]"
"max = -1\n",
"for row in dataset[\"train\"][\"text\"]:\n",
" length_of_the_messages = row.split(\" \")\n",
" max = len(length_of_the_messages) if len(length_of_the_messages) > max else max\n",
"print(\"Max number of words = \", max)"
]
},
{
Expand All @@ -46,9 +47,10 @@
"outputs": [],
"source": [
"def get_training_corpus():\n",
" batch_size = 400\n",
" aux_dataset = dataset[\"train\"]\n",
" for start_idx in range(0, len(aux_dataset), 400):\n",
" samples = aux_dataset[start_idx : start_idx + 400]\n",
" for start_idx in range(0, len(aux_dataset), batch_size):\n",
" samples = aux_dataset[start_idx : start_idx + batch_size]\n",
" yield samples[\"text\"]\n",
"\n",
"training_corpus = get_training_corpus()"
Expand All @@ -74,7 +76,8 @@
},
"outputs": [],
"source": [
"tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 524)\n",
"vocab_size = 524\n",
"tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, vocab_size)\n",
"# print(old_tokenizer.tokenize(\"PUSH1 PUSH1 MSTORE PUSH1 CALLDATASIZE LT PUSH2\"))"
]
},
Expand All @@ -86,8 +89,16 @@
},
"outputs": [],
"source": [
"tokenizer.save_pretrained(\"/data/forta/ethereum/tokenizer\")"
"tokenizer.save_pretrained(\"/data/forta/ethereum/tokenizer\")\n",
"print(tokenizer)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down

0 comments on commit a1277a3

Please sign in to comment.