Skip to content

Commit

Permalink
add more notebooks to the smart contract analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
kuronosec committed Mar 18, 2024
1 parent a1277a3 commit 180390d
Show file tree
Hide file tree
Showing 7 changed files with 6,804 additions and 174 deletions.
4,026 changes: 4,026 additions & 0 deletions analysis/ethereum_smart_contracts/anomaly_gpt.ipynb

Large diffs are not rendered by default.

63 changes: 16 additions & 47 deletions analysis/ethereum_smart_contracts/finetune_gpt.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
"# See the License for the specific language governing permissions and\n",
"# limitations under the License.\n",
"# Based on: https://gmihaila.github.io/tutorial_notebooks/finetune_transformers_pytorch/"
"# Based on: https://gmihaila.github.io/tutorial_notebooks/gpt2_finetune_classification/"
]
},
{
Expand Down Expand Up @@ -55,10 +55,10 @@
"set_seed(4444)\n",
"\n",
"epochs = 4\n",
"batch_size = 32\n",
"max_length = 500\n",
"batch_size = 10\n",
"# Max lenght gpt2 => 1024\n",
"max_length = None\n",
"\n",
"# Look for gpu to use. Will use `cpu` by default if no gpu found.\n",
"device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
"\n",
"model_name_or_path = '/data/forta/ethereum/model'\n",
Expand All @@ -76,39 +76,29 @@
},
"outputs": [],
"source": [
"class SmartContractBytecodeDataset(Dataset):\n",
"class SmartContractOpcodeDataset(Dataset):\n",
" r\"\"\" PyTorch Dataset class for loading data. \"\"\"\n",
"\n",
" def __init__(self, path, use_tokenizer):\n",
"\n",
" # Check if path exists.\n",
" if not os.path.isdir(path):\n",
" # Raise error if path is invalid.\n",
" raise ValueError('Invalid `path` variable! Needs to be a directory')\n",
" self.texts = []\n",
" self.labels = []\n",
" # Since the labels are defined by folders with data we loop \n",
" # through each label.\n",
" for label in ['normal', 'malicious']:\n",
" bytecode_path = os.path.join(path, label)\n",
"\n",
" opcode_path = os.path.join(path, label)\n",
" # Get all files from path.\n",
" files_names = os.listdir(bytecode_path)\n",
" files_names = os.listdir(opcode_path)\n",
" # Go through each file and read its content.\n",
" for file_name in tqdm(files_names, desc=f'{label} files'):\n",
" file_path = os.path.join(bytecode_path, file_name)\n",
"\n",
" # Read content.\n",
" file_path = os.path.join(opcode_path, file_name)\n",
" current_file = io.open(file_path, mode='r', encoding='utf-8')\n",
" for line in current_file:\n",
" # Fix any unicode issues.\n",
" line = fix_text(line)\n",
" # Save content.\n",
" self.texts.append(line)\n",
" # Save encode labels.\n",
" self.labels.append(label)\n",
"\n",
" # Number of exmaples.\n",
" self.n_examples = len(self.labels)\n",
"\n",
" return\n",
Expand All @@ -124,7 +114,6 @@
" r\"\"\" Data Collator used for GPT2 in a classificaiton rask. \"\"\"\n",
"\n",
" def __init__(self, use_tokenizer, labels_encoder, max_sequence_len=None):\n",
"\n",
" # Tokenizer to be used inside the class.\n",
" self.use_tokenizer = use_tokenizer\n",
" # Check max sequence length.\n",
Expand All @@ -135,44 +124,35 @@
" return\n",
"\n",
" def __call__(self, sequences):\n",
" # Get all texts from sequences list.\n",
" texts = [sequence['text'] for sequence in sequences]\n",
" # Get all labels from sequences list.\n",
" labels = [sequence['label'] for sequence in sequences]\n",
" # Encode all labels using label encoder.\n",
" labels = [self.labels_encoder[label] for label in labels]\n",
"\n",
" # Call tokenizer on all texts to convert into tensors of numbers with \n",
" # appropriate padding.\n",
" inputs = self.use_tokenizer(text=texts, return_tensors=\"pt\", padding=True, truncation=True, max_length=self.max_sequence_len)\n",
" inputs = self.use_tokenizer(text=texts, return_tensors=\"pt\", padding=True, truncation=True, max_length=self.max_sequence_len)\n",
" # Update the inputs with the associated encoded labels as tensor.\n",
" inputs.update({'labels':torch.tensor(labels)})\n",
"\n",
" return inputs\n",
"\n",
"def train(dataloader, optimizer_, scheduler_, device_):\n",
" r\"\"\" Train pytorch model on a single pass through the data loader. \"\"\"\n",
"\n",
" # Use global variable for model.\n",
" global model\n",
"\n",
" # Tracking variables.\n",
" predictions_labels = []\n",
" true_labels = []\n",
" # Total loss for this epoch.\n",
" total_loss = 0\n",
"\n",
" # Put the model into training mode.\n",
" model.train()\n",
"\n",
" # For each batch of training data...\n",
" for batch in tqdm(dataloader, total=len(dataloader)):\n",
"\n",
" # Add original labels - use later for evaluation.\n",
" true_labels += batch['labels'].numpy().flatten().tolist()\n",
" \n",
" true_labels += batch['labels'].numpy().flatten().tolist() \n",
" # move batch to device\n",
" batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}\n",
" \n",
" # Always clear any previously calculated gradients before performing a\n",
" # backward pass.\n",
" model.zero_grad()\n",
Expand Down Expand Up @@ -241,17 +221,13 @@
"\n",
" # Evaluate data for one epoch\n",
" for batch in tqdm(dataloader, total=len(dataloader)):\n",
"\n",
" # add original labels\n",
" true_labels += batch['labels'].numpy().flatten().tolist()\n",
"\n",
" # move batch to device\n",
" batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}\n",
"\n",
" # Telling the model not to compute or store gradients, saving memory and\n",
" # speeding up validation\n",
" with torch.no_grad(): \n",
"\n",
" # Forward pass, calculate logit predictions.\n",
" # This will return the logits rather than the loss because we have\n",
" # not provided labels.\n",
Expand Down Expand Up @@ -318,7 +294,6 @@
"\n",
"# resize model embedding to match new tokenizer\n",
"model.resize_token_embeddings(len(tokenizer))\n",
"\n",
"# fix model padding token id\n",
"model.config.pad_token_id = model.config.eos_token_id\n",
"\n",
Expand Down Expand Up @@ -381,19 +356,17 @@
"\n",
"print('Dealing with Train...')\n",
"# Create pytorch dataset.\n",
"train_dataset = SmartContractBytecodeDataset(path='/data/forta/ethereum/text', \n",
"train_dataset = SmartContractOpcodeDataset(path='/data/forta/ethereum/text/finetuning/training/', \n",
" use_tokenizer=tokenizer)\n",
"print('Created `train_dataset` with %d examples!'%len(train_dataset))\n",
"\n",
"# Move pytorch dataset into dataloader.\n",
"train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=gpt2_classificaiton_collator)\n",
"print('Created `train_dataloader` with %d batches!'%len(train_dataloader))\n",
"\n",
"print()\n",
"\n",
"print('Dealing with Validation...')\n",
"# Create pytorch dataset.\n",
"valid_dataset = SmartContractBytecodeDataset(path='/data/forta/ethereum/text', \n",
"valid_dataset = SmartContractOpcodeDataset(path='/data/forta/ethereum/text/finetuning/validation/', \n",
" use_tokenizer=tokenizer)\n",
"print('Created `valid_dataset` with %d examples!'%len(valid_dataset))\n",
"\n",
Expand Down Expand Up @@ -490,11 +463,9 @@
},
"outputs": [],
"source": [
"# Note: AdamW is a class from the huggingface library (as opposed to pytorch) \n",
"# I believe the 'W' stands for 'Weight Decay fix\"\n",
"optimizer = AdamW(model.parameters(),\n",
" lr = 2e-5, # default is 5e-5, our notebook had 2e-5\n",
" eps = 1e-8 # default is 1e-8.\n",
" lr = 2e-5, # default is 5e-5\n",
" eps = 1e-8 # default is 1e-8\n",
" )\n",
"\n",
"# Total number of training steps is number of batches * number of epochs.\n",
Expand All @@ -504,7 +475,7 @@
"\n",
"# Create the learning rate scheduler.\n",
"scheduler = get_linear_schedule_with_warmup(optimizer, \n",
" num_warmup_steps = 0, # Default value in run_glue.py\n",
" num_warmup_steps = 0,\n",
" num_training_steps = total_steps)\n",
"\n",
"# Store the average loss after each epoch so we can plot them.\n",
Expand All @@ -514,7 +485,6 @@
"# Loop through each epoch.\n",
"print('Epoch')\n",
"for epoch in tqdm(range(epochs)):\n",
" print()\n",
" print('Training on batches...')\n",
" # Perform one full pass over the training set.\n",
" train_labels, train_predict, train_loss = train(train_dataloader, optimizer, scheduler, device)\n",
Expand All @@ -527,7 +497,6 @@
"\n",
" # Print loss and accuracy values to see how training evolves.\n",
" print(\" train_loss: %.5f - val_loss: %.5f - train_acc: %.5f - valid_acc: %.5f\"%(train_loss, val_loss, train_acc, val_acc))\n",
" print()\n",
"\n",
" # Store the loss value for plotting the learning curve.\n",
" all_loss['train_loss'].append(train_loss)\n",
Expand Down
Loading

0 comments on commit 180390d

Please sign in to comment.