add more notebooks to the smart contract analysis

edenia · Mar 18, 2024 · 180390d · 180390d
1 parent a1277a3
commit 180390d
Show file tree

Hide file tree

Showing 7 changed files with 6,804 additions and 174 deletions.
diff --git a/analysis/ethereum_smart_contracts/anomaly_gpt.ipynb b/analysis/ethereum_smart_contracts/anomaly_gpt.ipynb
diff --git a/analysis/ethereum_smart_contracts/finetune_gpt.ipynb b/analysis/ethereum_smart_contracts/finetune_gpt.ipynb
@@ -19,7 +19,7 @@
     "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
     "# See the License for the specific language governing permissions and\n",
     "# limitations under the License.\n",
-    "# Based on: https://gmihaila.github.io/tutorial_notebooks/finetune_transformers_pytorch/"
+    "# Based on: https://gmihaila.github.io/tutorial_notebooks/gpt2_finetune_classification/"
    ]
   },
   {
@@ -55,10 +55,10 @@
     "set_seed(4444)\n",
     "\n",
     "epochs = 4\n",
-    "batch_size = 32\n",
-    "max_length = 500\n",
+    "batch_size = 10\n",
+    "# Max lenght gpt2 => 1024\n",
+    "max_length = None\n",
     "\n",
-    "# Look for gpu to use. Will use `cpu` by default if no gpu found.\n",
     "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
     "\n",
     "model_name_or_path = '/data/forta/ethereum/model'\n",
@@ -76,39 +76,29 @@
    },
    "outputs": [],
    "source": [
-    "class SmartContractBytecodeDataset(Dataset):\n",
+    "class SmartContractOpcodeDataset(Dataset):\n",
     "  r\"\"\" PyTorch Dataset class for loading data. \"\"\"\n",
     "\n",
     "  def __init__(self, path, use_tokenizer):\n",
-    "\n",
-    "    # Check if path exists.\n",
     "    if not os.path.isdir(path):\n",
-    "      # Raise error if path is invalid.\n",
     "      raise ValueError('Invalid `path` variable! Needs to be a directory')\n",
     "    self.texts = []\n",
     "    self.labels = []\n",
     "    # Since the labels are defined by folders with data we loop \n",
     "    # through each label.\n",
     "    for label in ['normal', 'malicious']:\n",
-    "      bytecode_path = os.path.join(path, label)\n",
-    "\n",
+    "      opcode_path = os.path.join(path, label)\n",
     "      # Get all files from path.\n",
-    "      files_names = os.listdir(bytecode_path)\n",
+    "      files_names = os.listdir(opcode_path)\n",
     "      # Go through each file and read its content.\n",
     "      for file_name in tqdm(files_names, desc=f'{label} files'):\n",
-    "        file_path = os.path.join(bytecode_path, file_name)\n",
-    "\n",
-    "        # Read content.\n",
+    "        file_path = os.path.join(opcode_path, file_name)\n",
     "        current_file = io.open(file_path, mode='r', encoding='utf-8')\n",
     "        for line in current_file:\n",
     "            # Fix any unicode issues.\n",
     "            line = fix_text(line)\n",
-    "            # Save content.\n",
     "            self.texts.append(line)\n",
-    "            # Save encode labels.\n",
     "            self.labels.append(label)\n",
-    "\n",
-    "    # Number of exmaples.\n",
     "    self.n_examples = len(self.labels)\n",
     "\n",
     "    return\n",
@@ -124,7 +114,6 @@
     "    r\"\"\" Data Collator used for GPT2 in a classificaiton rask. \"\"\"\n",
     "\n",
     "    def __init__(self, use_tokenizer, labels_encoder, max_sequence_len=None):\n",
-    "\n",
     "        # Tokenizer to be used inside the class.\n",
     "        self.use_tokenizer = use_tokenizer\n",
     "        # Check max sequence length.\n",
@@ -135,44 +124,35 @@
     "        return\n",
     "\n",
     "    def __call__(self, sequences):\n",
-    "        # Get all texts from sequences list.\n",
     "        texts = [sequence['text'] for sequence in sequences]\n",
-    "        # Get all labels from sequences list.\n",
     "        labels = [sequence['label'] for sequence in sequences]\n",
-    "        # Encode all labels using label encoder.\n",
     "        labels = [self.labels_encoder[label] for label in labels]\n",
+    "\n",
     "        # Call tokenizer on all texts to convert into tensors of numbers with \n",
     "        # appropriate padding.\n",
-    "        inputs = self.use_tokenizer(text=texts, return_tensors=\"pt\", padding=True, truncation=True,  max_length=self.max_sequence_len)\n",
+    "        inputs = self.use_tokenizer(text=texts, return_tensors=\"pt\", padding=True, truncation=True, max_length=self.max_sequence_len)\n",
     "        # Update the inputs with the associated encoded labels as tensor.\n",
     "        inputs.update({'labels':torch.tensor(labels)})\n",
     "\n",
     "        return inputs\n",
     "\n",
     "def train(dataloader, optimizer_, scheduler_, device_):\n",
     "  r\"\"\" Train pytorch model on a single pass through the data loader. \"\"\"\n",
-    "\n",
     "  # Use global variable for model.\n",
     "  global model\n",
-    "\n",
     "  # Tracking variables.\n",
     "  predictions_labels = []\n",
     "  true_labels = []\n",
     "  # Total loss for this epoch.\n",
     "  total_loss = 0\n",
-    "\n",
     "  # Put the model into training mode.\n",
     "  model.train()\n",
-    "\n",
     "  # For each batch of training data...\n",
     "  for batch in tqdm(dataloader, total=len(dataloader)):\n",
-    "\n",
     "    # Add original labels - use later for evaluation.\n",
-    "    true_labels += batch['labels'].numpy().flatten().tolist()\n",
-    "    \n",
+    "    true_labels += batch['labels'].numpy().flatten().tolist() \n",
     "    # move batch to device\n",
     "    batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}\n",
-    "    \n",
     "    # Always clear any previously calculated gradients before performing a\n",
     "    # backward pass.\n",
     "    model.zero_grad()\n",
@@ -241,17 +221,13 @@
     "\n",
     "  # Evaluate data for one epoch\n",
     "  for batch in tqdm(dataloader, total=len(dataloader)):\n",
-    "\n",
     "    # add original labels\n",
     "    true_labels += batch['labels'].numpy().flatten().tolist()\n",
-    "\n",
     "    # move batch to device\n",
     "    batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}\n",
-    "\n",
     "    # Telling the model not to compute or store gradients, saving memory and\n",
     "    # speeding up validation\n",
     "    with torch.no_grad():        \n",
-    "\n",
     "        # Forward pass, calculate logit predictions.\n",
     "        # This will return the logits rather than the loss because we have\n",
     "        # not provided labels.\n",
@@ -318,7 +294,6 @@
     "\n",
     "# resize model embedding to match new tokenizer\n",
     "model.resize_token_embeddings(len(tokenizer))\n",
-    "\n",
     "# fix model padding token id\n",
     "model.config.pad_token_id = model.config.eos_token_id\n",
     "\n",
@@ -381,19 +356,17 @@
     "\n",
     "print('Dealing with Train...')\n",
     "# Create pytorch dataset.\n",
-    "train_dataset = SmartContractBytecodeDataset(path='/data/forta/ethereum/text', \n",
+    "train_dataset = SmartContractOpcodeDataset(path='/data/forta/ethereum/text/finetuning/training/', \n",
     "                               use_tokenizer=tokenizer)\n",
     "print('Created `train_dataset` with %d examples!'%len(train_dataset))\n",
     "\n",
     "# Move pytorch dataset into dataloader.\n",
     "train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=gpt2_classificaiton_collator)\n",
     "print('Created `train_dataloader` with %d batches!'%len(train_dataloader))\n",
     "\n",
-    "print()\n",
-    "\n",
     "print('Dealing with Validation...')\n",
     "# Create pytorch dataset.\n",
-    "valid_dataset =  SmartContractBytecodeDataset(path='/data/forta/ethereum/text', \n",
+    "valid_dataset =  SmartContractOpcodeDataset(path='/data/forta/ethereum/text/finetuning/validation/', \n",
     "                               use_tokenizer=tokenizer)\n",
     "print('Created `valid_dataset` with %d examples!'%len(valid_dataset))\n",
     "\n",
@@ -490,11 +463,9 @@
    },
    "outputs": [],
    "source": [
-    "# Note: AdamW is a class from the huggingface library (as opposed to pytorch) \n",
-    "# I believe the 'W' stands for 'Weight Decay fix\"\n",
     "optimizer = AdamW(model.parameters(),\n",
-    "                  lr = 2e-5, # default is 5e-5, our notebook had 2e-5\n",
-    "                  eps = 1e-8 # default is 1e-8.\n",
+    "                  lr = 2e-5, # default is 5e-5\n",
+    "                  eps = 1e-8 # default is 1e-8\n",
     "                  )\n",
     "\n",
     "# Total number of training steps is number of batches * number of epochs.\n",
@@ -504,7 +475,7 @@
     "\n",
     "# Create the learning rate scheduler.\n",
     "scheduler = get_linear_schedule_with_warmup(optimizer, \n",
-    "                                            num_warmup_steps = 0, # Default value in run_glue.py\n",
+    "                                            num_warmup_steps = 0,\n",
     "                                            num_training_steps = total_steps)\n",
     "\n",
     "# Store the average loss after each epoch so we can plot them.\n",
@@ -514,7 +485,6 @@
     "# Loop through each epoch.\n",
     "print('Epoch')\n",
     "for epoch in tqdm(range(epochs)):\n",
-    "  print()\n",
     "  print('Training on batches...')\n",
     "  # Perform one full pass over the training set.\n",
     "  train_labels, train_predict, train_loss = train(train_dataloader, optimizer, scheduler, device)\n",
@@ -527,7 +497,6 @@
     "\n",
     "  # Print loss and accuracy values to see how training evolves.\n",
     "  print(\"  train_loss: %.5f - val_loss: %.5f - train_acc: %.5f - valid_acc: %.5f\"%(train_loss, val_loss, train_acc, val_acc))\n",
-    "  print()\n",
     "\n",
     "  # Store the loss value for plotting the learning curve.\n",
     "  all_loss['train_loss'].append(train_loss)\n",