create complete model template for the detection of malicious SC

edenia · Feb 28, 2024 · 727f6bb · 727f6bb
1 parent dd8db30
commit 727f6bb
Show file tree

Hide file tree

Showing 3 changed files with 219 additions and 526 deletions.
diff --git a/analysis/ethereum_smart_contracts/finetune_gpt.ipynb b/analysis/ethereum_smart_contracts/finetune_gpt.ipynb
@@ -19,7 +19,7 @@
     "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
     "# See the License for the specific language governing permissions and\n",
     "# limitations under the License.\n",
-    "# Based on: https://gmihaila.github.io/tutorial_notebooks/pretrain_transformers_pytorch/"
+    "# Based on: https://gmihaila.github.io/tutorial_notebooks/finetune_transformers_pytorch/"
    ]
   },
   {
@@ -33,6 +33,7 @@
     "import io\n",
     "import os\n",
     "import torch\n",
+    "import warnings\n",
     "from tqdm.notebook import tqdm\n",
     "from torch.utils.data import Dataset, DataLoader\n",
     "from ml_things import plot_dict, plot_confusion_matrix, fix_text\n",
@@ -46,34 +47,24 @@
     "                          get_linear_schedule_with_warmup,\n",
     "                          GPT2ForSequenceClassification)\n",
     "\n",
+    "# Supress deprecation warnings\n",
+    "warnings.filterwarnings('ignore', category=DeprecationWarning)\n",
+    "warnings.filterwarnings('ignore', category=FutureWarning)\n",
+    "\n",
     "# Set seed for reproducibility.\n",
-    "set_seed(123)\n",
+    "set_seed(4444)\n",
     "\n",
-    "# Number of training epochs (authors recommend between 2 and 4).\n",
     "epochs = 4\n",
-    "\n",
-    "# Number of batches - depending on the max sequence length and GPU memory.\n",
-    "# For 512 sequence length batch of 10 works without cuda memory issues.\n",
-    "# For small sequence length can try batch of 32 or higher.\n",
     "batch_size = 32\n",
-    "\n",
-    "# Pad or truncate text sequences to a specific length\n",
-    "# if `None` it will use maximum sequence of word piece tokens allowed by model.\n",
-    "max_length = 60\n",
+    "max_length = None\n",
     "\n",
     "# Look for gpu to use. Will use `cpu` by default if no gpu found.\n",
     "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
     "\n",
-    "# Name of transformers model - will use already pretrained model.\n",
-    "# Path of transformer model - will load your own model from local disk.\n",
-    "model_name_or_path = 'gpt2'\n",
-    "\n",
-    "# Dictionary of labels and their id - this will be used to convert.\n",
-    "# String labels to number ids.\n",
-    "labels_ids = {'neg': 0, 'pos': 1}\n",
+    "model_name_or_path = '/data/forta/ethereum/model'\n",
+    "tokenizer_name_or_path = '/data/forta/ethereum/tokenizer'\n",
     "\n",
-    "# How many labels are we using in training.\n",
-    "# This is used to decide size of classification head.\n",
+    "labels_ids = {'malicious': 0, 'normal': 1}\n",
     "n_labels = len(labels_ids)"
    ]
   },
@@ -85,19 +76,8 @@
    },
    "outputs": [],
    "source": [
-    "class MovieReviewsDataset(Dataset):\n",
-    "  r\"\"\"PyTorch Dataset class for loading data.\n",
-    "\n",
-    "  This is where the data parsing happens.\n",
-    "\n",
-    "  This class is built with reusability in mind: it can be used as is as.\n",
-    "\n",
-    "  Arguments:\n",
-    "\n",
-    "    path (:obj:`str`):\n",
-    "        Path to the data partition.\n",
-    "\n",
-    "  \"\"\"\n",
+    "class SmartContractBytecodeDataset(Dataset):\n",
+    "  r\"\"\" PyTorch Dataset class for loading data. \"\"\"\n",
     "\n",
     "  def __init__(self, path, use_tokenizer):\n",
     "\n",
@@ -109,82 +89,39 @@
     "    self.labels = []\n",
     "    # Since the labels are defined by folders with data we loop \n",
     "    # through each label.\n",
-    "    for label in ['pos', 'neg']:\n",
-    "      sentiment_path = os.path.join(path, label)\n",
+    "    for label in ['normal', 'malicious']:\n",
+    "      bytecode_path = os.path.join(path, label)\n",
     "\n",
     "      # Get all files from path.\n",
-    "      files_names = os.listdir(sentiment_path)#[:10] # Sample for debugging.\n",
+    "      files_names = os.listdir(bytecode_path)\n",
     "      # Go through each file and read its content.\n",
     "      for file_name in tqdm(files_names, desc=f'{label} files'):\n",
-    "        file_path = os.path.join(sentiment_path, file_name)\n",
+    "        file_path = os.path.join(bytecode_path, file_name)\n",
     "\n",
     "        # Read content.\n",
-    "        content = io.open(file_path, mode='r', encoding='utf-8').read()\n",
-    "        # Fix any unicode issues.\n",
-    "        content = fix_text(content)\n",
-    "        # Save content.\n",
-    "        self.texts.append(content)\n",
-    "        # Save encode labels.\n",
-    "        self.labels.append(label)\n",
+    "        current_file = io.open(file_path, mode='r', encoding='utf-8')\n",
+    "        for line in current_file:\n",
+    "            # Fix any unicode issues.\n",
+    "            line = fix_text(line)\n",
+    "            # Save content.\n",
+    "            self.texts.append(line)\n",
+    "            # Save encode labels.\n",
+    "            self.labels.append(label)\n",
     "\n",
     "    # Number of exmaples.\n",
     "    self.n_examples = len(self.labels)\n",
-    "    \n",
     "\n",
     "    return\n",
     "\n",
     "  def __len__(self):\n",
-    "    r\"\"\"When used `len` return the number of examples.\n",
-    "\n",
-    "    \"\"\"\n",
-    "    \n",
     "    return self.n_examples\n",
     "\n",
     "  def __getitem__(self, item):\n",
-    "    r\"\"\"Given an index return an example from the position.\n",
-    "    \n",
-    "    Arguments:\n",
-    "\n",
-    "      item (:obj:`int`):\n",
-    "          Index position to pick an example to return.\n",
-    "\n",
-    "    Returns:\n",
-    "      :obj:`Dict[str, str]`: Dictionary of inputs that contain text and \n",
-    "      asociated labels.\n",
-    "\n",
-    "    \"\"\"\n",
-    "\n",
     "    return {'text':self.texts[item],\n",
     "            'label':self.labels[item]}\n",
     "\n",
-    "\n",
-    "\n",
     "class Gpt2ClassificationCollator(object):\n",
-    "    r\"\"\"\n",
-    "    Data Collator used for GPT2 in a classificaiton rask. \n",
-    "    \n",
-    "    It uses a given tokenizer and label encoder to convert any text and labels to numbers that \n",
-    "    can go straight into a GPT2 model.\n",
-    "\n",
-    "    This class is built with reusability in mind: it can be used as is as long\n",
-    "    as the `dataloader` outputs a batch in dictionary format that can be passed \n",
-    "    straight into the model - `model(**batch)`.\n",
-    "\n",
-    "    Arguments:\n",
-    "\n",
-    "      use_tokenizer (:obj:`transformers.tokenization_?`):\n",
-    "          Transformer type tokenizer used to process raw text into numbers.\n",
-    "\n",
-    "      labels_ids (:obj:`dict`):\n",
-    "          Dictionary to encode any labels names into numbers. Keys map to \n",
-    "          labels names and Values map to number associated to those labels.\n",
-    "\n",
-    "      max_sequence_len (:obj:`int`, `optional`)\n",
-    "          Value to indicate the maximum desired sequence to truncate or pad text\n",
-    "          sequences. If no value is passed it will used maximum sequence size\n",
-    "          supported by the tokenizer and model.\n",
-    "\n",
-    "    \"\"\"\n",
+    "    r\"\"\" Data Collator used for GPT2 in a classificaiton rask. \"\"\"\n",
     "\n",
     "    def __init__(self, use_tokenizer, labels_encoder, max_sequence_len=None):\n",
     "\n",
@@ -198,21 +135,6 @@
     "        return\n",
     "\n",
     "    def __call__(self, sequences):\n",
-    "        r\"\"\"\n",
-    "        This function allowes the class objesct to be used as a function call.\n",
-    "        Sine the PyTorch DataLoader needs a collator function, I can use this \n",
-    "        class as a function.\n",
-    "\n",
-    "        Arguments:\n",
-    "\n",
-    "          item (:obj:`list`):\n",
-    "              List of texts and labels.\n",
-    "\n",
-    "        Returns:\n",
-    "          :obj:`Dict[str, object]`: Dictionary of inputs that feed into the model.\n",
-    "          It holddes the statement `model(**Returned Dictionary)`.\n",
-    "        \"\"\"\n",
-    "\n",
     "        # Get all texts from sequences list.\n",
     "        texts = [sequence['text'] for sequence in sequences]\n",
     "        # Get all labels from sequences list.\n",
@@ -227,37 +149,8 @@
     "\n",
     "        return inputs\n",
     "\n",
-    "\n",
     "def train(dataloader, optimizer_, scheduler_, device_):\n",
-    "  r\"\"\"\n",
-    "  Train pytorch model on a single pass through the data loader.\n",
-    "\n",
-    "  It will use the global variable `model` which is the transformer model \n",
-    "  loaded on `_device` that we want to train on.\n",
-    "\n",
-    "  This function is built with reusability in mind: it can be used as is as long\n",
-    "    as the `dataloader` outputs a batch in dictionary format that can be passed \n",
-    "    straight into the model - `model(**batch)`.\n",
-    "\n",
-    "  Arguments:\n",
-    "\n",
-    "      dataloader (:obj:`torch.utils.data.dataloader.DataLoader`):\n",
-    "          Parsed data into batches of tensors.\n",
-    "\n",
-    "      optimizer_ (:obj:`transformers.optimization.AdamW`):\n",
-    "          Optimizer used for training.\n",
-    "\n",
-    "      scheduler_ (:obj:`torch.optim.lr_scheduler.LambdaLR`):\n",
-    "          PyTorch scheduler.\n",
-    "\n",
-    "      device_ (:obj:`torch.device`):\n",
-    "          Device used to load tensors before feeding to model.\n",
-    "\n",
-    "  Returns:\n",
-    "\n",
-    "      :obj:`List[List[int], List[int], float]`: List of [True Labels, Predicted\n",
-    "        Labels, Train Average Loss].\n",
-    "  \"\"\"\n",
+    "  r\"\"\" Train pytorch model on a single pass through the data loader. \"\"\"\n",
     "\n",
     "  # Use global variable for model.\n",
     "  global model\n",
@@ -329,32 +222,9 @@
     "  # Return all true labels and prediction for future evaluations.\n",
     "  return true_labels, predictions_labels, avg_epoch_loss\n",
     "\n",
-    "\n",
-    "\n",
     "def validation(dataloader, device_):\n",
-    "  r\"\"\"Validation function to evaluate model performance on a \n",
-    "  separate set of data.\n",
-    "\n",
-    "  This function will return the true and predicted labels so we can use later\n",
-    "  to evaluate the model's performance.\n",
-    "\n",
-    "  This function is built with reusability in mind: it can be used as is as long\n",
-    "    as the `dataloader` outputs a batch in dictionary format that can be passed \n",
-    "    straight into the model - `model(**batch)`.\n",
-    "\n",
-    "  Arguments:\n",
-    "\n",
-    "    dataloader (:obj:`torch.utils.data.dataloader.DataLoader`):\n",
-    "          Parsed data into batches of tensors.\n",
-    "\n",
-    "    device_ (:obj:`torch.device`):\n",
-    "          Device used to load tensors before feeding to model.\n",
-    "\n",
-    "  Returns:\n",
-    "    \n",
-    "    :obj:`List[List[int], List[int], float]`: List of [True Labels, Predicted\n",
-    "        Labels, Train Average Loss]\n",
-    "  \"\"\"\n",
+    "  r\"\"\" Validation function to evaluate model performance on a \n",
+    "  separate set of data. \"\"\"\n",
     "\n",
     "  # Use global variable for model.\n",
     "  global model\n",
@@ -436,13 +306,12 @@
     "\n",
     "# Get model's tokenizer.\n",
     "print('Loading tokenizer...')\n",
-    "tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path)\n",
+    "tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path=tokenizer_name_or_path)\n",
     "# default to left padding\n",
     "tokenizer.padding_side = \"left\"\n",
     "# Define PAD Token = EOS Token = 50256\n",
     "tokenizer.pad_token = tokenizer.eos_token\n",
     "\n",
-    "\n",
     "# Get the actual model.\n",
     "print('Loading model...')\n",
     "model = GPT2ForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, config=model_config)\n",
@@ -510,10 +379,9 @@
     "                                                          labels_encoder=labels_ids, \n",
     "                                                          max_sequence_len=max_length)\n",
     "\n",
-    "\n",
     "print('Dealing with Train...')\n",
     "# Create pytorch dataset.\n",
-    "train_dataset = MovieReviewsDataset(path='content/aclImdb/train', \n",
+    "train_dataset = SmartContractBytecodeDataset(path='/data/forta/ethereum/text', \n",
     "                               use_tokenizer=tokenizer)\n",
     "print('Created `train_dataset` with %d examples!'%len(train_dataset))\n",
     "\n",
@@ -525,7 +393,7 @@
     "\n",
     "print('Dealing with Validation...')\n",
     "# Create pytorch dataset.\n",
-    "valid_dataset =  MovieReviewsDataset(path='content/aclImdb/test', \n",
+    "valid_dataset =  SmartContractBytecodeDataset(path='/data/forta/ethereum/text', \n",
     "                               use_tokenizer=tokenizer)\n",
     "print('Created `valid_dataset` with %d examples!'%len(valid_dataset))\n",
     "\n",
@@ -617,7 +485,8 @@
      ]
     },
     "id": "yLPCkiv-lis3",
-    "outputId": "047032ac-d708-480f-ebcf-9c0239f793a6"
+    "outputId": "047032ac-d708-480f-ebcf-9c0239f793a6",
+    "scrolled": true
    },
    "outputs": [],
    "source": [
@@ -673,6 +542,13 @@
     "plot_dict(all_acc, use_xlabel='Epochs', use_ylabel='Value', use_linestyles=['-', '--'])"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# "
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -701,7 +577,8 @@
     "true_labels, predictions_labels, avg_epoch_loss = validation(valid_dataloader, device)\n",
     "\n",
     "# Create the evaluation report.\n",
-    "evaluation_report = classification_report(true_labels, predictions_labels, labels=list(labels_ids.values()), target_names=list(labels_ids.keys()))\n",
+    "evaluation_report = classification_report(true_labels, predictions_labels, labels=list(labels_ids.values()),\n",
+    "                                          target_names=list(labels_ids.keys()))\n",
     "# Show the evaluation report.\n",
     "print(evaluation_report)\n",
     "\n",