Skip to content

Commit

Permalink
create complete model template for the detection of malicious SC
Browse files Browse the repository at this point in the history
  • Loading branch information
kuronosec committed Feb 28, 2024
1 parent dd8db30 commit 727f6bb
Show file tree
Hide file tree
Showing 3 changed files with 219 additions and 526 deletions.
209 changes: 43 additions & 166 deletions analysis/ethereum_smart_contracts/finetune_gpt.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
"# See the License for the specific language governing permissions and\n",
"# limitations under the License.\n",
"# Based on: https://gmihaila.github.io/tutorial_notebooks/pretrain_transformers_pytorch/"
"# Based on: https://gmihaila.github.io/tutorial_notebooks/finetune_transformers_pytorch/"
]
},
{
Expand All @@ -33,6 +33,7 @@
"import io\n",
"import os\n",
"import torch\n",
"import warnings\n",
"from tqdm.notebook import tqdm\n",
"from torch.utils.data import Dataset, DataLoader\n",
"from ml_things import plot_dict, plot_confusion_matrix, fix_text\n",
Expand All @@ -46,34 +47,24 @@
" get_linear_schedule_with_warmup,\n",
" GPT2ForSequenceClassification)\n",
"\n",
"# Supress deprecation warnings\n",
"warnings.filterwarnings('ignore', category=DeprecationWarning)\n",
"warnings.filterwarnings('ignore', category=FutureWarning)\n",
"\n",
"# Set seed for reproducibility.\n",
"set_seed(123)\n",
"set_seed(4444)\n",
"\n",
"# Number of training epochs (authors recommend between 2 and 4).\n",
"epochs = 4\n",
"\n",
"# Number of batches - depending on the max sequence length and GPU memory.\n",
"# For 512 sequence length batch of 10 works without cuda memory issues.\n",
"# For small sequence length can try batch of 32 or higher.\n",
"batch_size = 32\n",
"\n",
"# Pad or truncate text sequences to a specific length\n",
"# if `None` it will use maximum sequence of word piece tokens allowed by model.\n",
"max_length = 60\n",
"max_length = None\n",
"\n",
"# Look for gpu to use. Will use `cpu` by default if no gpu found.\n",
"device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
"\n",
"# Name of transformers model - will use already pretrained model.\n",
"# Path of transformer model - will load your own model from local disk.\n",
"model_name_or_path = 'gpt2'\n",
"\n",
"# Dictionary of labels and their id - this will be used to convert.\n",
"# String labels to number ids.\n",
"labels_ids = {'neg': 0, 'pos': 1}\n",
"model_name_or_path = '/data/forta/ethereum/model'\n",
"tokenizer_name_or_path = '/data/forta/ethereum/tokenizer'\n",
"\n",
"# How many labels are we using in training.\n",
"# This is used to decide size of classification head.\n",
"labels_ids = {'malicious': 0, 'normal': 1}\n",
"n_labels = len(labels_ids)"
]
},
Expand All @@ -85,19 +76,8 @@
},
"outputs": [],
"source": [
"class MovieReviewsDataset(Dataset):\n",
" r\"\"\"PyTorch Dataset class for loading data.\n",
"\n",
" This is where the data parsing happens.\n",
"\n",
" This class is built with reusability in mind: it can be used as is as.\n",
"\n",
" Arguments:\n",
"\n",
" path (:obj:`str`):\n",
" Path to the data partition.\n",
"\n",
" \"\"\"\n",
"class SmartContractBytecodeDataset(Dataset):\n",
" r\"\"\" PyTorch Dataset class for loading data. \"\"\"\n",
"\n",
" def __init__(self, path, use_tokenizer):\n",
"\n",
Expand All @@ -109,82 +89,39 @@
" self.labels = []\n",
" # Since the labels are defined by folders with data we loop \n",
" # through each label.\n",
" for label in ['pos', 'neg']:\n",
" sentiment_path = os.path.join(path, label)\n",
" for label in ['normal', 'malicious']:\n",
" bytecode_path = os.path.join(path, label)\n",
"\n",
" # Get all files from path.\n",
" files_names = os.listdir(sentiment_path)#[:10] # Sample for debugging.\n",
" files_names = os.listdir(bytecode_path)\n",
" # Go through each file and read its content.\n",
" for file_name in tqdm(files_names, desc=f'{label} files'):\n",
" file_path = os.path.join(sentiment_path, file_name)\n",
" file_path = os.path.join(bytecode_path, file_name)\n",
"\n",
" # Read content.\n",
" content = io.open(file_path, mode='r', encoding='utf-8').read()\n",
" # Fix any unicode issues.\n",
" content = fix_text(content)\n",
" # Save content.\n",
" self.texts.append(content)\n",
" # Save encode labels.\n",
" self.labels.append(label)\n",
" current_file = io.open(file_path, mode='r', encoding='utf-8')\n",
" for line in current_file:\n",
" # Fix any unicode issues.\n",
" line = fix_text(line)\n",
" # Save content.\n",
" self.texts.append(line)\n",
" # Save encode labels.\n",
" self.labels.append(label)\n",
"\n",
" # Number of exmaples.\n",
" self.n_examples = len(self.labels)\n",
" \n",
"\n",
" return\n",
"\n",
" def __len__(self):\n",
" r\"\"\"When used `len` return the number of examples.\n",
"\n",
" \"\"\"\n",
" \n",
" return self.n_examples\n",
"\n",
" def __getitem__(self, item):\n",
" r\"\"\"Given an index return an example from the position.\n",
" \n",
" Arguments:\n",
"\n",
" item (:obj:`int`):\n",
" Index position to pick an example to return.\n",
"\n",
" Returns:\n",
" :obj:`Dict[str, str]`: Dictionary of inputs that contain text and \n",
" asociated labels.\n",
"\n",
" \"\"\"\n",
"\n",
" return {'text':self.texts[item],\n",
" 'label':self.labels[item]}\n",
"\n",
"\n",
"\n",
"class Gpt2ClassificationCollator(object):\n",
" r\"\"\"\n",
" Data Collator used for GPT2 in a classificaiton rask. \n",
" \n",
" It uses a given tokenizer and label encoder to convert any text and labels to numbers that \n",
" can go straight into a GPT2 model.\n",
"\n",
" This class is built with reusability in mind: it can be used as is as long\n",
" as the `dataloader` outputs a batch in dictionary format that can be passed \n",
" straight into the model - `model(**batch)`.\n",
"\n",
" Arguments:\n",
"\n",
" use_tokenizer (:obj:`transformers.tokenization_?`):\n",
" Transformer type tokenizer used to process raw text into numbers.\n",
"\n",
" labels_ids (:obj:`dict`):\n",
" Dictionary to encode any labels names into numbers. Keys map to \n",
" labels names and Values map to number associated to those labels.\n",
"\n",
" max_sequence_len (:obj:`int`, `optional`)\n",
" Value to indicate the maximum desired sequence to truncate or pad text\n",
" sequences. If no value is passed it will used maximum sequence size\n",
" supported by the tokenizer and model.\n",
"\n",
" \"\"\"\n",
" r\"\"\" Data Collator used for GPT2 in a classificaiton rask. \"\"\"\n",
"\n",
" def __init__(self, use_tokenizer, labels_encoder, max_sequence_len=None):\n",
"\n",
Expand All @@ -198,21 +135,6 @@
" return\n",
"\n",
" def __call__(self, sequences):\n",
" r\"\"\"\n",
" This function allowes the class objesct to be used as a function call.\n",
" Sine the PyTorch DataLoader needs a collator function, I can use this \n",
" class as a function.\n",
"\n",
" Arguments:\n",
"\n",
" item (:obj:`list`):\n",
" List of texts and labels.\n",
"\n",
" Returns:\n",
" :obj:`Dict[str, object]`: Dictionary of inputs that feed into the model.\n",
" It holddes the statement `model(**Returned Dictionary)`.\n",
" \"\"\"\n",
"\n",
" # Get all texts from sequences list.\n",
" texts = [sequence['text'] for sequence in sequences]\n",
" # Get all labels from sequences list.\n",
Expand All @@ -227,37 +149,8 @@
"\n",
" return inputs\n",
"\n",
"\n",
"def train(dataloader, optimizer_, scheduler_, device_):\n",
" r\"\"\"\n",
" Train pytorch model on a single pass through the data loader.\n",
"\n",
" It will use the global variable `model` which is the transformer model \n",
" loaded on `_device` that we want to train on.\n",
"\n",
" This function is built with reusability in mind: it can be used as is as long\n",
" as the `dataloader` outputs a batch in dictionary format that can be passed \n",
" straight into the model - `model(**batch)`.\n",
"\n",
" Arguments:\n",
"\n",
" dataloader (:obj:`torch.utils.data.dataloader.DataLoader`):\n",
" Parsed data into batches of tensors.\n",
"\n",
" optimizer_ (:obj:`transformers.optimization.AdamW`):\n",
" Optimizer used for training.\n",
"\n",
" scheduler_ (:obj:`torch.optim.lr_scheduler.LambdaLR`):\n",
" PyTorch scheduler.\n",
"\n",
" device_ (:obj:`torch.device`):\n",
" Device used to load tensors before feeding to model.\n",
"\n",
" Returns:\n",
"\n",
" :obj:`List[List[int], List[int], float]`: List of [True Labels, Predicted\n",
" Labels, Train Average Loss].\n",
" \"\"\"\n",
" r\"\"\" Train pytorch model on a single pass through the data loader. \"\"\"\n",
"\n",
" # Use global variable for model.\n",
" global model\n",
Expand Down Expand Up @@ -329,32 +222,9 @@
" # Return all true labels and prediction for future evaluations.\n",
" return true_labels, predictions_labels, avg_epoch_loss\n",
"\n",
"\n",
"\n",
"def validation(dataloader, device_):\n",
" r\"\"\"Validation function to evaluate model performance on a \n",
" separate set of data.\n",
"\n",
" This function will return the true and predicted labels so we can use later\n",
" to evaluate the model's performance.\n",
"\n",
" This function is built with reusability in mind: it can be used as is as long\n",
" as the `dataloader` outputs a batch in dictionary format that can be passed \n",
" straight into the model - `model(**batch)`.\n",
"\n",
" Arguments:\n",
"\n",
" dataloader (:obj:`torch.utils.data.dataloader.DataLoader`):\n",
" Parsed data into batches of tensors.\n",
"\n",
" device_ (:obj:`torch.device`):\n",
" Device used to load tensors before feeding to model.\n",
"\n",
" Returns:\n",
" \n",
" :obj:`List[List[int], List[int], float]`: List of [True Labels, Predicted\n",
" Labels, Train Average Loss]\n",
" \"\"\"\n",
" r\"\"\" Validation function to evaluate model performance on a \n",
" separate set of data. \"\"\"\n",
"\n",
" # Use global variable for model.\n",
" global model\n",
Expand Down Expand Up @@ -436,13 +306,12 @@
"\n",
"# Get model's tokenizer.\n",
"print('Loading tokenizer...')\n",
"tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path)\n",
"tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path=tokenizer_name_or_path)\n",
"# default to left padding\n",
"tokenizer.padding_side = \"left\"\n",
"# Define PAD Token = EOS Token = 50256\n",
"tokenizer.pad_token = tokenizer.eos_token\n",
"\n",
"\n",
"# Get the actual model.\n",
"print('Loading model...')\n",
"model = GPT2ForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, config=model_config)\n",
Expand Down Expand Up @@ -510,10 +379,9 @@
" labels_encoder=labels_ids, \n",
" max_sequence_len=max_length)\n",
"\n",
"\n",
"print('Dealing with Train...')\n",
"# Create pytorch dataset.\n",
"train_dataset = MovieReviewsDataset(path='content/aclImdb/train', \n",
"train_dataset = SmartContractBytecodeDataset(path='/data/forta/ethereum/text', \n",
" use_tokenizer=tokenizer)\n",
"print('Created `train_dataset` with %d examples!'%len(train_dataset))\n",
"\n",
Expand All @@ -525,7 +393,7 @@
"\n",
"print('Dealing with Validation...')\n",
"# Create pytorch dataset.\n",
"valid_dataset = MovieReviewsDataset(path='content/aclImdb/test', \n",
"valid_dataset = SmartContractBytecodeDataset(path='/data/forta/ethereum/text', \n",
" use_tokenizer=tokenizer)\n",
"print('Created `valid_dataset` with %d examples!'%len(valid_dataset))\n",
"\n",
Expand Down Expand Up @@ -617,7 +485,8 @@
]
},
"id": "yLPCkiv-lis3",
"outputId": "047032ac-d708-480f-ebcf-9c0239f793a6"
"outputId": "047032ac-d708-480f-ebcf-9c0239f793a6",
"scrolled": true
},
"outputs": [],
"source": [
Expand Down Expand Up @@ -673,6 +542,13 @@
"plot_dict(all_acc, use_xlabel='Epochs', use_ylabel='Value', use_linestyles=['-', '--'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# "
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -701,7 +577,8 @@
"true_labels, predictions_labels, avg_epoch_loss = validation(valid_dataloader, device)\n",
"\n",
"# Create the evaluation report.\n",
"evaluation_report = classification_report(true_labels, predictions_labels, labels=list(labels_ids.values()), target_names=list(labels_ids.keys()))\n",
"evaluation_report = classification_report(true_labels, predictions_labels, labels=list(labels_ids.values()),\n",
" target_names=list(labels_ids.keys()))\n",
"# Show the evaluation report.\n",
"print(evaluation_report)\n",
"\n",
Expand Down
Loading

0 comments on commit 727f6bb

Please sign in to comment.