Skip to content

Commit

Permalink
rename and improve verified data processing
Browse files Browse the repository at this point in the history
  • Loading branch information
kuronosec committed Apr 5, 2024
1 parent 96d73be commit fb8a24d
Showing 1 changed file with 64 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,10 @@
"outputs": [],
"source": [
"COLS = ['contract_creator', 'contract_address', 'contract_name', 'decompiled_opcodes', 'malicious']\n",
"pretraining_data = pd.read_parquet('/data/forta/ethereum/text/pretraining/malicious_contract_training_dataset_final.parquet', columns=COLS)"
"pretraining_data = pd.read_parquet('/data/forta/ethereum/text/pretraining/raw/verified/verified-smart-contracts.parquet', columns=COLS)\n",
"# Is anomaly detection?\n",
"anomaly_detection_training = False\n",
"only_evaluation = True"
]
},
{
Expand All @@ -51,7 +54,9 @@
"metadata": {},
"outputs": [],
"source": [
"pretraining_data['malicious'].value_counts()"
"count = pretraining_data['malicious'].value_counts()\n",
"number_normal = count.iloc[0]\n",
"number_malicious = count.iloc[1]"
]
},
{
Expand Down Expand Up @@ -108,20 +113,42 @@
"# Prepare data for pretraining phase\n",
"# First clean and process the opcode data\n",
"pretraining_data['experiment_2_opcodes'] = pretraining_data.apply(get_exp_2_features, axis=1)\n",
"training_data = None\n",
"validation_data = None\n",
"normal_data = None\n",
"malicious_data = None\n",
"\n",
"# Files to store the data\n",
"train_file_path = '/data/forta/ethereum/text/pretraining/small_pretraining_train.txt'\n",
"val_file_path = '/data/forta/ethereum/text/pretraining/small_pretraining_val.txt'\n",
"train_file_path = '/data/forta/ethereum/text/pretraining/small_pretraining_train.csv'\n",
"val_file_path = '/data/forta/ethereum/text/pretraining/small_pretraining_val.csv'\n",
"\n",
"# Suffle data\n",
"pretraining_data = pd.concat([pretraining_data[:499], pretraining_data[5000:]])\n",
"pretraining_data = pretraining_data.sample(frac = 1)\n",
"if only_evaluation: \n",
" evaluation_file_path = \"/data/forta/ethereum/text/evaluation/malicious-eval.csv\"\n",
" pretraining_data.loc[pretraining_data['malicious'] == True]['experiment_2_opcodes'].to_csv(\n",
" evaluation_file_path, sep=',', index=False)\n",
"elif not anomaly_detection_training:\n",
" percentaje_normal = number_normal - (number_normal*0.1)\n",
" percentaje_malicious = number_malicious - (number_malicious*0.1)\n",
" # Suffle normal and malicious data if no anomaly detection\n",
" normal_data = pretraining_data.loc[pretraining_data['malicious'] == False]\n",
" malicious_data = pretraining_data.loc[pretraining_data['malicious'] == True]\n",
"\n",
"# Save the data to disk\n",
"training_data = pretraining_data[:499]\n",
"validation_data = pretraining_data[500:]\n",
"training_data['experiment_2_opcodes'].to_csv(train_file_path, sep='\\t', index=False)\n",
"validation_data['experiment_2_opcodes'].to_csv(val_file_path, sep='\\t', index=False)"
" normal_data = normal_data.sample(frac = 1)\n",
" malicious_data = malicious_data.sample(frac = 1)\n",
"\n",
" training_data = pd.concat([normal_data[:percentaje_normal-1], malicious_data[:percentaje_malicious-1]])\n",
" validation_data = pd.concat([normal_data[percentaje_normal:], malicious_data[percentaje_malicious:]])\n",
" \n",
" training_data = training_data.sample(frac = 1)\n",
" validation_data = validation_data.sample(frac = 1)\n",
"else:\n",
" # If anomaly detection only train with normal data\n",
" normal_data = pretraining_data.loc[pretraining_data['malicious'] == False]\n",
" malicious_data = pretraining_data.loc[pretraining_data['malicious'] == True]\n",
" normal_data = normal_data.sample(frac = 1)\n",
" malicious_data = malicious_data.sample(frac = 1)\n",
" training_data = normal_data[:number_normal]\n",
" validation_data = malicious_data"
]
},
{
Expand All @@ -131,19 +158,32 @@
"metadata": {},
"outputs": [],
"source": [
"# Prepare data for finetuning phase\n",
"# Training\n",
"training_data.loc[training_data['malicious'] == False].to_csv('/data/forta/ethereum/text/finetuning/training/normal/normal.txt',\n",
" columns=['experiment_2_opcodes'], sep='\\t', index=False)\n",
"training_data.loc[training_data['malicious'] == True].to_csv('/data/forta/ethereum/text/finetuning/training/malicious/malicious.txt',\n",
" columns=['experiment_2_opcodes'], sep='\\t', index=False)\n",
"\n",
"# Validation\n",
"validation_data.loc[validation_data['malicious'] == False].to_csv('/data/forta/ethereum/text/finetuning/validation/normal/normal.txt',\n",
" columns=['experiment_2_opcodes'], sep='\\t', index=False)\n",
"validation_data.loc[validation_data['malicious'] == True].to_csv('/data/forta/ethereum/text/finetuning/validation/malicious/malicious.txt',\n",
" columns=['experiment_2_opcodes'], sep='\\t', index=False)"
"if training_data is not None and validation_data is not None:\n",
" # Save the data to disk\n",
" training_data['experiment_2_opcodes'].to_csv(train_file_path, sep='\\t', index=False)\n",
" validation_data['experiment_2_opcodes'].to_csv(val_file_path, sep='\\t', index=False)\n",
" if not anomaly_detection_training:\n",
" # Prepare data for anomaly data selection phase\n",
" # Training\n",
" training_data.loc[training_data['malicious'] == False].to_csv('/data/forta/ethereum/text/pretraining/training/normal/normal.csv',\n",
" columns=['experiment_2_opcodes'], sep='\\t', index=False)\n",
" training_data.loc[training_data['malicious'] == True].to_csv('/data/forta/ethereum/text/pretraining/training/malicious/malicious.csv',\n",
" columns=['experiment_2_opcodes'], sep='\\t', index=False)\n",
" \n",
" # Validation\n",
" validation_data.loc[validation_data['malicious'] == False].to_csv('/data/forta/ethereum/text/pretraining/validation/normal/normal.csv',\n",
" columns=['experiment_2_opcodes'], sep='\\t', index=False)\n",
" validation_data.loc[validation_data['malicious'] == True].to_csv('/data/forta/ethereum/text/pretraining/validation/malicious/malicious.csv',\n",
" columns=['experiment_2_opcodes'], sep='\\t', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "45daafe2-4110-49ad-8628-4eaf1ba49171",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down

0 comments on commit fb8a24d

Please sign in to comment.