-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
create pretraining data collection from Zettablock
- Loading branch information
Showing
1 changed file
with
313 additions
and
0 deletions.
There are no files selected for viewing
313 changes: 313 additions & 0 deletions
313
analysis/ethereum_smart_contracts/pretraining_data_collection.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,313 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "ba979f32-67fc-4dc5-a71b-be84043a87b9", | ||
"metadata": {}, | ||
"source": [ | ||
"# Pretraining - Smart Contract Training Dataset Collection\n", | ||
"\n", | ||
"This notebook collects smart contract bytecode and decompiled opcodes for normal and malicious contract classification.\n", | ||
"\n", | ||
"Pretraining contracts are gathered from Zettablock and malicious contracts from [Forta Network's labelled datasets github repo](\"https://github.com/forta-network/labelled-datasets\")." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "599dd826-f2b4-416c-b256-8eef021287a9", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import logging\n", | ||
"import pickle\n", | ||
"import os\n", | ||
"import requests\n", | ||
"import json\n", | ||
"import time\n", | ||
"\n", | ||
"from evmdasm import EvmBytecode\n", | ||
"import pandas as pd\n", | ||
"from tqdm import tqdm\n", | ||
"from web3 import Web3\n", | ||
"\n", | ||
"tqdm.pandas()\n", | ||
"# disable warning logs from evmdasm tool\n", | ||
"logging.getLogger(\"evmdasm\").setLevel(logging.CRITICAL)\n", | ||
"\n", | ||
"zettablock_data_file = \"/data/forta/ethereum/text/pretraining/zettablock_data\"\n", | ||
"processed_data_file = \"/data/forta/ethereum/text/pretraining/big_pretrain_data\"\n", | ||
"\n", | ||
"# Your Zettablock API key\n", | ||
"API_KEY = \"XXXXX-XXXX-XXXX-XXX-XXXXXXXX\"\n", | ||
"headers = {\n", | ||
" \"accept\": \"application/json\",\n", | ||
" \"content-type\": \"application/json\",\n", | ||
" # credentials\n", | ||
" \"X-API-KEY\": API_KEY\n", | ||
"}\n", | ||
"\n", | ||
"# Zettablock endpoint\n", | ||
"data_lake_query_endpoint = \"https://api.zettablock.com/api/v1/databases/AwsDataCatalog/queries\"\n", | ||
"\n", | ||
"# Configure the blockchains we are interested in\n", | ||
"blockchains = [\"ethereum_mainnet\", \"polygon_mainnet\", \"bsc_mainnet\"]\n", | ||
"\n", | ||
"# Final training and validation files\n", | ||
"train_file_path = \"/data/forta/ethereum/text/pretraining/pretraining_train.csv\"\n", | ||
"val_file_path = \"/data/forta/ethereum/text/pretraining/pretraining_val.csv\"" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "f62b6fa1-3333-4422-8b15-eb80b0206460", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Code taken from Zettablock tutorials\n", | ||
"# check response until success or failed is returned\n", | ||
"def get_response(queryrun_id):\n", | ||
"\timport time\n", | ||
"\ti = 1\n", | ||
"\tqueryrun_status_endpoint = f'https://api.zettablock.com/api/v1/queryruns/{queryrun_id}/status'\n", | ||
"\twhile True:\n", | ||
"\t\tres = requests.get(queryrun_status_endpoint, headers=headers)\n", | ||
"\t\tstate = json.loads(res.text)['state']\n", | ||
"\t\tif state == 'SUCCEEDED' or state == 'FAILED':\n", | ||
"\t\t\treturn state\n", | ||
"\t\ttime.sleep(i)\n", | ||
"\t\ti += 1\n", | ||
"\n", | ||
"def download_file(url: str, local_file: str, headers=None, params=None):\n", | ||
" resp = requests.get(url, stream=True, headers=headers, params=params)\n", | ||
" total = int(resp.headers.get('content-length', 0))\n", | ||
" with open(local_file, 'ab') as file, tqdm(\n", | ||
" desc=local_file,\n", | ||
" total=total,\n", | ||
" unit='iB',\n", | ||
" unit_scale=True,\n", | ||
" unit_divisor=1024,\n", | ||
" ) as bar:\n", | ||
" for data in resp.iter_content(chunk_size=1024):\n", | ||
" size = file.write(data)\n", | ||
" bar.update(size)\n", | ||
"\n", | ||
"def call_zettablock_api(query_text, blockchain):\n", | ||
" # Get Smart Contract Data from Zettablock for several blockchains\n", | ||
" query = {\"query\": query_text, \"resultCacheExpireMillis\": 86400000}\n", | ||
" \n", | ||
" # Create a query with SQL statement, and get query id\n", | ||
" res = requests.post(data_lake_query_endpoint, headers=headers, data=json.dumps(query))\n", | ||
" print(res.text)\n", | ||
" \n", | ||
" # Trigger the query by query id, and get queryrun id\n", | ||
" query_id = res.json()['id']\n", | ||
" data_lake_submission_endpoints = f'https://api.zettablock.com/api/v1/queries/{query_id}/trigger'\n", | ||
" res = requests.post(data_lake_submission_endpoints, headers=headers, data='{}')\n", | ||
" \n", | ||
" # Check status using queryrun id\n", | ||
" queryrun_id = res.json()['queryrunId']\n", | ||
" \n", | ||
" if get_response(queryrun_id) == 'SUCCEEDED':\n", | ||
" # Fetch result from queryrun id\n", | ||
" params = {'includeColumnName': 'true'}\n", | ||
" queryrun_result_endpoint = f'https://api.zettablock.com/api/v1/stream/queryruns/{queryrun_id}/result'\n", | ||
" # if the result is huge, consider using stream and write to a file\n", | ||
" download_file(queryrun_result_endpoint, zettablock_data_file+\"_\"+blockchain+\".csv\", headers=headers, params=params)\n", | ||
" else:\n", | ||
" print('query failed, please check status message for details')\n", | ||
" print(res.json())" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "73efe3ca-4665-4b2d-aff3-3a4ce62b1e62", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Code provided by the Forta team\n", | ||
"# Decompile and dissasemble the smart contract bytecode\n", | ||
"def get_opcodes(creation_bytecode) -> str:\n", | ||
" bytecode = creation_bytecode\n", | ||
" if bytecode is None:\n", | ||
" return ''\n", | ||
"\n", | ||
" try:\n", | ||
" opcodes = EvmBytecode(bytecode).disassemble()\n", | ||
" except Exception:\n", | ||
" return ''\n", | ||
" \n", | ||
" return \" \".join([str(op).strip() for op in opcodes])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "afe717dc-febf-48bb-ab9d-1957fa5477c0", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Code provided by the Forta team\n", | ||
"# Filter opcodes to get the best features\n", | ||
"def get_exp_2_features(row):\n", | ||
" creator = row['contract_creator']\n", | ||
" opcodes = row['decompiled_opcodes'].split()\n", | ||
" mask = '0xffffffffffffffffffffffffffffffffffffffff'\n", | ||
" features = []\n", | ||
" for i in range(len(opcodes)-1):\n", | ||
" first = opcodes[i]\n", | ||
" second = opcodes[i+1]\n", | ||
" if not first.startswith('0x'):\n", | ||
" token = first\n", | ||
" if first.startswith('UNKNOWN') or first.startswith('INVALID'):\n", | ||
" token = first.split('_')[0]\n", | ||
" features.append(token)\n", | ||
" elif first == 'PUSH4':\n", | ||
" features.append(second)\n", | ||
" elif first == 'PUSH20':\n", | ||
" if second == creator:\n", | ||
" features.append('creator')\n", | ||
" elif second == mask:\n", | ||
" features.append(mask)\n", | ||
" else:\n", | ||
" features.append('address')\n", | ||
" elif first == 'PUSH32':\n", | ||
" features.append(second)\n", | ||
" return \" \".join(features)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "a7cccb91-ceb9-4e98-bddf-3b59e3b6aca7", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Create queries for the supported blockchains\n", | ||
"# They don't have all the same available data\n", | ||
"def get_query(blockchain):\n", | ||
" query_text = \"\"\n", | ||
" if blockchain == \"ethereum_mainnet\":\n", | ||
" query_text = '''\n", | ||
" SELECT contract.address as contract_address,\n", | ||
" contract.name as contract_name,\n", | ||
" contract.creator as contract_creator,\n", | ||
" tags.name as contract_tag_name, \n", | ||
" tags.type as contract_type,\n", | ||
" contract.code as contract_code\n", | ||
" FROM {}.contracts contract LEFT JOIN {}.labels tags ON tags.address = contract.address\n", | ||
" LIMIT 30000\n", | ||
" '''.format(blockchain, blockchain)\n", | ||
" elif blockchain == \"polygon_mainnet\":\n", | ||
" query_text = '''\n", | ||
" SELECT contracts.address as contract_address,\n", | ||
" mappings.contract_name as contract_name,\n", | ||
" contracts.creator_address as contract_creator,\n", | ||
" mappings.contract_category as contract_type,\n", | ||
" contracts.bytecode as contract_code\n", | ||
" FROM polygon_mainnet.contract_creations contracts LEFT JOIN polygon_mainnet.contract_mappings mappings ON mappings.contract_address = contracts.address\n", | ||
" LIMIT 30000\n", | ||
" '''.format(blockchain, blockchain)\n", | ||
" else:\n", | ||
" query_text = '''\n", | ||
" SELECT contracts.address as contract_address,\n", | ||
" contracts.creator_address as contract_creator,\n", | ||
" contracts.bytecode as contract_code\n", | ||
" FROM bsc_mainnet.contract_creations contracts\n", | ||
" LIMIT 30000\n", | ||
" '''.format(blockchain, blockchain)\n", | ||
" return query_text\n", | ||
"\n", | ||
"# Get contracts and filter the data\n", | ||
"def get_pretrain_contracts():\n", | ||
" # Get data from Zettablock for 3 different EVM compatible blockchains\n", | ||
" for blockchain in blockchains:\n", | ||
" if not os.path.exists(zettablock_data_file+\"_\"+blockchain+\".csv\"):\n", | ||
" print(\"Dowloading data from %s...\" % (blockchain))\n", | ||
" call_zettablock_api(get_query(blockchain), blockchain)\n", | ||
" if not os.path.exists(processed_data_file+\"_\"+blockchain+\".csv\"):\n", | ||
" \"\"\"Collects contracts from Zettablock and its decompiled opcodes.\"\"\" \n", | ||
" chunksize = 10 ** 6\n", | ||
" with pd.read_csv(zettablock_data_file+\"_\"+blockchain+\".csv\", chunksize=chunksize) as contract_reader:\n", | ||
" for contracts in contract_reader:\n", | ||
" contracts['decompiled_opcodes'] = contracts['contract_code'].progress_apply(get_opcodes)\n", | ||
" # Store data so we don't have to download it all the time\n", | ||
" contracts = contracts[(contracts['decompiled_opcodes'].notna()) & (contracts['decompiled_opcodes'] != '')]\n", | ||
" contracts.drop_duplicates('contract_address', inplace=True)\n", | ||
" contracts.progress_apply(get_exp_2_features, axis=1)\n", | ||
" contracts.to_csv(processed_data_file+\"_\"+blockchain+\".csv\", mode='a')\n", | ||
" else:\n", | ||
" print(\"%s already exists.\" % processed_data_file+\"_\"+blockchain+\".csv\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "838b65f6-7238-400b-889c-ea3af72dbadb", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Actually run the collection code\n", | ||
"get_pretrain_contracts()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "cf19d9ae-1de8-4069-a093-592deff4d192", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Finally prepare data for pretraining phase\n", | ||
"pretraining_data = {}\n", | ||
"for blockchain in blockchains:\n", | ||
" # Load data from disk\n", | ||
" pretraining_data[blockchain] = pd.read_csv(processed_data_file+\"_\"+blockchain+\".csv\")\n", | ||
"\n", | ||
"# Concat data in the same pandas variable\n", | ||
"pretraining_data = pd.concat(list(pretraining_data.values()))\n", | ||
"# Suffle data so we have mixed and heterogeneos samples from all the blockchains\n", | ||
"print(pretraining_data.shape)\n", | ||
"pretraining_data = pretraining_data.sample(frac = 1)\n", | ||
"\n", | ||
"training_samples = 60000\n", | ||
"# Save the data to disk\n", | ||
"training_data = pretraining_data[:training_samples]\n", | ||
"validation_data = pretraining_data[training_samples:]\n", | ||
"training_data['decompiled_opcodes'].to_csv(train_file_path, sep='\\t', index=False)\n", | ||
"validation_data['decompiled_opcodes'].to_csv(val_file_path, sep='\\t', index=False)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "065d4f90-efd2-40ab-9782-9e41f12b5f91", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.18" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |