Skip to content

Commit

Permalink
Improve finetuning data colection with multiple blockchains
Browse files Browse the repository at this point in the history
  • Loading branch information
kuronosec committed Mar 31, 2024
1 parent b5fe5fa commit c855ec1
Showing 1 changed file with 58 additions and 113 deletions.
171 changes: 58 additions & 113 deletions analysis/ethereum_smart_contracts/GPT_finetuning_data_collection.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"# Malicious Smart Contract Detection Training Dataset Collection Notebook\n",
"\n",
"This notebook collects smart contract creation bytecode and decompiled opcodes for malicious contract classification. \n",
"Benign contracts are gathered from Zettablock and malicious contracts from [Forta Network's labelled datasets github repo](\"https://github.com/forta-network/labelled-datasets\").\n",
"Benign contracts are gathered from blockchain explorers and malicious contracts from [Forta Network's labelled datasets github repo](\"https://github.com/forta-network/labelled-datasets\").\n",
"\n",
"# Code provided by the Forta project"
]
Expand All @@ -24,16 +24,32 @@
"import logging\n",
"import pickle\n",
"import os\n",
"import time\n",
"\n",
"from evmdasm import EvmBytecode\n",
"import pandas as pd\n",
"from tqdm import tqdm\n",
"import requests\n",
"from web3 import Web3\n",
"\n",
"tqdm.pandas()\n",
"# disable warning logs from evmdasm tool\n",
"logging.getLogger(\"evmdasm\").setLevel(logging.CRITICAL)"
"logging.getLogger(\"evmdasm\").setLevel(logging.CRITICAL)\n",
"\n",
"blockchains = [\"ethereum\", \"polygon\", \"bsc\"]\n",
"current_blockchain = \"ethereum\"\n",
"\n",
"ZETTABLOCK_API_KEY = \"XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\"\n",
"ZETTABLOCK_URL = \"https://api.zettablock.com/api/v1/databases/realtimeDB/queries\"\n",
"EXECUTE_URL = \"https://api.zettablock.com/api/v1/queries/\"\n",
"\n",
"headers = {\n",
" \"accept\": \"application/json\",\n",
" \"X-API-KEY\": ZETTABLOCK_API_KEY,\n",
" \"content-type\": \"application/json\"\n",
"}\n",
"\n",
"TRACES = {}\n",
"CONTRACT_DATA = {}"
]
},
{
Expand All @@ -44,8 +60,13 @@
"outputs": [],
"source": [
"def get_verified_smart_contracts():\n",
" verified_smart_contracts = {}\n",
" # 5k verified SCs Downloaded from https://etherscan.io/exportData?type=open-source-contract-codes\n",
" verified_smart_contracts = pd.read_csv(\"/data/forta/ethereum/text/normal_smart_contracts.csv\")\n",
" verified_smart_contracts[\"ethereum\"] = pd.read_csv(\"/data/forta/ethereum/text/pretraining/raw/ethereum-verified.csv\")\n",
" # 5k verified SCs Downloaded from https://polygonscan.com/exportData?type=open-source-contract-codes\n",
" verified_smart_contracts[\"polygon\"] = pd.read_csv(\"/data/forta/ethereum/text/pretraining/raw/polygon-verified.csv\")\n",
" # 5k verified SCs Downloaded from https://bscscan.com/exportData?type=open-source-contract-codes\n",
" verified_smart_contracts[\"bsc\"] = pd.read_csv(\"/data/forta/ethereum/text/pretraining/raw/bsc-verified.csv\")\n",
" return verified_smart_contracts"
]
},
Expand All @@ -56,39 +77,24 @@
"metadata": {},
"outputs": [],
"source": [
"ALCHEMY_URL = \"https://eth-mainnet.g.alchemy.com/v2/XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\"\n",
"w3_eth = Web3(Web3.HTTPProvider(ALCHEMY_URL))\n",
"TRACES = {}\n",
"\n",
"def get_contract_bytecode(contract):\n",
" try:\n",
" contract_address = Web3.to_checksum_address(contract[\"contract_address\"])\n",
" contract_bytecode = w3_eth.eth.get_code(contract_address)\n",
" contract_bytecode = get_contract_data_from_zettablock_API(contract[\"contract_address\"], current_blockchain)\n",
" return contract_bytecode\n",
" except Exception as e:\n",
" print(e)\n",
" return []"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a0c270ac-3bfb-4216-b64c-711a1add3730",
"metadata": {},
"outputs": [],
"source": [
"def get_created_contracts(tx_hash):\n",
" if TRACES.get(tx_hash, 'error') != 'error':\n",
" return TRACES[tx_hash]\n",
" try:\n",
" trace = w3_eth.tracing.trace_transaction(tx_hash)\n",
" result = [t for t in trace if t['type'].startswith('create')]\n",
" TRACES[tx_hash] = result\n",
" return result\n",
" except Exception as e:\n",
" print(e)\n",
" TRACES[tx_hash] = 'error'\n",
" return [] "
" return []\n",
"\n",
"def get_contract_data_from_zettablock_API(contract, blockchain):\n",
" # Connect to Zettablock API\n",
" # Its free version only allows 1 request per second\n",
" time.sleep(1)\n",
" payload = {\"query\": \"SELECT bytecode FROM %s_mainnet.contract_creations WHERE address = '%s'\" % (blockchain, contract)}\n",
" response = requests.post(ZETTABLOCK_URL, json=payload, headers=headers)\n",
" id = response.json()['id']\n",
" response = requests.post(EXECUTE_URL+id+\"/execute?includeColumnName=false&includeMetadata=false\",\n",
" headers=headers)\n",
" return response.text"
]
},
{
Expand All @@ -98,23 +104,10 @@
"metadata": {},
"outputs": [],
"source": [
"def get_creation_bytecode(row) -> str:\n",
" \"\"\"Get contract creation bytecode from EVM trace.\"\"\"\n",
" tx_hash = row['contract_creation_tx']\n",
" contracts = get_created_contracts(tx_hash)\n",
" \n",
" if len(contracts) == 0:\n",
" return\n",
"\n",
" for c in contracts:\n",
" if c['result'] is not None and c['result'].get('address') == contract_addr:\n",
" return c['action'].get('init')\n",
"\n",
"def get_opcodes(creation_bytecode) -> str:\n",
" bytecode = creation_bytecode\n",
" if bytecode is None:\n",
" return ''\n",
"\n",
" try:\n",
" opcodes = EvmBytecode(bytecode).disassemble()\n",
" except Exception:\n",
Expand All @@ -131,7 +124,7 @@
"outputs": [],
"source": [
"def get_malicious_contracts() -> pd.DataFrame:\n",
" data_path = '/data/forta/ethereum/text/finetuning/malicious_data.pkl'\n",
" data_path = '/data/forta/ethereum/text/pretraining/raw/malicious-data.pkl'\n",
" malicious = None\n",
"\n",
" if os.path.exists(data_path):\n",
Expand All @@ -151,89 +144,33 @@
" return malicious"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4a818966-d8e9-4ae9-8592-a93cd6031411",
"metadata": {},
"outputs": [],
"source": [
"CONTRACT_DATA = {}\n",
"ETHERSCAN_API_KEY=\"XXXXXXXXXXXXXXXX\"\n",
"\n",
"def get_contract_transactions(contracts):\n",
" \"\"\"Get contract transaction info from Etherscan.\"\"\"\n",
" \n",
" # Etherscan API can take up to 5 contract addresses at a time.\n",
" for i in range(0, len(contracts), 5):\n",
" url = f\"https://api.etherscan.io/api?module=contract&action=getcontractcreation&contractaddresses={','.join(contracts[i:i+5])}&apikey={ETHERSCAN_API_KEY}\"\n",
" resp = requests.get(url)\n",
" data = resp.json()['result']\n",
" for r in data:\n",
" contract = r['contractAddress']\n",
" CONTRACT_DATA[contract.lower()] = r"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "41b00f6a-e17f-420d-9d69-f98d4a68757d",
"metadata": {},
"outputs": [],
"source": [
"def get_contract_data(row):\n",
" address = row['contract_address'].lower()\n",
" data = CONTRACT_DATA.get(address)\n",
" if data:\n",
" return data.get('contractCreator'), data.get('txHash')\n",
" \n",
" return None, None"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a7cccb91-ceb9-4e98-bddf-3b59e3b6aca7",
"metadata": {},
"outputs": [],
"source": [
"verified_contracts_sql = '''\n",
"SELECT abis.address as contract_address, \n",
" abis.name as contract_name,\n",
" tags.name as contract_etherscan_label, \n",
" tags.type as contract_tag\n",
"FROM ethereum_mainnet.contracts abis LEFT JOIN ethereum_mainnet.labels tags ON tags.address = abis.address\n",
"'''\n",
"\n",
"mev_contracts_sql = '''\n",
"SELECT tags.address as contract_address, \n",
" tags.type as label_type, \n",
" tags.name as label_name \n",
"FROM ethereum_mainnet.labels tags WHERE tags.name like 'mev%'\n",
"'''\n",
"\n",
"def get_benign_contracts() -> pd.DataFrame:\n",
" data_path = '/data/forta/ethereum/text/finetuning/malicious_data.pkl'\n",
" global current_blockchain\n",
" data_path = '/data/forta/ethereum/text/pretraining/raw/benign-data.pkl'\n",
" benign = None\n",
"\n",
" if os.path.exists(data_path):\n",
" with open(data_path, \"rb\") as data_file:\n",
" benign = pickle.load(data_file)\n",
" else:\n",
" \"\"\"Collects verified and mev contracts from Zettablock and its decompiled opcodes.\"\"\"\n",
"\n",
" # verified_contracts = get_zettablock_data(verified_contracts_sql)\n",
" # mev_contracts = get_luabase_data(mev_contracts_sql)\n",
" # benign = pd.concat([verified_contracts, mev_contracts])\n",
" \"\"\"Collects verified and mev contracts and its decompiled opcodes.\"\"\"\n",
" benign = get_verified_smart_contracts()\n",
"\n",
" get_contract_transactions(list(benign.loc[:, 'contract_address']))\n",
"\n",
" benign[['contract_creator', 'contract_creation_tx']] = benign.apply(get_contract_data, axis=1, result_type='expand')\n",
" benign['creation_bytecode'] = benign.progress_apply(get_contract_bytecode, axis=1)\n",
" for blockchain in blockchains:\n",
" current_blockchain = blockchain\n",
" benign[blockchain]['contract_address'] = benign[blockchain]['contract_address'].progress_apply(str.lower)\n",
" benign[blockchain]['creation_bytecode'] = benign[blockchain].progress_apply(get_contract_bytecode, axis=1)\n",
" begign = pd.DataFrame(pd.concat([benign[\"ethereum\"], benign[\"polygon\"], benign[\"bsc\"]]))\n",
" benign = begign.reset_index(drop=True)\n",
" benign['decompiled_opcodes'] = benign['creation_bytecode'].progress_apply(get_opcodes)\n",
" # Store data so we don't have to download it all the time\n",
" benign.to_pickle('/data/forta/ethereum/text/benign_data.pkl')\n",
" benign.to_pickle(data_path)\n",
" return benign"
]
},
Expand Down Expand Up @@ -299,7 +236,7 @@
"metadata": {},
"outputs": [],
"source": [
"dataset.fillna('').to_parquet('/data/forta/ethereum/text/finetuning/malicious_contract_training_dataset_final.parquet', index=None)"
"dataset.fillna('').to_parquet('/data/forta/ethereum/text/pretraining/raw/verified-smart-contracts.parquet', index=None)"
]
},
{
Expand All @@ -311,6 +248,14 @@
"source": [
"dataset['malicious'].value_counts().plot(kind='pie', figsize=(7, 7))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "190d2967-93d9-4ecc-910d-77a29a40a340",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down

0 comments on commit c855ec1

Please sign in to comment.