Skip to content

Commit

Permalink
improve verified normal and malicious data collection
Browse files Browse the repository at this point in the history
  • Loading branch information
kuronosec committed Apr 5, 2024
1 parent c855ec1 commit 96d73be
Showing 1 changed file with 79 additions and 27 deletions.
106 changes: 79 additions & 27 deletions analysis/ethereum_smart_contracts/GPT_finetuning_data_collection.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"import pickle\n",
"import os\n",
"import time\n",
"import json\n",
"\n",
"from evmdasm import EvmBytecode\n",
"import pandas as pd\n",
Expand All @@ -38,8 +39,9 @@
"blockchains = [\"ethereum\", \"polygon\", \"bsc\"]\n",
"current_blockchain = \"ethereum\"\n",
"\n",
"ZETTABLOCK_API_KEY = \"XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\"\n",
"ZETTABLOCK_API_KEY = \"519a0243-0add-4593-9adf-85c6a6a5ef22\"\n",
"ZETTABLOCK_URL = \"https://api.zettablock.com/api/v1/databases/realtimeDB/queries\"\n",
"ZETTABLOCK_DATA_LAKE_ENDPOINT = \"https://api.zettablock.com/api/v1/databases/AwsDataCatalog/queries\"\n",
"EXECUTE_URL = \"https://api.zettablock.com/api/v1/queries/\"\n",
"\n",
"headers = {\n",
Expand All @@ -62,12 +64,24 @@
"def get_verified_smart_contracts():\n",
" verified_smart_contracts = {}\n",
" # 5k verified SCs Downloaded from https://etherscan.io/exportData?type=open-source-contract-codes\n",
" verified_smart_contracts[\"ethereum\"] = pd.read_csv(\"/data/forta/ethereum/text/pretraining/raw/ethereum-verified.csv\")\n",
" verified_smart_contracts[\"ethereum\"] = pd.read_csv(\"/data/forta/ethereum/text/pretraining/raw/verified/ethereum-verified.csv\")\n",
" # 5k verified SCs Downloaded from https://polygonscan.com/exportData?type=open-source-contract-codes\n",
" verified_smart_contracts[\"polygon\"] = pd.read_csv(\"/data/forta/ethereum/text/pretraining/raw/polygon-verified.csv\")\n",
" verified_smart_contracts[\"polygon\"] = pd.read_csv(\"/data/forta/ethereum/text/pretraining/raw/verified/polygon-verified.csv\")\n",
" # 5k verified SCs Downloaded from https://bscscan.com/exportData?type=open-source-contract-codes\n",
" verified_smart_contracts[\"bsc\"] = pd.read_csv(\"/data/forta/ethereum/text/pretraining/raw/bsc-verified.csv\")\n",
" return verified_smart_contracts"
" verified_smart_contracts[\"bsc\"] = pd.read_csv(\"/data/forta/ethereum/text/pretraining/raw/verified/bsc-verified.csv\")\n",
" return verified_smart_contracts\n",
"\n",
"def get_verified_malicious_smart_contracts():\n",
" \"\"\"Collects malicious contracts from Forta's labelled dataset github repo and its decompiled opcodes.\"\"\"\n",
" # csv from https://github.com/forta-network/labelled-datasets (Only Ethereum)\n",
" # github_url = 'https://raw.githubusercontent.com/forta-network/labelled-datasets/main/labels/1/malicious_smart_contracts.csv'\n",
" # malicious = pd.read_csv(github_url)\n",
" # The other blockchain data apart from Ethereum was collected by Sakundi\n",
" malicious_smart_contracts = {}\n",
" malicious_smart_contracts[\"ethereum\"] = pd.read_csv(\"/data/forta/ethereum/text/pretraining/raw/verified/ethereum-malicious.csv\")\n",
" malicious_smart_contracts[\"polygon\"] = pd.read_csv(\"/data/forta/ethereum/text/pretraining/raw/verified/polygon-malicious.csv\")\n",
" malicious_smart_contracts[\"bsc\"] = pd.read_csv(\"/data/forta/ethereum/text/pretraining/raw/verified/bsc-malicious.csv\")\n",
" return malicious_smart_contracts"
]
},
{
Expand All @@ -79,22 +93,58 @@
"source": [
"def get_contract_bytecode(contract):\n",
" try:\n",
" contract_bytecode = get_contract_data_from_zettablock_API(contract[\"contract_address\"], current_blockchain)\n",
" return contract_bytecode\n",
" query_text = (\"SELECT bytecode FROM %s_mainnet.contract_creations WHERE address = '%s' LIMIT 1\"\n",
" % (current_blockchain, contract[\"contract_address\"]))\n",
" contract_bytecode = call_zettablock_api(query_text, current_blockchain)\n",
" if contract_bytecode is not None:\n",
" return contract_bytecode\n",
" else:\n",
" return \"\"\n",
" except Exception as e:\n",
" print(e)\n",
" return []\n",
"\n",
"def get_contract_data_from_zettablock_API(contract, blockchain):\n",
" # Connect to Zettablock API\n",
" # Its free version only allows 1 request per second\n",
" time.sleep(1)\n",
" payload = {\"query\": \"SELECT bytecode FROM %s_mainnet.contract_creations WHERE address = '%s'\" % (blockchain, contract)}\n",
" response = requests.post(ZETTABLOCK_URL, json=payload, headers=headers)\n",
" id = response.json()['id']\n",
" response = requests.post(EXECUTE_URL+id+\"/execute?includeColumnName=false&includeMetadata=false\",\n",
" headers=headers)\n",
" return response.text"
"# Code taken from Zettablock tutorials\n",
"# check response until success or failed is returned\n",
"def get_response(queryrun_id):\n",
"\timport time\n",
"\ti = 1\n",
"\tqueryrun_status_endpoint = f'https://api.zettablock.com/api/v1/queryruns/{queryrun_id}/status'\n",
"\twhile True:\n",
"\t\tres = requests.get(queryrun_status_endpoint, headers=headers)\n",
"\t\tstate = json.loads(res.text)['state']\n",
"\t\tif state == 'SUCCEEDED' or state == 'FAILED':\n",
"\t\t\treturn state\n",
"\t\ttime.sleep(i)\n",
"\t\ti += 1\n",
"\n",
"def call_zettablock_api(query_text, blockchain):\n",
" # Get Smart Contract Data from Zettablock for several blockchains\n",
" query = {\"query\": query_text, \"resultCacheExpireMillis\": 86400000}\n",
" \n",
" # Create a query with SQL statement, and get query id\n",
" res = requests.post(ZETTABLOCK_DATA_LAKE_ENDPOINT, headers=headers, data=json.dumps(query)) \n",
" # Trigger the query by query id, and get queryrun id\n",
" query_id = res.json()['id']\n",
" data_lake_submission_endpoints = f'https://api.zettablock.com/api/v1/queries/{query_id}/trigger'\n",
" res = requests.post(data_lake_submission_endpoints, headers=headers, data='{}')\n",
" \n",
" # Check status using queryrun id\n",
" queryrun_id = res.json()['queryrunId']\n",
" \n",
" if get_response(queryrun_id) == 'SUCCEEDED':\n",
" # Fetch result from queryrun id\n",
" params = {'includeColumnName': 'true'}\n",
" queryrun_result_endpoint = f'https://api.zettablock.com/api/v1/stream/queryruns/{queryrun_id}/result'\n",
" # if the result is huge, consider using stream and write to a file\n",
" resp = requests.get(queryrun_result_endpoint, stream=False, headers=headers, params=params)\n",
" lines = resp.text.split(\"\\n\")\n",
" data = lines[1]\n",
" return data\n",
" else:\n",
" print('query failed, please check status message for details')\n",
" print(res.json())\n",
" return None"
]
},
{
Expand Down Expand Up @@ -124,20 +174,22 @@
"outputs": [],
"source": [
"def get_malicious_contracts() -> pd.DataFrame:\n",
" data_path = '/data/forta/ethereum/text/pretraining/raw/malicious-data.pkl'\n",
" data_path = '/data/forta/ethereum/text/pretraining/raw/verified/malicious-data.pkl'\n",
" malicious = None\n",
"\n",
" if os.path.exists(data_path):\n",
" with open(data_path, \"rb\") as data_file:\n",
" malicious = pickle.load(data_file)\n",
" else:\n",
" \"\"\"Collects malicious contracts from Forta's labelled dataset github repo and its decompiled opcodes.\"\"\"\n",
" # csv from https://github.com/forta-network/labelled-datasets\n",
" github_url = 'https://raw.githubusercontent.com/forta-network/labelled-datasets/main/labels/1/malicious_smart_contracts.csv'\n",
" malicious = pd.read_csv(github_url)\n",
" # exclude phishing hack related contracts\n",
" malicious = malicious[malicious['contract_creator_etherscan_label'] != 'phish-hack']\n",
" malicious['creation_bytecode'] = malicious.progress_apply(get_contract_bytecode, axis=1)\n",
" malicious = get_verified_malicious_smart_contracts()\n",
" for blockchain in blockchains:\n",
" current_blockchain = blockchain\n",
" if blockchain == \"ethereum\":\n",
" # exclude phishing hack related contracts\n",
" malicious[blockchain] = malicious[blockchain][malicious[blockchain]['contract_creator_etherscan_label'] != 'phish-hack']\n",
" malicious[blockchain]['contract_address'] = malicious[blockchain]['contract_address'].progress_apply(str.lower)\n",
" malicious[blockchain]['creation_bytecode'] = malicious[blockchain].progress_apply(get_contract_bytecode, axis=1)\n",
" malicious = pd.DataFrame(pd.concat([malicious[\"ethereum\"], malicious[\"polygon\"], malicious[\"bsc\"]]))\n",
" malicious['decompiled_opcodes'] = malicious['creation_bytecode'].progress_apply(get_opcodes)\n",
" # Store data so we don't have to download it all the time\n",
" malicious.to_pickle(data_path)\n",
Expand All @@ -153,7 +205,7 @@
"source": [
"def get_benign_contracts() -> pd.DataFrame:\n",
" global current_blockchain\n",
" data_path = '/data/forta/ethereum/text/pretraining/raw/benign-data.pkl'\n",
" data_path = '/data/forta/ethereum/text/pretraining/raw/verified/benign-data.pkl'\n",
" benign = None\n",
"\n",
" if os.path.exists(data_path):\n",
Expand Down Expand Up @@ -236,7 +288,7 @@
"metadata": {},
"outputs": [],
"source": [
"dataset.fillna('').to_parquet('/data/forta/ethereum/text/pretraining/raw/verified-smart-contracts.parquet', index=None)"
"dataset.fillna('').to_parquet('/data/forta/ethereum/text/pretraining/raw/verified/verified-smart-contracts.parquet', index=None)"
]
},
{
Expand Down

0 comments on commit 96d73be

Please sign in to comment.