create pretraining data collection from Zettablock

edenia · Mar 24, 2024 · 273262a · 273262a
1 parent 180390d
commit 273262a
Showing 1 changed file with 313 additions and 0 deletions.
diff --git a/analysis/ethereum_smart_contracts/pretraining_data_collection.ipynb b/analysis/ethereum_smart_contracts/pretraining_data_collection.ipynb
@@ -0,0 +1,313 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "ba979f32-67fc-4dc5-a71b-be84043a87b9",
+   "metadata": {},
+   "source": [
+    "# Pretraining - Smart Contract Training Dataset Collection\n",
+    "\n",
+    "This notebook collects smart contract bytecode and decompiled opcodes for normal and malicious contract classification.\n",
+    "\n",
+    "Pretraining contracts are gathered from Zettablock and malicious contracts from [Forta Network's labelled datasets github repo](\"https://github.com/forta-network/labelled-datasets\")."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "599dd826-f2b4-416c-b256-8eef021287a9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "import pickle\n",
+    "import os\n",
+    "import requests\n",
+    "import json\n",
+    "import time\n",
+    "\n",
+    "from evmdasm import EvmBytecode\n",
+    "import pandas as pd\n",
+    "from tqdm import tqdm\n",
+    "from web3 import Web3\n",
+    "\n",
+    "tqdm.pandas()\n",
+    "# disable warning logs from evmdasm tool\n",
+    "logging.getLogger(\"evmdasm\").setLevel(logging.CRITICAL)\n",
+    "\n",
+    "zettablock_data_file = \"/data/forta/ethereum/text/pretraining/zettablock_data\"\n",
+    "processed_data_file = \"/data/forta/ethereum/text/pretraining/big_pretrain_data\"\n",
+    "\n",
+    "# Your Zettablock API key\n",
+    "API_KEY = \"XXXXX-XXXX-XXXX-XXX-XXXXXXXX\"\n",
+    "headers = {\n",
+    "    \"accept\": \"application/json\",\n",
+    "    \"content-type\": \"application/json\",\n",
+    "    # credentials\n",
+    "    \"X-API-KEY\": API_KEY\n",
+    "}\n",
+    "\n",
+    "# Zettablock endpoint\n",
+    "data_lake_query_endpoint = \"https://api.zettablock.com/api/v1/databases/AwsDataCatalog/queries\"\n",
+    "\n",
+    "# Configure the blockchains we are interested in\n",
+    "blockchains = [\"ethereum_mainnet\", \"polygon_mainnet\", \"bsc_mainnet\"]\n",
+    "\n",
+    "# Final training and validation files\n",
+    "train_file_path = \"/data/forta/ethereum/text/pretraining/pretraining_train.csv\"\n",
+    "val_file_path = \"/data/forta/ethereum/text/pretraining/pretraining_val.csv\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f62b6fa1-3333-4422-8b15-eb80b0206460",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Code taken from Zettablock tutorials\n",
+    "# check response until success or failed is returned\n",
+    "def get_response(queryrun_id):\n",
+    "\timport time\n",
+    "\ti = 1\n",
+    "\tqueryrun_status_endpoint = f'https://api.zettablock.com/api/v1/queryruns/{queryrun_id}/status'\n",
+    "\twhile True:\n",
+    "\t\tres = requests.get(queryrun_status_endpoint, headers=headers)\n",
+    "\t\tstate = json.loads(res.text)['state']\n",
+    "\t\tif state == 'SUCCEEDED' or state == 'FAILED':\n",
+    "\t\t\treturn state\n",
+    "\t\ttime.sleep(i)\n",
+    "\t\ti += 1\n",
+    "\n",
+    "def download_file(url: str, local_file: str, headers=None, params=None):\n",
+    "    resp = requests.get(url, stream=True, headers=headers, params=params)\n",
+    "    total = int(resp.headers.get('content-length', 0))\n",
+    "    with open(local_file, 'ab') as file, tqdm(\n",
+    "        desc=local_file,\n",
+    "        total=total,\n",
+    "        unit='iB',\n",
+    "        unit_scale=True,\n",
+    "        unit_divisor=1024,\n",
+    "    ) as bar:\n",
+    "        for data in resp.iter_content(chunk_size=1024):\n",
+    "            size = file.write(data)\n",
+    "            bar.update(size)\n",
+    "\n",
+    "def call_zettablock_api(query_text, blockchain):\n",
+    "    # Get Smart Contract Data from Zettablock for several blockchains\n",
+    "    query = {\"query\": query_text, \"resultCacheExpireMillis\": 86400000}\n",
+    "    \n",
+    "    # Create a query with SQL statement, and get query id\n",
+    "    res = requests.post(data_lake_query_endpoint, headers=headers, data=json.dumps(query))\n",
+    "    print(res.text)\n",
+    "    \n",
+    "    # Trigger the query by query id, and get queryrun id\n",
+    "    query_id = res.json()['id']\n",
+    "    data_lake_submission_endpoints = f'https://api.zettablock.com/api/v1/queries/{query_id}/trigger'\n",
+    "    res = requests.post(data_lake_submission_endpoints, headers=headers, data='{}')\n",
+    "    \n",
+    "    # Check status using queryrun id\n",
+    "    queryrun_id = res.json()['queryrunId']\n",
+    "    \n",
+    "    if get_response(queryrun_id) == 'SUCCEEDED':\n",
+    "        # Fetch result from queryrun id\n",
+    "        params = {'includeColumnName': 'true'}\n",
+    "        queryrun_result_endpoint = f'https://api.zettablock.com/api/v1/stream/queryruns/{queryrun_id}/result'\n",
+    "        # if the result is huge, consider using stream and write to a file\n",
+    "        download_file(queryrun_result_endpoint, zettablock_data_file+\"_\"+blockchain+\".csv\", headers=headers, params=params)\n",
+    "    else:\n",
+    "        print('query failed, please check status message for details')\n",
+    "        print(res.json())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "73efe3ca-4665-4b2d-aff3-3a4ce62b1e62",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Code provided by the Forta team\n",
+    "# Decompile and dissasemble the smart contract bytecode\n",
+    "def get_opcodes(creation_bytecode) -> str:\n",
+    "    bytecode = creation_bytecode\n",
+    "    if bytecode is None:\n",
+    "        return ''\n",
+    "\n",
+    "    try:\n",
+    "        opcodes = EvmBytecode(bytecode).disassemble()\n",
+    "    except Exception:\n",
+    "        return ''\n",
+    "    \n",
+    "    return \" \".join([str(op).strip() for op in opcodes])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "afe717dc-febf-48bb-ab9d-1957fa5477c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Code provided by the Forta team\n",
+    "# Filter opcodes to get the best features\n",
+    "def get_exp_2_features(row):\n",
+    "    creator = row['contract_creator']\n",
+    "    opcodes = row['decompiled_opcodes'].split()\n",
+    "    mask = '0xffffffffffffffffffffffffffffffffffffffff'\n",
+    "    features = []\n",
+    "    for i in range(len(opcodes)-1):\n",
+    "        first = opcodes[i]\n",
+    "        second = opcodes[i+1]\n",
+    "        if not first.startswith('0x'):\n",
+    "            token = first\n",
+    "            if first.startswith('UNKNOWN') or first.startswith('INVALID'):\n",
+    "                token = first.split('_')[0]\n",
+    "            features.append(token)\n",
+    "        elif first == 'PUSH4':\n",
+    "            features.append(second)\n",
+    "        elif first == 'PUSH20':\n",
+    "            if second == creator:\n",
+    "                features.append('creator')\n",
+    "            elif second == mask:\n",
+    "                features.append(mask)\n",
+    "            else:\n",
+    "                features.append('address')\n",
+    "        elif first == 'PUSH32':\n",
+    "            features.append(second)\n",
+    "    return \" \".join(features)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a7cccb91-ceb9-4e98-bddf-3b59e3b6aca7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create queries for the supported blockchains\n",
+    "# They don't have all the same available data\n",
+    "def get_query(blockchain):\n",
+    "    query_text = \"\"\n",
+    "    if blockchain == \"ethereum_mainnet\":\n",
+    "        query_text = '''\n",
+    "            SELECT contract.address as contract_address,\n",
+    "                   contract.name as contract_name,\n",
+    "                   contract.creator as contract_creator,\n",
+    "                   tags.name as contract_tag_name, \n",
+    "                   tags.type as contract_type,\n",
+    "                   contract.code as contract_code\n",
+    "            FROM {}.contracts contract LEFT JOIN {}.labels tags ON tags.address = contract.address\n",
+    "            LIMIT 30000\n",
+    "        '''.format(blockchain, blockchain)\n",
+    "    elif blockchain == \"polygon_mainnet\":\n",
+    "        query_text = '''\n",
+    "            SELECT contracts.address as contract_address,\n",
+    "                   mappings.contract_name as contract_name,\n",
+    "                   contracts.creator_address as contract_creator,\n",
+    "                   mappings.contract_category as contract_type,\n",
+    "                   contracts.bytecode as contract_code\n",
+    "            FROM polygon_mainnet.contract_creations contracts LEFT JOIN polygon_mainnet.contract_mappings mappings ON mappings.contract_address = contracts.address\n",
+    "            LIMIT 30000\n",
+    "        '''.format(blockchain, blockchain)\n",
+    "    else:\n",
+    "        query_text = '''\n",
+    "            SELECT contracts.address as contract_address,\n",
+    "                   contracts.creator_address as contract_creator,\n",
+    "                   contracts.bytecode as contract_code\n",
+    "            FROM bsc_mainnet.contract_creations contracts\n",
+    "            LIMIT 30000\n",
+    "        '''.format(blockchain, blockchain)\n",
+    "    return query_text\n",
+    "\n",
+    "# Get contracts and filter the data\n",
+    "def get_pretrain_contracts():\n",
+    "    # Get data from Zettablock for 3 different EVM compatible blockchains\n",
+    "    for blockchain in blockchains:\n",
+    "        if not os.path.exists(zettablock_data_file+\"_\"+blockchain+\".csv\"):\n",
+    "            print(\"Dowloading data from %s...\" % (blockchain))\n",
+    "            call_zettablock_api(get_query(blockchain), blockchain)\n",
+    "        if not os.path.exists(processed_data_file+\"_\"+blockchain+\".csv\"):\n",
+    "            \"\"\"Collects contracts from Zettablock and its decompiled opcodes.\"\"\" \n",
+    "            chunksize = 10 ** 6\n",
+    "            with pd.read_csv(zettablock_data_file+\"_\"+blockchain+\".csv\", chunksize=chunksize) as contract_reader:\n",
+    "                for contracts in contract_reader:\n",
+    "                    contracts['decompiled_opcodes'] = contracts['contract_code'].progress_apply(get_opcodes)\n",
+    "                    # Store data so we don't have to download it all the time\n",
+    "                    contracts = contracts[(contracts['decompiled_opcodes'].notna()) & (contracts['decompiled_opcodes'] != '')]\n",
+    "                    contracts.drop_duplicates('contract_address', inplace=True)\n",
+    "                    contracts.progress_apply(get_exp_2_features, axis=1)\n",
+    "                    contracts.to_csv(processed_data_file+\"_\"+blockchain+\".csv\", mode='a')\n",
+    "        else:\n",
+    "            print(\"%s already exists.\" % processed_data_file+\"_\"+blockchain+\".csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "838b65f6-7238-400b-889c-ea3af72dbadb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Actually run the collection code\n",
+    "get_pretrain_contracts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cf19d9ae-1de8-4069-a093-592deff4d192",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Finally prepare data for pretraining phase\n",
+    "pretraining_data = {}\n",
+    "for blockchain in blockchains:\n",
+    "    # Load data from disk\n",
+    "    pretraining_data[blockchain] = pd.read_csv(processed_data_file+\"_\"+blockchain+\".csv\")\n",
+    "\n",
+    "# Concat data in the same pandas variable\n",
+    "pretraining_data = pd.concat(list(pretraining_data.values()))\n",
+    "# Suffle data so we have mixed and heterogeneos samples from all the blockchains\n",
+    "print(pretraining_data.shape)\n",
+    "pretraining_data = pretraining_data.sample(frac = 1)\n",
+    "\n",
+    "training_samples = 60000\n",
+    "# Save the data to disk\n",
+    "training_data = pretraining_data[:training_samples]\n",
+    "validation_data = pretraining_data[training_samples:]\n",
+    "training_data['decompiled_opcodes'].to_csv(train_file_path, sep='\\t', index=False)\n",
+    "validation_data['decompiled_opcodes'].to_csv(val_file_path, sep='\\t', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "065d4f90-efd2-40ab-9782-9e41f12b5f91",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}