Skip to content

Commit

Permalink
fix: use global keyword when updating global variables
Browse files Browse the repository at this point in the history
  • Loading branch information
rcstanciu committed Jun 10, 2021
1 parent 142f09d commit 70da202
Showing 1 changed file with 59 additions and 23 deletions.
82 changes: 59 additions & 23 deletions monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,34 +5,45 @@

from time import sleep

logging.basicConfig(format='[%(asctime)s] %(levelname)s:%(message)s', level=logging.DEBUG)
logging.basicConfig(
format="[%(asctime)s] %(levelname)s:%(message)s", level=logging.DEBUG
)
client = docker.from_env()

MAX_RETRY_COUNT = 5
GRAPHQL_URI="http://node:3085/graphql"
GRAPHQL_URI = "http://node:3085/graphql"
INITIAL_STATUS_COUNT = {
"SYNCED": 0,
"CONNECTING": 0,
"OFFLINE": 0,
"CATCHUP": 0,
'BOOTSTRAP': 0
"BOOTSTRAP": 0,
}
STATUS_COUNT = INITIAL_STATUS_COUNT
OUTOFSYNC_COUNT = 0


class NodeOutOfSyncException(Exception):
"""Exception for triggering the node restart."""

pass


class NodeNotReachableException(Exception):
"""Exception for waiting the node to be reachable."""

pass


def check_mina_node_status():
"""
Fetch Mina node status using the GraphQL client.
"""
logging.debug("Fetching node status")
global MAX_RETRY_COUNT
global GRAPHQL_URI
global STATUS_COUNT

retry_count = 0

while retry_count < MAX_RETRY_COUNT:
Expand All @@ -54,9 +65,14 @@ def check_mina_node_status():
}
"""

# Fetch node status using the GraphQL API
# Fetch node status using the GraphQL API
try:
r = requests.post(GRAPHQL_URI, json={'query': query}, headers={'Content-Type': 'application/json'}, timeout=60)
r = requests.post(
GRAPHQL_URI,
json={"query": query},
headers={"Content-Type": "application/json"},
timeout=60,
)
except requests.exceptions.ConnectionError:
# Node is not reachable.
# Raise NodeOutOfSyncException in order to skip a few syncs
Expand All @@ -66,44 +82,54 @@ def check_mina_node_status():
# Check response status
if r.status_code == 200:
logging.debug("Status fetched successfully")
response = r.json()['data']['daemonStatus']
response = r.json()["data"]["daemonStatus"]
logging.debug(response)

# Node sync status
sync_status = response['syncStatus']
sync_status = response["syncStatus"]
# Node uptime (in seconds)
uptime = response['uptimeSecs']
uptime = response["uptimeSecs"]
# Blockchain length
blockchain_length = response['blockchainLength']
blockchain_length = response["blockchainLength"]
# Highest block
highest_block = response['highestBlockLengthReceived']
highest_block = response["highestBlockLengthReceived"]
# Highest unvalidated block
highest_unvalidated_block = response['highestUnvalidatedBlockLengthReceived']
highest_unvalidated_block = response[
"highestUnvalidatedBlockLengthReceived"
]
# Compute difference between unvalidated and validated blocks
blocks_validated_diff = highest_unvalidated_block - highest_block

# Increment status count
STATUS_COUNT[sync_status] += 1
logging.debug(STATUS_COUNT)

if STATUS_COUNT['CONNECTING'] > 60:
logging.error("Node has been too long in the CONNECTING state. (more than 5 minutes")
if STATUS_COUNT["CONNECTING"] > 60:
logging.error(
"Node has been too long in the CONNECTING state. (more than 5 minutes"
)
raise NodeOutOfSyncException()

if STATUS_COUNT['CATCHUP'] > 540:
logging.debug("Node has been too long in the CATHUP state (more than 45 minutes).")
if STATUS_COUNT["CATCHUP"] > 540:
logging.debug(
"Node has been too long in the CATHUP state (more than 45 minutes)."
)
raise NodeOutOfSyncException()

if STATUS_COUNT['BOOTSTRAP'] > 240:
logging.error("Node has been too long in the BOOTSTRAP state (more than 20 minutes).")
if STATUS_COUNT["BOOTSTRAP"] > 240:
logging.error(
"Node has been too long in the BOOTSTRAP state (more than 20 minutes)."
)
raise NodeOutOfSyncException()

if sync_status == 'BOOTSTRAP':
if sync_status == "BOOTSTRAP":
logging.debug("Node is bootstrapping...")
return

if blocks_validated_diff > 2:
logging.error("Difference between highest validated block and highest unvalidated block. (delta > 2)")
logging.error(
"Difference between highest validated block and highest unvalidated block. (delta > 2)"
)
raise NodeOutOfSyncException()

logging.info("Node is synced.")
Expand All @@ -116,27 +142,37 @@ def check_mina_node_status():
# Raise NodeOutOfSyncException in order to restart the node
raise NodeOutOfSyncException()


def restart_node():
"""Restart Mina node"""
logging.debug("Restarting node")

global STATUS_COUNT
global INITIAL_STATUS_COUNT
global client

for item in client.containers.list():
if item.name == 'node' or item.name == 'sidecar':
if item.name == "node" or item.name == "sidecar":
item.stop()
break

STATUS_COUNT = INITIAL_STATUS_COUNT


def start_monitor():
"""Main event loop"""
logging.info("mina-monitor started")

global OUTOFSYNC_COUNT

while True:
try:
check_mina_node_status()
except NodeOutOfSyncException:
OUTOFSYNC_COUNT += 1
logging.error("Node is out of sync. (OUTOFSYNC_COUNT={})".format(OUTOFSYNC_COUNT))
logging.error(
"Node is out of sync. (OUTOFSYNC_COUNT={})".format(OUTOFSYNC_COUNT)
)
restart_node()
sleep(30)
except NodeNotReachableException:
Expand All @@ -145,6 +181,6 @@ def start_monitor():
finally:
sleep(5)

if __name__ == '__main__':
start_monitor()

if __name__ == "__main__":
start_monitor()

0 comments on commit 70da202

Please sign in to comment.