Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TARDIS-3783-Analyze and Improve Short Polling Implementation #16

Merged
merged 1 commit into from
Sep 6, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions opsgenie_sdk/api_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -708,6 +708,9 @@ def __deserialize_model(self, data, klass):
value = data[klass.attribute_map[attr]]
kwargs[attr] = self.__deserialize(value, attr_type)

if hasattr(klass, "url") and hasattr(klass, "id"):
kwargs["api_client"] = self

instance = klass(**kwargs)

if hasattr(instance, 'get_real_child_model'):
Expand Down
111 changes: 39 additions & 72 deletions opsgenie_sdk/models/success_response.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,7 @@
import six

import time
import json
import certifi
import urllib3
import random

import opsgenie_sdk
from opsgenie_sdk.models.success_data import SuccessData # noqa: F401,E501
Expand Down Expand Up @@ -56,7 +54,7 @@ class SuccessResponse(object):
}
attribute_map['url'] = 'url'

def __init__(self, request_id=None, took=0.0, result=None, data=None, url=None): # noqa: E501
def __init__(self, request_id=None, took=0.0, result=None, data=None, url=None, api_client=None): # noqa: E501
"""SuccessResponse - a model defined in OpenAPI""" # noqa: E501

self._request_id = None
Expand All @@ -72,13 +70,11 @@ def __init__(self, request_id=None, took=0.0, result=None, data=None, url=None):
if data is not None:
self.data = data

if url is not None:
self.url = url
else:
self.url = None
self.url = url
self.api_client = api_client

self._id = None
self.conf = opsgenie_sdk.configuration.Configuration()
self.conf = api_client.configuration
self.logger = self.conf.logger["package_logger"]
@property
def request_id(self):
Expand Down Expand Up @@ -190,96 +186,67 @@ def url(self, url):
self._url = url

def retrieve_result(self):
conf = self.conf
api_key = conf.api_key_prefix.get('Authorization') + ' ' + conf.api_key.get('Authorization')
http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
headers = {
'Authorization': api_key
}

request_url = self._build_request_url_(conf.host)

attempt_count = 0
MAX_NUMBER_OF_RETRIES = conf.short_polling_max_retries
domain_api_client = self._get_domain_api_client_()
MAX_NUMBER_OF_RETRIES = self.conf.short_polling_max_retries
condition = True
response = None
attempt_count = 0

while condition:
if attempt_count > 0:
time.sleep(2 * attempt_count * 0.2)
sleep_time = random.uniform(0, (0.2 * 2 ** attempt_count))
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So polling delay is now starting at 0.1 and doubling on every attempt, with a uniformly distributed jitter of +/-100%

Combined with the short_polling_max_retries default value of 10 (and assuming the off-by-one bug I noted gets fixed) that gives an expected maximum aggregate sleep time of 51 seconds, and a worst-case of 102 seconds.

This plus any delays in calling get_[incident_]request_status would be the maximum blocking time for any client code calling retrieve_result(), directly or indirectly.

Seems reasonable...

time.sleep(sleep_time)

try:
self.logger.debug(
str(attempt_count + 1) + ' attempt to retrieve result with request_id: ' + str(self.request_id))
response = http.request(method='GET', url=request_url, headers=headers)
except ApiException as err:
print("Exception when calling success_response->retrieve_result: %s\n" % err)

response_headers = response.headers
if response_headers is not None:
rate_limit_state = response_headers.get('X-RateLimit-State')
status_code = response.status

if rate_limit_state == 'THROTTLED':
should_retry = True
self.logger.debug('Should retry because X-RateLimit-State is THROTTLED')
elif status_code == 429:
should_retry = True
self.logger.debug('Should retry because Status is 429')
elif 500 <= status_code <= 599:
should_retry = True
self.logger.debug('Should retry because Status is ' + status_code)
else:
should_retry = False
response_body = response.data
response_body_decoded = json.loads(response_body)
response_body_decoded_data = response_body_decoded.get('data')
if response_body_decoded_data is not None:
self._id = response_body_decoded_data.get('alertId')
if self._id is None:
self._id = response_body_decoded_data.get('incidentId')

attempt_count += 1

if should_retry and attempt_count < MAX_NUMBER_OF_RETRIES:
condition = True
else:
condition = False
response = domain_api_client.get_incident_request_status(request_id=self.request_id)
except AttributeError:
response = domain_api_client.get_request_status(request_id=self.request_id)

if response.data.is_success is True:
should_retry = False

if hasattr(response.data, "alert_id"):
self._id = response.data.alert_id
elif hasattr(response.data, "incident_id"):
self._id = response.data.incident_id
else:
should_retry = True

if should_retry and attempt_count < MAX_NUMBER_OF_RETRIES:
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Off-by-one error: If incrementing attempt_count before this comparison it should be attempt_count <= MAX_NUMBER_OF_RETRIES.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch. I will fix it right away.

condition = True
else:
condition = False

attempt_count += 1

return response

def retrieve_resulting_action(self):
conf = self.conf
api_key = conf.api_key_prefix.get('Authorization') + ' ' + conf.api_key.get('Authorization')
http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
headers = {
'Authorization': api_key
}

if self._id is None:
self.retrieve_result()

request_url = self._build_request_url_(conf.host)
domain_api_client = self._get_domain_api_client_()

try:
response = http.request(method='GET', url=request_url, headers=headers)
action = domain_api_client.get_alert(identifier=self._id)
except AttributeError:
action = domain_api_client.get_incident(identifier=self._id)

return response
except ApiException as err:
print("Exception when calling success_response->retrieve_result: %s\n" % err)
return action

def _build_request_url_(self, host):
def _get_domain_api_client_(self):
if 'alerts' in self.url:
return host + '/v2/alerts/requests/' + self.request_id
return opsgenie_sdk.AlertApi(api_client=self.api_client)
elif 'incident' in self.url:
return host + '/v1/incidents/requests/' + self.request_id
return opsgenie_sdk.IncidentApi(api_client=self.api_client)
else:
raise ApiException(reason='Short polling is not currently supported for this domain')

@property
def id(self):
if self._id is None:
self.retrieve_result()

return self._id

def to_dict(self):
Expand Down
1 change: 1 addition & 0 deletions opsgenie_sdk/rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,7 @@ def request(self, method, url, query_params=None, headers=None,
exception = exceptions.build_exception(response=r)
raise exception

self.retries = -1
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tracking retry count here seems a bit hacky and error-prone. You've fixed one problem here but I think there are more lurking. For example, what happens when all retries are exhausted? This method raises and doesn't reset retries to -1 (and it has no way to know that retries have been exhausted in order to do the reset). So I think retry_count is going to be wrong until after the next successful request?

Seem like retry counts should be maintained at the level where the retrying is being done, and passed in somehow?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup, I agree. In this particular pull request, I was more focused towards improving the short polling implementation but then I realized that the retry_count wasn't being reset in the previous implementation. So, from the top of my head, I just reset it there. I am gonna dig a little deeper into it. Thanks for the heads up :))

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Upon further investigation, I realized that this particular retry_count is only used for metrics' publishing and not for the actual retry policy of the SDK. The number of times the SDK retries an action is managed by the retry library and is independent of this variable. I have added this information to an already existent internal issue about improving the retry logic of the SDK and hopefully the implementation will be improved very soon

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's fixed in the following pull request: #17

return r

def GET(self, url, headers=None, query_params=None, _preload_content=True,
Expand Down
10 changes: 7 additions & 3 deletions samples/alert.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import getopt
import json
import sys

import opsgenie_sdk
Expand Down Expand Up @@ -150,12 +149,17 @@ def main(argv):
print('Create Alert:')
response = alert.create_alert()

alert_id = response.id

print()
print('Retrieve Result:')
result = response.retrieve_result()
print(json.loads(result.data))
print(result)

alert_id = response.id
print()
print('Retrieve Resulting Action:')
alert1 = response.retrieve_resulting_action()
print(alert1)

print()
print('Get Alert:')
Expand Down
9 changes: 6 additions & 3 deletions samples/incident.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import getopt
import json
import sys

import opsgenie_sdk
from opsgenie_sdk.metrics.observer import Observer
from opsgenie_sdk.rest import ApiException


class MetricObserver(Observer):
def notify(self, publisher):
print("{}: '{}' has now metric data = {}".format(type(self).__name__,
Expand Down Expand Up @@ -126,7 +124,12 @@ def main(argv):
print()
print('Retrieve Result:')
result = response.retrieve_result()
print(json.loads(result.data))
print(result)

print()
print('Retrieve Resulting Action:')
incident1 = response.retrieve_resulting_action()
print(incident1)

incident_id = response.id

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
# prerequisite: setuptools
# http://pypi.python.org/pypi/setuptools

REQUIRES = ["urllib3 >= 1.15", "six >= 1.10", "certifi", "python-dateutil", "numpy >= 1.16.3", "retry >= 0.9.2"]
REQUIRES = ["urllib3 >= 1.15", "six >= 1.10", "certifi", "python-dateutil", "numpy >= 1.16.3", "retry >= 0.9.2", "setuptools >= 21.0.0"]

with open("README.md", "r") as fh:
long_description = fh.read()
Expand Down
3 changes: 3 additions & 0 deletions templates/api_client.mustache
Original file line number Diff line number Diff line change
Expand Up @@ -714,6 +714,9 @@ class ApiClient(object):
value = data[klass.attribute_map[attr]]
kwargs[attr] = self.__deserialize(value, attr_type)

if hasattr(klass, "url") and hasattr(klass, "id"):
kwargs["api_client"] = self

instance = klass(**kwargs)

if hasattr(instance, 'get_real_child_model'):
Expand Down
Loading