mirror of
https://github.com/ipxe/ipxe.git
synced 2026-04-22 06:02:04 +02:00
[cloud] Retry all Alibaba Cloud API calls
Experimentation suggests Alibaba Cloud API calls are extremely unreliable, with a failure rate around 1%. It is therefore necessary to allow for retrying basically every API call. Some API calls (e.g. DescribeImages or ModifyImageAttribute) are naturally idempotent and so safe to retry. Some non-idempotent API calls (e.g. CopyImage) support explicit idempotence tokens. The remaining API calls may simply fail on a retry, if the original request happened to succeed but failed to return a response. We could write convoluted retry logic around the non-idempotent calls, but this would substantially increase the complexity of the already unnecessarily complex code. For now, we assume that retrying non-idempotent requests is probably more likely to fix transient failures than to cause additional problems. Signed-off-by: Michael Brown <mcb30@ipxe.org>
This commit is contained in:
parent
d146b28b50
commit
fa856e784c
@ -86,6 +86,29 @@ IPXE_STORAGE_PREFIX = 'ipxe-upload-temp-'
|
||||
POLL_INTERVAL_SEC = 5
|
||||
POLL_MAX_RETRIES = 100
|
||||
|
||||
# Experimentation suggests Alibaba Cloud API calls are extremely
|
||||
# unreliable, with a failure rate around 1%. It is therefore
|
||||
# necessary to allow for retrying basically every API call.
|
||||
#
|
||||
# Some API calls (e.g. DescribeImages or ModifyImageAttribute) are
|
||||
# naturally idempotent and so safe to retry. Some non-idempotent API
|
||||
# calls (e.g. CopyImage) support explicit idempotence tokens. The
|
||||
# remaining API calls may simply fail on a retry, if the original
|
||||
# request happened to succeed but failed to return a response.
|
||||
#
|
||||
# We could write convoluted retry logic around the non-idempotent
|
||||
# calls, but this would substantially increase the complexity of the
|
||||
# already unnecessarily complex code. For now, we assume that
|
||||
# retrying non-idempotent requests is probably more likely to fix
|
||||
# transient failures than to cause additional problems.
|
||||
#
|
||||
RUNTIME_OPTS = util.models.RuntimeOptions(
|
||||
autoretry=True,
|
||||
max_attempts=5,
|
||||
connect_timeout=10000,
|
||||
read_timeout=120000,
|
||||
)
|
||||
|
||||
# For regions in mainland China, the Chinese state censorship laws
|
||||
# prohibit direct access to OSS bucket contents.
|
||||
#
|
||||
@ -227,13 +250,13 @@ def delete_temp_function(clients, func):
|
||||
"""Remove temporary function"""
|
||||
logger.info("delete function %s %s" % (clients.region, func))
|
||||
assert func.startswith(IPXE_STORAGE_PREFIX)
|
||||
clients.fc.delete_function(func)
|
||||
clients.fc.delete_function_with_options(func, {}, RUNTIME_OPTS)
|
||||
|
||||
def create_temp_function(clients, role):
|
||||
"""Create temporary function (and remove any stale temporary functions)"""
|
||||
req = fc.models.ListFunctionsRequest(prefix=IPXE_STORAGE_PREFIX)
|
||||
try:
|
||||
rsp = clients.fc.list_functions(req)
|
||||
rsp = clients.fc.list_functions_with_options(req, {}, RUNTIME_OPTS)
|
||||
except openapi.client.UnretryableException:
|
||||
# AliCloud provides no other way to detect non-working regions
|
||||
return None
|
||||
@ -259,7 +282,7 @@ def create_temp_function(clients, role):
|
||||
timeout=FC_TIMEOUT_SEC,
|
||||
)
|
||||
req = fc.models.CreateFunctionRequest(body=body)
|
||||
rsp = clients.fc.create_function(req)
|
||||
rsp = clients.fc.create_function_with_options(req, {}, RUNTIME_OPTS)
|
||||
logger.info("create function %s %s" % (clients.region, func))
|
||||
return func
|
||||
|
||||
@ -271,13 +294,7 @@ def call_temp_function(clients, func, payload):
|
||||
)
|
||||
body = json.dumps(payload)
|
||||
req = fc.models.InvokeFunctionRequest(body=body)
|
||||
run = util.models.RuntimeOptions(
|
||||
autoretry=True,
|
||||
max_attempts=5,
|
||||
connect_timeout=10000,
|
||||
read_timeout=120000,
|
||||
)
|
||||
rsp = clients.fc.invoke_function_with_options(func, req, hdr, run)
|
||||
rsp = clients.fc.invoke_function_with_options(func, req, hdr, RUNTIME_OPTS)
|
||||
log = base64.b64decode(rsp.headers.get('x-fc-log-result', b'')).decode()
|
||||
if rsp.status_code != http.HTTPStatus.OK:
|
||||
raise RuntimeError(rsp)
|
||||
@ -368,7 +385,7 @@ def delete_image(clients, name):
|
||||
image_name=name,
|
||||
image_owner_alias='self',
|
||||
)
|
||||
rsp = clients.ecs.describe_images(req)
|
||||
rsp = clients.ecs.describe_images_with_options(req, RUNTIME_OPTS)
|
||||
for image in rsp.body.images.image or ():
|
||||
logger.info("delete image %s %s (%s)" %
|
||||
(clients.region, image.image_name, image.image_id))
|
||||
@ -378,12 +395,14 @@ def delete_image(clients, name):
|
||||
image_id=image.image_id,
|
||||
is_public=False,
|
||||
)
|
||||
rsp = clients.ecs.modify_image_share_permission(req)
|
||||
rsp = clients.ecs.modify_image_share_permission_with_options(
|
||||
req, RUNTIME_OPTS
|
||||
)
|
||||
req = ecs.models.DeleteImageRequest(
|
||||
region_id=clients.region,
|
||||
image_id=image.image_id
|
||||
)
|
||||
rsp = clients.ecs.delete_image(req)
|
||||
rsp = clients.ecs.delete_image_with_options(req, RUNTIME_OPTS)
|
||||
|
||||
def wait_for_task(clients, task_id):
|
||||
"""Wait for task to complete"""
|
||||
@ -394,7 +413,10 @@ def wait_for_task(clients, task_id):
|
||||
region_id=clients.region,
|
||||
task_ids=task_id,
|
||||
)
|
||||
rsp = clients.ecs.describe_tasks(req)
|
||||
try:
|
||||
rsp = clients.ecs.describe_tasks_with_options(req, RUNTIME_OPTS)
|
||||
except openapi.client.UnretryableException:
|
||||
continue
|
||||
assert len(rsp.body.task_set.task) == 1
|
||||
assert rsp.body.task_set.task[0].task_id == task_id
|
||||
status = rsp.body.task_set.task[0].task_status
|
||||
@ -412,7 +434,10 @@ def wait_for_image(clients, image_id):
|
||||
region_id=clients.region,
|
||||
image_id=image_id,
|
||||
)
|
||||
rsp = clients.ecs.describe_images(req)
|
||||
try:
|
||||
rsp = clients.ecs.describe_images_with_options(req, RUNTIME_OPTS)
|
||||
except openapi.client.UnretryableException:
|
||||
continue
|
||||
if len(rsp.body.images.image):
|
||||
assert len(rsp.body.images.image) == 1
|
||||
assert rsp.body.images.image[0].image_id == image_id
|
||||
@ -437,8 +462,9 @@ def import_image(clients, image, bucket):
|
||||
architecture=image.arch,
|
||||
boot_mode=image.mode,
|
||||
disk_device_mapping=[disk],
|
||||
client_token=str(uuid4()),
|
||||
)
|
||||
rsp = clients.ecs.import_image(req)
|
||||
rsp = clients.ecs.import_image_with_options(req, RUNTIME_OPTS)
|
||||
image_id = rsp.body.image_id
|
||||
task_id = rsp.body.task_id
|
||||
wait_for_task(clients, task_id)
|
||||
@ -456,8 +482,9 @@ def copy_image(clients, image, image_id, censored):
|
||||
image_id=image_id,
|
||||
destination_region_id=censored.region,
|
||||
destination_image_name=image.name,
|
||||
client_token=str(uuid4()),
|
||||
)
|
||||
rsp = clients.ecs.copy_image(req)
|
||||
rsp = clients.ecs.copy_image_with_options(req, RUNTIME_OPTS)
|
||||
copy_id = rsp.body.image_id
|
||||
wait_for_image(censored, copy_id)
|
||||
logger.info("image %s %s (%s)" % (censored.region, image.name, copy_id))
|
||||
@ -471,14 +498,16 @@ def finalise_image(clients, image, image_id):
|
||||
image_id=image_id,
|
||||
image_family=image.family,
|
||||
)
|
||||
rsp = clients.ecs.modify_image_attribute(req)
|
||||
rsp = clients.ecs.modify_image_attribute_with_options(req, RUNTIME_OPTS)
|
||||
if image.public:
|
||||
req = ecs.models.ModifyImageSharePermissionRequest(
|
||||
region_id=clients.region,
|
||||
image_id=image_id,
|
||||
is_public=True,
|
||||
)
|
||||
rsp = clients.ecs.modify_image_share_permission(req)
|
||||
rsp = clients.ecs.modify_image_share_permission_with_options(
|
||||
req, RUNTIME_OPTS
|
||||
)
|
||||
|
||||
# Parse command-line arguments
|
||||
parser = argparse.ArgumentParser(description="Import Alibaba Cloud image")
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user