[cloud] Retry all Alibaba Cloud API calls

Experimentation suggests Alibaba Cloud API calls are extremely
unreliable, with a failure rate around 1%.  It is therefore necessary
to allow for retrying basically every API call.

Some API calls (e.g. DescribeImages or ModifyImageAttribute) are
naturally idempotent and so safe to retry.  Some non-idempotent API
calls (e.g. CopyImage) support explicit idempotence tokens.  The
remaining API calls may simply fail on a retry, if the original
request happened to succeed but failed to return a response.

We could write convoluted retry logic around the non-idempotent calls,
but this would substantially increase the complexity of the already
unnecessarily complex code.  For now, we assume that retrying
non-idempotent requests is probably more likely to fix transient
failures than to cause additional problems.

Signed-off-by: Michael Brown <mcb30@ipxe.org>
This commit is contained in:
Michael Brown 2026-04-21 16:31:52 +01:00
parent d146b28b50
commit fa856e784c

View File

@ -86,6 +86,29 @@ IPXE_STORAGE_PREFIX = 'ipxe-upload-temp-'
POLL_INTERVAL_SEC = 5
POLL_MAX_RETRIES = 100
# Experimentation suggests Alibaba Cloud API calls are extremely
# unreliable, with a failure rate around 1%. It is therefore
# necessary to allow for retrying basically every API call.
#
# Some API calls (e.g. DescribeImages or ModifyImageAttribute) are
# naturally idempotent and so safe to retry. Some non-idempotent API
# calls (e.g. CopyImage) support explicit idempotence tokens. The
# remaining API calls may simply fail on a retry, if the original
# request happened to succeed but failed to return a response.
#
# We could write convoluted retry logic around the non-idempotent
# calls, but this would substantially increase the complexity of the
# already unnecessarily complex code. For now, we assume that
# retrying non-idempotent requests is probably more likely to fix
# transient failures than to cause additional problems.
#
RUNTIME_OPTS = util.models.RuntimeOptions(
autoretry=True,
max_attempts=5,
connect_timeout=10000,
read_timeout=120000,
)
# For regions in mainland China, the Chinese state censorship laws
# prohibit direct access to OSS bucket contents.
#
@ -227,13 +250,13 @@ def delete_temp_function(clients, func):
"""Remove temporary function"""
logger.info("delete function %s %s" % (clients.region, func))
assert func.startswith(IPXE_STORAGE_PREFIX)
clients.fc.delete_function(func)
clients.fc.delete_function_with_options(func, {}, RUNTIME_OPTS)
def create_temp_function(clients, role):
"""Create temporary function (and remove any stale temporary functions)"""
req = fc.models.ListFunctionsRequest(prefix=IPXE_STORAGE_PREFIX)
try:
rsp = clients.fc.list_functions(req)
rsp = clients.fc.list_functions_with_options(req, {}, RUNTIME_OPTS)
except openapi.client.UnretryableException:
# AliCloud provides no other way to detect non-working regions
return None
@ -259,7 +282,7 @@ def create_temp_function(clients, role):
timeout=FC_TIMEOUT_SEC,
)
req = fc.models.CreateFunctionRequest(body=body)
rsp = clients.fc.create_function(req)
rsp = clients.fc.create_function_with_options(req, {}, RUNTIME_OPTS)
logger.info("create function %s %s" % (clients.region, func))
return func
@ -271,13 +294,7 @@ def call_temp_function(clients, func, payload):
)
body = json.dumps(payload)
req = fc.models.InvokeFunctionRequest(body=body)
run = util.models.RuntimeOptions(
autoretry=True,
max_attempts=5,
connect_timeout=10000,
read_timeout=120000,
)
rsp = clients.fc.invoke_function_with_options(func, req, hdr, run)
rsp = clients.fc.invoke_function_with_options(func, req, hdr, RUNTIME_OPTS)
log = base64.b64decode(rsp.headers.get('x-fc-log-result', b'')).decode()
if rsp.status_code != http.HTTPStatus.OK:
raise RuntimeError(rsp)
@ -368,7 +385,7 @@ def delete_image(clients, name):
image_name=name,
image_owner_alias='self',
)
rsp = clients.ecs.describe_images(req)
rsp = clients.ecs.describe_images_with_options(req, RUNTIME_OPTS)
for image in rsp.body.images.image or ():
logger.info("delete image %s %s (%s)" %
(clients.region, image.image_name, image.image_id))
@ -378,12 +395,14 @@ def delete_image(clients, name):
image_id=image.image_id,
is_public=False,
)
rsp = clients.ecs.modify_image_share_permission(req)
rsp = clients.ecs.modify_image_share_permission_with_options(
req, RUNTIME_OPTS
)
req = ecs.models.DeleteImageRequest(
region_id=clients.region,
image_id=image.image_id
)
rsp = clients.ecs.delete_image(req)
rsp = clients.ecs.delete_image_with_options(req, RUNTIME_OPTS)
def wait_for_task(clients, task_id):
"""Wait for task to complete"""
@ -394,7 +413,10 @@ def wait_for_task(clients, task_id):
region_id=clients.region,
task_ids=task_id,
)
rsp = clients.ecs.describe_tasks(req)
try:
rsp = clients.ecs.describe_tasks_with_options(req, RUNTIME_OPTS)
except openapi.client.UnretryableException:
continue
assert len(rsp.body.task_set.task) == 1
assert rsp.body.task_set.task[0].task_id == task_id
status = rsp.body.task_set.task[0].task_status
@ -412,7 +434,10 @@ def wait_for_image(clients, image_id):
region_id=clients.region,
image_id=image_id,
)
rsp = clients.ecs.describe_images(req)
try:
rsp = clients.ecs.describe_images_with_options(req, RUNTIME_OPTS)
except openapi.client.UnretryableException:
continue
if len(rsp.body.images.image):
assert len(rsp.body.images.image) == 1
assert rsp.body.images.image[0].image_id == image_id
@ -437,8 +462,9 @@ def import_image(clients, image, bucket):
architecture=image.arch,
boot_mode=image.mode,
disk_device_mapping=[disk],
client_token=str(uuid4()),
)
rsp = clients.ecs.import_image(req)
rsp = clients.ecs.import_image_with_options(req, RUNTIME_OPTS)
image_id = rsp.body.image_id
task_id = rsp.body.task_id
wait_for_task(clients, task_id)
@ -456,8 +482,9 @@ def copy_image(clients, image, image_id, censored):
image_id=image_id,
destination_region_id=censored.region,
destination_image_name=image.name,
client_token=str(uuid4()),
)
rsp = clients.ecs.copy_image(req)
rsp = clients.ecs.copy_image_with_options(req, RUNTIME_OPTS)
copy_id = rsp.body.image_id
wait_for_image(censored, copy_id)
logger.info("image %s %s (%s)" % (censored.region, image.name, copy_id))
@ -471,14 +498,16 @@ def finalise_image(clients, image, image_id):
image_id=image_id,
image_family=image.family,
)
rsp = clients.ecs.modify_image_attribute(req)
rsp = clients.ecs.modify_image_attribute_with_options(req, RUNTIME_OPTS)
if image.public:
req = ecs.models.ModifyImageSharePermissionRequest(
region_id=clients.region,
image_id=image_id,
is_public=True,
)
rsp = clients.ecs.modify_image_share_permission(req)
rsp = clients.ecs.modify_image_share_permission_with_options(
req, RUNTIME_OPTS
)
# Parse command-line arguments
parser = argparse.ArgumentParser(description="Import Alibaba Cloud image")