From 82175b92ffbbc7f01549ec5d04c4ffede6793020 Mon Sep 17 00:00:00 2001 From: Taddes Date: Thu, 14 May 2026 19:52:56 -0400 Subject: [PATCH] failure uids as part of mock service for tests --- tools/tokenserver/test_purge_old_records.py | 95 +++++++++++++++++++-- 1 file changed, 88 insertions(+), 7 deletions(-) diff --git a/tools/tokenserver/test_purge_old_records.py b/tools/tokenserver/test_purge_old_records.py index a21ef036..ba7e1230 100644 --- a/tools/tokenserver/test_purge_old_records.py +++ b/tools/tokenserver/test_purge_old_records.py @@ -15,11 +15,20 @@ from database import Database from purge_old_records import purge_old_records -def _make_service_app(service_requests): - """Return a WSGI app that records each request into the given list.""" +def _make_service_app(service_requests, failure_uids): + """Return a WSGI app that records each request into the given list. + + DELETE requests against `/1.5/{uid}` where `uid` is in `failure_uids` + respond with HTTP 500, simulating a broken storage node response. This + drives the per-row failure path in purge_old_records (STOR-70). + """ def _service_app(environ, start_response): service_requests.append(environ) + match = re.match(r"^/1\.5/(\d+)", environ.get("PATH_INFO", "")) + if match and int(match.group(1)) in failure_uids: + start_response("500 Internal Server Error", []) + return "" start_response("200 OK", []) return "" @@ -32,17 +41,24 @@ def mock_service_server(): Module scope is justified: the server is expensive to start (OS port allocation + thread) and is stateless between tests — the requests list - is cleared in each per-test fixture's teardown. + and failure_uids set are cleared in each per-test fixture's teardown. """ service_requests = [] - server = make_server("localhost", 0, _make_service_app(service_requests)) + failure_uids = set() + server = make_server( + "localhost", 0, _make_service_app(service_requests, failure_uids) + ) server.RequestHandlerClass.log_request = lambda *a: None host, port = server.server_address service_node = f"http://{host}:{port}" thread = threading.Thread(target=server.serve_forever) thread.daemon = True thread.start() - yield {"node": service_node, "requests": service_requests} + yield { + "node": service_node, + "requests": service_requests, + "failure_uids": failure_uids, + } server.shutdown() thread.join() @@ -56,7 +72,10 @@ def mock_spanner_server(mock_service_server): test class behaviour where both servers appended to the same list. """ service_requests = mock_service_server["requests"] - server = make_server("localhost", 0, _make_service_app(service_requests)) + failure_uids = set() + server = make_server( + "localhost", 0, _make_service_app(service_requests, failure_uids) + ) server.RequestHandlerClass.log_request = lambda *a: None host, port = server.server_address spanner_node = f"http://{host}:{port}" @@ -64,7 +83,11 @@ def mock_spanner_server(mock_service_server): thread = threading.Thread(target=server.serve_forever) thread.daemon = True thread.start() - yield {"node": spanner_node, "downed_node": downed_node} + yield { + "node": spanner_node, + "downed_node": downed_node, + "failure_uids": failure_uids, + } server.shutdown() thread.join() @@ -83,6 +106,7 @@ def purge_db(mock_service_server): database._execute_sql("DELETE FROM nodes").close() database._execute_sql("DELETE FROM services").close() del mock_service_server["requests"][:] + mock_service_server["failure_uids"].clear() database.close() @@ -102,6 +126,8 @@ def migration_db(mock_service_server, mock_spanner_server): database._execute_sql("DELETE FROM nodes").close() database._execute_sql("DELETE FROM services").close() del mock_service_server["requests"][:] + mock_service_server["failure_uids"].clear() + mock_spanner_server["failure_uids"].clear() database.close() @@ -332,6 +358,61 @@ def test_purging_override_with_migrated_password_change( assert len(service_requests) == 2 +def test_failed_service_delete_does_not_abort_batch(purge_db, mock_service_server): + """A row whose service delete fails should not abort the rest of the batch. + + Regression for STOR-70: a 5xx response from the storage node previously + propagated to the outer try/except, returning False and leaving the rest + of the batch unprocessed. The failing row's replaced_at is now bumped to + defer the retry, and the batch continues with the remaining rows. + """ + database = purge_db + failure_uids = mock_service_server["failure_uids"] + node_secret = "SECRET" + + # Two old user records share one batch. Assertions below are independent + # of processing order — the failing row must be bumped and the healthy + # row purged regardless of which is seen first. + healthy_email = "healthy@mozilla.com" + healthy = database.allocate_user(healthy_email, client_state="aa", generation=1) + database.update_user(healthy, client_state="bb", generation=2) + broken_email = "broken@mozilla.com" + broken = database.allocate_user(broken_email, client_state="aa", generation=1) + database.update_user(broken, client_state="bb", generation=2) + + broken_old = next( + r for r in database.get_user_records(broken_email) if r.replaced_at + ) + failure_uids.add(broken_old.uid) + original_replaced_at = broken_old.replaced_at + + class _RecordingMetrics: + def __init__(self): + self.calls = [] + + def incr(self, label, tags=None): + self.calls.append((label, tags)) + + metrics = _RecordingMetrics() + assert purge_old_records(node_secret, grace_period=0, metrics=metrics) is True + + # The healthy user's old record was purged in the same batch. + healthy_records = list(database.get_user_records(healthy_email)) + assert len(healthy_records) == 1 + assert healthy_records[0].replaced_at is None + + # The broken user's old record remains and its replaced_at moved forward. + broken_records = list(database.get_user_records(broken_email)) + assert len(broken_records) == 2 + bumped = next(r for r in broken_records if r.uid == broken_old.uid) + assert bumped.replaced_at > original_replaced_at + + # The failure was reported with a stable error-class tag. + failure_calls = [c for c in metrics.calls if c[0] == "row_failure"] + assert len(failure_calls) == 1 + assert failure_calls[0][1] == {"reason": "http_5xx"} + + @pytest.mark.migration_records def test_purging_override_null_keys_changed_at( migration_db, mock_service_server, mock_spanner_server