failure uids as part of mock service for tests

2026-05-17 02:26:54 +02:00 · 2026-05-14 19:52:56 -04:00 · 2026-05-14 19:52:56 -04:00 · 82175b92ff
commit 82175b92ff
parent 5d5cdf587c
1 changed files with 88 additions and 7 deletions
--- a/tools/tokenserver/test_purge_old_records.py
+++ b/tools/tokenserver/test_purge_old_records.py
@ -15,11 +15,20 @@ from database import Database
 from purge_old_records import purge_old_records


-def _make_service_app(service_requests):
-    """Return a WSGI app that records each request into the given list."""
+def _make_service_app(service_requests, failure_uids):
+    """Return a WSGI app that records each request into the given list.
+
+    DELETE requests against `/1.5/{uid}` where `uid` is in `failure_uids`
+    respond with HTTP 500, simulating a broken storage node response. This
+    drives the per-row failure path in purge_old_records (STOR-70).
+    """

    def _service_app(environ, start_response):
        service_requests.append(environ)
+        match = re.match(r"^/1\.5/(\d+)", environ.get("PATH_INFO", ""))
+        if match and int(match.group(1)) in failure_uids:
+            start_response("500 Internal Server Error", [])
+            return ""
        start_response("200 OK", [])
        return ""

@ -32,17 +41,24 @@ def mock_service_server():

    Module scope is justified: the server is expensive to start (OS port
    allocation + thread) and is stateless between tests — the requests list
-    is cleared in each per-test fixture's teardown.
+    and failure_uids set are cleared in each per-test fixture's teardown.
    """
    service_requests = []
-    server = make_server("localhost", 0, _make_service_app(service_requests))
+    failure_uids = set()
+    server = make_server(
+        "localhost", 0, _make_service_app(service_requests, failure_uids)
+    )
    server.RequestHandlerClass.log_request = lambda *a: None
    host, port = server.server_address
    service_node = f"http://{host}:{port}"
    thread = threading.Thread(target=server.serve_forever)
    thread.daemon = True
    thread.start()
-    yield {"node": service_node, "requests": service_requests}
+    yield {
+        "node": service_node,
+        "requests": service_requests,
+        "failure_uids": failure_uids,
+    }
    server.shutdown()
    thread.join()

@ -56,7 +72,10 @@ def mock_spanner_server(mock_service_server):
    test class behaviour where both servers appended to the same list.
    """
    service_requests = mock_service_server["requests"]
-    server = make_server("localhost", 0, _make_service_app(service_requests))
+    failure_uids = set()
+    server = make_server(
+        "localhost", 0, _make_service_app(service_requests, failure_uids)
+    )
    server.RequestHandlerClass.log_request = lambda *a: None
    host, port = server.server_address
    spanner_node = f"http://{host}:{port}"
@ -64,7 +83,11 @@ def mock_spanner_server(mock_service_server):
    thread = threading.Thread(target=server.serve_forever)
    thread.daemon = True
    thread.start()
-    yield {"node": spanner_node, "downed_node": downed_node}
+    yield {
+        "node": spanner_node,
+        "downed_node": downed_node,
+        "failure_uids": failure_uids,
+    }
    server.shutdown()
    thread.join()

@ -83,6 +106,7 @@ def purge_db(mock_service_server):
    database._execute_sql("DELETE FROM nodes").close()
    database._execute_sql("DELETE FROM services").close()
    del mock_service_server["requests"][:]
+    mock_service_server["failure_uids"].clear()
    database.close()


@ -102,6 +126,8 @@ def migration_db(mock_service_server, mock_spanner_server):
    database._execute_sql("DELETE FROM nodes").close()
    database._execute_sql("DELETE FROM services").close()
    del mock_service_server["requests"][:]
+    mock_service_server["failure_uids"].clear()
+    mock_spanner_server["failure_uids"].clear()
    database.close()


@ -332,6 +358,61 @@ def test_purging_override_with_migrated_password_change(
    assert len(service_requests) == 2


+def test_failed_service_delete_does_not_abort_batch(purge_db, mock_service_server):
+    """A row whose service delete fails should not abort the rest of the batch.
+
+    Regression for STOR-70: a 5xx response from the storage node previously
+    propagated to the outer try/except, returning False and leaving the rest
+    of the batch unprocessed. The failing row's replaced_at is now bumped to
+    defer the retry, and the batch continues with the remaining rows.
+    """
+    database = purge_db
+    failure_uids = mock_service_server["failure_uids"]
+    node_secret = "SECRET"
+
+    # Two old user records share one batch. Assertions below are independent
+    # of processing order — the failing row must be bumped and the healthy
+    # row purged regardless of which is seen first.
+    healthy_email = "healthy@mozilla.com"
+    healthy = database.allocate_user(healthy_email, client_state="aa", generation=1)
+    database.update_user(healthy, client_state="bb", generation=2)
+    broken_email = "broken@mozilla.com"
+    broken = database.allocate_user(broken_email, client_state="aa", generation=1)
+    database.update_user(broken, client_state="bb", generation=2)
+
+    broken_old = next(
+        r for r in database.get_user_records(broken_email) if r.replaced_at
+    )
+    failure_uids.add(broken_old.uid)
+    original_replaced_at = broken_old.replaced_at
+
+    class _RecordingMetrics:
+        def __init__(self):
+            self.calls = []
+
+        def incr(self, label, tags=None):
+            self.calls.append((label, tags))
+
+    metrics = _RecordingMetrics()
+    assert purge_old_records(node_secret, grace_period=0, metrics=metrics) is True
+
+    # The healthy user's old record was purged in the same batch.
+    healthy_records = list(database.get_user_records(healthy_email))
+    assert len(healthy_records) == 1
+    assert healthy_records[0].replaced_at is None
+
+    # The broken user's old record remains and its replaced_at moved forward.
+    broken_records = list(database.get_user_records(broken_email))
+    assert len(broken_records) == 2
+    bumped = next(r for r in broken_records if r.uid == broken_old.uid)
+    assert bumped.replaced_at > original_replaced_at
+
+    # The failure was reported with a stable error-class tag.
+    failure_calls = [c for c in metrics.calls if c[0] == "row_failure"]
+    assert len(failure_calls) == 1
+    assert failure_calls[0][1] == {"reason": "http_5xx"}
+
+
@pytest.mark.migration_records
 def test_purging_override_null_keys_changed_at(
    migration_db, mock_service_server, mock_spanner_server