From 19922f6bd353325bbe8a337b9d64c87644dd4434 Mon Sep 17 00:00:00 2001 From: Mathijs Miermans Date: Thu, 9 Jan 2025 15:02:53 +0100 Subject: [PATCH] [MC-1608] Allow engagement to have a corpus_item_id (#734) * [MC-1608] Allow engagement to have a corpus_item_id * Fix lint error * Retry docker-image-build * Retry Docker build (3) --- .../engagement_backends/gcs_engagement.py | 2 +- .../engagement_backends/protocol.py | 13 +++++++--- .../test_gcs_engagement.py | 26 ++++++++++++++----- 3 files changed, 31 insertions(+), 10 deletions(-) diff --git a/merino/curated_recommendations/engagement_backends/gcs_engagement.py b/merino/curated_recommendations/engagement_backends/gcs_engagement.py index fec3d1472..ec18bab9c 100644 --- a/merino/curated_recommendations/engagement_backends/gcs_engagement.py +++ b/merino/curated_recommendations/engagement_backends/gcs_engagement.py @@ -14,7 +14,7 @@ logger = logging.getLogger(__name__) -EngagementKeyType = tuple[str, str | None] # Keyed on (scheduled_corpus_item_id, region) +EngagementKeyType = tuple[str | None, str | None] # Keyed on (scheduled_corpus_item_id, region) EngagementCacheType = dict[EngagementKeyType, Engagement] diff --git a/merino/curated_recommendations/engagement_backends/protocol.py b/merino/curated_recommendations/engagement_backends/protocol.py index a740a72fb..bf77c941c 100644 --- a/merino/curated_recommendations/engagement_backends/protocol.py +++ b/merino/curated_recommendations/engagement_backends/protocol.py @@ -7,14 +7,21 @@ class Engagement(BaseModel): """Represents the engagement data from the last 24 hours for a scheduled corpus item. - Engagement is aggregated over a rolling 24-hour window by scheduled_corpus_item_id, a unique ID - for the URL, scheduled date, and surface (e.g., "New Tab en-US"). It's expected to be updated + Engagement is aggregated over a rolling 24-hour window by two different ids. + - scheduled_corpus_item_id, a unique ID for the URL, scheduled date, and surface (New Tab en-US) + - corpus_item_id, a unique ID only for the URL + + Initially, each item had a scheduled_corpus_item_id, but in 2025Q1 we will start recommending + some items that only have a corpus_item_id. + + It's expected to be updated periodically with a delay of < 30 minutes from when clicks or impressions happen in the client. Additionally, engagement data is aggregated for the regions US, CA, IE, GB, DE, AU, and CH. If region is None this means engagement data is aggregated across all regions. """ - scheduled_corpus_item_id: str + scheduled_corpus_item_id: str | None = None + corpus_item_id: str | None = None # Fx 135 started emitting corpus_item_id, so it may be null. region: str | None = None # If region is None, then engagement is across all regions. click_count: int impression_count: int diff --git a/tests/integration/api/v1/curated_recommendations/engagement_backends/test_gcs_engagement.py b/tests/integration/api/v1/curated_recommendations/engagement_backends/test_gcs_engagement.py index 9f47b2454..b4acca402 100644 --- a/tests/integration/api/v1/curated_recommendations/engagement_backends/test_gcs_engagement.py +++ b/tests/integration/api/v1/curated_recommendations/engagement_backends/test_gcs_engagement.py @@ -78,6 +78,7 @@ def blob(gcs_bucket): [ {"scheduled_corpus_item_id": "1A", "click_count": 30, "impression_count": 300}, {"scheduled_corpus_item_id": "6A", "click_count": 40, "impression_count": 400}, + # Some records have a region { "scheduled_corpus_item_id": "1A", "region": "US", @@ -90,6 +91,19 @@ def blob(gcs_bucket): "click_count": 4, "impression_count": 9, }, + # Some records have a corpus_item_id + { + "corpus_item_id": "C1", + "click_count": 50, + "impression_count": 100, + }, + { + "scheduled_corpus_item_id": "7A", + "corpus_item_id": "C1", + "region": "US", + "click_count": 1, + "impression_count": 5, + }, ], ) @@ -205,12 +219,12 @@ async def test_gcs_engagement_metrics(gcs_storage_client, gcs_bucket, metrics_cl # Verify the metrics are recorded correctly metrics_client.gauge.assert_any_call("recommendation.engagement.size", value=blob.size) - metrics_client.gauge.assert_any_call("recommendation.engagement.global.count", value=2) - metrics_client.gauge.assert_any_call("recommendation.engagement.global.clicks", value=70) - metrics_client.gauge.assert_any_call("recommendation.engagement.global.impressions", value=700) - metrics_client.gauge.assert_any_call("recommendation.engagement.us.count", value=2) - metrics_client.gauge.assert_any_call("recommendation.engagement.us.clicks", value=7) - metrics_client.gauge.assert_any_call("recommendation.engagement.us.impressions", value=18) + metrics_client.gauge.assert_any_call("recommendation.engagement.global.count", value=3) + metrics_client.gauge.assert_any_call("recommendation.engagement.global.clicks", value=120) + metrics_client.gauge.assert_any_call("recommendation.engagement.global.impressions", value=800) + metrics_client.gauge.assert_any_call("recommendation.engagement.us.count", value=3) + metrics_client.gauge.assert_any_call("recommendation.engagement.us.clicks", value=8) + metrics_client.gauge.assert_any_call("recommendation.engagement.us.impressions", value=23) metrics_client.timeit.assert_any_call("recommendation.engagement.update.timing") # Check the last_updated gauge value shows that the engagement was updated just now.