Skip to content

Commit

Permalink
[MC-1608] Allow engagement to have a corpus_item_id (#734)
Browse files Browse the repository at this point in the history
* [MC-1608] Allow engagement to have a corpus_item_id

* Fix lint error

* Retry docker-image-build

* Retry Docker build (3)
  • Loading branch information
mmiermans authored Jan 9, 2025
1 parent 1e43a39 commit 19922f6
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

logger = logging.getLogger(__name__)

EngagementKeyType = tuple[str, str | None] # Keyed on (scheduled_corpus_item_id, region)
EngagementKeyType = tuple[str | None, str | None] # Keyed on (scheduled_corpus_item_id, region)
EngagementCacheType = dict[EngagementKeyType, Engagement]


Expand Down
13 changes: 10 additions & 3 deletions merino/curated_recommendations/engagement_backends/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,21 @@
class Engagement(BaseModel):
"""Represents the engagement data from the last 24 hours for a scheduled corpus item.
Engagement is aggregated over a rolling 24-hour window by scheduled_corpus_item_id, a unique ID
for the URL, scheduled date, and surface (e.g., "New Tab en-US"). It's expected to be updated
Engagement is aggregated over a rolling 24-hour window by two different ids.
- scheduled_corpus_item_id, a unique ID for the URL, scheduled date, and surface (New Tab en-US)
- corpus_item_id, a unique ID only for the URL
Initially, each item had a scheduled_corpus_item_id, but in 2025Q1 we will start recommending
some items that only have a corpus_item_id.
It's expected to be updated
periodically with a delay of < 30 minutes from when clicks or impressions happen in the client.
Additionally, engagement data is aggregated for the regions US, CA, IE, GB, DE, AU, and CH.
If region is None this means engagement data is aggregated across all regions.
"""

scheduled_corpus_item_id: str
scheduled_corpus_item_id: str | None = None
corpus_item_id: str | None = None # Fx 135 started emitting corpus_item_id, so it may be null.
region: str | None = None # If region is None, then engagement is across all regions.
click_count: int
impression_count: int
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def blob(gcs_bucket):
[
{"scheduled_corpus_item_id": "1A", "click_count": 30, "impression_count": 300},
{"scheduled_corpus_item_id": "6A", "click_count": 40, "impression_count": 400},
# Some records have a region
{
"scheduled_corpus_item_id": "1A",
"region": "US",
Expand All @@ -90,6 +91,19 @@ def blob(gcs_bucket):
"click_count": 4,
"impression_count": 9,
},
# Some records have a corpus_item_id
{
"corpus_item_id": "C1",
"click_count": 50,
"impression_count": 100,
},
{
"scheduled_corpus_item_id": "7A",
"corpus_item_id": "C1",
"region": "US",
"click_count": 1,
"impression_count": 5,
},
],
)

Expand Down Expand Up @@ -205,12 +219,12 @@ async def test_gcs_engagement_metrics(gcs_storage_client, gcs_bucket, metrics_cl

# Verify the metrics are recorded correctly
metrics_client.gauge.assert_any_call("recommendation.engagement.size", value=blob.size)
metrics_client.gauge.assert_any_call("recommendation.engagement.global.count", value=2)
metrics_client.gauge.assert_any_call("recommendation.engagement.global.clicks", value=70)
metrics_client.gauge.assert_any_call("recommendation.engagement.global.impressions", value=700)
metrics_client.gauge.assert_any_call("recommendation.engagement.us.count", value=2)
metrics_client.gauge.assert_any_call("recommendation.engagement.us.clicks", value=7)
metrics_client.gauge.assert_any_call("recommendation.engagement.us.impressions", value=18)
metrics_client.gauge.assert_any_call("recommendation.engagement.global.count", value=3)
metrics_client.gauge.assert_any_call("recommendation.engagement.global.clicks", value=120)
metrics_client.gauge.assert_any_call("recommendation.engagement.global.impressions", value=800)
metrics_client.gauge.assert_any_call("recommendation.engagement.us.count", value=3)
metrics_client.gauge.assert_any_call("recommendation.engagement.us.clicks", value=8)
metrics_client.gauge.assert_any_call("recommendation.engagement.us.impressions", value=23)
metrics_client.timeit.assert_any_call("recommendation.engagement.update.timing")

# Check the last_updated gauge value shows that the engagement was updated just now.
Expand Down

0 comments on commit 19922f6

Please sign in to comment.