From c0340f2918c7906e31370407a57a87f207e1c730 Mon Sep 17 00:00:00 2001
From: Mathieu Leplatre <mathieu@mozilla.com>
Date: Fri, 19 Apr 2024 10:55:41 +0200
Subject: [PATCH] Add attachment integrity check (#1418)

* Add attachment integrity check

* Update baselines
---
 .bandit.baseline                              |  49 ++------
 .secrets.baseline                             |  22 +++-
 .../remotesettings/attachments_integrity.py   |  68 +++++++++++
 .../test_attachments_integrity.py             | 115 ++++++++++++++++++
 4 files changed, 213 insertions(+), 41 deletions(-)
 create mode 100644 checks/remotesettings/attachments_integrity.py
 create mode 100644 tests/checks/remotesettings/test_attachments_integrity.py

diff --git a/.bandit.baseline b/.bandit.baseline
index ec97d74a..6fea09fe 100644
--- a/.bandit.baseline
+++ b/.bandit.baseline
@@ -1,14 +1,14 @@
 {
   "errors": [],
-  "generated_at": "2023-05-25T13:28:03Z",
+  "generated_at": "2024-04-18T11:21:08Z",
   "metrics": {
     "_totals": {
-      "CONFIDENCE.HIGH": 3,
+      "CONFIDENCE.HIGH": 2,
       "CONFIDENCE.LOW": 0,
       "CONFIDENCE.MEDIUM": 0,
       "CONFIDENCE.UNDEFINED": 0,
       "SEVERITY.HIGH": 0,
-      "SEVERITY.LOW": 3,
+      "SEVERITY.LOW": 2,
       "SEVERITY.MEDIUM": 0,
       "SEVERITY.UNDEFINED": 0,
       "loc": 883,
@@ -42,15 +42,15 @@
       "skipped_tests": 0
     },
     "telescope/app.py": {
-      "CONFIDENCE.HIGH": 3,
+      "CONFIDENCE.HIGH": 2,
       "CONFIDENCE.LOW": 0,
       "CONFIDENCE.MEDIUM": 0,
       "CONFIDENCE.UNDEFINED": 0,
       "SEVERITY.HIGH": 0,
-      "SEVERITY.LOW": 3,
+      "SEVERITY.LOW": 2,
       "SEVERITY.MEDIUM": 0,
       "SEVERITY.UNDEFINED": 0,
-      "loc": 380,
+      "loc": 382,
       "nosec": 0,
       "skipped_tests": 0
     },
@@ -63,7 +63,7 @@
       "SEVERITY.LOW": 0,
       "SEVERITY.MEDIUM": 0,
       "SEVERITY.UNDEFINED": 0,
-      "loc": 89,
+      "loc": 87,
       "nosec": 1,
       "skipped_tests": 0
     },
@@ -124,35 +124,12 @@
       "line_range": [
         6
       ],
-      "more_info": "https://bandit.readthedocs.io/en/1.7.5/blacklists/blacklist_imports.html#b404-import-subprocess",
+      "more_info": "https://bandit.readthedocs.io/en/1.7.8/blacklists/blacklist_imports.html#b404-import-subprocess",
       "test_id": "B404",
       "test_name": "blacklist"
     },
     {
-      "code": "218     # Check that `curl` has HTTP2 and HTTP3 for `checks.core.http_versions`\n219     curl_cmd = subprocess.run(\n220         [\"curl\", \"--version\"],\n221         capture_output=True,\n222     )\n223     output = curl_cmd.stdout.strip().decode()\n",
-      "col_offset": 15,
-      "end_col_offset": 5,
-      "filename": "telescope/app.py",
-      "issue_confidence": "HIGH",
-      "issue_cwe": {
-        "id": 78,
-        "link": "https://cwe.mitre.org/data/definitions/78.html"
-      },
-      "issue_severity": "LOW",
-      "issue_text": "Starting a process with a partial executable path",
-      "line_number": 219,
-      "line_range": [
-        219,
-        220,
-        221,
-        222
-      ],
-      "more_info": "https://bandit.readthedocs.io/en/1.7.5/plugins/b607_start_process_with_partial_path.html",
-      "test_id": "B607",
-      "test_name": "start_process_with_partial_path"
-    },
-    {
-      "code": "218     # Check that `curl` has HTTP2 and HTTP3 for `checks.core.http_versions`\n219     curl_cmd = subprocess.run(\n220         [\"curl\", \"--version\"],\n221         capture_output=True,\n222     )\n223     output = curl_cmd.stdout.strip().decode()\n",
+      "code": "219     # Check that `curl` has HTTP2 and HTTP3 for `checks.core.http_versions`\n220     curl_cmd = subprocess.run(\n221         [config.CURL_BINARY_PATH, \"--version\"],\n222         capture_output=True,\n223     )\n224     output = curl_cmd.stdout.strip().decode()\n",
       "col_offset": 15,
       "end_col_offset": 5,
       "filename": "telescope/app.py",
@@ -163,14 +140,14 @@
       },
       "issue_severity": "LOW",
       "issue_text": "subprocess call - check for execution of untrusted input.",
-      "line_number": 219,
+      "line_number": 220,
       "line_range": [
-        219,
         220,
         221,
-        222
+        222,
+        223
       ],
-      "more_info": "https://bandit.readthedocs.io/en/1.7.5/plugins/b603_subprocess_without_shell_equals_true.html",
+      "more_info": "https://bandit.readthedocs.io/en/1.7.8/plugins/b603_subprocess_without_shell_equals_true.html",
       "test_id": "B603",
       "test_name": "subprocess_without_shell_equals_true"
     }
diff --git a/.secrets.baseline b/.secrets.baseline
index 46f17034..5e9b514c 100644
--- a/.secrets.baseline
+++ b/.secrets.baseline
@@ -75,10 +75,6 @@
     {
       "path": "detect_secrets.filters.allowlist.is_line_allowlisted"
     },
-    {
-      "path": "detect_secrets.filters.common.is_baseline_file",
-      "filename": ".secrets.baseline"
-    },
     {
       "path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies",
       "min_level": 2
@@ -137,6 +133,22 @@
         "line_number": 23
       }
     ],
+    "tests/checks/remotesettings/test_attachments_integrity.py": [
+      {
+        "type": "Hex High Entropy String",
+        "filename": "tests/checks/remotesettings/test_attachments_integrity.py",
+        "hashed_secret": "263e1869fb57deeb75844cf85d55f0d03a6019fc",
+        "is_verified": false,
+        "line_number": 31
+      },
+      {
+        "type": "Hex High Entropy String",
+        "filename": "tests/checks/remotesettings/test_attachments_integrity.py",
+        "hashed_secret": "fdf34abe05071170ce2ec082a18aa762ec454ae3",
+        "is_verified": false,
+        "line_number": 39
+      }
+    ],
     "tests/checks/remotesettings/test_public_suffix_list.py": [
       {
         "type": "Hex High Entropy String",
@@ -147,5 +159,5 @@
       }
     ]
   },
-  "generated_at": "2023-09-19T13:14:09Z"
+  "generated_at": "2024-04-18T11:22:16Z"
 }
diff --git a/checks/remotesettings/attachments_integrity.py b/checks/remotesettings/attachments_integrity.py
new file mode 100644
index 00000000..a8ec2352
--- /dev/null
+++ b/checks/remotesettings/attachments_integrity.py
@@ -0,0 +1,68 @@
+"""
+Every attachment in every collection has the right size and hash.
+
+The URLs of invalid attachments is returned along with the number of checked records.
+"""
+
+import hashlib
+
+import aiohttp
+
+from telescope.typings import CheckResult
+from telescope.utils import ClientSession, run_parallel
+
+from .utils import KintoClient
+
+
+async def test_attachment(session, attachment):
+    url = attachment["location"]
+    try:
+        async with session.get(url) as response:
+            binary = await response.read()
+    except aiohttp.client_exceptions.ClientError as exc:
+        return {"url": url, "error": str(exc)}, False
+
+    if (bz := len(binary)) != (az := attachment["size"]):
+        return {"url": url, "error": f"size differ ({bz}!={az})"}, False
+
+    h = hashlib.sha256(binary)
+    if (bh := h.hexdigest()) != (ah := attachment["hash"]):
+        return {"url": url, "error": f"hash differ ({bh}!={ah})"}, False
+
+    return {}, True
+
+
+async def run(server: str) -> CheckResult:
+    client = KintoClient(server_url=server)
+
+    info = await client.server_info()
+    base_url = info["capabilities"]["attachments"]["base_url"]
+
+    # Fetch collections records in parallel.
+    entries = await client.get_monitor_changes()
+    futures = [
+        client.get_records(
+            bucket=entry["bucket"],
+            collection=entry["collection"],
+            _expected=entry["last_modified"],
+        )
+        for entry in entries
+        if "preview" not in entry["bucket"]
+    ]
+    results = await run_parallel(*futures)
+
+    # For each record that has an attachment, check the attachment content.
+    attachments = []
+    for entry, records in zip(entries, results):
+        for record in records:
+            if "attachment" not in record:
+                continue
+            attachment = record["attachment"]
+            attachment["location"] = base_url + attachment["location"]
+            attachments.append(attachment)
+
+    async with ClientSession() as session:
+        futures = [test_attachment(session, attachment) for attachment in attachments]
+        results = await run_parallel(*futures)
+    bad = [result for result, success in results if not success]
+    return len(bad) == 0, {"bad": bad, "checked": len(attachments)}
diff --git a/tests/checks/remotesettings/test_attachments_integrity.py b/tests/checks/remotesettings/test_attachments_integrity.py
new file mode 100644
index 00000000..f629324f
--- /dev/null
+++ b/tests/checks/remotesettings/test_attachments_integrity.py
@@ -0,0 +1,115 @@
+from checks.remotesettings.attachments_integrity import run
+
+
+RECORDS_URL = "/buckets/{}/collections/{}/records"
+
+
+async def test_positive(mock_responses, mock_aioresponses):
+    server_url = "http://fake.local/v1"
+    mock_responses.get(
+        server_url + "/",
+        payload={"capabilities": {"attachments": {"base_url": "http://cdn/"}}},
+    )
+    changes_url = server_url + RECORDS_URL.format("monitor", "changes")
+    mock_responses.get(
+        changes_url,
+        payload={
+            "data": [
+                {"id": "abc", "bucket": "bid", "collection": "cid", "last_modified": 42}
+            ]
+        },
+    )
+    records_url = server_url + RECORDS_URL.format("bid", "cid") + "?_expected=42"
+    mock_responses.get(
+        records_url,
+        payload={
+            "data": [
+                {
+                    "id": "abc",
+                    "attachment": {
+                        "size": 5,
+                        "hash": "ed968e840d10d2d313a870bc131a4e2c311d7ad09bdf32b3418147221f51a6e2",
+                        "location": "file1.jpg",
+                    },
+                },
+                {
+                    "id": "efg",
+                    "attachment": {
+                        "size": 10,
+                        "hash": "bf2cb58a68f684d95a3b78ef8f661c9a4e5b09e82cc8f9cc88cce90528caeb27",
+                        "location": "file2.jpg",
+                    },
+                },
+                {"id": "ijk"},
+            ]
+        },
+    )
+    mock_aioresponses.get("http://cdn/file1.jpg", body=b"a" * 5)
+    mock_aioresponses.get("http://cdn/file2.jpg", body=b"a" * 10)
+
+    status, data = await run(server_url)
+
+    # assert status is True
+    assert data == {"bad": [], "checked": 2}
+
+
+async def test_negative(mock_responses, mock_aioresponses):
+    server_url = "http://fake.local/v1"
+    mock_responses.get(
+        server_url + "/",
+        payload={"capabilities": {"attachments": {"base_url": "http://cdn/"}}},
+    )
+    changes_url = server_url + RECORDS_URL.format("monitor", "changes")
+    mock_responses.get(
+        changes_url,
+        payload={
+            "data": [
+                {"id": "abc", "bucket": "bid", "collection": "cid", "last_modified": 42}
+            ]
+        },
+    )
+    records_url = server_url + RECORDS_URL.format("bid", "cid") + "?_expected=42"
+    mock_responses.get(
+        records_url,
+        payload={
+            "data": [
+                {
+                    "id": "abc",
+                    "attachment": {
+                        "size": 7,
+                        "hash": "ed968e840d10d2d313a870bc131a4e2c311d7ad09bdf32b3418147221f51a6e2",
+                        "location": "file1.jpg",
+                    },
+                },
+                {"id": "efg", "attachment": {"location": "missing.jpg"}},
+                {
+                    "id": "ijk",
+                    "attachment": {"size": 10, "hash": "foo", "location": "file2.jpg"},
+                },
+                {"id": "lmn"},
+            ]
+        },
+    )
+    mock_aioresponses.get("http://cdn/file1.jpg", body=b"a" * 5)
+    mock_aioresponses.get("http://cdn/file2.jpg", body=b"a" * 10)
+
+    status, data = await run(server_url)
+
+    assert status is False
+    assert data == {
+        "bad": [
+            {
+                "error": "size differ (5!=7)",
+                "url": "http://cdn/file1.jpg",
+            },
+            {
+                "error": "Connection refused: GET http://cdn/missing.jpg",
+                "url": "http://cdn/missing.jpg",
+            },
+            {
+                "error": "hash differ (bf2cb58a68f684d95a3b78ef8f661c9a4e5b09e82cc8f9cc88cce90528caeb27!=foo)",
+                "url": "http://cdn/file2.jpg",
+            },
+        ],
+        "checked": 3,
+    }