mime: fully implement "determine_computed_mime" algorithm

There are some (very) slight spec deviations. And another spec bug (very cool): whatwg/mimesniff#169
simonwuelker · Apr 16, 2023 · e4e21a6 · e4e21a6
1 parent 03a41fa
commit e4e21a6
Show file tree

Hide file tree

Showing 3 changed files with 313 additions and 8 deletions.
diff --git a/util/mime/src/resource.rs b/util/mime/src/resource.rs
@@ -2,7 +2,10 @@
 
 use std::str::FromStr;
 
-use crate::{sniff, MIMEType};
+use crate::{
+    sniff::{self, identify_audio_or_video_type, identify_image_type},
+    sniff_tables, MIMEType,
+};
 
 use http::request::HTTPError;
 use url::URL;
@@ -123,8 +126,12 @@ impl ResourceMetadata {
         no_sniff: NoSniff,
         resource_data: &[u8],
     ) -> Self {
-        let computed_mime_type =
-            determine_computed_mimetype(supplied_mime_type.as_ref(), no_sniff, resource_data);
+        let computed_mime_type = determine_computed_mimetype(
+            supplied_mime_type.as_ref(),
+            no_sniff,
+            check_for_apache_bug,
+            resource_data,
+        );
 
         Self {
             supplied_mime_type,
@@ -139,7 +146,8 @@ impl ResourceMetadata {
 pub fn determine_computed_mimetype(
     supplied_mime_type: Option<&MIMEType>,
     no_sniff: NoSniff,
-    resource_data: &[u8],
+    check_for_apache_bug: CheckForApacheBug,
+    resource_header: &[u8],
 ) -> MIMEType {
     // 1. If the supplied MIME type is undefined or if the supplied MIME type’s essence is "unknown/unknown", "application/unknown", or "*/*",
     // execute the rules for identifying an unknown MIME type with the sniff-scriptable flag equal to the inverse of the no-sniff flag and abort these steps.
@@ -156,7 +164,7 @@ pub fn determine_computed_mimetype(
             NoSniff::No => SniffScriptable::Yes,
         };
 
-        return sniff::identify_unknown_mime_type(resource_data, sniff_scriptable);
+        return sniff::identify_unknown_mime_type(resource_header, sniff_scriptable);
     }
 
     // 2. If the no-sniff flag is set, the computed MIME type is the supplied MIME type.
@@ -165,5 +173,296 @@ pub fn determine_computed_mimetype(
         return supplied_mime_type.unwrap().clone();
     }
 
-    todo!()
+    // 3. If the check-for-apache-bug flag is set, execute the rules for distinguishing if a resource is text or binary and abort these steps.
+    if check_for_apache_bug == CheckForApacheBug::Yes {
+        // https://mimesniff.spec.whatwg.org/#rules-for-text-or-binary
+
+        // 1. Let length be the number of bytes in the resource header.
+        let length = resource_header.len();
+
+        // 2. If length is greater than or equal to 2 and the first 2 bytes of the resource header
+        // are equal to 0xFE 0xFF (UTF-16BE BOM) or 0xFF 0xFE (UTF-16LE BOM), the computed MIME type is "text/plain".
+        if length >= 2
+            && (resource_header[..2] == [0xFE, 0xFF] || resource_header[..2] == [0xFF, 0xFE])
+        {
+            return MIMEType::new("text", "plain");
+        }
+
+        // 3. If length is greater than or equal to 3 and the first 3 bytes of the resource header are equal to 0xEF 0xBB 0xBF (UTF-8 BOM), the computed MIME type is "text/plain".
+        if length >= 3 && resource_header[..3] == [0xEF, 0xBB, 0xBF] {
+            return MIMEType::new("text", "plain");
+        }
+
+        // 4. If the resource header contains no binary data bytes, the computed MIME type is "text/plain".
+        if resource_header
+            .iter()
+            .all(|&byte| !sniff::is_binary_data_byte(byte))
+        {
+            return MIMEType::new("text", "plain");
+        }
+
+        // 5. The computed MIME type is "application/octet-stream".
+        return MIMEType::new("application", "octet-stream");
+    }
+
+    // NOTE: the following spec steps only apply if there is a supplied mime type
+    if let Some(supplied_mime_type) = supplied_mime_type {
+        // 4. If the supplied MIME type is an XML MIME type, the computed MIME type is the supplied MIME type.
+        if supplied_mime_type.is_xml() {
+            return supplied_mime_type.clone();
+        }
+
+        // 5. If the supplied MIME type’s essence is "text/html", execute the rules for distinguishing if a resource is a feed or HTML and abort these steps.
+        if supplied_mime_type.essence() == "text/html" {
+            // https://mimesniff.spec.whatwg.org/#rules-for-distinguishing-if-a-resource-is-a-feed-or-html
+
+            // 1. Let sequence be the resource header, where sequence[s] is byte s in sequence and sequence[0] is the first byte in sequence.
+            let sequence = resource_header;
+
+            // 2. Let length be the number of bytes in sequence.
+            let length = sequence.len();
+
+            // 3. Initialize s to 0.
+            let mut s = 0;
+
+            // 4. If length is greater than or equal to 3 and the three bytes from sequence[0] to sequence[2] are equal to 0xEF 0xBB 0xBF (UTF-8 BOM), increment s by 3.
+            if sequence.starts_with(&[0xEF, 0xBB, 0xBF]) {
+                s += 3;
+            }
+
+            // 5. While s is less than length, continuously loop through these steps:
+            'outer_loop: while s < length {
+                // 1. Enter loop L:
+                'L: loop {
+                    match sequence.get(s) {
+                        None => {
+                            // 1. If sequence[s] is undefined, the computed MIME type is the supplied MIME type.
+                            // Abort these steps.
+                            return supplied_mime_type.clone();
+                        },
+                        Some(0x3C) => {
+                            // 2. If sequence[s] is equal to 0x3C ("<"), increment s by 1 and exit loop L.
+                            s += 1;
+                            break 'L;
+                        },
+                        Some(byte) if !sniff_tables::WHITESPACE.contains(byte) => {
+                            // 3. If sequence[s] is not a whitespace byte, the computed MIME type is the supplied MIME type.
+                            // Abort these steps.
+                            return supplied_mime_type.clone();
+                        },
+                        Some(_) => {},
+                    }
+
+                    // 4. Increment s by 1.
+                    s += 1;
+                }
+
+                // 2. Enter loop L:
+                // NOTE: this seems to be a spec bug, theres no way for the loop to run more than once.
+                // See https://github.com/whatwg/mimesniff/issues/169
+                {
+                    // 1. If sequence[s] is undefined, the computed MIME type is the supplied MIME type.
+                    // Abort these steps.
+                    if length <= s {
+                        return supplied_mime_type.clone();
+                    }
+
+                    // 2. If length is greater than or equal to s + 3 and the three bytes from sequence[s] to sequence[s + 2] are equal to 0x21 0x2D 0x2D ("!--"),
+                    // increment s by 3 and enter loop M:
+                    if sequence[s..].starts_with(b"!--") {
+                        s += 3;
+
+                        loop {
+                            // 1. If sequence[s] is undefined, the computed MIME type is the supplied MIME type.
+                            // Abort these steps.
+                            if length <= s {
+                                return supplied_mime_type.clone();
+                            }
+
+                            // 2. If length is greater than or equal to s + 3 and the three bytes from sequence[s] to sequence[s + 2]
+                            // are equal to 0x2D 0x2D 0x3E ("-->"), increment s by 3 and exit loops M and L.
+                            if sequence[s..].starts_with(b"-->") {
+                                s += 3;
+
+                                // NOTE: we don't have L but since the L is the last step of outer_loop we can continue
+                                continue 'outer_loop;
+                            }
+
+                            // 3. Increment s by 1.
+                            s += 1;
+                        }
+                    }
+
+                    // 3. If length is greater than or equal to s + 1 and sequence[s] is equal to 0x21 ("!"), increment s by 1 and enter loop M:
+                    if sequence[s..].starts_with(b"!") {
+                        s += 1;
+
+                        loop {
+                            // 1. If sequence[s] is undefined, the computed MIME type is the supplied MIME type.
+                            // Abort these steps.
+                            if length <= s {
+                                return supplied_mime_type.clone();
+                            }
+
+                            // 2. If length is greater than or equal to s + 1 and sequence[s] is equal to 0x3E (">"), increment s by 1 and exit loops M and L.
+                            if sequence[s..].starts_with(b">") {
+                                s += 1;
+
+                                // NOTE: we don't have L but since the L is the last step of outer_loop we can continue
+                                continue 'outer_loop;
+                            }
+
+                            // 3. Increment s by 1.
+                            s += 1;
+                        }
+                    }
+
+                    // 4. If length is greater than or equal to s + 1 and sequence[s] is equal to 0x3F ("?"), increment s by 1 and enter loop M:
+                    if sequence[s..].starts_with(b"?") {
+                        s += 1;
+
+                        loop {
+                            // 1. If sequence[s] is undefined, the computed MIME type is the supplied MIME type.
+                            // Abort these steps.
+                            if length <= s {
+                                return supplied_mime_type.clone();
+                            }
+
+                            // 2. If length is greater than or equal to s + 2 and the two bytes from sequence[s] to sequence[s + 1] are equal to 0x3F 0x3E ("?>"), increment s by 2 and exit loops M and L.
+                            if sequence[s..].starts_with(b"?>") {
+                                s += 2;
+
+                                // NOTE: we don't have L but since the L is the last step of outer_loop we can continue
+                                continue 'outer_loop;
+                            }
+
+                            // 3. Increment s by 1.
+                            s += 1;
+                        }
+                    }
+
+                    // 5. If length is greater than or equal to s + 3 and the three bytes from sequence[s] to sequence[s + 2]
+                    // are equal to 0x72 0x73 0x73 ("rss"), the computed MIME type is "application/rss+xml".
+                    // Abort these steps.
+                    if sequence[s..].starts_with(b"rss") {
+                        return MIMEType::new("application", "rss");
+                    }
+
+                    // 6. If length is greater than or equal to s + 4 and the four bytes from sequence[s] to sequence[s + 3] are equal to 0x66 0x65 0x65 0x64 ("feed"), the computed MIME type is "application/atom+xml".
+                    // Abort these steps.
+                    if sequence[s..].starts_with(b"feed") {
+                        return MIMEType::new("application", "atom+xml");
+                    }
+
+                    // 7. If length is greater than or equal to s + 7 and the seven bytes from sequence[s] to sequence[s + 6] are equal to 0x72 0x64 0x66 0x3A 0x52 0x44 0x46 ("rdf:RDF"), increment s by 7 and enter loop M:
+                    if sequence[s..].starts_with(b"rdf:RDF") {
+                        s += 7;
+
+                        loop {
+                            // 1. If sequence[s] is undefined, the computed MIME type is the supplied MIME type.
+                            // Abort these steps.
+                            if length <= s {
+                                return supplied_mime_type.clone();
+                            }
+
+                            // 2.  If length is greater than or equal to s + 24 and the twenty-four bytes from sequence[s] to sequence[s + 23]
+                            // are equal to 0x68 0x74 0x74 0x70 0x3A 0x2F 0x2F 0x70 0x75 0x72 0x6C 0x2E 0x6F 0x72 0x67 0x2F 0x72 0x73 0x73 0x2F 0x31 0x2E 0x30 0x2F
+                            // ("http://purl.org/rss/1.0/"), increment s by 24 and enter loop N:
+                            if sequence[s..].starts_with(b"http://purl.org/rss/1.0/") {
+                                s += 24;
+
+                                loop {
+                                    // 1. If sequence[s] is undefined, the computed MIME type is the supplied MIME type.
+                                    // Abort these steps.
+                                    if length <= s {
+                                        return supplied_mime_type.clone();
+                                    }
+
+                                    // 2. If length is greater than or equal to s + 43 and the forty-three bytes from sequence[s] to sequence[s + 42]
+                                    // are equal to "http://www.w3.org/1999/02/22-rdf-syntax-ns#", the computed MIME type is "application/rss+xml".
+                                    // Abort these steps.
+                                    if sequence[s..]
+                                        .starts_with(b"http://www.w3.org/1999/02/22-rdf-syntax-ns#")
+                                    {
+                                        return MIMEType::new("application", "rss+xml");
+                                    }
+
+                                    // 3. Increment s by 1.
+                                    s += 1;
+                                }
+                            }
+
+                            // 3. If length is greater than or equal to s + 24 and the twenty-four bytes from sequence[s] to sequence[s + 23] are equal to
+                            // "http://www.w3.org/1999/02/22-rdf-syntax-ns#", increment s by 24 and enter loop N:
+                            if sequence[s..]
+                                .starts_with(b"http://www.w3.org/1999/02/22-rdf-syntax-ns#")
+                            {
+                                s += 24;
+
+                                loop {
+                                    // 1. If sequence[s] is undefined, the computed MIME type is the supplied MIME type.
+                                    // Abort these steps.
+                                    if length <= s {
+                                        return supplied_mime_type.clone();
+                                    }
+
+                                    // 2. If length is greater than or equal to s + 43 and the forty-three bytes from sequence[s] to sequence[s + 42] are
+                                    // equal to "http://purl.org/rss/1.0/", the computed MIME type is "application/rss+xml".
+                                    // Abort these steps.
+                                    if sequence.starts_with(b"http://purl.org/rss/1.0/") {
+                                        return MIMEType::new("application", "rss+xml");
+                                    }
+
+                                    // 3. Increment s by 1.
+                                    s += 1;
+                                }
+                            }
+
+                            // 4. Increment s by 1.
+                            s += 1;
+                        }
+                    }
+
+                    // 8. The computed MIME type is the supplied MIME type.
+                    // Abort these steps.
+                    return supplied_mime_type.clone();
+                }
+            }
+
+            // 6. The computed MIME type is the supplied MIME type.
+            return supplied_mime_type.clone();
+        }
+
+        // 6. If the supplied MIME type is an image MIME type supported by the user agent, let matched-type be the result of
+        // executing the image type pattern matching algorithm with the resource header as the byte sequence to be matched.
+
+        // 7. If matched-type is not undefined, the computed MIME type is matched-type.
+        // Abort these steps.
+
+        // NOTE: lets just act like we support all image types, i don't see any harm in that.
+        if supplied_mime_type.is_image() {
+            if let Some(matched_mime_type) = identify_image_type(resource_header) {
+                return matched_mime_type;
+            }
+        }
+
+        // 8.  If the supplied MIME type is an audio or video MIME type supported by the user agent, let matched-type be the result of
+        // executing the audio or video type pattern matching algorithm with the resource header as the byte sequence to be matched.
+
+        // 9. If matched-type is not undefined, the computed MIME type is matched-type.
+        // Abort these steps.
+        // NOTE: lets just act like we support all audio/video types, i don't see any harm in that.
+        if supplied_mime_type.is_audio_or_video() {
+            if let Some(matched_mime_type) = identify_audio_or_video_type(resource_header) {
+                return matched_mime_type;
+            }
+        }
+    }
+
+    // 10. The computed MIME type is the supplied MIME type.
+
+    // NOTE: this would mean that the computed MIME type can be undefined which
+    // is very inconvenient. Instead, we use "application/octet-stream"
+    // (which basically means "whatever it is, I don't understand it")
+    MIMEType::new("application", "octet-stream")
 }
diff --git a/util/mime/src/sniff.rs b/util/mime/src/sniff.rs
@@ -45,7 +45,7 @@ pub fn identify_unknown_mime_type(
     // 9. If resource’s resource header contains no binary data bytes, return "text/plain".
     if resource_header
         .iter()
-        .all(|byte| !matches!(byte, 0x00..=0x08 | 0x0B | 0x0E..=0x1A | 0x1C..=0x1F))
+        .all(|&byte| !is_binary_data_byte(byte))
     {
         return MIMEType::new("text", "plain");
     }
@@ -367,3 +367,9 @@ fn compute_mp3_frame_size(version: u8, bitrate: u32, freq: u32, pad: bool) -> us
     // 4. Return size.
     size as usize
 }
+
+/// <https://mimesniff.spec.whatwg.org/#binary-data-byte>
+#[inline]
+pub(crate) fn is_binary_data_byte(byte: u8) -> bool {
+    matches!(byte, 0x00..=0x08 | 0x0B | 0x0E..=0x1A | 0x1C..=0x1F)
+}
diff --git a/util/mime/src/sniff_tables.rs b/util/mime/src/sniff_tables.rs
@@ -5,7 +5,7 @@ use std::str::FromStr;
 use crate::MIMEType;
 
 /// <https://mimesniff.spec.whatwg.org/#whitespace-byte>
-const WHITESPACE: &[u8] = &[0x09, 0x0A, 0x0C, 0x0D, 0x20];
+pub const WHITESPACE: &[u8] = &[0x09, 0x0A, 0x0C, 0x0D, 0x20];
 
 pub const SCRIPTABLE_MIME_TYPES_TABLE: MIMESniffTable<36> = MIMESniffTable([
     // The case-insensitive string "<!DOCTYPE HTML" followed by a tag-terminating byte.