http-types 2.3.0

use std::fmt;

use super::{Mime, ParamKind, ParamName, ParamValue};

/// Parse a string into a mime type.
/// Follows the [WHATWG MIME parsing algorithm](https://mimesniff.spec.whatwg.org/#parsing-a-mime-type)
pub(crate) fn parse(input: &str) -> crate::Result<Mime> {
    // 1
    let input = input.trim_matches(is_http_whitespace_char);

    // 3.
    let (basetype, input) = collect_code_point_sequence_char(input, '/');

    // 4.
    crate::ensure!(!basetype.is_empty(), "MIME type should not be empty");
    crate::ensure!(
        basetype.chars().all(is_http_token_code_point),
        "MIME type should ony contain valid HTTP token code points"
    );

    // 5.
    crate::ensure!(!input.is_empty(), "MIME must contain a sub type");

    // 6.
    let input = &input[1..];

    // 7.
    let (subtype, input) = collect_code_point_sequence_char(input, ';');

    // 8.
    let subtype = subtype.trim_end_matches(is_http_whitespace_char);

    // 9.
    crate::ensure!(!subtype.is_empty(), "MIME sub type should not be empty");
    crate::ensure!(
        subtype.chars().all(is_http_token_code_point),
        "MIME sub type should ony contain valid HTTP token code points"
    );

    // 10.
    let basetype = basetype.to_ascii_lowercase();
    let subtype = subtype.to_ascii_lowercase();
    let mut params = None;

    // 11.
    let mut input = input;
    while !input.is_empty() {
        // 1.
        input = &input[1..];

        // 2.
        input = input.trim_start_matches(is_http_whitespace_char);

        // 3.
        let (parameter_name, new_input) =
            collect_code_point_sequence_slice(input, &[';', '='] as &[char]);
        input = new_input;

        // 4.
        let parameter_name = parameter_name.to_ascii_lowercase();

        if input.is_empty() {
            // 6.
            break;
        } else {
            // 5.
            if input.starts_with(';') {
                continue;
            } else {
                // It's a '='
                input = &input[1..];
            }
        }

        let parameter_value = if input.starts_with('"') {
            // 8.
            // implementation of https://fetch.spec.whatwg.org/#collect-an-http-quoted-string
            let (parameter_value, new_input) = collect_http_quoted_string(input);
            let (_, new_input) = collect_code_point_sequence_char(new_input, ';');
            input = new_input;
            parameter_value
        } else {
            // 9.
            let (parameter_value, new_input) = collect_code_point_sequence_char(input, ';');
            input = new_input;
            let parameter_value = parameter_value.trim_end_matches(is_http_whitespace_char);
            if parameter_value.is_empty() {
                continue;
            }
            parameter_value.to_owned()
        };

        // 10.
        if !parameter_name.is_empty()
            && parameter_name.chars().all(is_http_token_code_point)
            && parameter_value
                .chars()
                .all(is_http_quoted_string_token_code_point)
        {
            let params = params.get_or_insert_with(Vec::new);
            let name = ParamName(parameter_name.into());
            let value = ParamValue(parameter_value.into());
            if !params.iter().any(|(k, _)| k == &name) {
                params.push((name, value));
            }
        }
    }

    Ok(Mime {
        essence: format!("{}/{}", &basetype, &subtype),
        basetype,
        subtype,
        params: params.map(ParamKind::Vec),
        static_essence: None,
        static_basetype: None,
        static_subtype: None,
    })
}

/// Validates [HTTP token code points](https://mimesniff.spec.whatwg.org/#http-token-code-point)
fn is_http_token_code_point(c: char) -> bool {
    match c {
        '!'
        | '#'
        | '$'
        | '%'
        | '&'
        | '\''
        | '*'
        | '+'
        | '-'
        | '.'
        | '^'
        | '_'
        | '`'
        | '|'
        | '~'
        | 'a'..='z'
        | 'A'..='Z'
        | '0'..='9' => true,
        _ => false,
    }
}

/// Validates [HTTP quoted-string token code points](https://mimesniff.spec.whatwg.org/#http-quoted-string-token-code-point)
fn is_http_quoted_string_token_code_point(c: char) -> bool {
    match c {
        '\t' | ' '..='~' | '\u{80}'..='\u{FF}' => true,
        _ => false,
    }
}

/// Is a [HTTP whitespace](https://fetch.spec.whatwg.org/#http-whitespace)
fn is_http_whitespace_char(c: char) -> bool {
    match c {
        '\n' | '\r' | '\t' | ' ' => true,
        _ => false,
    }
}

/// [code point sequence collection](https://infra.spec.whatwg.org/#collect-a-sequence-of-code-points)
fn collect_code_point_sequence_char(input: &str, delimiter: char) -> (&str, &str) {
    input.split_at(input.find(delimiter).unwrap_or_else(|| input.len()))
}

/// [code point sequence collection](https://infra.spec.whatwg.org/#collect-a-sequence-of-code-points)
fn collect_code_point_sequence_slice<'a>(input: &'a str, delimiter: &[char]) -> (&'a str, &'a str) {
    input.split_at(input.find(delimiter).unwrap_or_else(|| input.len()))
}

/// [HTTP quoted string collection](https://fetch.spec.whatwg.org/#collect-an-http-quoted-string)
///
/// Assumes that the first char is '"'
fn collect_http_quoted_string(mut input: &str) -> (String, &str) {
    // 2.
    let mut value = String::new();
    // 4.
    input = &input[1..];
    // 5.
    loop {
        // 1.
        let (add_value, new_input) =
            collect_code_point_sequence_slice(input, &['"', '\\'] as &[char]);
        value.push_str(add_value);
        let mut chars = new_input.chars();
        // 3.
        if let Some(quote_or_backslash) = chars.next() {
            // 4.
            input = chars.as_str();
            //5.
            if quote_or_backslash == '\\' {
                if let Some(c) = chars.next() {
                    // 2.
                    value.push(c);
                    // 3.
                    input = chars.as_str();
                } else {
                    // 1.
                    value.push('\\');
                    break;
                }
            } else {
                // 6.
                break;
            }
        } else {
            // 2
            break;
        }
    }
    (value, input)
}

/// Implementation of [WHATWG MIME serialization algorithm](https://mimesniff.spec.whatwg.org/#serializing-a-mime-type)
pub(crate) fn format(mime_type: &Mime, f: &mut fmt::Formatter<'_>) -> fmt::Result {
    if let Some(essence) = mime_type.static_essence {
        write!(f, "{}", essence)?
    } else {
        write!(f, "{}", &mime_type.essence)?
    }
    if let Some(params) = &mime_type.params {
        match params {
            ParamKind::Utf8 => write!(f, ";charset=utf-8")?,
            ParamKind::Vec(params) => {
                for (name, value) in params {
                    if value.0.chars().all(is_http_token_code_point) && !value.0.is_empty() {
                        write!(f, ";{}={}", name, value)?;
                    } else {
                        write!(
                            f,
                            ";{}=\"{}\"",
                            name,
                            value
                                .0
                                .chars()
                                .flat_map(|c| match c {
                                    '"' | '\\' => EscapeMimeValue {
                                        state: EscapeMimeValueState::Backslash(c)
                                    },
                                    c => EscapeMimeValue {
                                        state: EscapeMimeValueState::Char(c)
                                    },
                                })
                                .collect::<String>()
                        )?;
                    }
                }
            }
        }
    }
    Ok(())
}

struct EscapeMimeValue {
    state: EscapeMimeValueState,
}

#[derive(Clone, Debug)]
enum EscapeMimeValueState {
    Done,
    Char(char),
    Backslash(char),
}

impl Iterator for EscapeMimeValue {
    type Item = char;

    fn next(&mut self) -> Option<char> {
        match self.state {
            EscapeMimeValueState::Done => None,
            EscapeMimeValueState::Char(c) => {
                self.state = EscapeMimeValueState::Done;
                Some(c)
            }
            EscapeMimeValueState::Backslash(c) => {
                self.state = EscapeMimeValueState::Char(c);
                Some('\\')
            }
        }
    }

    fn size_hint(&self) -> (usize, Option<usize>) {
        match self.state {
            EscapeMimeValueState::Done => (0, Some(0)),
            EscapeMimeValueState::Char(_) => (1, Some(1)),
            EscapeMimeValueState::Backslash(_) => (2, Some(2)),
        }
    }
}

#[test]
fn test() {
    let mime = parse("text/html").unwrap();
    assert_eq!(mime.basetype(), "text");
    assert_eq!(mime.subtype(), "html");

    // technically invalid mime, but allow anyway
    let mime = parse("text/html;").unwrap();
    assert_eq!(mime.basetype(), "text");
    assert_eq!(mime.subtype(), "html");

    let mime = parse("text/html; charset=utf-8").unwrap();
    assert_eq!(mime.basetype(), "text");
    assert_eq!(mime.subtype(), "html");
    assert_eq!(mime.param("charset").unwrap(), "utf-8");

    let mime = parse("text/html; charset=utf-8;").unwrap();
    assert_eq!(mime.basetype(), "text");
    assert_eq!(mime.subtype(), "html");
    assert_eq!(mime.param("charset").unwrap(), "utf-8");

    assert!(parse("text").is_err());
    assert!(parse("text/").is_err());
    assert!(parse("t/").is_err());
    assert!(parse("t/h").is_ok());
}

/// Web Platform tests for MIME type parsing
/// From https://github.com/web-platform-tests/wpt/blob/master/mimesniff/mime-types/resources/mime-types.json
#[test]
fn whatwag_tests() {
    fn assert_parse(input: &str, expected: &str) {
        let actual = parse(input).unwrap();
        assert_eq!(actual.to_string(), expected);
    }

    fn assert_fails(input: &str) {
        assert!(parse(input).is_err());
    }

    fn assert_parse_and_encoding(
        input: &str,
        expected: &str,
        _encoding: impl Into<Option<&'static str>>,
    ) {
        //TODO: check encoding
        assert_parse(input, expected);
    }

    // Basics
    assert_parse_and_encoding("text/html;charset=gbk", "text/html;charset=gbk", "GBK");
    assert_parse_and_encoding("TEXT/HTML;CHARSET=GBK", "text/html;charset=GBK", "GBK");

    //" Legacy comment syntax"
    assert_parse_and_encoding("text/html;charset=gbk(", "text/html;charset=\"gbk(\"", None);
    assert_parse_and_encoding(
        "text/html;x=(;charset=gbk",
        "text/html;x=\"(\";charset=gbk",
        "GBK",
    );

    // Duplicate parameter
    assert_parse_and_encoding(
        "text/html;charset=gbk;charset=windows-1255",
        "text/html;charset=gbk",
        "GBK",
    );
    assert_parse_and_encoding(
        "text/html;charset=();charset=GBK",
        "text/html;charset=\"()\"",
        None,
    );

    // Spaces
    assert_parse_and_encoding("text/html;charset =gbk", "text/html", None);
    assert_parse_and_encoding("text/html ;charset=gbk", "text/html;charset=gbk", "GBK");
    assert_parse_and_encoding("text/html; charset=gbk", "text/html;charset=gbk", "GBK");
    assert_parse_and_encoding(
        "text/html;charset= gbk",
        "text/html;charset=\" gbk\"",
        "GBK",
    );
    assert_parse_and_encoding(
        "text/html;charset= \"gbk\"",
        "text/html;charset=\" \\\"gbk\\\"\"",
        None,
    );

    // 0x0B and 0x0C
    assert_parse_and_encoding("text/html;charset=\u{000B}gbk", "text/html", None);
    assert_parse_and_encoding("text/html;charset=\u{000C}gbk", "text/html", None);
    assert_parse_and_encoding("text/html;\u{000B}charset=gbk", "text/html", None);
    assert_parse_and_encoding("text/html;\u{000C}charset=gbk", "text/html", None);

    // Single quotes are a token, not a delimiter
    assert_parse_and_encoding("text/html;charset='gbk'", "text/html;charset='gbk'", None);
    assert_parse_and_encoding("text/html;charset='gbk", "text/html;charset='gbk", None);
    assert_parse_and_encoding("text/html;charset=gbk'", "text/html;charset=gbk'", None);
    assert_parse_and_encoding(
        "text/html;charset=';charset=GBK",
        "text/html;charset='",
        None,
    );

    // Invalid parameters
    assert_parse_and_encoding("text/html;test;charset=gbk", "text/html;charset=gbk", "GBK");
    assert_parse_and_encoding(
        "text/html;test=;charset=gbk",
        "text/html;charset=gbk",
        "GBK",
    );
    assert_parse_and_encoding("text/html;';charset=gbk", "text/html;charset=gbk", "GBK");
    assert_parse_and_encoding("text/html;\";charset=gbk", "text/html;charset=gbk", "GBK");
    assert_parse_and_encoding("text/html ; ; charset=gbk", "text/html;charset=gbk", "GBK");
    assert_parse_and_encoding("text/html;;;;charset=gbk", "text/html;charset=gbk", "GBK");
    assert_parse_and_encoding(
        "text/html;charset= \"\u{007F};charset=GBK",
        "text/html;charset=GBK",
        "GBK",
    );
    assert_parse_and_encoding(
        "text/html;charset=\"\u{007F};charset=foo\";charset=GBK",
        "text/html;charset=GBK",
        "GBK",
    );

    // Double quotes"
    assert_parse_and_encoding("text/html;charset=\"gbk\"", "text/html;charset=gbk", "GBK");
    assert_parse_and_encoding("text/html;charset=\"gbk", "text/html;charset=gbk", "GBK");
    assert_parse_and_encoding(
        "text/html;charset=gbk\"",
        "text/html;charset=\"gbk\\\"\"",
        None,
    );
    assert_parse_and_encoding(
        "text/html;charset=\" gbk\"",
        "text/html;charset=\" gbk\"",
        "GBK",
    );
    assert_parse_and_encoding(
        "text/html;charset=\"gbk \"",
        "text/html;charset=\"gbk \"",
        "GBK",
    );
    assert_parse_and_encoding(
        "text/html;charset=\"\\ gbk\"",
        "text/html;charset=\" gbk\"",
        "GBK",
    );
    assert_parse_and_encoding(
        "text/html;charset=\"\\g\\b\\k\"",
        "text/html;charset=gbk",
        "GBK",
    );
    assert_parse_and_encoding("text/html;charset=\"gbk\"x", "text/html;charset=gbk", "GBK");
    assert_parse_and_encoding(
        "text/html;charset=\"\";charset=GBK",
        "text/html;charset=\"\"",
        None,
    );
    assert_parse_and_encoding(
        "text/html;charset=\";charset=GBK",
        "text/html;charset=\";charset=GBK\"",
        None,
    );

    // Unexpected code points
    assert_parse_and_encoding(
        "text/html;charset={gbk}",
        "text/html;charset=\"{gbk}\"",
        None,
    );

    // Parameter name longer than 127
    assert_parse_and_encoding("text/html;0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789=x;charset=gbk", "text/html;0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789=x;charset=gbk", "GBK");

    // type/subtype longer than 127
    assert_parse("0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789/0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789", "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789/0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789");

    // Valid
    assert_parse("!#$%&'*+-.^_`|~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz/!#$%&'*+-.^_`|~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz;!#$%&'*+-.^_`|~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz=!#$%&'*+-.^_`|~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", "!#$%&'*+-.^_`|~0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz/!#$%&'*+-.^_`|~0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz;!#$%&'*+-.^_`|~0123456789abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz=!#$%&'*+-.^_`|~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
    assert_parse("x/x;x=\"\t !\\\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u{0080}\u{0081}\u{0082}\u{0083}\u{0084}\u{0085}\u{0086}\u{0087}\u{0088}\u{0089}\u{008A}\u{008B}\u{008C}\u{008D}\u{008E}\u{008F}\u{0090}\u{0091}\u{0092}\u{0093}\u{0094}\u{0095}\u{0096}\u{0097}\u{0098}\u{0099}\u{009A}\u{009B}\u{009C}\u{009D}\u{009E}\u{009F}\u{00A0}\u{00A1}\u{00A2}\u{00A3}\u{00A4}\u{00A5}\u{00A6}\u{00A7}\u{00A8}\u{00A9}\u{00AA}\u{00AB}\u{00AC}\u{00AD}\u{00AE}\u{00AF}\u{00B0}\u{00B1}\u{00B2}\u{00B3}\u{00B4}\u{00B5}\u{00B6}\u{00B7}\u{00B8}\u{00B9}\u{00BA}\u{00BB}\u{00BC}\u{00BD}\u{00BE}\u{00BF}\u{00C0}\u{00C1}\u{00C2}\u{00C3}\u{00C4}\u{00C5}\u{00C6}\u{00C7}\u{00C8}\u{00C9}\u{00CA}\u{00CB}\u{00CC}\u{00CD}\u{00CE}\u{00CF}\u{00D0}\u{00D1}\u{00D2}\u{00D3}\u{00D4}\u{00D5}\u{00D6}\u{00D7}\u{00D8}\u{00D9}\u{00DA}\u{00DB}\u{00DC}\u{00DD}\u{00DE}\u{00DF}\u{00E0}\u{00E1}\u{00E2}\u{00E3}\u{00E4}\u{00E5}\u{00E6}\u{00E7}\u{00E8}\u{00E9}\u{00EA}\u{00EB}\u{00EC}\u{00ED}\u{00EE}\u{00EF}\u{00F0}\u{00F1}\u{00F2}\u{00F3}\u{00F4}\u{00F5}\u{00F6}\u{00F7}\u{00F8}\u{00F9}\u{00FA}\u{00FB}\u{00FC}\u{00FD}\u{00FE}\u{00FF}\"", "x/x;x=\"\t !\\\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u{0080}\u{0081}\u{0082}\u{0083}\u{0084}\u{0085}\u{0086}\u{0087}\u{0088}\u{0089}\u{008A}\u{008B}\u{008C}\u{008D}\u{008E}\u{008F}\u{0090}\u{0091}\u{0092}\u{0093}\u{0094}\u{0095}\u{0096}\u{0097}\u{0098}\u{0099}\u{009A}\u{009B}\u{009C}\u{009D}\u{009E}\u{009F}\u{00A0}\u{00A1}\u{00A2}\u{00A3}\u{00A4}\u{00A5}\u{00A6}\u{00A7}\u{00A8}\u{00A9}\u{00AA}\u{00AB}\u{00AC}\u{00AD}\u{00AE}\u{00AF}\u{00B0}\u{00B1}\u{00B2}\u{00B3}\u{00B4}\u{00B5}\u{00B6}\u{00B7}\u{00B8}\u{00B9}\u{00BA}\u{00BB}\u{00BC}\u{00BD}\u{00BE}\u{00BF}\u{00C0}\u{00C1}\u{00C2}\u{00C3}\u{00C4}\u{00C5}\u{00C6}\u{00C7}\u{00C8}\u{00C9}\u{00CA}\u{00CB}\u{00CC}\u{00CD}\u{00CE}\u{00CF}\u{00D0}\u{00D1}\u{00D2}\u{00D3}\u{00D4}\u{00D5}\u{00D6}\u{00D7}\u{00D8}\u{00D9}\u{00DA}\u{00DB}\u{00DC}\u{00DD}\u{00DE}\u{00DF}\u{00E0}\u{00E1}\u{00E2}\u{00E3}\u{00E4}\u{00E5}\u{00E6}\u{00E7}\u{00E8}\u{00E9}\u{00EA}\u{00EB}\u{00EC}\u{00ED}\u{00EE}\u{00EF}\u{00F0}\u{00F1}\u{00F2}\u{00F3}\u{00F4}\u{00F5}\u{00F6}\u{00F7}\u{00F8}\u{00F9}\u{00FA}\u{00FB}\u{00FC}\u{00FD}\u{00FE}\u{00FF}\"");

    // End-of-file handling
    assert_parse("x/x;test", "x/x");
    assert_parse("x/x;test=\"\\", "x/x;test=\"\\\\\"");

    // Whitespace (not handled by generated-mime-types.json or above)
    assert_parse("x/x;x= ", "x/x");
    assert_parse("x/x;x=\t", "x/x");
    assert_parse("x/x\n\r\t ;x=x", "x/x;x=x");
    assert_parse("\n\r\t x/x;x=x\n\r\t ", "x/x;x=x");
    assert_parse("x/x;\n\r\t x=x\n\r\t ;x=y", "x/x;x=x");

    // Latin1
    assert_parse_and_encoding(
        "text/html;test=\u{00FF};charset=gbk",
        "text/html;test=\"\u{00FF}\";charset=gbk",
        "GBK",
    );

    // >Latin1
    assert_parse("x/x;test=\u{FFFD};x=x", "x/x;x=x");

    // Failure
    assert_fails("\u{000B}x/x");
    assert_fails("\u{000C}x/x");
    assert_fails("x/x\u{000B}");
    assert_fails("x/x\u{000C}");
    assert_fails("");
    assert_fails("\t");
    assert_fails("/");
    assert_fails("bogus");
    assert_fails("bogus/");
    assert_fails("bogus/ ");
    assert_fails("bogus/bogus/;");
    assert_fails("</>");
    assert_fails("(/)");
    assert_fails("ÿ/ÿ");
    assert_fails("text/html(;doesnot=matter");
    assert_fails("{/}");
    assert_fails("\u{0100}/\u{0100}");
    assert_fails("text /html");
    assert_fails("text/ html");
    assert_fails("\"text/html\"");
}