toml_parser 1.0.4

//! Lex TOML tokens
//!
//! To get started, see [`Source::lex`][crate::Source::lex]

#[cfg(test)]
#[cfg(feature = "std")]
mod test;
mod token;

#[cfg(feature = "alloc")]
use alloc::vec::Vec;

use winnow::stream::AsBStr as _;
use winnow::stream::ContainsToken as _;
use winnow::stream::FindSlice as _;
use winnow::stream::Location;
use winnow::stream::Stream as _;

use crate::Span;

pub use token::Token;
pub use token::TokenKind;

/// Lex TOML [tokens][Token]
///
/// To get started, see [`Source::lex`][crate::Source::lex]
pub struct Lexer<'i> {
    stream: Stream<'i>,
    eof: bool,
}

impl<'i> Lexer<'i> {
    pub(crate) fn new(input: &'i str) -> Self {
        let mut stream = Stream::new(input);
        if input.as_bytes().starts_with(BOM) {
            let offset = BOM.len();
            #[cfg(feature = "unsafe")] // SAFETY: only called when next character is ASCII
            unsafe {
                stream.next_slice_unchecked(offset)
            };
            #[cfg(not(feature = "unsafe"))]
            stream.next_slice(offset);
        }
        Lexer { stream, eof: false }
    }

    #[cfg(feature = "alloc")]
    pub fn into_vec(self) -> Vec<Token> {
        #![allow(unused_qualifications)] // due to MSRV of 1.66
        let capacity = core::cmp::min(
            self.stream.len(),
            usize::MAX / core::mem::size_of::<Token>(),
        );
        let mut vec = Vec::with_capacity(capacity);
        vec.extend(self);
        vec
    }
}

impl Iterator for Lexer<'_> {
    type Item = Token;

    fn next(&mut self) -> Option<Self::Item> {
        let Some(peek_byte) = self.stream.as_bstr().first() else {
            if self.eof {
                return None;
            } else {
                self.eof = true;
                let start = self.stream.current_token_start();
                let span = Span::new_unchecked(start, start);
                return Some(Token::new(TokenKind::Eof, span));
            }
        };
        Some(process_token(*peek_byte, &mut self.stream))
    }
}

const BOM: &[u8] = b"\xEF\xBB\xBF";

pub(crate) type Stream<'i> = winnow::stream::LocatingSlice<&'i str>;

fn process_token(peek_byte: u8, stream: &mut Stream<'_>) -> Token {
    let token = match peek_byte {
        b'.' => lex_ascii_char(stream, TokenKind::Dot),
        b'=' => lex_ascii_char(stream, TokenKind::Equals),
        b',' => lex_ascii_char(stream, TokenKind::Comma),
        b'[' => lex_ascii_char(stream, TokenKind::LeftSquareBracket),
        b']' => lex_ascii_char(stream, TokenKind::RightSquareBracket),
        b'{' => lex_ascii_char(stream, TokenKind::LeftCurlyBracket),
        b'}' => lex_ascii_char(stream, TokenKind::RightCurlyBracket),
        b' ' => lex_whitespace(stream),
        b'\t' => lex_whitespace(stream),
        b'#' => lex_comment(stream),
        b'\r' => lex_crlf(stream),
        b'\n' => lex_ascii_char(stream, TokenKind::Newline),
        b'\'' => {
            if stream.starts_with(ML_LITERAL_STRING_DELIM) {
                lex_ml_literal_string(stream)
            } else {
                lex_literal_string(stream)
            }
        }
        b'"' => {
            if stream.starts_with(ML_BASIC_STRING_DELIM) {
                lex_ml_basic_string(stream)
            } else {
                lex_basic_string(stream)
            }
        }
        _ => lex_atom(stream),
    };
    token
}

/// Process an ASCII character token
///
/// # Safety
///
/// - `stream` must be UTF-8
/// - `stream` must be non-empty
/// - `stream[0]` must be ASCII
fn lex_ascii_char(stream: &mut Stream<'_>, kind: TokenKind) -> Token {
    debug_assert!(!stream.is_empty());
    let start = stream.current_token_start();

    let offset = 1; // an ascii character
    #[cfg(feature = "unsafe")] // SAFETY: only called when next character is ASCII
    unsafe {
        stream.next_slice_unchecked(offset)
    };
    #[cfg(not(feature = "unsafe"))]
    stream.next_slice(offset);

    let end = stream.previous_token_end();
    let span = Span::new_unchecked(start, end);
    Token::new(kind, span)
}

/// Process Whitespace
///
/// ```bnf
/// ;; Whitespace
///
/// ws = *wschar
/// wschar =  %x20  ; Space
/// wschar =/ %x09  ; Horizontal tab
/// ```
///
/// # Safety
///
/// - `stream` must be UTF-8
/// - `stream` must be non-empty
fn lex_whitespace(stream: &mut Stream<'_>) -> Token {
    debug_assert!(!stream.is_empty());
    let start = stream.current_token_start();

    let offset = stream
        .as_bstr()
        .offset_for(|b| !WSCHAR.contains_token(b))
        .unwrap_or(stream.eof_offset());
    #[cfg(feature = "unsafe")] // SAFETY: WSCHAR ensures `offset` will be at UTF-8 boundary
    unsafe {
        stream.next_slice_unchecked(offset)
    };
    #[cfg(not(feature = "unsafe"))]
    stream.next_slice(offset);

    let end = stream.previous_token_end();
    let span = Span::new_unchecked(start, end);
    Token::new(TokenKind::Whitespace, span)
}

/// ```bnf
/// wschar =  %x20  ; Space
/// wschar =/ %x09  ; Horizontal tab
/// ```
pub(crate) const WSCHAR: (u8, u8) = (b' ', b'\t');

/// Process Comment
///
/// ```bnf
/// ;; Comment
///
/// comment-start-symbol = %x23 ; #
/// non-ascii = %x80-D7FF / %xE000-10FFFF
/// non-eol = %x09 / %x20-7F / non-ascii
///
/// comment = comment-start-symbol *non-eol
/// ```
///
/// # Safety
///
/// - `stream` must be UTF-8
/// - `stream[0] == b'#'`
fn lex_comment(stream: &mut Stream<'_>) -> Token {
    let start = stream.current_token_start();

    let offset = stream
        .as_bytes()
        .find_slice((b'\r', b'\n'))
        .map(|s| s.start)
        .unwrap_or_else(|| stream.eof_offset());
    #[cfg(feature = "unsafe")] // SAFETY: newlines ensure `offset` is along UTF-8 boundary
    unsafe {
        stream.next_slice_unchecked(offset)
    };
    #[cfg(not(feature = "unsafe"))]
    stream.next_slice(offset);

    let end = stream.previous_token_end();
    let span = Span::new_unchecked(start, end);
    Token::new(TokenKind::Comment, span)
}

/// `comment-start-symbol = %x23 ; #`
pub(crate) const COMMENT_START_SYMBOL: u8 = b'#';

/// Process Newline
///
/// ```bnf
/// ;; Newline
///
/// newline =  %x0A     ; LF
/// newline =/ %x0D.0A  ; CRLF
/// ```
///
/// # Safety
///
/// - `stream` must be UTF-8
/// - `stream[0] == b'\r'`
fn lex_crlf(stream: &mut Stream<'_>) -> Token {
    let start = stream.current_token_start();

    let mut offset = '\r'.len_utf8();
    let has_lf = stream.as_bstr().get(1) == Some(&b'\n');
    if has_lf {
        offset += '\n'.len_utf8();
    }

    #[cfg(feature = "unsafe")] // SAFETY: newlines ensure `offset` is along UTF-8 boundary
    unsafe {
        stream.next_slice_unchecked(offset)
    };
    #[cfg(not(feature = "unsafe"))]
    stream.next_slice(offset);
    let end = stream.previous_token_end();
    let span = Span::new_unchecked(start, end);

    Token::new(TokenKind::Newline, span)
}

/// Process literal string
///
/// ```bnf
/// ;; Literal String
///
/// literal-string = apostrophe *literal-char apostrophe
///
/// apostrophe = %x27 ; ' apostrophe
///
/// literal-char = %x09 / %x20-26 / %x28-7E / non-ascii
/// ```
///
/// # Safety
///
/// - `stream` must be UTF-8
/// - `stream[0] == b'\''`
fn lex_literal_string(stream: &mut Stream<'_>) -> Token {
    let start = stream.current_token_start();

    let offset = 1; // APOSTROPHE
    #[cfg(feature = "unsafe")] // SAFETY: only called when next character is ASCII
    unsafe {
        stream.next_slice_unchecked(offset)
    };
    #[cfg(not(feature = "unsafe"))]
    stream.next_slice(offset);

    let offset = match stream.as_bstr().find_slice((APOSTROPHE, b'\n')) {
        Some(span) => {
            if stream.as_bstr()[span.start] == APOSTROPHE {
                span.end
            } else {
                span.start
            }
        }
        None => stream.eof_offset(),
    };
    #[cfg(feature = "unsafe")]
    // SAFETY: `APOSTROPHE`/newline ensure `offset` is along UTF-8 boundary
    unsafe {
        stream.next_slice_unchecked(offset)
    };
    #[cfg(not(feature = "unsafe"))]
    stream.next_slice(offset);

    let end = stream.previous_token_end();
    let span = Span::new_unchecked(start, end);
    Token::new(TokenKind::LiteralString, span)
}

/// `apostrophe = %x27 ; ' apostrophe`
pub(crate) const APOSTROPHE: u8 = b'\'';

/// Process multi-line literal string
///
/// ```bnf
/// ;; Multiline Literal String
///
/// ml-literal-string = ml-literal-string-delim [ newline ] ml-literal-body
///                     ml-literal-string-delim
/// ml-literal-string-delim = 3apostrophe
/// ml-literal-body = *mll-content *( mll-quotes 1*mll-content ) [ mll-quotes ]
///
/// mll-content = mll-char / newline
/// mll-char = %x09 / %x20-26 / %x28-7E / non-ascii
/// mll-quotes = 1*2apostrophe
/// ```
///
/// # Safety
///
/// - `stream` must be UTF-8
/// - `stream.starts_with(ML_LITERAL_STRING_DELIM)`
fn lex_ml_literal_string(stream: &mut Stream<'_>) -> Token {
    let start = stream.current_token_start();

    let offset = ML_LITERAL_STRING_DELIM.len();
    #[cfg(feature = "unsafe")] // SAFETY: only called when next character is ASCII
    unsafe {
        stream.next_slice_unchecked(offset)
    };
    #[cfg(not(feature = "unsafe"))]
    stream.next_slice(offset);

    let offset = match stream.as_bstr().find_slice(ML_LITERAL_STRING_DELIM) {
        Some(span) => span.end,
        None => stream.eof_offset(),
    };
    #[cfg(feature = "unsafe")]
    // SAFETY: `ML_LITERAL_STRING_DELIM` ensure `offset` is along UTF-8 boundary
    unsafe {
        stream.next_slice_unchecked(offset)
    };
    #[cfg(not(feature = "unsafe"))]
    stream.next_slice(offset);

    if stream.as_bstr().peek_token() == Some(APOSTROPHE) {
        let offset = 1;
        #[cfg(feature = "unsafe")] // SAFETY: `APOSTROPHE` ensure `offset` is along UTF-8 boundary
        unsafe {
            stream.next_slice_unchecked(offset)
        };
        #[cfg(not(feature = "unsafe"))]
        stream.next_slice(offset);

        if stream.as_bstr().peek_token() == Some(APOSTROPHE) {
            let offset = 1;
            #[cfg(feature = "unsafe")]
            // SAFETY: `APOSTROPHE` ensure `offset` is along UTF-8 boundary
            unsafe {
                stream.next_slice_unchecked(offset)
            };
            #[cfg(not(feature = "unsafe"))]
            stream.next_slice(offset);
        }
    }

    let end = stream.previous_token_end();
    let span = Span::new_unchecked(start, end);
    Token::new(TokenKind::MlLiteralString, span)
}

/// `ml-literal-string-delim = 3apostrophe`
pub(crate) const ML_LITERAL_STRING_DELIM: &str = "'''";

/// Process basic string
///
/// ```bnf
/// ;; Basic String
///
/// basic-string = quotation-mark *basic-char quotation-mark
///
/// quotation-mark = %x22            ; "
///
/// basic-char = basic-unescaped / escaped
/// basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
/// escaped = escape escape-seq-char
///
/// escape = %x5C                   ; \
/// escape-seq-char =  %x22         ; "    quotation mark  U+0022
/// escape-seq-char =/ %x5C         ; \    reverse solidus U+005C
/// escape-seq-char =/ %x62         ; b    backspace       U+0008
/// escape-seq-char =/ %x66         ; f    form feed       U+000C
/// escape-seq-char =/ %x6E         ; n    line feed       U+000A
/// escape-seq-char =/ %x72         ; r    carriage return U+000D
/// escape-seq-char =/ %x74         ; t    tab             U+0009
/// escape-seq-char =/ %x75 4HEXDIG ; uXXXX                U+XXXX
/// escape-seq-char =/ %x55 8HEXDIG ; UXXXXXXXX            U+XXXXXXXX
/// ```
///
/// # Safety
///
/// - `stream` must be UTF-8
/// - `stream[0] == b'"'`
fn lex_basic_string(stream: &mut Stream<'_>) -> Token {
    let start = stream.current_token_start();

    let offset = 1; // QUOTATION_MARK
    #[cfg(feature = "unsafe")] // SAFETY: only called when next character is ASCII
    unsafe {
        stream.next_slice_unchecked(offset)
    };
    #[cfg(not(feature = "unsafe"))]
    stream.next_slice(offset);

    loop {
        // newline is present for error recovery
        match stream.as_bstr().find_slice((QUOTATION_MARK, ESCAPE, b'\n')) {
            Some(span) => {
                let found = stream.as_bstr()[span.start];
                if found == QUOTATION_MARK {
                    let offset = span.end;
                    #[cfg(feature = "unsafe")]
                    // SAFETY: `QUOTATION_MARK` ensure `offset` is along UTF-8 boundary
                    unsafe {
                        stream.next_slice_unchecked(offset)
                    };
                    #[cfg(not(feature = "unsafe"))]
                    stream.next_slice(offset);
                    break;
                } else if found == ESCAPE {
                    let offset = span.end;
                    #[cfg(feature = "unsafe")]
                    // SAFETY: `ESCAPE` / newline ensure `offset` is along UTF-8 boundary
                    unsafe {
                        stream.next_slice_unchecked(offset)
                    };
                    #[cfg(not(feature = "unsafe"))]
                    stream.next_slice(offset);

                    let peek = stream.as_bstr().peek_token();
                    match peek {
                        Some(ESCAPE) | Some(QUOTATION_MARK) => {
                            let offset = 1; // ESCAPE / QUOTATION_MARK
                            #[cfg(feature = "unsafe")]
                            #[cfg(feature = "unsafe")]
                            // SAFETY: `ESCAPE` / newline ensure `offset` is along UTF-8 boundary
                            unsafe {
                                stream.next_slice_unchecked(offset)
                            };
                            #[cfg(not(feature = "unsafe"))]
                            stream.next_slice(offset);
                        }
                        _ => {}
                    }
                    continue;
                } else if found == b'\n' {
                    let offset = span.start;
                    #[cfg(feature = "unsafe")]
                    // SAFETY: newline ensure `offset` is along UTF-8 boundary
                    unsafe {
                        stream.next_slice_unchecked(offset)
                    };
                    #[cfg(not(feature = "unsafe"))]
                    stream.next_slice(offset);
                    break;
                } else {
                    unreachable!("found `{found}`");
                }
            }
            None => {
                stream.finish();
                break;
            }
        }
    }

    let end = stream.previous_token_end();
    let span = Span::new_unchecked(start, end);
    Token::new(TokenKind::BasicString, span)
}

/// `quotation-mark = %x22            ; "`
pub(crate) const QUOTATION_MARK: u8 = b'"';

/// `escape = %x5C                   ; \`
pub(crate) const ESCAPE: u8 = b'\\';

/// Process multi-line basic string
///
/// ```bnf
/// ;; Multiline Basic String
///
/// ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body
///                   ml-basic-string-delim
/// ml-basic-string-delim = 3quotation-mark
/// ml-basic-body = *mlb-content *( mlb-quotes 1*mlb-content ) [ mlb-quotes ]
///
/// mlb-content = mlb-char / newline / mlb-escaped-nl
/// mlb-char = mlb-unescaped / escaped
/// mlb-quotes = 1*2quotation-mark
/// mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
/// mlb-escaped-nl = escape ws newline *( wschar / newline )
/// ```
///
/// # Safety
///
/// - `stream` must be UTF-8
/// - `stream.starts_with(ML_BASIC_STRING_DELIM)`
fn lex_ml_basic_string(stream: &mut Stream<'_>) -> Token {
    let start = stream.current_token_start();

    let offset = ML_BASIC_STRING_DELIM.len();
    #[cfg(feature = "unsafe")] // SAFETY: only called when next character is ASCII
    unsafe {
        stream.next_slice_unchecked(offset)
    };
    #[cfg(not(feature = "unsafe"))]
    stream.next_slice(offset);

    loop {
        // newline is present for error recovery
        match stream.as_bstr().find_slice((ML_BASIC_STRING_DELIM, "\\")) {
            Some(span) => {
                let found = stream.as_bstr()[span.start];
                if found == QUOTATION_MARK {
                    let offset = span.end;
                    #[cfg(feature = "unsafe")]
                    // SAFETY: `QUOTATION_MARK` ensure `offset` is along UTF-8 boundary
                    unsafe {
                        stream.next_slice_unchecked(offset)
                    };
                    #[cfg(not(feature = "unsafe"))]
                    stream.next_slice(offset);
                    break;
                } else if found == ESCAPE {
                    let offset = span.end;
                    #[cfg(feature = "unsafe")]
                    // SAFETY: `ESCAPE` ensure `offset` is along UTF-8 boundary
                    unsafe {
                        stream.next_slice_unchecked(offset)
                    };
                    #[cfg(not(feature = "unsafe"))]
                    stream.next_slice(offset);

                    let peek = stream.as_bstr().peek_token();
                    match peek {
                        Some(ESCAPE) | Some(QUOTATION_MARK) => {
                            let offset = 1; // ESCAPE / QUOTATION_MARK
                            #[cfg(feature = "unsafe")]
                            // SAFETY: `QUOTATION_MARK`/`ESCAPE` ensure `offset` is along UTF-8 boundary
                            unsafe {
                                stream.next_slice_unchecked(offset)
                            };
                            #[cfg(not(feature = "unsafe"))]
                            stream.next_slice(offset);
                        }
                        _ => {}
                    }
                    continue;
                } else {
                    unreachable!("found `{found}`");
                }
            }
            None => {
                stream.finish();
                break;
            }
        }
    }
    if stream.as_bstr().peek_token() == Some(QUOTATION_MARK) {
        let offset = 1;
        #[cfg(feature = "unsafe")]
        // SAFETY: `QUOTATION_MARK` ensure `offset` is along UTF-8 boundary
        unsafe {
            stream.next_slice_unchecked(offset)
        };
        #[cfg(not(feature = "unsafe"))]
        stream.next_slice(offset);
        if stream.as_bstr().peek_token() == Some(QUOTATION_MARK) {
            let offset = 1;
            #[cfg(feature = "unsafe")]
            // SAFETY: `QUOTATION_MARK` ensure `offset` is along UTF-8 boundary
            unsafe {
                stream.next_slice_unchecked(offset)
            };
            #[cfg(not(feature = "unsafe"))]
            stream.next_slice(offset);
        }
    }

    let end = stream.previous_token_end();
    let span = Span::new_unchecked(start, end);
    Token::new(TokenKind::MlBasicString, span)
}

/// `ml-basic-string-delim = 3quotation-mark`
pub(crate) const ML_BASIC_STRING_DELIM: &str = "\"\"\"";

/// Process Atom
///
/// This is everything else
///
/// # Safety
///
/// - `stream` must be UTF-8
/// - `stream` must be non-empty
fn lex_atom(stream: &mut Stream<'_>) -> Token {
    let start = stream.current_token_start();

    // Intentionally leaves off quotes in case the opening quote was missing
    const TOKEN_START: &[u8] = b".=,[]{} \t#\r\n";
    let offset = stream
        .as_bstr()
        .offset_for(|b| TOKEN_START.contains_token(b))
        .unwrap_or_else(|| stream.eof_offset());
    #[cfg(feature = "unsafe")] // SAFETY: `TOKEN_START` ensure `offset` is along UTF-8 boundary
    unsafe {
        stream.next_slice_unchecked(offset)
    };
    #[cfg(not(feature = "unsafe"))]
    stream.next_slice(offset);

    let end = stream.previous_token_end();
    let span = Span::new_unchecked(start, end);
    Token::new(TokenKind::Atom, span)
}