#[cfg(test)]
#[cfg(feature = "std")]
mod test;
mod token;
#[cfg(feature = "alloc")]
use alloc::vec::Vec;
use winnow::stream::AsBStr as _;
use winnow::stream::ContainsToken as _;
use winnow::stream::FindSlice as _;
use winnow::stream::Location;
use winnow::stream::Stream as _;
use crate::Span;
pub use token::Token;
pub use token::TokenKind;
pub struct Lexer<'i> {
stream: Stream<'i>,
eof: bool,
}
impl<'i> Lexer<'i> {
pub(crate) fn new(input: &'i str) -> Self {
let mut stream = Stream::new(input);
if input.as_bytes().starts_with(BOM) {
let offset = BOM.len();
#[cfg(feature = "unsafe")] unsafe {
stream.next_slice_unchecked(offset)
};
#[cfg(not(feature = "unsafe"))]
stream.next_slice(offset);
}
Lexer { stream, eof: false }
}
#[cfg(feature = "alloc")]
pub fn into_vec(self) -> Vec<Token> {
#![allow(unused_qualifications)] let capacity = core::cmp::min(
self.stream.len(),
usize::MAX / core::mem::size_of::<Token>(),
);
let mut vec = Vec::with_capacity(capacity);
vec.extend(self);
vec
}
}
impl Iterator for Lexer<'_> {
type Item = Token;
fn next(&mut self) -> Option<Self::Item> {
let Some(peek_byte) = self.stream.as_bstr().first() else {
if self.eof {
return None;
} else {
self.eof = true;
let start = self.stream.current_token_start();
let span = Span::new_unchecked(start, start);
return Some(Token::new(TokenKind::Eof, span));
}
};
Some(process_token(*peek_byte, &mut self.stream))
}
}
const BOM: &[u8] = b"\xEF\xBB\xBF";
pub(crate) type Stream<'i> = winnow::stream::LocatingSlice<&'i str>;
fn process_token(peek_byte: u8, stream: &mut Stream<'_>) -> Token {
let token = match peek_byte {
b'.' => lex_ascii_char(stream, TokenKind::Dot),
b'=' => lex_ascii_char(stream, TokenKind::Equals),
b',' => lex_ascii_char(stream, TokenKind::Comma),
b'[' => lex_ascii_char(stream, TokenKind::LeftSquareBracket),
b']' => lex_ascii_char(stream, TokenKind::RightSquareBracket),
b'{' => lex_ascii_char(stream, TokenKind::LeftCurlyBracket),
b'}' => lex_ascii_char(stream, TokenKind::RightCurlyBracket),
b' ' => lex_whitespace(stream),
b'\t' => lex_whitespace(stream),
b'#' => lex_comment(stream),
b'\r' => lex_crlf(stream),
b'\n' => lex_ascii_char(stream, TokenKind::Newline),
b'\'' => {
if stream.starts_with(ML_LITERAL_STRING_DELIM) {
lex_ml_literal_string(stream)
} else {
lex_literal_string(stream)
}
}
b'"' => {
if stream.starts_with(ML_BASIC_STRING_DELIM) {
lex_ml_basic_string(stream)
} else {
lex_basic_string(stream)
}
}
_ => lex_atom(stream),
};
token
}
fn lex_ascii_char(stream: &mut Stream<'_>, kind: TokenKind) -> Token {
debug_assert!(!stream.is_empty());
let start = stream.current_token_start();
let offset = 1; #[cfg(feature = "unsafe")] unsafe {
stream.next_slice_unchecked(offset)
};
#[cfg(not(feature = "unsafe"))]
stream.next_slice(offset);
let end = stream.previous_token_end();
let span = Span::new_unchecked(start, end);
Token::new(kind, span)
}
fn lex_whitespace(stream: &mut Stream<'_>) -> Token {
debug_assert!(!stream.is_empty());
let start = stream.current_token_start();
let offset = stream
.as_bstr()
.offset_for(|b| !WSCHAR.contains_token(b))
.unwrap_or(stream.eof_offset());
#[cfg(feature = "unsafe")] unsafe {
stream.next_slice_unchecked(offset)
};
#[cfg(not(feature = "unsafe"))]
stream.next_slice(offset);
let end = stream.previous_token_end();
let span = Span::new_unchecked(start, end);
Token::new(TokenKind::Whitespace, span)
}
pub(crate) const WSCHAR: (u8, u8) = (b' ', b'\t');
fn lex_comment(stream: &mut Stream<'_>) -> Token {
let start = stream.current_token_start();
let offset = stream
.as_bytes()
.find_slice((b'\r', b'\n'))
.map(|s| s.start)
.unwrap_or_else(|| stream.eof_offset());
#[cfg(feature = "unsafe")] unsafe {
stream.next_slice_unchecked(offset)
};
#[cfg(not(feature = "unsafe"))]
stream.next_slice(offset);
let end = stream.previous_token_end();
let span = Span::new_unchecked(start, end);
Token::new(TokenKind::Comment, span)
}
pub(crate) const COMMENT_START_SYMBOL: u8 = b'#';
fn lex_crlf(stream: &mut Stream<'_>) -> Token {
let start = stream.current_token_start();
let mut offset = '\r'.len_utf8();
let has_lf = stream.as_bstr().get(1) == Some(&b'\n');
if has_lf {
offset += '\n'.len_utf8();
}
#[cfg(feature = "unsafe")] unsafe {
stream.next_slice_unchecked(offset)
};
#[cfg(not(feature = "unsafe"))]
stream.next_slice(offset);
let end = stream.previous_token_end();
let span = Span::new_unchecked(start, end);
Token::new(TokenKind::Newline, span)
}
fn lex_literal_string(stream: &mut Stream<'_>) -> Token {
let start = stream.current_token_start();
let offset = 1; #[cfg(feature = "unsafe")] unsafe {
stream.next_slice_unchecked(offset)
};
#[cfg(not(feature = "unsafe"))]
stream.next_slice(offset);
let offset = match stream.as_bstr().find_slice((APOSTROPHE, b'\n')) {
Some(span) => {
if stream.as_bstr()[span.start] == APOSTROPHE {
span.end
} else {
span.start
}
}
None => stream.eof_offset(),
};
#[cfg(feature = "unsafe")]
unsafe {
stream.next_slice_unchecked(offset)
};
#[cfg(not(feature = "unsafe"))]
stream.next_slice(offset);
let end = stream.previous_token_end();
let span = Span::new_unchecked(start, end);
Token::new(TokenKind::LiteralString, span)
}
pub(crate) const APOSTROPHE: u8 = b'\'';
fn lex_ml_literal_string(stream: &mut Stream<'_>) -> Token {
let start = stream.current_token_start();
let offset = ML_LITERAL_STRING_DELIM.len();
#[cfg(feature = "unsafe")] unsafe {
stream.next_slice_unchecked(offset)
};
#[cfg(not(feature = "unsafe"))]
stream.next_slice(offset);
let offset = match stream.as_bstr().find_slice(ML_LITERAL_STRING_DELIM) {
Some(span) => span.end,
None => stream.eof_offset(),
};
#[cfg(feature = "unsafe")]
unsafe {
stream.next_slice_unchecked(offset)
};
#[cfg(not(feature = "unsafe"))]
stream.next_slice(offset);
if stream.as_bstr().peek_token() == Some(APOSTROPHE) {
let offset = 1;
#[cfg(feature = "unsafe")] unsafe {
stream.next_slice_unchecked(offset)
};
#[cfg(not(feature = "unsafe"))]
stream.next_slice(offset);
if stream.as_bstr().peek_token() == Some(APOSTROPHE) {
let offset = 1;
#[cfg(feature = "unsafe")]
unsafe {
stream.next_slice_unchecked(offset)
};
#[cfg(not(feature = "unsafe"))]
stream.next_slice(offset);
}
}
let end = stream.previous_token_end();
let span = Span::new_unchecked(start, end);
Token::new(TokenKind::MlLiteralString, span)
}
pub(crate) const ML_LITERAL_STRING_DELIM: &str = "'''";
fn lex_basic_string(stream: &mut Stream<'_>) -> Token {
let start = stream.current_token_start();
let offset = 1; #[cfg(feature = "unsafe")] unsafe {
stream.next_slice_unchecked(offset)
};
#[cfg(not(feature = "unsafe"))]
stream.next_slice(offset);
loop {
match stream.as_bstr().find_slice((QUOTATION_MARK, ESCAPE, b'\n')) {
Some(span) => {
let found = stream.as_bstr()[span.start];
if found == QUOTATION_MARK {
let offset = span.end;
#[cfg(feature = "unsafe")]
unsafe {
stream.next_slice_unchecked(offset)
};
#[cfg(not(feature = "unsafe"))]
stream.next_slice(offset);
break;
} else if found == ESCAPE {
let offset = span.end;
#[cfg(feature = "unsafe")]
unsafe {
stream.next_slice_unchecked(offset)
};
#[cfg(not(feature = "unsafe"))]
stream.next_slice(offset);
let peek = stream.as_bstr().peek_token();
match peek {
Some(ESCAPE) | Some(QUOTATION_MARK) => {
let offset = 1; #[cfg(feature = "unsafe")]
#[cfg(feature = "unsafe")]
unsafe {
stream.next_slice_unchecked(offset)
};
#[cfg(not(feature = "unsafe"))]
stream.next_slice(offset);
}
_ => {}
}
continue;
} else if found == b'\n' {
let offset = span.start;
#[cfg(feature = "unsafe")]
unsafe {
stream.next_slice_unchecked(offset)
};
#[cfg(not(feature = "unsafe"))]
stream.next_slice(offset);
break;
} else {
unreachable!("found `{found}`");
}
}
None => {
stream.finish();
break;
}
}
}
let end = stream.previous_token_end();
let span = Span::new_unchecked(start, end);
Token::new(TokenKind::BasicString, span)
}
pub(crate) const QUOTATION_MARK: u8 = b'"';
pub(crate) const ESCAPE: u8 = b'\\';
fn lex_ml_basic_string(stream: &mut Stream<'_>) -> Token {
let start = stream.current_token_start();
let offset = ML_BASIC_STRING_DELIM.len();
#[cfg(feature = "unsafe")] unsafe {
stream.next_slice_unchecked(offset)
};
#[cfg(not(feature = "unsafe"))]
stream.next_slice(offset);
loop {
match stream.as_bstr().find_slice((ML_BASIC_STRING_DELIM, "\\")) {
Some(span) => {
let found = stream.as_bstr()[span.start];
if found == QUOTATION_MARK {
let offset = span.end;
#[cfg(feature = "unsafe")]
unsafe {
stream.next_slice_unchecked(offset)
};
#[cfg(not(feature = "unsafe"))]
stream.next_slice(offset);
break;
} else if found == ESCAPE {
let offset = span.end;
#[cfg(feature = "unsafe")]
unsafe {
stream.next_slice_unchecked(offset)
};
#[cfg(not(feature = "unsafe"))]
stream.next_slice(offset);
let peek = stream.as_bstr().peek_token();
match peek {
Some(ESCAPE) | Some(QUOTATION_MARK) => {
let offset = 1; #[cfg(feature = "unsafe")]
unsafe {
stream.next_slice_unchecked(offset)
};
#[cfg(not(feature = "unsafe"))]
stream.next_slice(offset);
}
_ => {}
}
continue;
} else {
unreachable!("found `{found}`");
}
}
None => {
stream.finish();
break;
}
}
}
if stream.as_bstr().peek_token() == Some(QUOTATION_MARK) {
let offset = 1;
#[cfg(feature = "unsafe")]
unsafe {
stream.next_slice_unchecked(offset)
};
#[cfg(not(feature = "unsafe"))]
stream.next_slice(offset);
if stream.as_bstr().peek_token() == Some(QUOTATION_MARK) {
let offset = 1;
#[cfg(feature = "unsafe")]
unsafe {
stream.next_slice_unchecked(offset)
};
#[cfg(not(feature = "unsafe"))]
stream.next_slice(offset);
}
}
let end = stream.previous_token_end();
let span = Span::new_unchecked(start, end);
Token::new(TokenKind::MlBasicString, span)
}
pub(crate) const ML_BASIC_STRING_DELIM: &str = "\"\"\"";
fn lex_atom(stream: &mut Stream<'_>) -> Token {
let start = stream.current_token_start();
const TOKEN_START: &[u8] = b".=,[]{} \t#\r\n";
let offset = stream
.as_bstr()
.offset_for(|b| TOKEN_START.contains_token(b))
.unwrap_or_else(|| stream.eof_offset());
#[cfg(feature = "unsafe")] unsafe {
stream.next_slice_unchecked(offset)
};
#[cfg(not(feature = "unsafe"))]
stream.next_slice(offset);
let end = stream.previous_token_end();
let span = Span::new_unchecked(start, end);
Token::new(TokenKind::Atom, span)
}