@ -1,9 +1,10 @@
use memchr ::memchr2 ;
use crate ::toolkit ::error ::{ SyntaxError , TextPosition } ;
use memchr ::{ memchr2 , memchr2_iter } ;
use std ::borrow ::Cow ;
use std ::cmp ::min ;
use std ::cmp ::min ;
use std ::error ::Error ;
use std ::fmt ;
use std ::io ::{ self , Read } ;
use std ::io ::{ self , Read } ;
use std ::ops ::{ Range , RangeInclusive } ;
use std ::ops ::{ Range , RangeInclusive } ;
use std ::str ;
#[ cfg(feature = " async-tokio " ) ]
#[ cfg(feature = " async-tokio " ) ]
use tokio ::io ::{ AsyncRead , AsyncReadExt } ;
use tokio ::io ::{ AsyncRead , AsyncReadExt } ;
@ -22,14 +23,14 @@ pub trait TokenRecognizer {
}
}
pub struct TokenRecognizerError {
pub struct TokenRecognizerError {
pub posi tion : Range < usize > ,
pub loca tion : Range < usize > ,
pub message : String ,
pub message : String ,
}
}
impl < S : Into < String > > From < ( Range < usize > , S ) > for TokenRecognizerError {
impl < S : Into < String > > From < ( Range < usize > , S ) > for TokenRecognizerError {
fn from ( ( posi tion, message ) : ( Range < usize > , S ) ) -> Self {
fn from ( ( loca tion, message ) : ( Range < usize > , S ) ) -> Self {
Self {
Self {
posi tion,
loca tion,
message : message . into ( ) ,
message : message . into ( ) ,
}
}
}
}
@ -37,34 +38,37 @@ impl<S: Into<String>> From<(Range<usize>, S)> for TokenRecognizerError {
#[ allow(clippy::range_plus_one) ]
#[ allow(clippy::range_plus_one) ]
impl < S : Into < String > > From < ( RangeInclusive < usize > , S ) > for TokenRecognizerError {
impl < S : Into < String > > From < ( RangeInclusive < usize > , S ) > for TokenRecognizerError {
fn from ( ( posi tion, message ) : ( RangeInclusive < usize > , S ) ) -> Self {
fn from ( ( loca tion, message ) : ( RangeInclusive < usize > , S ) ) -> Self {
( * posi tion. start ( ) .. * posi tion. end ( ) + 1 , message ) . into ( )
( * loca tion. start ( ) .. * loca tion. end ( ) + 1 , message ) . into ( )
}
}
}
}
impl < S : Into < String > > From < ( usize , S ) > for TokenRecognizerError {
impl < S : Into < String > > From < ( usize , S ) > for TokenRecognizerError {
fn from ( ( posi tion, message ) : ( usize , S ) ) -> Self {
fn from ( ( loca tion, message ) : ( usize , S ) ) -> Self {
( position ..= posi tion, message ) . into ( )
( location ..= loca tion, message ) . into ( )
}
}
}
}
pub struct TokenWithPosition < T > {
pub token : T ,
pub position : Range < usize > ,
}
pub struct Lexer < R : TokenRecognizer > {
pub struct Lexer < R : TokenRecognizer > {
parser : R ,
parser : R ,
data : Vec < u8 > ,
data : Vec < u8 > ,
start : usize ,
position : Position ,
previous_position : Position , // Lexer position before the last emitted token
is_ending : bool ,
is_ending : bool ,
position : usize ,
min_buffer_size : usize ,
min_buffer_size : usize ,
max_buffer_size : usize ,
max_buffer_size : usize ,
is_line_jump_whitespace : bool ,
is_line_jump_whitespace : bool ,
line_comment_start : Option < & ' static [ u8 ] > ,
line_comment_start : Option < & ' static [ u8 ] > ,
}
}
#[ derive(Clone, Copy) ]
struct Position {
line_start_buffer_offset : usize ,
buffer_offset : usize ,
global_offset : u64 ,
global_line : u64 ,
}
impl < R : TokenRecognizer > Lexer < R > {
impl < R : TokenRecognizer > Lexer < R > {
pub fn new (
pub fn new (
parser : R ,
parser : R ,
@ -76,9 +80,19 @@ impl<R: TokenRecognizer> Lexer<R> {
Self {
Self {
parser ,
parser ,
data : Vec ::new ( ) ,
data : Vec ::new ( ) ,
start : 0 ,
position : Position {
line_start_buffer_offset : 0 ,
buffer_offset : 0 ,
global_offset : 0 ,
global_line : 0 ,
} ,
previous_position : Position {
line_start_buffer_offset : 0 ,
buffer_offset : 0 ,
global_offset : 0 ,
global_line : 0 ,
} ,
is_ending : false ,
is_ending : false ,
position : 0 ,
min_buffer_size ,
min_buffer_size ,
max_buffer_size ,
max_buffer_size ,
is_line_jump_whitespace ,
is_line_jump_whitespace ,
@ -148,24 +162,43 @@ impl<R: TokenRecognizer> Lexer<R> {
Ok ( ( ) )
Ok ( ( ) )
}
}
pub fn read_next (
#[ allow(clippy::unwrap_in_result) ]
& mut self ,
pub fn read_next ( & mut self , options : & R ::Options ) -> Option < Result < R ::Token < ' _ > , SyntaxError > > {
options : & R ::Options ,
) -> Option < Result < TokenWithPosition < R ::Token < ' _ > > , LexerError > > {
self . skip_whitespaces_and_comments ( ) ? ;
self . skip_whitespaces_and_comments ( ) ? ;
let Some ( ( consumed , result ) ) =
self . previous_position = self . position ;
self . parser
let Some ( ( consumed , result ) ) = self . parser . recognize_next_token (
. recognize_next_token ( & self . data [ self . start .. ] , self . is_ending , options )
& self . data [ self . position . buffer_offset .. ] ,
else {
self . is_ending ,
options ,
) else {
return if self . is_ending {
return if self . is_ending {
if self . start = = self . data . len ( ) {
if self . po si tion . buffe r_offse t = = self . data . len ( ) {
None // We have finished
None // We have finished
} else {
} else {
let error = LexerError {
let ( new_line_jumps , new_line_start ) =
position : self . position .. self . position + ( self . data . len ( ) - self . start ) ,
Self ::find_number_of_line_jumps_and_start_of_last_line (
& self . data [ self . position . buffer_offset .. ] ,
) ;
if new_line_jumps > 0 {
self . position . line_start_buffer_offset =
self . position . buffer_offset + new_line_start ;
}
self . position . global_offset + =
u64 ::try_from ( self . data . len ( ) - self . position . buffer_offset ) . unwrap ( ) ;
self . position . buffer_offset = self . data . len ( ) ;
self . position . global_line + = new_line_jumps ;
let new_position = TextPosition {
line : self . position . global_line ,
column : Self ::column_from_bytes (
& self . data [ self . position . line_start_buffer_offset .. ] ,
) ,
offset : self . position . global_offset ,
} ;
let error = SyntaxError {
location : new_position .. new_position ,
message : "Unexpected end of file" . into ( ) ,
message : "Unexpected end of file" . into ( ) ,
} ;
} ;
self . start = self . data . len ( ) ; // We consume everything
self . po si tion . buffe r_offse t = self . data . len ( ) ; // We consume everything
Some ( Err ( error ) )
Some ( Err ( error ) )
}
}
} else {
} else {
@ -177,44 +210,119 @@ impl<R: TokenRecognizer> Lexer<R> {
"The lexer must consume at least one byte each time"
"The lexer must consume at least one byte each time"
) ;
) ;
debug_assert! (
debug_assert! (
self . sta rt + consumed < = self . data . len ( ) ,
self . po si tion . buffe r_offse t + consumed < = self . data . len ( ) ,
"The lexer tried to consumed {consumed} bytes but only {} bytes are readable" ,
"The lexer tried to consumed {consumed} bytes but only {} bytes are readable" ,
self . data . len ( ) - self . start
self . data . len ( ) - self . position . buffer_offset
) ;
let ( new_line_jumps , new_line_start ) =
Self ::find_number_of_line_jumps_and_start_of_last_line (
& self . data [ self . position . buffer_offset .. self . position . buffer_offset + consumed ] ,
) ;
) ;
let old_position = self . position ;
if new_line_jumps > 0 {
self . start + = consumed ;
self . position . line_start_buffer_offset = self . position . buffer_offset + new_line_start ;
self . position + = consumed ;
}
Some ( match result {
self . position . buffer_offset + = consumed ;
Ok ( token ) = > Ok ( TokenWithPosition {
self . position . global_offset + = u64 ::try_from ( consumed ) . unwrap ( ) ;
token ,
self . position . global_line + = new_line_jumps ;
position : old_position .. self . position ,
Some ( result . map_err ( | e | SyntaxError {
} ) ,
location : self . location_from_buffer_offset_range ( e . location ) ,
Err ( e ) = > Err ( LexerError {
position : e . position . start + self . position .. e . position . end + self . position ,
message : e . message ,
message : e . message ,
} ) ,
} ) )
} )
}
pub fn location_from_buffer_offset_range (
& self ,
offset_range : Range < usize > ,
) -> Range < TextPosition > {
let start_offset = self . previous_position . buffer_offset + offset_range . start ;
let ( start_extra_line_jumps , start_line_start ) =
Self ::find_number_of_line_jumps_and_start_of_last_line (
& self . data [ self . previous_position . buffer_offset .. start_offset ] ,
) ;
let start_line_start = if start_extra_line_jumps > 0 {
start_line_start + self . previous_position . buffer_offset
} else {
self . previous_position . line_start_buffer_offset
} ;
let end_offset = self . previous_position . buffer_offset + offset_range . end ;
let ( end_extra_line_jumps , end_line_start ) =
Self ::find_number_of_line_jumps_and_start_of_last_line (
& self . data [ self . previous_position . buffer_offset .. end_offset ] ,
) ;
let end_line_start = if end_extra_line_jumps > 0 {
end_line_start + self . previous_position . buffer_offset
} else {
self . previous_position . line_start_buffer_offset
} ;
TextPosition {
line : self . previous_position . global_line + start_extra_line_jumps ,
column : Self ::column_from_bytes ( & self . data [ start_line_start .. start_offset ] ) ,
offset : self . previous_position . global_offset
+ u64 ::try_from ( offset_range . start ) . unwrap ( ) ,
} .. TextPosition {
line : self . previous_position . global_line + end_extra_line_jumps ,
column : Self ::column_from_bytes ( & self . data [ end_line_start .. end_offset ] ) ,
offset : self . previous_position . global_offset + u64 ::try_from ( offset_range . end ) . unwrap ( ) ,
}
}
pub fn last_token_location ( & self ) -> Range < TextPosition > {
TextPosition {
line : self . previous_position . global_line ,
column : Self ::column_from_bytes (
& self . data [ self . previous_position . line_start_buffer_offset
.. self . previous_position . buffer_offset ] ,
) ,
offset : self . previous_position . global_offset ,
} .. TextPosition {
line : self . position . global_line ,
column : Self ::column_from_bytes (
& self . data [ self . position . line_start_buffer_offset .. self . position . buffer_offset ] ,
) ,
offset : self . position . global_offset ,
}
}
pub fn last_token_source ( & self ) -> Cow < ' _ , str > {
String ::from_utf8_lossy (
& self . data [ self . previous_position . buffer_offset .. self . position . buffer_offset ] ,
)
}
}
pub fn is_end ( & self ) -> bool {
pub fn is_end ( & self ) -> bool {
self . is_ending & & self . data . len ( ) = = self . start
self . is_ending & & self . data . len ( ) = = self . po si tion . buffe r_offse t
}
}
#[ allow(clippy::unwrap_in_result) ]
fn skip_whitespaces_and_comments ( & mut self ) -> Option < ( ) > {
fn skip_whitespaces_and_comments ( & mut self ) -> Option < ( ) > {
loop {
loop {
self . skip_whitespaces ( ) ;
self . skip_whitespaces ( ) ? ;
let buf = & self . data [ self . start .. ] ;
let buf = & self . data [ self . po si tion . buffe r_offse t.. ] ;
if let Some ( line_comment_start ) = self . line_comment_start {
if let Some ( line_comment_start ) = self . line_comment_start {
if buf . starts_with ( line_comment_start ) {
if buf . starts_with ( line_comment_start ) {
// Comment
// Comment
if let Some ( end ) = memchr2 ( b'\r' , b'\n' , & buf [ line_comment_start . len ( ) .. ] ) {
if let Some ( end ) = memchr2 ( b'\r' , b'\n' , & buf [ line_comment_start . len ( ) .. ] ) {
self . start + = end + line_comment_start . len ( ) ;
let mut end_position = line_comment_start . len ( ) + end ;
self . position + = end + line_comment_start . len ( ) ;
if buf . get ( end_position ) . copied ( ) = = Some ( b'\r' ) {
// We look for \n for Windows line end style
if let Some ( c ) = buf . get ( end_position + 1 ) {
if * c = = b'\n' {
end_position + = 1 ;
}
} else if ! self . is_ending {
return None ; // We need to read more
}
}
let comment_size = end_position + 1 ;
self . position . buffer_offset + = comment_size ;
self . position . line_start_buffer_offset = self . position . buffer_offset ;
self . position . global_offset + = u64 ::try_from ( comment_size ) . unwrap ( ) ;
self . position . global_line + = 1 ;
continue ;
continue ;
}
}
if self . is_ending {
if self . is_ending {
self . start = self . data . len ( ) ; // EOF
self . po si tion . buffe r_offse t = self . data . len ( ) ; // EOF
return Some ( ( ) ) ;
return Some ( ( ) ) ;
}
}
return None ; // We need more data
return None ; // We need more data
@ -224,80 +332,98 @@ impl<R: TokenRecognizer> Lexer<R> {
}
}
}
}
fn skip_whitespaces ( & mut self ) {
fn skip_whitespaces ( & mut self ) -> Option < ( ) > {
if self . is_line_jump_whitespace {
if self . is_line_jump_whitespace {
for ( i , c ) in self . data [ self . start .. ] . iter ( ) . enumerate ( ) {
let mut i = self . position . buffer_offset ;
if ! matches! ( c , b' ' | b'\t' | b'\r' | b'\n' ) {
while let Some ( c ) = self . data . get ( i ) {
self . start + = i ;
match c {
self . position + = i ;
b' ' | b'\t' = > {
return ;
self . position . buffer_offset + = 1 ;
}
self . position . global_offset + = 1 ;
}
b'\r' = > {
// We look for \n for Windows line end style
let mut increment : u8 = 1 ;
if let Some ( c ) = self . data . get ( i + 1 ) {
if * c = = b'\n' {
increment + = 1 ;
i + = 1 ;
}
} else if ! self . is_ending {
return None ; // We need to read more
}
self . position . buffer_offset + = usize ::from ( increment ) ;
self . position . line_start_buffer_offset = self . position . buffer_offset ;
self . position . global_offset + = u64 ::from ( increment ) ;
self . position . global_line + = 1 ;
}
b'\n' = > {
self . position . buffer_offset + = 1 ;
self . position . line_start_buffer_offset = self . position . buffer_offset ;
self . position . global_offset + = 1 ;
self . position . global_line + = 1 ;
}
_ = > return Some ( ( ) ) ,
}
i + = 1 ;
//TODO: SIMD
//TODO: SIMD
}
}
} else {
} else {
for ( i , c ) in self . data [ self . start .. ] . iter ( ) . enumerate ( ) {
for c in & self . data [ self . position . buffer_offset .. ] {
if ! matches! ( c , b' ' | b'\t' ) {
if matches! ( c , b' ' | b'\t' ) {
self . start + = i ;
self . position . buffer_offset + = 1 ;
self . position + = i ;
self . position . global_offset + = 1 ;
return ;
} else {
return Some ( ( ) ) ;
}
}
//TODO: SIMD
//TODO: SIMD
}
}
}
}
// We only have whitespaces
Some ( ( ) )
self . position + = self . data . len ( ) - self . start ;
self . start = self . data . len ( ) ;
}
}
fn shrink_data ( & mut self ) {
fn shrink_data ( & mut self ) {
if self . start > 0 {
if self . position . line_start_buffer_offset > 0 {
self . data . copy_within ( self . start .. , 0 ) ;
self . data
self . data . truncate ( self . data . len ( ) - self . start ) ;
. copy_within ( self . position . line_start_buffer_offset .. , 0 ) ;
self . start = 0 ;
self . data
. truncate ( self . data . len ( ) - self . position . line_start_buffer_offset ) ;
self . position . buffer_offset - = self . position . line_start_buffer_offset ;
self . position . line_start_buffer_offset = 0 ;
self . previous_position = self . position ;
}
}
}
}
}
#[ derive(Debug) ]
pub struct LexerError {
position : Range < usize > ,
message : String ,
}
impl LexerError {
fn find_number_of_line_jumps_and_start_of_last_line ( bytes : & [ u8 ] ) -> ( u64 , usize ) {
pub fn position ( & self ) -> Range < usize > {
let mut num_of_jumps = 0 ;
self . position . clone ( )
let mut last_jump_pos = 0 ;
let mut previous_cr = 0 ;
for pos in memchr2_iter ( b'\r' , b'\n' , bytes ) {
if bytes [ pos ] = = b'\r' {
previous_cr = pos ;
num_of_jumps + = 1 ;
last_jump_pos = pos + 1 ;
} else {
if previous_cr < pos - 1 {
// We count \r\n as a single line jump
num_of_jumps + = 1 ;
}
}
last_jump_pos = pos + 1 ;
pub fn message ( & self ) -> & str {
& self . message
}
}
pub fn into_message ( self ) -> String {
self . message
}
}
}
( num_of_jumps , last_jump_pos )
}
impl fmt ::Display for LexerError {
fn column_from_bytes ( bytes : & [ u8 ] ) -> u64 {
fn fmt ( & self , f : & mut fmt ::Formatter < ' _ > ) -> fmt ::Result {
match str ::from_utf8 ( bytes ) {
if self . position . start + 1 = = self . position . end {
Ok ( s ) = > u64 ::try_from ( s . chars ( ) . count ( ) ) . unwrap ( ) ,
write! (
Err ( e ) = > {
f ,
if e . valid_up_to ( ) = = 0 {
"Lexer error at byte {}: {}" ,
0
self . position . start , self . message
)
} else {
} else {
write! (
Self ::column_from_bytes ( & bytes [ .. e . valid_up_to ( ) ] )
f ,
}
"Lexer error between bytes {} and {}: {}" ,
self . position . start , self . position . end , self . message
)
}
}
}
}
}
impl Error for LexerError {
fn description ( & self ) -> & str {
self . message ( )
}
}
}
}