Simplifies OxTTL lexer buffer management

pull/583/head
Tpt 1 year ago committed by Thomas Tanon
parent 4f7445104a
commit afaabf6110
  1. 83
      lib/oxttl/src/toolkit/lexer.rs

@ -1,4 +1,5 @@
use memchr::memchr2; use memchr::memchr2;
use std::cmp::min;
use std::error::Error; use std::error::Error;
use std::fmt; use std::fmt;
use std::io::{self, Read}; use std::io::{self, Read};
@ -56,7 +57,6 @@ pub struct Lexer<R: TokenRecognizer> {
parser: R, parser: R,
data: Vec<u8>, data: Vec<u8>,
start: usize, start: usize,
end: usize,
is_ending: bool, is_ending: bool,
position: usize, position: usize,
min_buffer_size: usize, min_buffer_size: usize,
@ -77,7 +77,6 @@ impl<R: TokenRecognizer> Lexer<R> {
parser, parser,
data: Vec::new(), data: Vec::new(),
start: 0, start: 0,
end: 0,
is_ending: false, is_ending: false,
position: 0, position: 0,
min_buffer_size, min_buffer_size,
@ -88,10 +87,8 @@ impl<R: TokenRecognizer> Lexer<R> {
} }
pub fn extend_from_slice(&mut self, other: &[u8]) { pub fn extend_from_slice(&mut self, other: &[u8]) {
self.shrink_if_useful(); self.shrink_data();
self.data.truncate(self.end);
self.data.extend_from_slice(other); self.data.extend_from_slice(other);
self.end = self.data.len();
} }
#[inline] #[inline]
@ -100,26 +97,25 @@ impl<R: TokenRecognizer> Lexer<R> {
} }
pub fn extend_from_read(&mut self, read: &mut impl Read) -> io::Result<()> { pub fn extend_from_read(&mut self, read: &mut impl Read) -> io::Result<()> {
self.shrink_if_useful(); self.shrink_data();
let min_end = self.end + self.min_buffer_size; if self.data.len() == self.max_buffer_size {
if min_end > self.max_buffer_size {
return Err(io::Error::new( return Err(io::Error::new(
io::ErrorKind::OutOfMemory, io::ErrorKind::OutOfMemory,
format!( format!(
"The buffer maximal size is {} < {min_end}", "Reached the buffer maximal size of {}",
self.max_buffer_size self.max_buffer_size
), ),
)); ));
} }
if self.data.len() < min_end { let min_end = min(self.data.len() + self.min_buffer_size, self.max_buffer_size);
self.data.resize(min_end, 0); let new_start = self.data.len();
} self.data.resize(min_end, 0);
if self.data.len() < self.data.capacity() { if self.data.len() < self.data.capacity() {
// We keep extending to have as much space as available without reallocation // We keep extending to have as much space as available without reallocation
self.data.resize(self.data.capacity(), 0); self.data.resize(self.data.capacity(), 0);
} }
let read = read.read(&mut self.data[self.end..])?; let read = read.read(&mut self.data[new_start..])?;
self.end += read; self.data.truncate(new_start + read);
self.is_ending = read == 0; self.is_ending = read == 0;
Ok(()) Ok(())
} }
@ -129,26 +125,25 @@ impl<R: TokenRecognizer> Lexer<R> {
&mut self, &mut self,
read: &mut (impl AsyncRead + Unpin), read: &mut (impl AsyncRead + Unpin),
) -> io::Result<()> { ) -> io::Result<()> {
self.shrink_if_useful(); self.shrink_data();
let min_end = self.end + self.min_buffer_size; if self.data.len() == self.max_buffer_size {
if min_end > self.max_buffer_size {
return Err(io::Error::new( return Err(io::Error::new(
io::ErrorKind::OutOfMemory, io::ErrorKind::OutOfMemory,
format!( format!(
"The buffer maximal size is {} < {min_end}", "Reached the buffer maximal size of {}",
self.max_buffer_size self.max_buffer_size
), ),
)); ));
} }
if self.data.len() < min_end { let min_end = min(self.data.len() + self.min_buffer_size, self.max_buffer_size);
self.data.resize(min_end, 0); let new_start = self.data.len();
} self.data.resize(min_end, 0);
if self.data.len() < self.data.capacity() { if self.data.len() < self.data.capacity() {
// We keep extending to have as much space as available without reallocation // We keep extending to have as much space as available without reallocation
self.data.resize(self.data.capacity(), 0); self.data.resize(self.data.capacity(), 0);
} }
let read = read.read(&mut self.data[self.end..]).await?; let read = read.read(&mut self.data[new_start..]).await?;
self.end += read; self.data.truncate(new_start + read);
self.is_ending = read == 0; self.is_ending = read == 0;
Ok(()) Ok(())
} }
@ -158,22 +153,21 @@ impl<R: TokenRecognizer> Lexer<R> {
options: &R::Options, options: &R::Options,
) -> Option<Result<TokenWithPosition<R::Token<'_>>, LexerError>> { ) -> Option<Result<TokenWithPosition<R::Token<'_>>, LexerError>> {
self.skip_whitespaces_and_comments()?; self.skip_whitespaces_and_comments()?;
let (consumed, result) = if let Some(r) = self.parser.recognize_next_token( let (consumed, result) = if let Some(r) =
&self.data[self.start..self.end], self.parser
self.is_ending, .recognize_next_token(&self.data[self.start..], self.is_ending, options)
options, {
) {
r r
} else { } else {
return if self.is_ending { return if self.is_ending {
if self.start == self.end { if self.start == self.data.len() {
None // We have finished None // We have finished
} else { } else {
let error = LexerError { let error = LexerError {
position: self.position..self.position + (self.end - self.start), position: self.position..self.position + (self.data.len() - self.start),
message: "Unexpected end of file".into(), message: "Unexpected end of file".into(),
}; };
self.end = self.start; // We consume everything self.start = self.data.len(); // We consume everything
Some(Err(error)) Some(Err(error))
} }
} else { } else {
@ -185,9 +179,9 @@ impl<R: TokenRecognizer> Lexer<R> {
"The lexer must consume at least one byte each time" "The lexer must consume at least one byte each time"
); );
debug_assert!( debug_assert!(
self.start + consumed <= self.end, self.start + consumed <= self.data.len(),
"The lexer tried to consumed {consumed} bytes but only {} bytes are readable", "The lexer tried to consumed {consumed} bytes but only {} bytes are readable",
self.end - self.start self.data.len() - self.start
); );
let old_position = self.position; let old_position = self.position;
self.start += consumed; self.start += consumed;
@ -205,14 +199,14 @@ impl<R: TokenRecognizer> Lexer<R> {
} }
pub fn is_end(&self) -> bool { pub fn is_end(&self) -> bool {
self.is_ending && self.end == self.start self.is_ending && self.data.len() == self.start
} }
fn skip_whitespaces_and_comments(&mut self) -> Option<()> { fn skip_whitespaces_and_comments(&mut self) -> Option<()> {
loop { loop {
self.skip_whitespaces(); self.skip_whitespaces();
let buf = &self.data[self.start..self.end]; let buf = &self.data[self.start..];
if let Some(line_comment_start) = self.line_comment_start { if let Some(line_comment_start) = self.line_comment_start {
if buf.starts_with(line_comment_start) { if buf.starts_with(line_comment_start) {
// Comment // Comment
@ -222,7 +216,7 @@ impl<R: TokenRecognizer> Lexer<R> {
continue; continue;
} }
if self.is_ending { if self.is_ending {
self.end = self.start; // EOF self.start = self.data.len(); // EOF
return Some(()); return Some(());
} }
return None; // We need more data return None; // We need more data
@ -234,7 +228,7 @@ impl<R: TokenRecognizer> Lexer<R> {
fn skip_whitespaces(&mut self) { fn skip_whitespaces(&mut self) {
if self.is_line_jump_whitespace { if self.is_line_jump_whitespace {
for (i, c) in self.data[self.start..self.end].iter().enumerate() { for (i, c) in self.data[self.start..].iter().enumerate() {
if !matches!(c, b' ' | b'\t' | b'\r' | b'\n') { if !matches!(c, b' ' | b'\t' | b'\r' | b'\n') {
self.start += i; self.start += i;
self.position += i; self.position += i;
@ -243,7 +237,7 @@ impl<R: TokenRecognizer> Lexer<R> {
//TODO: SIMD //TODO: SIMD
} }
} else { } else {
for (i, c) in self.data[self.start..self.end].iter().enumerate() { for (i, c) in self.data[self.start..].iter().enumerate() {
if !matches!(c, b' ' | b'\t') { if !matches!(c, b' ' | b'\t') {
self.start += i; self.start += i;
self.position += i; self.position += i;
@ -253,15 +247,14 @@ impl<R: TokenRecognizer> Lexer<R> {
} }
} }
// We only have whitespaces // We only have whitespaces
self.position += self.end - self.start; self.position += self.data.len() - self.start;
self.end = self.start; self.start = self.data.len();
} }
fn shrink_if_useful(&mut self) { fn shrink_data(&mut self) {
if self.start * 2 > self.data.len() { if self.start > 0 {
// We have read more than half of the buffer, let's move the data to the beginning self.data.copy_within(self.start.., 0);
self.data.copy_within(self.start..self.end, 0); self.data.truncate(self.data.len() - self.start);
self.end -= self.start;
self.start = 0; self.start = 0;
} }
} }

Loading…
Cancel
Save