From afaabf6110c10121613e89daf10ab7bdb2c4242c Mon Sep 17 00:00:00 2001 From: Tpt Date: Fri, 4 Aug 2023 18:22:49 +0200 Subject: [PATCH] Simplifies OxTTL lexer buffer management --- lib/oxttl/src/toolkit/lexer.rs | 83 ++++++++++++++++------------------ 1 file changed, 38 insertions(+), 45 deletions(-) diff --git a/lib/oxttl/src/toolkit/lexer.rs b/lib/oxttl/src/toolkit/lexer.rs index 664e35fe..5b980f94 100644 --- a/lib/oxttl/src/toolkit/lexer.rs +++ b/lib/oxttl/src/toolkit/lexer.rs @@ -1,4 +1,5 @@ use memchr::memchr2; +use std::cmp::min; use std::error::Error; use std::fmt; use std::io::{self, Read}; @@ -56,7 +57,6 @@ pub struct Lexer { parser: R, data: Vec, start: usize, - end: usize, is_ending: bool, position: usize, min_buffer_size: usize, @@ -77,7 +77,6 @@ impl Lexer { parser, data: Vec::new(), start: 0, - end: 0, is_ending: false, position: 0, min_buffer_size, @@ -88,10 +87,8 @@ impl Lexer { } pub fn extend_from_slice(&mut self, other: &[u8]) { - self.shrink_if_useful(); - self.data.truncate(self.end); + self.shrink_data(); self.data.extend_from_slice(other); - self.end = self.data.len(); } #[inline] @@ -100,26 +97,25 @@ impl Lexer { } pub fn extend_from_read(&mut self, read: &mut impl Read) -> io::Result<()> { - self.shrink_if_useful(); - let min_end = self.end + self.min_buffer_size; - if min_end > self.max_buffer_size { + self.shrink_data(); + if self.data.len() == self.max_buffer_size { return Err(io::Error::new( io::ErrorKind::OutOfMemory, format!( - "The buffer maximal size is {} < {min_end}", + "Reached the buffer maximal size of {}", self.max_buffer_size ), )); } - if self.data.len() < min_end { - self.data.resize(min_end, 0); - } + let min_end = min(self.data.len() + self.min_buffer_size, self.max_buffer_size); + let new_start = self.data.len(); + self.data.resize(min_end, 0); if self.data.len() < self.data.capacity() { // We keep extending to have as much space as available without reallocation self.data.resize(self.data.capacity(), 0); } - let read = read.read(&mut self.data[self.end..])?; - self.end += read; + let read = read.read(&mut self.data[new_start..])?; + self.data.truncate(new_start + read); self.is_ending = read == 0; Ok(()) } @@ -129,26 +125,25 @@ impl Lexer { &mut self, read: &mut (impl AsyncRead + Unpin), ) -> io::Result<()> { - self.shrink_if_useful(); - let min_end = self.end + self.min_buffer_size; - if min_end > self.max_buffer_size { + self.shrink_data(); + if self.data.len() == self.max_buffer_size { return Err(io::Error::new( io::ErrorKind::OutOfMemory, format!( - "The buffer maximal size is {} < {min_end}", + "Reached the buffer maximal size of {}", self.max_buffer_size ), )); } - if self.data.len() < min_end { - self.data.resize(min_end, 0); - } + let min_end = min(self.data.len() + self.min_buffer_size, self.max_buffer_size); + let new_start = self.data.len(); + self.data.resize(min_end, 0); if self.data.len() < self.data.capacity() { // We keep extending to have as much space as available without reallocation self.data.resize(self.data.capacity(), 0); } - let read = read.read(&mut self.data[self.end..]).await?; - self.end += read; + let read = read.read(&mut self.data[new_start..]).await?; + self.data.truncate(new_start + read); self.is_ending = read == 0; Ok(()) } @@ -158,22 +153,21 @@ impl Lexer { options: &R::Options, ) -> Option>, LexerError>> { self.skip_whitespaces_and_comments()?; - let (consumed, result) = if let Some(r) = self.parser.recognize_next_token( - &self.data[self.start..self.end], - self.is_ending, - options, - ) { + let (consumed, result) = if let Some(r) = + self.parser + .recognize_next_token(&self.data[self.start..], self.is_ending, options) + { r } else { return if self.is_ending { - if self.start == self.end { + if self.start == self.data.len() { None // We have finished } else { let error = LexerError { - position: self.position..self.position + (self.end - self.start), + position: self.position..self.position + (self.data.len() - self.start), message: "Unexpected end of file".into(), }; - self.end = self.start; // We consume everything + self.start = self.data.len(); // We consume everything Some(Err(error)) } } else { @@ -185,9 +179,9 @@ impl Lexer { "The lexer must consume at least one byte each time" ); debug_assert!( - self.start + consumed <= self.end, + self.start + consumed <= self.data.len(), "The lexer tried to consumed {consumed} bytes but only {} bytes are readable", - self.end - self.start + self.data.len() - self.start ); let old_position = self.position; self.start += consumed; @@ -205,14 +199,14 @@ impl Lexer { } pub fn is_end(&self) -> bool { - self.is_ending && self.end == self.start + self.is_ending && self.data.len() == self.start } fn skip_whitespaces_and_comments(&mut self) -> Option<()> { loop { self.skip_whitespaces(); - let buf = &self.data[self.start..self.end]; + let buf = &self.data[self.start..]; if let Some(line_comment_start) = self.line_comment_start { if buf.starts_with(line_comment_start) { // Comment @@ -222,7 +216,7 @@ impl Lexer { continue; } if self.is_ending { - self.end = self.start; // EOF + self.start = self.data.len(); // EOF return Some(()); } return None; // We need more data @@ -234,7 +228,7 @@ impl Lexer { fn skip_whitespaces(&mut self) { if self.is_line_jump_whitespace { - for (i, c) in self.data[self.start..self.end].iter().enumerate() { + for (i, c) in self.data[self.start..].iter().enumerate() { if !matches!(c, b' ' | b'\t' | b'\r' | b'\n') { self.start += i; self.position += i; @@ -243,7 +237,7 @@ impl Lexer { //TODO: SIMD } } else { - for (i, c) in self.data[self.start..self.end].iter().enumerate() { + for (i, c) in self.data[self.start..].iter().enumerate() { if !matches!(c, b' ' | b'\t') { self.start += i; self.position += i; @@ -253,15 +247,14 @@ impl Lexer { } } // We only have whitespaces - self.position += self.end - self.start; - self.end = self.start; + self.position += self.data.len() - self.start; + self.start = self.data.len(); } - fn shrink_if_useful(&mut self) { - if self.start * 2 > self.data.len() { - // We have read more than half of the buffer, let's move the data to the beginning - self.data.copy_within(self.start..self.end, 0); - self.end -= self.start; + fn shrink_data(&mut self) { + if self.start > 0 { + self.data.copy_within(self.start.., 0); + self.data.truncate(self.data.len() - self.start); self.start = 0; } }