From 13c3515d7b1b537dd08ef0e4391afa3e347d14f9 Mon Sep 17 00:00:00 2001 From: Tpt Date: Sat, 2 Sep 2023 17:12:43 +0200 Subject: [PATCH] OxTTL: return file position in errors --- fuzz/fuzz_targets/nquads.rs | 10 +- fuzz/fuzz_targets/trig.rs | 10 +- lib/oxrdfio/src/error.rs | 34 +- lib/oxrdfxml/src/error.rs | 12 - lib/oxrdfxml/src/parser.rs | 6 +- lib/oxttl/src/lexer.rs | 6 +- lib/oxttl/src/lib.rs | 2 +- lib/oxttl/src/line_formats.rs | 8 +- lib/oxttl/src/n3.rs | 4 +- lib/oxttl/src/terse.rs | 20 +- lib/oxttl/src/toolkit/error.rs | 132 +++++++ lib/oxttl/src/toolkit/lexer.rs | 344 ++++++++++++------ lib/oxttl/src/toolkit/mod.rs | 8 +- lib/oxttl/src/toolkit/parser.rs | 142 +------- .../parser-error/invalid_iri.nt | 2 + .../parser-error/invalid_iri_comment.nt | 2 + .../parser-error/invalid_iri_comment_crlf.nt | 2 + .../parser-error/invalid_iri_crlf.nt | 2 + .../parser-error/invalid_iri_error.txt | 1 + .../parser-error/invalid_predicate.nt | 2 + .../parser-error/invalid_predicate_error.txt | 1 + .../parser-error/invalid_string_escape.nt | 1 + .../invalid_string_escape_error.txt | 1 + .../oxigraph-tests/parser-error/manifest.ttl | 66 ++++ .../parser-error/unexpected_eof.nt | 2 + .../parser-error/unexpected_eof_crlf.nt | 2 + .../parser-error/unexpected_eof_error.txt | 1 + testsuite/src/parser_evaluator.rs | 21 +- testsuite/tests/oxigraph.rs | 8 + 29 files changed, 552 insertions(+), 300 deletions(-) create mode 100644 lib/oxttl/src/toolkit/error.rs create mode 100644 testsuite/oxigraph-tests/parser-error/invalid_iri.nt create mode 100644 testsuite/oxigraph-tests/parser-error/invalid_iri_comment.nt create mode 100644 testsuite/oxigraph-tests/parser-error/invalid_iri_comment_crlf.nt create mode 100644 testsuite/oxigraph-tests/parser-error/invalid_iri_crlf.nt create mode 100644 testsuite/oxigraph-tests/parser-error/invalid_iri_error.txt create mode 100644 testsuite/oxigraph-tests/parser-error/invalid_predicate.nt create mode 100644 testsuite/oxigraph-tests/parser-error/invalid_predicate_error.txt create mode 100644 testsuite/oxigraph-tests/parser-error/invalid_string_escape.nt create mode 100644 testsuite/oxigraph-tests/parser-error/invalid_string_escape_error.txt create mode 100644 testsuite/oxigraph-tests/parser-error/manifest.ttl create mode 100644 testsuite/oxigraph-tests/parser-error/unexpected_eof.nt create mode 100644 testsuite/oxigraph-tests/parser-error/unexpected_eof_crlf.nt create mode 100644 testsuite/oxigraph-tests/parser-error/unexpected_eof_error.txt diff --git a/fuzz/fuzz_targets/nquads.rs b/fuzz/fuzz_targets/nquads.rs index a7de4913..c964e229 100644 --- a/fuzz/fuzz_targets/nquads.rs +++ b/fuzz/fuzz_targets/nquads.rs @@ -2,9 +2,9 @@ use libfuzzer_sys::fuzz_target; use oxrdf::Quad; -use oxttl::{NQuadsParser, NQuadsSerializer, SyntaxError}; +use oxttl::{NQuadsParser, NQuadsSerializer}; -fn parse<'a>(chunks: impl IntoIterator) -> (Vec, Vec) { +fn parse<'a>(chunks: impl IntoIterator) -> (Vec, Vec) { let mut quads = Vec::new(); let mut errors = Vec::new(); let mut parser = NQuadsParser::new().with_quoted_triples().parse(); @@ -13,7 +13,7 @@ fn parse<'a>(chunks: impl IntoIterator) -> (Vec, Vec quads.push(quad), - Err(error) => errors.push(error), + Err(error) => errors.push(error.to_string()), } } } @@ -21,7 +21,7 @@ fn parse<'a>(chunks: impl IntoIterator) -> (Vec, Vec quads.push(quad), - Err(error) => errors.push(error), + Err(error) => errors.push(error.to_string()), } } assert!(parser.is_end()); @@ -39,7 +39,7 @@ fuzz_target!(|data: &[u8]| { .collect::>() .as_slice()]); assert_eq!(quads, quads_without_split); - assert_eq!(errors.len(), errors_without_split.len()); + assert_eq!(errors, errors_without_split); // We serialize let mut writer = NQuadsSerializer::new().serialize_to_write(Vec::new()); diff --git a/fuzz/fuzz_targets/trig.rs b/fuzz/fuzz_targets/trig.rs index 6a930a97..c0713e69 100644 --- a/fuzz/fuzz_targets/trig.rs +++ b/fuzz/fuzz_targets/trig.rs @@ -2,9 +2,9 @@ use libfuzzer_sys::fuzz_target; use oxrdf::{Dataset, GraphName, Quad, Subject, Term, Triple}; -use oxttl::{SyntaxError, TriGParser, TriGSerializer}; +use oxttl::{TriGParser, TriGSerializer}; -fn parse<'a>(chunks: impl IntoIterator) -> (Vec, Vec) { +fn parse<'a>(chunks: impl IntoIterator) -> (Vec, Vec) { let mut quads = Vec::new(); let mut errors = Vec::new(); let mut parser = TriGParser::new() @@ -17,7 +17,7 @@ fn parse<'a>(chunks: impl IntoIterator) -> (Vec, Vec quads.push(quad), - Err(error) => errors.push(error), + Err(error) => errors.push(error.to_string()), } } } @@ -25,7 +25,7 @@ fn parse<'a>(chunks: impl IntoIterator) -> (Vec, Vec quads.push(quad), - Err(error) => errors.push(error), + Err(error) => errors.push(error.to_string()), } } assert!(parser.is_end()); @@ -96,7 +96,7 @@ fuzz_target!(|data: &[u8]| { String::from_utf8_lossy(&serialize_quads(&quads_without_split)) ); } - assert_eq!(errors.len(), errors_without_split.len()); + assert_eq!(errors, errors_without_split); // We serialize let new_serialization = serialize_quads(&quads); diff --git a/lib/oxrdfio/src/error.rs b/lib/oxrdfio/src/error.rs index ac8173a7..235ba1b7 100644 --- a/lib/oxrdfio/src/error.rs +++ b/lib/oxrdfio/src/error.rs @@ -1,4 +1,5 @@ use std::error::Error; +use std::ops::Range; use std::{fmt, io}; /// Error returned during RDF format parsing. @@ -110,10 +111,33 @@ pub struct SyntaxError { enum SyntaxErrorKind { Turtle(oxttl::SyntaxError), RdfXml(oxrdfxml::SyntaxError), - Msg { msg: &'static str }, } +impl SyntaxError { + /// The location of the error inside of the file. + #[inline] + pub fn location(&self) -> Option> { + match &self.inner { + SyntaxErrorKind::Turtle(e) => { + let location = e.location(); + Some( + TextPosition { + line: location.start.line, + column: location.start.column, + offset: location.start.offset, + }..TextPosition { + line: location.end.line, + column: location.end.column, + offset: location.end.offset, + }, + ) + } + SyntaxErrorKind::RdfXml(_) | SyntaxErrorKind::Msg { .. } => None, + } + } +} + impl fmt::Display for SyntaxError { #[inline] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { @@ -146,3 +170,11 @@ impl From for io::Error { } } } + +/// A position in a text i.e. a `line` number starting from 0, a `column` number starting from 0 (in number of code points) and a global file `offset` starting from 0 (in number of bytes). +#[derive(Eq, PartialEq, Debug, Clone, Copy)] +pub struct TextPosition { + pub line: u64, + pub column: u64, + pub offset: u64, +} diff --git a/lib/oxrdfxml/src/error.rs b/lib/oxrdfxml/src/error.rs index fd561be6..cb9eb9c4 100644 --- a/lib/oxrdfxml/src/error.rs +++ b/lib/oxrdfxml/src/error.rs @@ -72,15 +72,6 @@ impl From for ParseError { } } -impl From for ParseError { - #[inline] - fn from(error: quick_xml::events::attributes::AttrError) -> Self { - Self::Syntax(SyntaxError { - inner: SyntaxErrorKind::XmlAttribute(error), - }) - } -} - /// An error in the syntax of the parsed file. #[derive(Debug)] pub struct SyntaxError { @@ -90,7 +81,6 @@ pub struct SyntaxError { #[derive(Debug)] pub enum SyntaxErrorKind { Xml(quick_xml::Error), - XmlAttribute(quick_xml::events::attributes::AttrError), InvalidIri { iri: String, error: IriParseError, @@ -119,7 +109,6 @@ impl fmt::Display for SyntaxError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match &self.inner { SyntaxErrorKind::Xml(error) => error.fmt(f), - SyntaxErrorKind::XmlAttribute(error) => error.fmt(f), SyntaxErrorKind::InvalidIri { iri, error } => { write!(f, "error while parsing IRI '{iri}': {error}") } @@ -136,7 +125,6 @@ impl Error for SyntaxError { fn source(&self) -> Option<&(dyn Error + 'static)> { match &self.inner { SyntaxErrorKind::Xml(error) => Some(error), - SyntaxErrorKind::XmlAttribute(error) => Some(error), SyntaxErrorKind::InvalidIri { error, .. } => Some(error), SyntaxErrorKind::InvalidLanguageTag { error, .. } => Some(error), SyntaxErrorKind::Msg { .. } => None, diff --git a/lib/oxrdfxml/src/parser.rs b/lib/oxrdfxml/src/parser.rs index 22983350..dcd216a4 100644 --- a/lib/oxrdfxml/src/parser.rs +++ b/lib/oxrdfxml/src/parser.rs @@ -8,7 +8,7 @@ use quick_xml::escape::unescape_with; use quick_xml::events::attributes::Attribute; use quick_xml::events::*; use quick_xml::name::{LocalName, QName, ResolveResult}; -use quick_xml::{NsReader, Writer}; +use quick_xml::{Error, NsReader, Writer}; use std::collections::{HashMap, HashSet}; use std::io::{BufReader, Read}; use std::str; @@ -515,7 +515,7 @@ impl RdfXmlReader { .to_string(), ); for attr in event.attributes() { - clean_event.push_attribute(attr?); + clean_event.push_attribute(attr.map_err(Error::InvalidAttr)?); } writer.write_event(Event::Start(clean_event))?; self.in_literal_depth += 1; @@ -544,7 +544,7 @@ impl RdfXmlReader { let mut type_attr = None; for attribute in event.attributes() { - let attribute = attribute?; + let attribute = attribute.map_err(Error::InvalidAttr)?; if attribute.key.as_ref().starts_with(b"xml") { if attribute.key.as_ref() == b"xml:lang" { let tag = self.convert_attribute(&attribute)?; diff --git a/lib/oxttl/src/lexer.rs b/lib/oxttl/src/lexer.rs index f12f3b25..65dba56e 100644 --- a/lib/oxttl/src/lexer.rs +++ b/lib/oxttl/src/lexer.rs @@ -266,7 +266,7 @@ impl N3Lexer { )); } } - Err(e) => return Some((e.position.end, Err(e))), + Err(e) => return Some((e.location.end, Err(e))), } } else if is_ending { while data[..i].ends_with(b".") { @@ -447,7 +447,7 @@ impl N3Lexer { return Some((i, Ok((buffer, might_be_invalid_iri)))); } } - Err(e) => return Some((e.position.end, Err(e))), + Err(e) => return Some((e.location.end, Err(e))), } } else if is_ending { let buffer = if let Some(mut buffer) = buffer { @@ -515,7 +515,7 @@ impl N3Lexer { } i += consumed; } - Err(e) => return Some((e.position.end, Err(e))), + Err(e) => return Some((e.location.end, Err(e))), } } } diff --git a/lib/oxttl/src/lib.rs b/lib/oxttl/src/lib.rs index ac96515e..0e04e243 100644 --- a/lib/oxttl/src/lib.rs +++ b/lib/oxttl/src/lib.rs @@ -17,7 +17,7 @@ pub mod turtle; pub use crate::n3::N3Parser; pub use crate::nquads::{NQuadsParser, NQuadsSerializer}; pub use crate::ntriples::{NTriplesParser, NTriplesSerializer}; -pub use crate::toolkit::{ParseError, SyntaxError}; +pub use crate::toolkit::{ParseError, SyntaxError, TextPosition}; pub use crate::trig::{TriGParser, TriGSerializer}; pub use crate::turtle::{TurtleParser, TurtleSerializer}; diff --git a/lib/oxttl/src/line_formats.rs b/lib/oxttl/src/line_formats.rs index f95e56f3..1b4c31e6 100644 --- a/lib/oxttl/src/line_formats.rs +++ b/lib/oxttl/src/line_formats.rs @@ -76,7 +76,7 @@ impl RuleRecognizer for NQuadsRecognizer { } _ => self.error( errors, - format!("The subject of a triple should be an IRI or a blank node, {token:?} found"), + "The subject of a triple should be an IRI or a blank node, TOKEN found", ), }, NQuadsState::ExpectPredicate => match token { @@ -88,7 +88,7 @@ impl RuleRecognizer for NQuadsRecognizer { } _ => self.error( errors, - format!("The predicate of a triple should be an IRI, {token:?} found"), + "The predicate of a triple should be an IRI, TOKEN found", ), }, NQuadsState::ExpectedObject => match token { @@ -118,7 +118,7 @@ impl RuleRecognizer for NQuadsRecognizer { } _ => self.error( errors, - format!("The object of a triple should be an IRI, a blank node or a literal, {token:?} found"), + "The object of a triple should be an IRI, a blank node or a literal, TOKEN found", ), }, NQuadsState::ExpectLiteralAnnotationOrGraphNameOrDot { value } => match token { @@ -159,7 +159,7 @@ impl RuleRecognizer for NQuadsRecognizer { .push(NQuadsState::ExpectPossibleGraphOrEndOfQuotedTriple); self } - _ => self.error(errors, format!("A literal datatype must be an IRI, found {token:?}")), + _ => self.error(errors, "A literal datatype must be an IRI, found TOKEN"), }, NQuadsState::ExpectPossibleGraphOrEndOfQuotedTriple => { if self.stack.is_empty() { diff --git a/lib/oxttl/src/n3.rs b/lib/oxttl/src/n3.rs index 46274ecf..84e36235 100644 --- a/lib/oxttl/src/n3.rs +++ b/lib/oxttl/src/n3.rs @@ -836,7 +836,7 @@ impl RuleRecognizer for N3Recognizer { self.stack.push(N3State::FormulaContent); self } - _ => self.error(errors, format!("This is not a valid RDF value: {token:?}")) + _ => self.error(errors, "TOKEN is not a valid RDF value") } } N3State::PropertyListMiddle => match token { @@ -950,7 +950,7 @@ impl RuleRecognizer for N3Recognizer { Err(e) => self.error(errors, e) } _ => { - self.error(errors, format!("Expecting a datatype IRI after '^^, found {token:?}")).recognize_next(token, results, errors) + self.error(errors, "Expecting a datatype IRI after '^^, found TOKEN").recognize_next(token, results, errors) } } } diff --git a/lib/oxttl/src/terse.rs b/lib/oxttl/src/terse.rs index ecd24d4a..f99aaf4d 100644 --- a/lib/oxttl/src/terse.rs +++ b/lib/oxttl/src/terse.rs @@ -167,7 +167,7 @@ impl RuleRecognizer for TriGRecognizer { self } _ => { - self.error(errors, format!("The token {token:?} is not a valid subject or graph name")) + self.error(errors, "TOKEN is not a valid subject or graph name") } } TriGState::WrappedGraphOrPredicateObjectList { term } => { @@ -317,7 +317,7 @@ impl RuleRecognizer for TriGRecognizer { self } _ => { - self.error(errors, format!("The token {token:?} is not a valid RDF subject")) + self.error(errors, "TOKEN is not a valid RDF subject") } }, TriGState::TriplesBlankNodePropertyListCurrent => if token == N3Token::Punctuation("]") { @@ -350,7 +350,7 @@ impl RuleRecognizer for TriGRecognizer { self } _ => { - self.error(errors, format!("The token {token:?} is not a valid graph name")) + self.error(errors, "TOKEN is not a valid graph name") } } TriGState::GraphNameAnonEnd => if token == N3Token::Punctuation("]") { @@ -456,7 +456,7 @@ impl RuleRecognizer for TriGRecognizer { Err(e) => self.error(errors, e) } _ => { - self.error(errors, format!("The token {token:?} is not a valid predicate")) + self.error(errors, "TOKEN is not a valid predicate") } } // [12] object ::= iri | BlankNode | collection | blankNodePropertyList | literal | quotedTriple @@ -536,7 +536,7 @@ impl RuleRecognizer for TriGRecognizer { self } _ => { - self.error(errors, format!("This is not a valid RDF object: {token:?}")) + self.error(errors, "TOKEN is not a valid RDF object") } } @@ -637,7 +637,7 @@ impl RuleRecognizer for TriGRecognizer { Err(e) => self.error(errors, e) } _ => { - self.error(errors, format!("Expecting a datatype IRI after '^^, found {token:?}")).recognize_next(token, results, errors) + self.error(errors, "Expecting a datatype IRI after ^^, found TOKEN").recognize_next(token, results, errors) } } } @@ -653,7 +653,7 @@ impl RuleRecognizer for TriGRecognizer { if token == N3Token::Punctuation(">>") { self } else { - self.error(errors, format!("Expecting '>>' to close a quoted triple, found {token:?}")) + self.error(errors, "Expecting '>>' to close a quoted triple, found TOKEN") } } #[cfg(feature = "rdf-star")] @@ -670,7 +670,7 @@ impl RuleRecognizer for TriGRecognizer { if token == N3Token::Punctuation(">>") { self } else { - self.error(errors, format!("Expecting '>>' to close a quoted triple, found {token:?}")) + self.error(errors, "Expecting '>>' to close a quoted triple, found TOKEN") } } // [28t] qtSubject ::= iri | BlankNode | quotedTriple @@ -703,7 +703,7 @@ impl RuleRecognizer for TriGRecognizer { self.stack.push(TriGState::QuotedSubject); self } - _ => self.error(errors, format!("This is not a valid RDF quoted triple subject: {token:?}")) + _ => self.error(errors, "TOKEN is not a valid RDF quoted triple subject: TOKEN") } // [29t] qtObject ::= iri | BlankNode | literal | quotedTriple #[cfg(feature = "rdf-star")] @@ -759,7 +759,7 @@ impl RuleRecognizer for TriGRecognizer { self.stack.push(TriGState::QuotedSubject); self } - _ => self.error(errors, format!("This is not a valid RDF quoted triple object: {token:?}")) + _ => self.error(errors, "TOKEN is not a valid RDF quoted triple object") } #[cfg(feature = "rdf-star")] TriGState::QuotedAnonEnd => if token == N3Token::Punctuation("]") { diff --git a/lib/oxttl/src/toolkit/error.rs b/lib/oxttl/src/toolkit/error.rs new file mode 100644 index 00000000..df50b950 --- /dev/null +++ b/lib/oxttl/src/toolkit/error.rs @@ -0,0 +1,132 @@ +use std::error::Error; +use std::ops::Range; +use std::{fmt, io}; + +/// A position in a text i.e. a `line` number starting from 0, a `column` number starting from 0 (in number of code points) and a global file `offset` starting from 0 (in number of bytes). +#[derive(Eq, PartialEq, Debug, Clone, Copy)] +pub struct TextPosition { + pub line: u64, + pub column: u64, + pub offset: u64, +} + +/// An error in the syntax of the parsed file. +/// +/// It is composed of a message and a byte range in the input. +#[derive(Debug)] +pub struct SyntaxError { + pub(super) location: Range, + pub(super) message: String, +} + +impl SyntaxError { + /// The location of the error inside of the file. + #[inline] + pub fn location(&self) -> Range { + self.location.clone() + } + + /// The error message. + #[inline] + pub fn message(&self) -> &str { + &self.message + } +} + +impl fmt::Display for SyntaxError { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.location.start.offset + 1 >= self.location.end.offset { + write!( + f, + "Parser error at line {} column {}: {}", + self.location.start.line + 1, + self.location.start.column + 1, + self.message + ) + } else if self.location.start.line == self.location.end.line { + write!( + f, + "Parser error between at line {} between columns {} and column {}: {}", + self.location.start.line + 1, + self.location.start.column + 1, + self.location.end.column + 1, + self.message + ) + } else { + write!( + f, + "Parser error between line {} column {} and line {} column {}: {}", + self.location.start.line + 1, + self.location.start.column + 1, + self.location.end.line + 1, + self.location.end.column + 1, + self.message + ) + } + } +} + +impl Error for SyntaxError {} + +impl From for io::Error { + #[inline] + fn from(error: SyntaxError) -> Self { + io::Error::new(io::ErrorKind::InvalidData, error) + } +} + +/// A parsing error. +/// +/// It is the union of [`SyntaxError`] and [`std::io::Error`]. +#[derive(Debug)] +pub enum ParseError { + /// I/O error during parsing (file not found...). + Io(io::Error), + /// An error in the file syntax. + Syntax(SyntaxError), +} + +impl fmt::Display for ParseError { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Io(e) => e.fmt(f), + Self::Syntax(e) => e.fmt(f), + } + } +} + +impl Error for ParseError { + #[inline] + fn source(&self) -> Option<&(dyn Error + 'static)> { + Some(match self { + Self::Io(e) => e, + Self::Syntax(e) => e, + }) + } +} + +impl From for ParseError { + #[inline] + fn from(error: SyntaxError) -> Self { + Self::Syntax(error) + } +} + +impl From for ParseError { + #[inline] + fn from(error: io::Error) -> Self { + Self::Io(error) + } +} + +impl From for io::Error { + #[inline] + fn from(error: ParseError) -> Self { + match error { + ParseError::Syntax(e) => e.into(), + ParseError::Io(e) => e, + } + } +} diff --git a/lib/oxttl/src/toolkit/lexer.rs b/lib/oxttl/src/toolkit/lexer.rs index 34c1c01e..0f7373c2 100644 --- a/lib/oxttl/src/toolkit/lexer.rs +++ b/lib/oxttl/src/toolkit/lexer.rs @@ -1,9 +1,10 @@ -use memchr::memchr2; +use crate::toolkit::error::{SyntaxError, TextPosition}; +use memchr::{memchr2, memchr2_iter}; +use std::borrow::Cow; use std::cmp::min; -use std::error::Error; -use std::fmt; use std::io::{self, Read}; use std::ops::{Range, RangeInclusive}; +use std::str; #[cfg(feature = "async-tokio")] use tokio::io::{AsyncRead, AsyncReadExt}; @@ -22,14 +23,14 @@ pub trait TokenRecognizer { } pub struct TokenRecognizerError { - pub position: Range, + pub location: Range, pub message: String, } impl> From<(Range, S)> for TokenRecognizerError { - fn from((position, message): (Range, S)) -> Self { + fn from((location, message): (Range, S)) -> Self { Self { - position, + location, message: message.into(), } } @@ -37,34 +38,37 @@ impl> From<(Range, S)> for TokenRecognizerError { #[allow(clippy::range_plus_one)] impl> From<(RangeInclusive, S)> for TokenRecognizerError { - fn from((position, message): (RangeInclusive, S)) -> Self { - (*position.start()..*position.end() + 1, message).into() + fn from((location, message): (RangeInclusive, S)) -> Self { + (*location.start()..*location.end() + 1, message).into() } } impl> From<(usize, S)> for TokenRecognizerError { - fn from((position, message): (usize, S)) -> Self { - (position..=position, message).into() + fn from((location, message): (usize, S)) -> Self { + (location..=location, message).into() } } -pub struct TokenWithPosition { - pub token: T, - pub position: Range, -} - pub struct Lexer { parser: R, data: Vec, - start: usize, + position: Position, + previous_position: Position, // Lexer position before the last emitted token is_ending: bool, - position: usize, min_buffer_size: usize, max_buffer_size: usize, is_line_jump_whitespace: bool, line_comment_start: Option<&'static [u8]>, } +#[derive(Clone, Copy)] +struct Position { + line_start_buffer_offset: usize, + buffer_offset: usize, + global_offset: u64, + global_line: u64, +} + impl Lexer { pub fn new( parser: R, @@ -76,9 +80,19 @@ impl Lexer { Self { parser, data: Vec::new(), - start: 0, + position: Position { + line_start_buffer_offset: 0, + buffer_offset: 0, + global_offset: 0, + global_line: 0, + }, + previous_position: Position { + line_start_buffer_offset: 0, + buffer_offset: 0, + global_offset: 0, + global_line: 0, + }, is_ending: false, - position: 0, min_buffer_size, max_buffer_size, is_line_jump_whitespace, @@ -148,24 +162,43 @@ impl Lexer { Ok(()) } - pub fn read_next( - &mut self, - options: &R::Options, - ) -> Option>, LexerError>> { + #[allow(clippy::unwrap_in_result)] + pub fn read_next(&mut self, options: &R::Options) -> Option, SyntaxError>> { self.skip_whitespaces_and_comments()?; - let Some((consumed, result)) = - self.parser - .recognize_next_token(&self.data[self.start..], self.is_ending, options) - else { + self.previous_position = self.position; + let Some((consumed, result)) = self.parser.recognize_next_token( + &self.data[self.position.buffer_offset..], + self.is_ending, + options, + ) else { return if self.is_ending { - if self.start == self.data.len() { + if self.position.buffer_offset == self.data.len() { None // We have finished } else { - let error = LexerError { - position: self.position..self.position + (self.data.len() - self.start), + let (new_line_jumps, new_line_start) = + Self::find_number_of_line_jumps_and_start_of_last_line( + &self.data[self.position.buffer_offset..], + ); + if new_line_jumps > 0 { + self.position.line_start_buffer_offset = + self.position.buffer_offset + new_line_start; + } + self.position.global_offset += + u64::try_from(self.data.len() - self.position.buffer_offset).unwrap(); + self.position.buffer_offset = self.data.len(); + self.position.global_line += new_line_jumps; + let new_position = TextPosition { + line: self.position.global_line, + column: Self::column_from_bytes( + &self.data[self.position.line_start_buffer_offset..], + ), + offset: self.position.global_offset, + }; + let error = SyntaxError { + location: new_position..new_position, message: "Unexpected end of file".into(), }; - self.start = self.data.len(); // We consume everything + self.position.buffer_offset = self.data.len(); // We consume everything Some(Err(error)) } } else { @@ -177,44 +210,119 @@ impl Lexer { "The lexer must consume at least one byte each time" ); debug_assert!( - self.start + consumed <= self.data.len(), + self.position.buffer_offset + consumed <= self.data.len(), "The lexer tried to consumed {consumed} bytes but only {} bytes are readable", - self.data.len() - self.start + self.data.len() - self.position.buffer_offset ); - let old_position = self.position; - self.start += consumed; - self.position += consumed; - Some(match result { - Ok(token) => Ok(TokenWithPosition { - token, - position: old_position..self.position, - }), - Err(e) => Err(LexerError { - position: e.position.start + self.position..e.position.end + self.position, - message: e.message, - }), - }) + let (new_line_jumps, new_line_start) = + Self::find_number_of_line_jumps_and_start_of_last_line( + &self.data[self.position.buffer_offset..self.position.buffer_offset + consumed], + ); + if new_line_jumps > 0 { + self.position.line_start_buffer_offset = self.position.buffer_offset + new_line_start; + } + self.position.buffer_offset += consumed; + self.position.global_offset += u64::try_from(consumed).unwrap(); + self.position.global_line += new_line_jumps; + Some(result.map_err(|e| SyntaxError { + location: self.location_from_buffer_offset_range(e.location), + message: e.message, + })) + } + + pub fn location_from_buffer_offset_range( + &self, + offset_range: Range, + ) -> Range { + let start_offset = self.previous_position.buffer_offset + offset_range.start; + let (start_extra_line_jumps, start_line_start) = + Self::find_number_of_line_jumps_and_start_of_last_line( + &self.data[self.previous_position.buffer_offset..start_offset], + ); + let start_line_start = if start_extra_line_jumps > 0 { + start_line_start + self.previous_position.buffer_offset + } else { + self.previous_position.line_start_buffer_offset + }; + let end_offset = self.previous_position.buffer_offset + offset_range.end; + let (end_extra_line_jumps, end_line_start) = + Self::find_number_of_line_jumps_and_start_of_last_line( + &self.data[self.previous_position.buffer_offset..end_offset], + ); + let end_line_start = if end_extra_line_jumps > 0 { + end_line_start + self.previous_position.buffer_offset + } else { + self.previous_position.line_start_buffer_offset + }; + TextPosition { + line: self.previous_position.global_line + start_extra_line_jumps, + column: Self::column_from_bytes(&self.data[start_line_start..start_offset]), + offset: self.previous_position.global_offset + + u64::try_from(offset_range.start).unwrap(), + }..TextPosition { + line: self.previous_position.global_line + end_extra_line_jumps, + column: Self::column_from_bytes(&self.data[end_line_start..end_offset]), + offset: self.previous_position.global_offset + u64::try_from(offset_range.end).unwrap(), + } + } + + pub fn last_token_location(&self) -> Range { + TextPosition { + line: self.previous_position.global_line, + column: Self::column_from_bytes( + &self.data[self.previous_position.line_start_buffer_offset + ..self.previous_position.buffer_offset], + ), + offset: self.previous_position.global_offset, + }..TextPosition { + line: self.position.global_line, + column: Self::column_from_bytes( + &self.data[self.position.line_start_buffer_offset..self.position.buffer_offset], + ), + offset: self.position.global_offset, + } + } + + pub fn last_token_source(&self) -> Cow<'_, str> { + String::from_utf8_lossy( + &self.data[self.previous_position.buffer_offset..self.position.buffer_offset], + ) } pub fn is_end(&self) -> bool { - self.is_ending && self.data.len() == self.start + self.is_ending && self.data.len() == self.position.buffer_offset } + #[allow(clippy::unwrap_in_result)] fn skip_whitespaces_and_comments(&mut self) -> Option<()> { loop { - self.skip_whitespaces(); + self.skip_whitespaces()?; - let buf = &self.data[self.start..]; + let buf = &self.data[self.position.buffer_offset..]; if let Some(line_comment_start) = self.line_comment_start { if buf.starts_with(line_comment_start) { // Comment if let Some(end) = memchr2(b'\r', b'\n', &buf[line_comment_start.len()..]) { - self.start += end + line_comment_start.len(); - self.position += end + line_comment_start.len(); + let mut end_position = line_comment_start.len() + end; + if buf.get(end_position).copied() == Some(b'\r') { + // We look for \n for Windows line end style + if let Some(c) = buf.get(end_position + 1) { + if *c == b'\n' { + end_position += 1; + } + } else if !self.is_ending { + return None; // We need to read more + } + } + let comment_size = end_position + 1; + self.position.buffer_offset += comment_size; + self.position.line_start_buffer_offset = self.position.buffer_offset; + self.position.global_offset += u64::try_from(comment_size).unwrap(); + self.position.global_line += 1; continue; } if self.is_ending { - self.start = self.data.len(); // EOF + self.position.buffer_offset = self.data.len(); // EOF return Some(()); } return None; // We need more data @@ -224,80 +332,98 @@ impl Lexer { } } - fn skip_whitespaces(&mut self) { + fn skip_whitespaces(&mut self) -> Option<()> { if self.is_line_jump_whitespace { - for (i, c) in self.data[self.start..].iter().enumerate() { - if !matches!(c, b' ' | b'\t' | b'\r' | b'\n') { - self.start += i; - self.position += i; - return; + let mut i = self.position.buffer_offset; + while let Some(c) = self.data.get(i) { + match c { + b' ' | b'\t' => { + self.position.buffer_offset += 1; + self.position.global_offset += 1; + } + b'\r' => { + // We look for \n for Windows line end style + let mut increment: u8 = 1; + if let Some(c) = self.data.get(i + 1) { + if *c == b'\n' { + increment += 1; + i += 1; + } + } else if !self.is_ending { + return None; // We need to read more + } + self.position.buffer_offset += usize::from(increment); + self.position.line_start_buffer_offset = self.position.buffer_offset; + self.position.global_offset += u64::from(increment); + self.position.global_line += 1; + } + b'\n' => { + self.position.buffer_offset += 1; + self.position.line_start_buffer_offset = self.position.buffer_offset; + self.position.global_offset += 1; + self.position.global_line += 1; + } + _ => return Some(()), } + i += 1; //TODO: SIMD } } else { - for (i, c) in self.data[self.start..].iter().enumerate() { - if !matches!(c, b' ' | b'\t') { - self.start += i; - self.position += i; - return; + for c in &self.data[self.position.buffer_offset..] { + if matches!(c, b' ' | b'\t') { + self.position.buffer_offset += 1; + self.position.global_offset += 1; + } else { + return Some(()); } //TODO: SIMD } } - // We only have whitespaces - self.position += self.data.len() - self.start; - self.start = self.data.len(); + Some(()) } fn shrink_data(&mut self) { - if self.start > 0 { - self.data.copy_within(self.start.., 0); - self.data.truncate(self.data.len() - self.start); - self.start = 0; + if self.position.line_start_buffer_offset > 0 { + self.data + .copy_within(self.position.line_start_buffer_offset.., 0); + self.data + .truncate(self.data.len() - self.position.line_start_buffer_offset); + self.position.buffer_offset -= self.position.line_start_buffer_offset; + self.position.line_start_buffer_offset = 0; + self.previous_position = self.position; } } -} -#[derive(Debug)] -pub struct LexerError { - position: Range, - message: String, -} - -impl LexerError { - pub fn position(&self) -> Range { - self.position.clone() - } - - pub fn message(&self) -> &str { - &self.message - } - - pub fn into_message(self) -> String { - self.message - } -} - -impl fmt::Display for LexerError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - if self.position.start + 1 == self.position.end { - write!( - f, - "Lexer error at byte {}: {}", - self.position.start, self.message - ) - } else { - write!( - f, - "Lexer error between bytes {} and {}: {}", - self.position.start, self.position.end, self.message - ) + fn find_number_of_line_jumps_and_start_of_last_line(bytes: &[u8]) -> (u64, usize) { + let mut num_of_jumps = 0; + let mut last_jump_pos = 0; + let mut previous_cr = 0; + for pos in memchr2_iter(b'\r', b'\n', bytes) { + if bytes[pos] == b'\r' { + previous_cr = pos; + num_of_jumps += 1; + last_jump_pos = pos + 1; + } else { + if previous_cr < pos - 1 { + // We count \r\n as a single line jump + num_of_jumps += 1; + } + last_jump_pos = pos + 1; + } } + (num_of_jumps, last_jump_pos) } -} -impl Error for LexerError { - fn description(&self) -> &str { - self.message() + fn column_from_bytes(bytes: &[u8]) -> u64 { + match str::from_utf8(bytes) { + Ok(s) => u64::try_from(s.chars().count()).unwrap(), + Err(e) => { + if e.valid_up_to() == 0 { + 0 + } else { + Self::column_from_bytes(&bytes[..e.valid_up_to()]) + } + } + } } } diff --git a/lib/oxttl/src/toolkit/mod.rs b/lib/oxttl/src/toolkit/mod.rs index 300b9c2c..cc8e3624 100644 --- a/lib/oxttl/src/toolkit/mod.rs +++ b/lib/oxttl/src/toolkit/mod.rs @@ -2,12 +2,12 @@ //! //! Provides the basic code to write plain Rust lexers and parsers able to read files chunk by chunk. +mod error; mod lexer; mod parser; -pub use self::lexer::{Lexer, LexerError, TokenRecognizer, TokenRecognizerError}; +pub use self::error::{ParseError, SyntaxError, TextPosition}; +pub use self::lexer::{Lexer, TokenRecognizer, TokenRecognizerError}; #[cfg(feature = "async-tokio")] pub use self::parser::FromTokioAsyncReadIterator; -pub use self::parser::{ - FromReadIterator, ParseError, Parser, RuleRecognizer, RuleRecognizerError, SyntaxError, -}; +pub use self::parser::{FromReadIterator, Parser, RuleRecognizer, RuleRecognizerError}; diff --git a/lib/oxttl/src/toolkit/parser.rs b/lib/oxttl/src/toolkit/parser.rs index 38419477..7a9ba8bf 100644 --- a/lib/oxttl/src/toolkit/parser.rs +++ b/lib/oxttl/src/toolkit/parser.rs @@ -1,9 +1,6 @@ -use crate::toolkit::lexer::TokenWithPosition; -use crate::toolkit::{Lexer, LexerError, TokenRecognizer}; -use std::error::Error; +use crate::toolkit::error::{ParseError, SyntaxError}; +use crate::toolkit::lexer::{Lexer, TokenRecognizer}; use std::io::Read; -use std::ops::Range; -use std::{fmt, io}; #[cfg(feature = "async-tokio")] use tokio::io::AsyncRead; @@ -42,7 +39,6 @@ pub struct Parser { state: Option, results: Vec, errors: Vec, - position: Range, default_lexer_options: ::Options, } @@ -53,7 +49,6 @@ impl Parser { state: Some(recognizer), results: vec![], errors: vec![], - position: 0..0, default_lexer_options: ::Options::default(), } } @@ -76,8 +71,10 @@ impl Parser { loop { if let Some(error) = self.errors.pop() { return Some(Err(SyntaxError { - position: self.position.clone(), - message: error.message, + location: self.lexer.last_token_location(), + message: error + .message + .replace("TOKEN", &self.lexer.last_token_source()), })); } if let Some(result) = self.results.pop() { @@ -89,8 +86,7 @@ impl Parser { .map_or(&self.default_lexer_options, |p| p.lexer_options()), ) { match result { - Ok(TokenWithPosition { token, position }) => { - self.position = position; + Ok(token) => { self.state = self.state.take().map(|state| { state.recognize_next(token, &mut self.results, &mut self.errors) }); @@ -98,7 +94,7 @@ impl Parser { } Err(e) => { self.state = self.state.take().map(RR::error_recovery_state); - return Some(Err(e.into())); + return Some(Err(e)); } } } @@ -126,128 +122,6 @@ impl Parser { } } -/// An error in the syntax of the parsed file. -/// -/// It is composed of a message and a byte range in the input. -#[derive(Debug)] -pub struct SyntaxError { - position: Range, - message: String, -} - -impl SyntaxError { - /// The invalid byte range in the input. - #[inline] - pub fn position(&self) -> Range { - self.position.clone() - } - - /// The error message. - #[inline] - pub fn message(&self) -> &str { - &self.message - } - - /// Converts this error to an error message. - #[inline] - pub fn into_message(self) -> String { - self.message - } -} - -impl fmt::Display for SyntaxError { - #[inline] - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - if self.position.start + 1 == self.position.end { - write!( - f, - "Parser error at byte {}: {}", - self.position.start, self.message - ) - } else { - write!( - f, - "Parser error between bytes {} and {}: {}", - self.position.start, self.position.end, self.message - ) - } - } -} - -impl Error for SyntaxError {} - -impl From for io::Error { - #[inline] - fn from(error: SyntaxError) -> Self { - io::Error::new(io::ErrorKind::InvalidData, error) - } -} - -impl From for SyntaxError { - #[inline] - fn from(e: LexerError) -> Self { - Self { - position: e.position(), - message: e.into_message(), - } - } -} - -/// A parsing error. -/// -/// It is the union of [`SyntaxError`] and [`std::io::Error`]. -#[derive(Debug)] -pub enum ParseError { - /// I/O error during parsing (file not found...). - Io(io::Error), - /// An error in the file syntax. - Syntax(SyntaxError), -} - -impl fmt::Display for ParseError { - #[inline] - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Self::Io(e) => e.fmt(f), - Self::Syntax(e) => e.fmt(f), - } - } -} - -impl Error for ParseError { - #[inline] - fn source(&self) -> Option<&(dyn Error + 'static)> { - Some(match self { - Self::Io(e) => e, - Self::Syntax(e) => e, - }) - } -} - -impl From for ParseError { - #[inline] - fn from(error: SyntaxError) -> Self { - Self::Syntax(error) - } -} - -impl From for ParseError { - #[inline] - fn from(error: io::Error) -> Self { - Self::Io(error) - } -} - -impl From for io::Error { - #[inline] - fn from(error: ParseError) -> Self { - match error { - ParseError::Syntax(e) => e.into(), - ParseError::Io(e) => e, - } - } -} - pub struct FromReadIterator { read: R, parser: Parser, diff --git a/testsuite/oxigraph-tests/parser-error/invalid_iri.nt b/testsuite/oxigraph-tests/parser-error/invalid_iri.nt new file mode 100644 index 00000000..021c7911 --- /dev/null +++ b/testsuite/oxigraph-tests/parser-error/invalid_iri.nt @@ -0,0 +1,2 @@ + . + . diff --git a/testsuite/oxigraph-tests/parser-error/invalid_iri_comment.nt b/testsuite/oxigraph-tests/parser-error/invalid_iri_comment.nt new file mode 100644 index 00000000..7c8d2120 --- /dev/null +++ b/testsuite/oxigraph-tests/parser-error/invalid_iri_comment.nt @@ -0,0 +1,2 @@ + . # foo + . diff --git a/testsuite/oxigraph-tests/parser-error/invalid_iri_comment_crlf.nt b/testsuite/oxigraph-tests/parser-error/invalid_iri_comment_crlf.nt new file mode 100644 index 00000000..7c8d2120 --- /dev/null +++ b/testsuite/oxigraph-tests/parser-error/invalid_iri_comment_crlf.nt @@ -0,0 +1,2 @@ + . # foo + . diff --git a/testsuite/oxigraph-tests/parser-error/invalid_iri_crlf.nt b/testsuite/oxigraph-tests/parser-error/invalid_iri_crlf.nt new file mode 100644 index 00000000..021c7911 --- /dev/null +++ b/testsuite/oxigraph-tests/parser-error/invalid_iri_crlf.nt @@ -0,0 +1,2 @@ + . + . diff --git a/testsuite/oxigraph-tests/parser-error/invalid_iri_error.txt b/testsuite/oxigraph-tests/parser-error/invalid_iri_error.txt new file mode 100644 index 00000000..26729063 --- /dev/null +++ b/testsuite/oxigraph-tests/parser-error/invalid_iri_error.txt @@ -0,0 +1 @@ +Parser error between at line 2 between columns 24 and column 36: Invalid IRI code point ' ' \ No newline at end of file diff --git a/testsuite/oxigraph-tests/parser-error/invalid_predicate.nt b/testsuite/oxigraph-tests/parser-error/invalid_predicate.nt new file mode 100644 index 00000000..63e6fd7a --- /dev/null +++ b/testsuite/oxigraph-tests/parser-error/invalid_predicate.nt @@ -0,0 +1,2 @@ + . + "p" . diff --git a/testsuite/oxigraph-tests/parser-error/invalid_predicate_error.txt b/testsuite/oxigraph-tests/parser-error/invalid_predicate_error.txt new file mode 100644 index 00000000..469dd19f --- /dev/null +++ b/testsuite/oxigraph-tests/parser-error/invalid_predicate_error.txt @@ -0,0 +1 @@ +Parser error between at line 2 between columns 24 and column 27: "p" is not a valid predicate \ No newline at end of file diff --git a/testsuite/oxigraph-tests/parser-error/invalid_string_escape.nt b/testsuite/oxigraph-tests/parser-error/invalid_string_escape.nt new file mode 100644 index 00000000..8a625fce --- /dev/null +++ b/testsuite/oxigraph-tests/parser-error/invalid_string_escape.nt @@ -0,0 +1 @@ + "fooé \a baré" . diff --git a/testsuite/oxigraph-tests/parser-error/invalid_string_escape_error.txt b/testsuite/oxigraph-tests/parser-error/invalid_string_escape_error.txt new file mode 100644 index 00000000..f5e45857 --- /dev/null +++ b/testsuite/oxigraph-tests/parser-error/invalid_string_escape_error.txt @@ -0,0 +1 @@ +Parser error between at line 1 between columns 53 and column 55: Unexpected escape character '\a' \ No newline at end of file diff --git a/testsuite/oxigraph-tests/parser-error/manifest.ttl b/testsuite/oxigraph-tests/parser-error/manifest.ttl new file mode 100644 index 00000000..86159b38 --- /dev/null +++ b/testsuite/oxigraph-tests/parser-error/manifest.ttl @@ -0,0 +1,66 @@ +@prefix mf: . +@prefix rdf: . +@prefix rdfs: . +@prefix rdft: . + +<> + rdf:type mf:Manifest ; + rdfs:comment "Oxigraph parser error test cases" ; + mf:entries ( + <#invalid_iri> + <#invalid_iri_crlf> + <#invalid_iri_comment> + <#invalid_iri_comment_crlf> + <#invalid_string_escape> + <#unexpected_eof> + <#unexpected_eof_crlf> + <#invalid_predicate> + ) . + +<#invalid_iri> + rdf:type rdft:TestTurtleNegativeSyntax ; + mf:name "bad IRI" ; + mf:action ; + mf:result . + +<#invalid_iri_crlf> + rdf:type rdft:TestTurtleNegativeSyntax ; + mf:name "bad IRI" ; + mf:action ; + mf:result . + +<#invalid_iri_comment> + rdf:type rdft:TestTurtleNegativeSyntax ; + mf:name "bad IRI" ; + mf:action ; + mf:result . + +<#invalid_iri_comment_crlf> + rdf:type rdft:TestTurtleNegativeSyntax ; + mf:name "bad IRI" ; + mf:action ; + mf:result . + +<#invalid_string_escape> + rdf:type rdft:TestTurtleNegativeSyntax ; + mf:name "bad string escape" ; + mf:action ; + mf:result . + +<#unexpected_eof> + rdf:type rdft:TestTurtleNegativeSyntax ; + mf:name "unexpected end of file" ; + mf:action ; + mf:result . + +<#unexpected_eof_crlf> + rdf:type rdft:TestTurtleNegativeSyntax ; + mf:name "unexpected end of file" ; + mf:action ; + mf:result . + +<#invalid_predicate> + rdf:type rdft:TestTurtleNegativeSyntax ; + mf:name "invalid predicate" ; + mf:action ; + mf:result . diff --git a/testsuite/oxigraph-tests/parser-error/unexpected_eof.nt b/testsuite/oxigraph-tests/parser-error/unexpected_eof.nt new file mode 100644 index 00000000..8c0a4ca2 --- /dev/null +++ b/testsuite/oxigraph-tests/parser-error/unexpected_eof.nt @@ -0,0 +1,2 @@ + Result<()> { .action .as_deref() .ok_or_else(|| anyhow!("No action found"))?; - ensure!( - load_dataset(action, format, false).is_err(), - "File parsed without errors even if it should not" - ); + let Err(error) = load_dataset(action, format, false) else { + bail!("File parsed without errors even if it should not"); + }; + if let Some(result) = &test.result { + let expected = read_file_to_string(result)?; + ensure!( + expected == error.to_string(), + "Not expected error message:\n{}", + format_diff(&expected, &error.to_string(), "message") + ); + } Ok(()) } diff --git a/testsuite/tests/oxigraph.rs b/testsuite/tests/oxigraph.rs index b76e5a2a..238b57c7 100644 --- a/testsuite/tests/oxigraph.rs +++ b/testsuite/tests/oxigraph.rs @@ -20,6 +20,14 @@ fn oxigraph_parser_recovery_testsuite() -> Result<()> { ) } +#[test] +fn oxigraph_parser_error_testsuite() -> Result<()> { + check_testsuite( + "https://github.com/oxigraph/oxigraph/tests/parser-error/manifest.ttl", + &[], + ) +} + #[test] fn oxigraph_sparql_testsuite() -> Result<()> { check_testsuite(