OxTTL: return file position in errors

pull/622/head
Tpt 1 year ago committed by Thomas Tanon
parent 8193cac86d
commit 13c3515d7b
  1. 10
      fuzz/fuzz_targets/nquads.rs
  2. 10
      fuzz/fuzz_targets/trig.rs
  3. 34
      lib/oxrdfio/src/error.rs
  4. 12
      lib/oxrdfxml/src/error.rs
  5. 6
      lib/oxrdfxml/src/parser.rs
  6. 6
      lib/oxttl/src/lexer.rs
  7. 2
      lib/oxttl/src/lib.rs
  8. 8
      lib/oxttl/src/line_formats.rs
  9. 4
      lib/oxttl/src/n3.rs
  10. 20
      lib/oxttl/src/terse.rs
  11. 132
      lib/oxttl/src/toolkit/error.rs
  12. 336
      lib/oxttl/src/toolkit/lexer.rs
  13. 8
      lib/oxttl/src/toolkit/mod.rs
  14. 142
      lib/oxttl/src/toolkit/parser.rs
  15. 2
      testsuite/oxigraph-tests/parser-error/invalid_iri.nt
  16. 2
      testsuite/oxigraph-tests/parser-error/invalid_iri_comment.nt
  17. 2
      testsuite/oxigraph-tests/parser-error/invalid_iri_comment_crlf.nt
  18. 2
      testsuite/oxigraph-tests/parser-error/invalid_iri_crlf.nt
  19. 1
      testsuite/oxigraph-tests/parser-error/invalid_iri_error.txt
  20. 2
      testsuite/oxigraph-tests/parser-error/invalid_predicate.nt
  21. 1
      testsuite/oxigraph-tests/parser-error/invalid_predicate_error.txt
  22. 1
      testsuite/oxigraph-tests/parser-error/invalid_string_escape.nt
  23. 1
      testsuite/oxigraph-tests/parser-error/invalid_string_escape_error.txt
  24. 66
      testsuite/oxigraph-tests/parser-error/manifest.ttl
  25. 2
      testsuite/oxigraph-tests/parser-error/unexpected_eof.nt
  26. 2
      testsuite/oxigraph-tests/parser-error/unexpected_eof_crlf.nt
  27. 1
      testsuite/oxigraph-tests/parser-error/unexpected_eof_error.txt
  28. 17
      testsuite/src/parser_evaluator.rs
  29. 8
      testsuite/tests/oxigraph.rs

@ -2,9 +2,9 @@
use libfuzzer_sys::fuzz_target;
use oxrdf::Quad;
use oxttl::{NQuadsParser, NQuadsSerializer, SyntaxError};
use oxttl::{NQuadsParser, NQuadsSerializer};
fn parse<'a>(chunks: impl IntoIterator<Item = &'a [u8]>) -> (Vec<Quad>, Vec<SyntaxError>) {
fn parse<'a>(chunks: impl IntoIterator<Item = &'a [u8]>) -> (Vec<Quad>, Vec<String>) {
let mut quads = Vec::new();
let mut errors = Vec::new();
let mut parser = NQuadsParser::new().with_quoted_triples().parse();
@ -13,7 +13,7 @@ fn parse<'a>(chunks: impl IntoIterator<Item = &'a [u8]>) -> (Vec<Quad>, Vec<Synt
while let Some(result) = parser.read_next() {
match result {
Ok(quad) => quads.push(quad),
Err(error) => errors.push(error),
Err(error) => errors.push(error.to_string()),
}
}
}
@ -21,7 +21,7 @@ fn parse<'a>(chunks: impl IntoIterator<Item = &'a [u8]>) -> (Vec<Quad>, Vec<Synt
while let Some(result) = parser.read_next() {
match result {
Ok(quad) => quads.push(quad),
Err(error) => errors.push(error),
Err(error) => errors.push(error.to_string()),
}
}
assert!(parser.is_end());
@ -39,7 +39,7 @@ fuzz_target!(|data: &[u8]| {
.collect::<Vec<_>>()
.as_slice()]);
assert_eq!(quads, quads_without_split);
assert_eq!(errors.len(), errors_without_split.len());
assert_eq!(errors, errors_without_split);
// We serialize
let mut writer = NQuadsSerializer::new().serialize_to_write(Vec::new());

@ -2,9 +2,9 @@
use libfuzzer_sys::fuzz_target;
use oxrdf::{Dataset, GraphName, Quad, Subject, Term, Triple};
use oxttl::{SyntaxError, TriGParser, TriGSerializer};
use oxttl::{TriGParser, TriGSerializer};
fn parse<'a>(chunks: impl IntoIterator<Item = &'a [u8]>) -> (Vec<Quad>, Vec<SyntaxError>) {
fn parse<'a>(chunks: impl IntoIterator<Item = &'a [u8]>) -> (Vec<Quad>, Vec<String>) {
let mut quads = Vec::new();
let mut errors = Vec::new();
let mut parser = TriGParser::new()
@ -17,7 +17,7 @@ fn parse<'a>(chunks: impl IntoIterator<Item = &'a [u8]>) -> (Vec<Quad>, Vec<Synt
while let Some(result) = parser.read_next() {
match result {
Ok(quad) => quads.push(quad),
Err(error) => errors.push(error),
Err(error) => errors.push(error.to_string()),
}
}
}
@ -25,7 +25,7 @@ fn parse<'a>(chunks: impl IntoIterator<Item = &'a [u8]>) -> (Vec<Quad>, Vec<Synt
while let Some(result) = parser.read_next() {
match result {
Ok(quad) => quads.push(quad),
Err(error) => errors.push(error),
Err(error) => errors.push(error.to_string()),
}
}
assert!(parser.is_end());
@ -96,7 +96,7 @@ fuzz_target!(|data: &[u8]| {
String::from_utf8_lossy(&serialize_quads(&quads_without_split))
);
}
assert_eq!(errors.len(), errors_without_split.len());
assert_eq!(errors, errors_without_split);
// We serialize
let new_serialization = serialize_quads(&quads);

@ -1,4 +1,5 @@
use std::error::Error;
use std::ops::Range;
use std::{fmt, io};
/// Error returned during RDF format parsing.
@ -110,10 +111,33 @@ pub struct SyntaxError {
enum SyntaxErrorKind {
Turtle(oxttl::SyntaxError),
RdfXml(oxrdfxml::SyntaxError),
Msg { msg: &'static str },
}
impl SyntaxError {
/// The location of the error inside of the file.
#[inline]
pub fn location(&self) -> Option<Range<TextPosition>> {
match &self.inner {
SyntaxErrorKind::Turtle(e) => {
let location = e.location();
Some(
TextPosition {
line: location.start.line,
column: location.start.column,
offset: location.start.offset,
}..TextPosition {
line: location.end.line,
column: location.end.column,
offset: location.end.offset,
},
)
}
SyntaxErrorKind::RdfXml(_) | SyntaxErrorKind::Msg { .. } => None,
}
}
}
impl fmt::Display for SyntaxError {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
@ -146,3 +170,11 @@ impl From<SyntaxError> for io::Error {
}
}
}
/// A position in a text i.e. a `line` number starting from 0, a `column` number starting from 0 (in number of code points) and a global file `offset` starting from 0 (in number of bytes).
#[derive(Eq, PartialEq, Debug, Clone, Copy)]
pub struct TextPosition {
pub line: u64,
pub column: u64,
pub offset: u64,
}

@ -72,15 +72,6 @@ impl From<quick_xml::Error> for ParseError {
}
}
impl From<quick_xml::events::attributes::AttrError> for ParseError {
#[inline]
fn from(error: quick_xml::events::attributes::AttrError) -> Self {
Self::Syntax(SyntaxError {
inner: SyntaxErrorKind::XmlAttribute(error),
})
}
}
/// An error in the syntax of the parsed file.
#[derive(Debug)]
pub struct SyntaxError {
@ -90,7 +81,6 @@ pub struct SyntaxError {
#[derive(Debug)]
pub enum SyntaxErrorKind {
Xml(quick_xml::Error),
XmlAttribute(quick_xml::events::attributes::AttrError),
InvalidIri {
iri: String,
error: IriParseError,
@ -119,7 +109,6 @@ impl fmt::Display for SyntaxError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match &self.inner {
SyntaxErrorKind::Xml(error) => error.fmt(f),
SyntaxErrorKind::XmlAttribute(error) => error.fmt(f),
SyntaxErrorKind::InvalidIri { iri, error } => {
write!(f, "error while parsing IRI '{iri}': {error}")
}
@ -136,7 +125,6 @@ impl Error for SyntaxError {
fn source(&self) -> Option<&(dyn Error + 'static)> {
match &self.inner {
SyntaxErrorKind::Xml(error) => Some(error),
SyntaxErrorKind::XmlAttribute(error) => Some(error),
SyntaxErrorKind::InvalidIri { error, .. } => Some(error),
SyntaxErrorKind::InvalidLanguageTag { error, .. } => Some(error),
SyntaxErrorKind::Msg { .. } => None,

@ -8,7 +8,7 @@ use quick_xml::escape::unescape_with;
use quick_xml::events::attributes::Attribute;
use quick_xml::events::*;
use quick_xml::name::{LocalName, QName, ResolveResult};
use quick_xml::{NsReader, Writer};
use quick_xml::{Error, NsReader, Writer};
use std::collections::{HashMap, HashSet};
use std::io::{BufReader, Read};
use std::str;
@ -515,7 +515,7 @@ impl<R> RdfXmlReader<R> {
.to_string(),
);
for attr in event.attributes() {
clean_event.push_attribute(attr?);
clean_event.push_attribute(attr.map_err(Error::InvalidAttr)?);
}
writer.write_event(Event::Start(clean_event))?;
self.in_literal_depth += 1;
@ -544,7 +544,7 @@ impl<R> RdfXmlReader<R> {
let mut type_attr = None;
for attribute in event.attributes() {
let attribute = attribute?;
let attribute = attribute.map_err(Error::InvalidAttr)?;
if attribute.key.as_ref().starts_with(b"xml") {
if attribute.key.as_ref() == b"xml:lang" {
let tag = self.convert_attribute(&attribute)?;

@ -266,7 +266,7 @@ impl N3Lexer {
));
}
}
Err(e) => return Some((e.position.end, Err(e))),
Err(e) => return Some((e.location.end, Err(e))),
}
} else if is_ending {
while data[..i].ends_with(b".") {
@ -447,7 +447,7 @@ impl N3Lexer {
return Some((i, Ok((buffer, might_be_invalid_iri))));
}
}
Err(e) => return Some((e.position.end, Err(e))),
Err(e) => return Some((e.location.end, Err(e))),
}
} else if is_ending {
let buffer = if let Some(mut buffer) = buffer {
@ -515,7 +515,7 @@ impl N3Lexer {
}
i += consumed;
}
Err(e) => return Some((e.position.end, Err(e))),
Err(e) => return Some((e.location.end, Err(e))),
}
}
}

@ -17,7 +17,7 @@ pub mod turtle;
pub use crate::n3::N3Parser;
pub use crate::nquads::{NQuadsParser, NQuadsSerializer};
pub use crate::ntriples::{NTriplesParser, NTriplesSerializer};
pub use crate::toolkit::{ParseError, SyntaxError};
pub use crate::toolkit::{ParseError, SyntaxError, TextPosition};
pub use crate::trig::{TriGParser, TriGSerializer};
pub use crate::turtle::{TurtleParser, TurtleSerializer};

@ -76,7 +76,7 @@ impl RuleRecognizer for NQuadsRecognizer {
}
_ => self.error(
errors,
format!("The subject of a triple should be an IRI or a blank node, {token:?} found"),
"The subject of a triple should be an IRI or a blank node, TOKEN found",
),
},
NQuadsState::ExpectPredicate => match token {
@ -88,7 +88,7 @@ impl RuleRecognizer for NQuadsRecognizer {
}
_ => self.error(
errors,
format!("The predicate of a triple should be an IRI, {token:?} found"),
"The predicate of a triple should be an IRI, TOKEN found",
),
},
NQuadsState::ExpectedObject => match token {
@ -118,7 +118,7 @@ impl RuleRecognizer for NQuadsRecognizer {
}
_ => self.error(
errors,
format!("The object of a triple should be an IRI, a blank node or a literal, {token:?} found"),
"The object of a triple should be an IRI, a blank node or a literal, TOKEN found",
),
},
NQuadsState::ExpectLiteralAnnotationOrGraphNameOrDot { value } => match token {
@ -159,7 +159,7 @@ impl RuleRecognizer for NQuadsRecognizer {
.push(NQuadsState::ExpectPossibleGraphOrEndOfQuotedTriple);
self
}
_ => self.error(errors, format!("A literal datatype must be an IRI, found {token:?}")),
_ => self.error(errors, "A literal datatype must be an IRI, found TOKEN"),
},
NQuadsState::ExpectPossibleGraphOrEndOfQuotedTriple => {
if self.stack.is_empty() {

@ -836,7 +836,7 @@ impl RuleRecognizer for N3Recognizer {
self.stack.push(N3State::FormulaContent);
self
}
_ => self.error(errors, format!("This is not a valid RDF value: {token:?}"))
_ => self.error(errors, "TOKEN is not a valid RDF value")
}
}
N3State::PropertyListMiddle => match token {
@ -950,7 +950,7 @@ impl RuleRecognizer for N3Recognizer {
Err(e) => self.error(errors, e)
}
_ => {
self.error(errors, format!("Expecting a datatype IRI after '^^, found {token:?}")).recognize_next(token, results, errors)
self.error(errors, "Expecting a datatype IRI after '^^, found TOKEN").recognize_next(token, results, errors)
}
}
}

@ -167,7 +167,7 @@ impl RuleRecognizer for TriGRecognizer {
self
}
_ => {
self.error(errors, format!("The token {token:?} is not a valid subject or graph name"))
self.error(errors, "TOKEN is not a valid subject or graph name")
}
}
TriGState::WrappedGraphOrPredicateObjectList { term } => {
@ -317,7 +317,7 @@ impl RuleRecognizer for TriGRecognizer {
self
}
_ => {
self.error(errors, format!("The token {token:?} is not a valid RDF subject"))
self.error(errors, "TOKEN is not a valid RDF subject")
}
},
TriGState::TriplesBlankNodePropertyListCurrent => if token == N3Token::Punctuation("]") {
@ -350,7 +350,7 @@ impl RuleRecognizer for TriGRecognizer {
self
}
_ => {
self.error(errors, format!("The token {token:?} is not a valid graph name"))
self.error(errors, "TOKEN is not a valid graph name")
}
}
TriGState::GraphNameAnonEnd => if token == N3Token::Punctuation("]") {
@ -456,7 +456,7 @@ impl RuleRecognizer for TriGRecognizer {
Err(e) => self.error(errors, e)
}
_ => {
self.error(errors, format!("The token {token:?} is not a valid predicate"))
self.error(errors, "TOKEN is not a valid predicate")
}
}
// [12] object ::= iri | BlankNode | collection | blankNodePropertyList | literal | quotedTriple
@ -536,7 +536,7 @@ impl RuleRecognizer for TriGRecognizer {
self
}
_ => {
self.error(errors, format!("This is not a valid RDF object: {token:?}"))
self.error(errors, "TOKEN is not a valid RDF object")
}
}
@ -637,7 +637,7 @@ impl RuleRecognizer for TriGRecognizer {
Err(e) => self.error(errors, e)
}
_ => {
self.error(errors, format!("Expecting a datatype IRI after '^^, found {token:?}")).recognize_next(token, results, errors)
self.error(errors, "Expecting a datatype IRI after ^^, found TOKEN").recognize_next(token, results, errors)
}
}
}
@ -653,7 +653,7 @@ impl RuleRecognizer for TriGRecognizer {
if token == N3Token::Punctuation(">>") {
self
} else {
self.error(errors, format!("Expecting '>>' to close a quoted triple, found {token:?}"))
self.error(errors, "Expecting '>>' to close a quoted triple, found TOKEN")
}
}
#[cfg(feature = "rdf-star")]
@ -670,7 +670,7 @@ impl RuleRecognizer for TriGRecognizer {
if token == N3Token::Punctuation(">>") {
self
} else {
self.error(errors, format!("Expecting '>>' to close a quoted triple, found {token:?}"))
self.error(errors, "Expecting '>>' to close a quoted triple, found TOKEN")
}
}
// [28t] qtSubject ::= iri | BlankNode | quotedTriple
@ -703,7 +703,7 @@ impl RuleRecognizer for TriGRecognizer {
self.stack.push(TriGState::QuotedSubject);
self
}
_ => self.error(errors, format!("This is not a valid RDF quoted triple subject: {token:?}"))
_ => self.error(errors, "TOKEN is not a valid RDF quoted triple subject: TOKEN")
}
// [29t] qtObject ::= iri | BlankNode | literal | quotedTriple
#[cfg(feature = "rdf-star")]
@ -759,7 +759,7 @@ impl RuleRecognizer for TriGRecognizer {
self.stack.push(TriGState::QuotedSubject);
self
}
_ => self.error(errors, format!("This is not a valid RDF quoted triple object: {token:?}"))
_ => self.error(errors, "TOKEN is not a valid RDF quoted triple object")
}
#[cfg(feature = "rdf-star")]
TriGState::QuotedAnonEnd => if token == N3Token::Punctuation("]") {

@ -0,0 +1,132 @@
use std::error::Error;
use std::ops::Range;
use std::{fmt, io};
/// A position in a text i.e. a `line` number starting from 0, a `column` number starting from 0 (in number of code points) and a global file `offset` starting from 0 (in number of bytes).
#[derive(Eq, PartialEq, Debug, Clone, Copy)]
pub struct TextPosition {
pub line: u64,
pub column: u64,
pub offset: u64,
}
/// An error in the syntax of the parsed file.
///
/// It is composed of a message and a byte range in the input.
#[derive(Debug)]
pub struct SyntaxError {
pub(super) location: Range<TextPosition>,
pub(super) message: String,
}
impl SyntaxError {
/// The location of the error inside of the file.
#[inline]
pub fn location(&self) -> Range<TextPosition> {
self.location.clone()
}
/// The error message.
#[inline]
pub fn message(&self) -> &str {
&self.message
}
}
impl fmt::Display for SyntaxError {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.location.start.offset + 1 >= self.location.end.offset {
write!(
f,
"Parser error at line {} column {}: {}",
self.location.start.line + 1,
self.location.start.column + 1,
self.message
)
} else if self.location.start.line == self.location.end.line {
write!(
f,
"Parser error between at line {} between columns {} and column {}: {}",
self.location.start.line + 1,
self.location.start.column + 1,
self.location.end.column + 1,
self.message
)
} else {
write!(
f,
"Parser error between line {} column {} and line {} column {}: {}",
self.location.start.line + 1,
self.location.start.column + 1,
self.location.end.line + 1,
self.location.end.column + 1,
self.message
)
}
}
}
impl Error for SyntaxError {}
impl From<SyntaxError> for io::Error {
#[inline]
fn from(error: SyntaxError) -> Self {
io::Error::new(io::ErrorKind::InvalidData, error)
}
}
/// A parsing error.
///
/// It is the union of [`SyntaxError`] and [`std::io::Error`].
#[derive(Debug)]
pub enum ParseError {
/// I/O error during parsing (file not found...).
Io(io::Error),
/// An error in the file syntax.
Syntax(SyntaxError),
}
impl fmt::Display for ParseError {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Io(e) => e.fmt(f),
Self::Syntax(e) => e.fmt(f),
}
}
}
impl Error for ParseError {
#[inline]
fn source(&self) -> Option<&(dyn Error + 'static)> {
Some(match self {
Self::Io(e) => e,
Self::Syntax(e) => e,
})
}
}
impl From<SyntaxError> for ParseError {
#[inline]
fn from(error: SyntaxError) -> Self {
Self::Syntax(error)
}
}
impl From<io::Error> for ParseError {
#[inline]
fn from(error: io::Error) -> Self {
Self::Io(error)
}
}
impl From<ParseError> for io::Error {
#[inline]
fn from(error: ParseError) -> Self {
match error {
ParseError::Syntax(e) => e.into(),
ParseError::Io(e) => e,
}
}
}

@ -1,9 +1,10 @@
use memchr::memchr2;
use crate::toolkit::error::{SyntaxError, TextPosition};
use memchr::{memchr2, memchr2_iter};
use std::borrow::Cow;
use std::cmp::min;
use std::error::Error;
use std::fmt;
use std::io::{self, Read};
use std::ops::{Range, RangeInclusive};
use std::str;
#[cfg(feature = "async-tokio")]
use tokio::io::{AsyncRead, AsyncReadExt};
@ -22,14 +23,14 @@ pub trait TokenRecognizer {
}
pub struct TokenRecognizerError {
pub position: Range<usize>,
pub location: Range<usize>,
pub message: String,
}
impl<S: Into<String>> From<(Range<usize>, S)> for TokenRecognizerError {
fn from((position, message): (Range<usize>, S)) -> Self {
fn from((location, message): (Range<usize>, S)) -> Self {
Self {
position,
location,
message: message.into(),
}
}
@ -37,34 +38,37 @@ impl<S: Into<String>> From<(Range<usize>, S)> for TokenRecognizerError {
#[allow(clippy::range_plus_one)]
impl<S: Into<String>> From<(RangeInclusive<usize>, S)> for TokenRecognizerError {
fn from((position, message): (RangeInclusive<usize>, S)) -> Self {
(*position.start()..*position.end() + 1, message).into()
fn from((location, message): (RangeInclusive<usize>, S)) -> Self {
(*location.start()..*location.end() + 1, message).into()
}
}
impl<S: Into<String>> From<(usize, S)> for TokenRecognizerError {
fn from((position, message): (usize, S)) -> Self {
(position..=position, message).into()
fn from((location, message): (usize, S)) -> Self {
(location..=location, message).into()
}
}
pub struct TokenWithPosition<T> {
pub token: T,
pub position: Range<usize>,
}
pub struct Lexer<R: TokenRecognizer> {
parser: R,
data: Vec<u8>,
start: usize,
position: Position,
previous_position: Position, // Lexer position before the last emitted token
is_ending: bool,
position: usize,
min_buffer_size: usize,
max_buffer_size: usize,
is_line_jump_whitespace: bool,
line_comment_start: Option<&'static [u8]>,
}
#[derive(Clone, Copy)]
struct Position {
line_start_buffer_offset: usize,
buffer_offset: usize,
global_offset: u64,
global_line: u64,
}
impl<R: TokenRecognizer> Lexer<R> {
pub fn new(
parser: R,
@ -76,9 +80,19 @@ impl<R: TokenRecognizer> Lexer<R> {
Self {
parser,
data: Vec::new(),
start: 0,
position: Position {
line_start_buffer_offset: 0,
buffer_offset: 0,
global_offset: 0,
global_line: 0,
},
previous_position: Position {
line_start_buffer_offset: 0,
buffer_offset: 0,
global_offset: 0,
global_line: 0,
},
is_ending: false,
position: 0,
min_buffer_size,
max_buffer_size,
is_line_jump_whitespace,
@ -148,24 +162,43 @@ impl<R: TokenRecognizer> Lexer<R> {
Ok(())
}
pub fn read_next(
&mut self,
options: &R::Options,
) -> Option<Result<TokenWithPosition<R::Token<'_>>, LexerError>> {
#[allow(clippy::unwrap_in_result)]
pub fn read_next(&mut self, options: &R::Options) -> Option<Result<R::Token<'_>, SyntaxError>> {
self.skip_whitespaces_and_comments()?;
let Some((consumed, result)) =
self.parser
.recognize_next_token(&self.data[self.start..], self.is_ending, options)
else {
self.previous_position = self.position;
let Some((consumed, result)) = self.parser.recognize_next_token(
&self.data[self.position.buffer_offset..],
self.is_ending,
options,
) else {
return if self.is_ending {
if self.start == self.data.len() {
if self.position.buffer_offset == self.data.len() {
None // We have finished
} else {
let error = LexerError {
position: self.position..self.position + (self.data.len() - self.start),
let (new_line_jumps, new_line_start) =
Self::find_number_of_line_jumps_and_start_of_last_line(
&self.data[self.position.buffer_offset..],
);
if new_line_jumps > 0 {
self.position.line_start_buffer_offset =
self.position.buffer_offset + new_line_start;
}
self.position.global_offset +=
u64::try_from(self.data.len() - self.position.buffer_offset).unwrap();
self.position.buffer_offset = self.data.len();
self.position.global_line += new_line_jumps;
let new_position = TextPosition {
line: self.position.global_line,
column: Self::column_from_bytes(
&self.data[self.position.line_start_buffer_offset..],
),
offset: self.position.global_offset,
};
let error = SyntaxError {
location: new_position..new_position,
message: "Unexpected end of file".into(),
};
self.start = self.data.len(); // We consume everything
self.position.buffer_offset = self.data.len(); // We consume everything
Some(Err(error))
}
} else {
@ -177,44 +210,119 @@ impl<R: TokenRecognizer> Lexer<R> {
"The lexer must consume at least one byte each time"
);
debug_assert!(
self.start + consumed <= self.data.len(),
self.position.buffer_offset + consumed <= self.data.len(),
"The lexer tried to consumed {consumed} bytes but only {} bytes are readable",
self.data.len() - self.start
self.data.len() - self.position.buffer_offset
);
let (new_line_jumps, new_line_start) =
Self::find_number_of_line_jumps_and_start_of_last_line(
&self.data[self.position.buffer_offset..self.position.buffer_offset + consumed],
);
let old_position = self.position;
self.start += consumed;
self.position += consumed;
Some(match result {
Ok(token) => Ok(TokenWithPosition {
token,
position: old_position..self.position,
}),
Err(e) => Err(LexerError {
position: e.position.start + self.position..e.position.end + self.position,
if new_line_jumps > 0 {
self.position.line_start_buffer_offset = self.position.buffer_offset + new_line_start;
}
self.position.buffer_offset += consumed;
self.position.global_offset += u64::try_from(consumed).unwrap();
self.position.global_line += new_line_jumps;
Some(result.map_err(|e| SyntaxError {
location: self.location_from_buffer_offset_range(e.location),
message: e.message,
}),
})
}))
}
pub fn location_from_buffer_offset_range(
&self,
offset_range: Range<usize>,
) -> Range<TextPosition> {
let start_offset = self.previous_position.buffer_offset + offset_range.start;
let (start_extra_line_jumps, start_line_start) =
Self::find_number_of_line_jumps_and_start_of_last_line(
&self.data[self.previous_position.buffer_offset..start_offset],
);
let start_line_start = if start_extra_line_jumps > 0 {
start_line_start + self.previous_position.buffer_offset
} else {
self.previous_position.line_start_buffer_offset
};
let end_offset = self.previous_position.buffer_offset + offset_range.end;
let (end_extra_line_jumps, end_line_start) =
Self::find_number_of_line_jumps_and_start_of_last_line(
&self.data[self.previous_position.buffer_offset..end_offset],
);
let end_line_start = if end_extra_line_jumps > 0 {
end_line_start + self.previous_position.buffer_offset
} else {
self.previous_position.line_start_buffer_offset
};
TextPosition {
line: self.previous_position.global_line + start_extra_line_jumps,
column: Self::column_from_bytes(&self.data[start_line_start..start_offset]),
offset: self.previous_position.global_offset
+ u64::try_from(offset_range.start).unwrap(),
}..TextPosition {
line: self.previous_position.global_line + end_extra_line_jumps,
column: Self::column_from_bytes(&self.data[end_line_start..end_offset]),
offset: self.previous_position.global_offset + u64::try_from(offset_range.end).unwrap(),
}
}
pub fn last_token_location(&self) -> Range<TextPosition> {
TextPosition {
line: self.previous_position.global_line,
column: Self::column_from_bytes(
&self.data[self.previous_position.line_start_buffer_offset
..self.previous_position.buffer_offset],
),
offset: self.previous_position.global_offset,
}..TextPosition {
line: self.position.global_line,
column: Self::column_from_bytes(
&self.data[self.position.line_start_buffer_offset..self.position.buffer_offset],
),
offset: self.position.global_offset,
}
}
pub fn last_token_source(&self) -> Cow<'_, str> {
String::from_utf8_lossy(
&self.data[self.previous_position.buffer_offset..self.position.buffer_offset],
)
}
pub fn is_end(&self) -> bool {
self.is_ending && self.data.len() == self.start
self.is_ending && self.data.len() == self.position.buffer_offset
}
#[allow(clippy::unwrap_in_result)]
fn skip_whitespaces_and_comments(&mut self) -> Option<()> {
loop {
self.skip_whitespaces();
self.skip_whitespaces()?;
let buf = &self.data[self.start..];
let buf = &self.data[self.position.buffer_offset..];
if let Some(line_comment_start) = self.line_comment_start {
if buf.starts_with(line_comment_start) {
// Comment
if let Some(end) = memchr2(b'\r', b'\n', &buf[line_comment_start.len()..]) {
self.start += end + line_comment_start.len();
self.position += end + line_comment_start.len();
let mut end_position = line_comment_start.len() + end;
if buf.get(end_position).copied() == Some(b'\r') {
// We look for \n for Windows line end style
if let Some(c) = buf.get(end_position + 1) {
if *c == b'\n' {
end_position += 1;
}
} else if !self.is_ending {
return None; // We need to read more
}
}
let comment_size = end_position + 1;
self.position.buffer_offset += comment_size;
self.position.line_start_buffer_offset = self.position.buffer_offset;
self.position.global_offset += u64::try_from(comment_size).unwrap();
self.position.global_line += 1;
continue;
}
if self.is_ending {
self.start = self.data.len(); // EOF
self.position.buffer_offset = self.data.len(); // EOF
return Some(());
}
return None; // We need more data
@ -224,80 +332,98 @@ impl<R: TokenRecognizer> Lexer<R> {
}
}
fn skip_whitespaces(&mut self) {
fn skip_whitespaces(&mut self) -> Option<()> {
if self.is_line_jump_whitespace {
for (i, c) in self.data[self.start..].iter().enumerate() {
if !matches!(c, b' ' | b'\t' | b'\r' | b'\n') {
self.start += i;
self.position += i;
return;
}
let mut i = self.position.buffer_offset;
while let Some(c) = self.data.get(i) {
match c {
b' ' | b'\t' => {
self.position.buffer_offset += 1;
self.position.global_offset += 1;
}
b'\r' => {
// We look for \n for Windows line end style
let mut increment: u8 = 1;
if let Some(c) = self.data.get(i + 1) {
if *c == b'\n' {
increment += 1;
i += 1;
}
} else if !self.is_ending {
return None; // We need to read more
}
self.position.buffer_offset += usize::from(increment);
self.position.line_start_buffer_offset = self.position.buffer_offset;
self.position.global_offset += u64::from(increment);
self.position.global_line += 1;
}
b'\n' => {
self.position.buffer_offset += 1;
self.position.line_start_buffer_offset = self.position.buffer_offset;
self.position.global_offset += 1;
self.position.global_line += 1;
}
_ => return Some(()),
}
i += 1;
//TODO: SIMD
}
} else {
for (i, c) in self.data[self.start..].iter().enumerate() {
if !matches!(c, b' ' | b'\t') {
self.start += i;
self.position += i;
return;
for c in &self.data[self.position.buffer_offset..] {
if matches!(c, b' ' | b'\t') {
self.position.buffer_offset += 1;
self.position.global_offset += 1;
} else {
return Some(());
}
//TODO: SIMD
}
}
// We only have whitespaces
self.position += self.data.len() - self.start;
self.start = self.data.len();
Some(())
}
fn shrink_data(&mut self) {
if self.start > 0 {
self.data.copy_within(self.start.., 0);
self.data.truncate(self.data.len() - self.start);
self.start = 0;
if self.position.line_start_buffer_offset > 0 {
self.data
.copy_within(self.position.line_start_buffer_offset.., 0);
self.data
.truncate(self.data.len() - self.position.line_start_buffer_offset);
self.position.buffer_offset -= self.position.line_start_buffer_offset;
self.position.line_start_buffer_offset = 0;
self.previous_position = self.position;
}
}
}
#[derive(Debug)]
pub struct LexerError {
position: Range<usize>,
message: String,
}
impl LexerError {
pub fn position(&self) -> Range<usize> {
self.position.clone()
fn find_number_of_line_jumps_and_start_of_last_line(bytes: &[u8]) -> (u64, usize) {
let mut num_of_jumps = 0;
let mut last_jump_pos = 0;
let mut previous_cr = 0;
for pos in memchr2_iter(b'\r', b'\n', bytes) {
if bytes[pos] == b'\r' {
previous_cr = pos;
num_of_jumps += 1;
last_jump_pos = pos + 1;
} else {
if previous_cr < pos - 1 {
// We count \r\n as a single line jump
num_of_jumps += 1;
}
pub fn message(&self) -> &str {
&self.message
last_jump_pos = pos + 1;
}
pub fn into_message(self) -> String {
self.message
}
}
(num_of_jumps, last_jump_pos)
}
impl fmt::Display for LexerError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.position.start + 1 == self.position.end {
write!(
f,
"Lexer error at byte {}: {}",
self.position.start, self.message
)
fn column_from_bytes(bytes: &[u8]) -> u64 {
match str::from_utf8(bytes) {
Ok(s) => u64::try_from(s.chars().count()).unwrap(),
Err(e) => {
if e.valid_up_to() == 0 {
0
} else {
write!(
f,
"Lexer error between bytes {} and {}: {}",
self.position.start, self.position.end, self.message
)
Self::column_from_bytes(&bytes[..e.valid_up_to()])
}
}
}
}
impl Error for LexerError {
fn description(&self) -> &str {
self.message()
}
}

@ -2,12 +2,12 @@
//!
//! Provides the basic code to write plain Rust lexers and parsers able to read files chunk by chunk.
mod error;
mod lexer;
mod parser;
pub use self::lexer::{Lexer, LexerError, TokenRecognizer, TokenRecognizerError};
pub use self::error::{ParseError, SyntaxError, TextPosition};
pub use self::lexer::{Lexer, TokenRecognizer, TokenRecognizerError};
#[cfg(feature = "async-tokio")]
pub use self::parser::FromTokioAsyncReadIterator;
pub use self::parser::{
FromReadIterator, ParseError, Parser, RuleRecognizer, RuleRecognizerError, SyntaxError,
};
pub use self::parser::{FromReadIterator, Parser, RuleRecognizer, RuleRecognizerError};

@ -1,9 +1,6 @@
use crate::toolkit::lexer::TokenWithPosition;
use crate::toolkit::{Lexer, LexerError, TokenRecognizer};
use std::error::Error;
use crate::toolkit::error::{ParseError, SyntaxError};
use crate::toolkit::lexer::{Lexer, TokenRecognizer};
use std::io::Read;
use std::ops::Range;
use std::{fmt, io};
#[cfg(feature = "async-tokio")]
use tokio::io::AsyncRead;
@ -42,7 +39,6 @@ pub struct Parser<RR: RuleRecognizer> {
state: Option<RR>,
results: Vec<RR::Output>,
errors: Vec<RuleRecognizerError>,
position: Range<usize>,
default_lexer_options: <RR::TokenRecognizer as TokenRecognizer>::Options,
}
@ -53,7 +49,6 @@ impl<RR: RuleRecognizer> Parser<RR> {
state: Some(recognizer),
results: vec![],
errors: vec![],
position: 0..0,
default_lexer_options: <RR::TokenRecognizer as TokenRecognizer>::Options::default(),
}
}
@ -76,8 +71,10 @@ impl<RR: RuleRecognizer> Parser<RR> {
loop {
if let Some(error) = self.errors.pop() {
return Some(Err(SyntaxError {
position: self.position.clone(),
message: error.message,
location: self.lexer.last_token_location(),
message: error
.message
.replace("TOKEN", &self.lexer.last_token_source()),
}));
}
if let Some(result) = self.results.pop() {
@ -89,8 +86,7 @@ impl<RR: RuleRecognizer> Parser<RR> {
.map_or(&self.default_lexer_options, |p| p.lexer_options()),
) {
match result {
Ok(TokenWithPosition { token, position }) => {
self.position = position;
Ok(token) => {
self.state = self.state.take().map(|state| {
state.recognize_next(token, &mut self.results, &mut self.errors)
});
@ -98,7 +94,7 @@ impl<RR: RuleRecognizer> Parser<RR> {
}
Err(e) => {
self.state = self.state.take().map(RR::error_recovery_state);
return Some(Err(e.into()));
return Some(Err(e));
}
}
}
@ -126,128 +122,6 @@ impl<RR: RuleRecognizer> Parser<RR> {
}
}
/// An error in the syntax of the parsed file.
///
/// It is composed of a message and a byte range in the input.
#[derive(Debug)]
pub struct SyntaxError {
position: Range<usize>,
message: String,
}
impl SyntaxError {
/// The invalid byte range in the input.
#[inline]
pub fn position(&self) -> Range<usize> {
self.position.clone()
}
/// The error message.
#[inline]
pub fn message(&self) -> &str {
&self.message
}
/// Converts this error to an error message.
#[inline]
pub fn into_message(self) -> String {
self.message
}
}
impl fmt::Display for SyntaxError {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.position.start + 1 == self.position.end {
write!(
f,
"Parser error at byte {}: {}",
self.position.start, self.message
)
} else {
write!(
f,
"Parser error between bytes {} and {}: {}",
self.position.start, self.position.end, self.message
)
}
}
}
impl Error for SyntaxError {}
impl From<SyntaxError> for io::Error {
#[inline]
fn from(error: SyntaxError) -> Self {
io::Error::new(io::ErrorKind::InvalidData, error)
}
}
impl From<LexerError> for SyntaxError {
#[inline]
fn from(e: LexerError) -> Self {
Self {
position: e.position(),
message: e.into_message(),
}
}
}
/// A parsing error.
///
/// It is the union of [`SyntaxError`] and [`std::io::Error`].
#[derive(Debug)]
pub enum ParseError {
/// I/O error during parsing (file not found...).
Io(io::Error),
/// An error in the file syntax.
Syntax(SyntaxError),
}
impl fmt::Display for ParseError {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Io(e) => e.fmt(f),
Self::Syntax(e) => e.fmt(f),
}
}
}
impl Error for ParseError {
#[inline]
fn source(&self) -> Option<&(dyn Error + 'static)> {
Some(match self {
Self::Io(e) => e,
Self::Syntax(e) => e,
})
}
}
impl From<SyntaxError> for ParseError {
#[inline]
fn from(error: SyntaxError) -> Self {
Self::Syntax(error)
}
}
impl From<io::Error> for ParseError {
#[inline]
fn from(error: io::Error) -> Self {
Self::Io(error)
}
}
impl From<ParseError> for io::Error {
#[inline]
fn from(error: ParseError) -> Self {
match error {
ParseError::Syntax(e) => e.into(),
ParseError::Io(e) => e,
}
}
}
pub struct FromReadIterator<R: Read, RR: RuleRecognizer> {
read: R,
parser: Parser<RR>,

@ -0,0 +1,2 @@
<http://example.com/s> <http://example.com/p> <http://example.com/o> .
<http://example.com/s> <http:// /p> <http://example.com/o> .

@ -0,0 +1,2 @@
<http://example.com/s> <http://example.com/p> <http://example.com/o> . # foo
<http://example.com/s> <http:// /p> <http://example.com/o> .

@ -0,0 +1,2 @@
<http://example.com/s> <http://example.com/p> <http://example.com/o> . # foo
<http://example.com/s> <http:// /p> <http://example.com/o> .

@ -0,0 +1,2 @@
<http://example.com/s> <http://example.com/p> <http://example.com/o> .
<http://example.com/s> <http:// /p> <http://example.com/o> .

@ -0,0 +1 @@
Parser error between at line 2 between columns 24 and column 36: Invalid IRI code point ' '

@ -0,0 +1,2 @@
<http://example.com/s> <http://example.com/p> <http://example.com/o> .
<http://example.com/s> "p" <http://example.com/o> .

@ -0,0 +1 @@
Parser error between at line 2 between columns 24 and column 27: "p" is not a valid predicate

@ -0,0 +1 @@
<http://example.com/s> <http://example.com/p> "fooé \a baré" .

@ -0,0 +1 @@
Parser error between at line 1 between columns 53 and column 55: Unexpected escape character '\a'

@ -0,0 +1,66 @@
@prefix mf: <http://www.w3.org/2001/sw/DataAccess/tests/test-manifest#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix rdft: <http://www.w3.org/ns/rdftest#> .
<>
rdf:type mf:Manifest ;
rdfs:comment "Oxigraph parser error test cases" ;
mf:entries (
<#invalid_iri>
<#invalid_iri_crlf>
<#invalid_iri_comment>
<#invalid_iri_comment_crlf>
<#invalid_string_escape>
<#unexpected_eof>
<#unexpected_eof_crlf>
<#invalid_predicate>
) .
<#invalid_iri>
rdf:type rdft:TestTurtleNegativeSyntax ;
mf:name "bad IRI" ;
mf:action <invalid_iri.nt> ;
mf:result <invalid_iri_error.txt> .
<#invalid_iri_crlf>
rdf:type rdft:TestTurtleNegativeSyntax ;
mf:name "bad IRI" ;
mf:action <invalid_iri_crlf.nt> ;
mf:result <invalid_iri_error.txt> .
<#invalid_iri_comment>
rdf:type rdft:TestTurtleNegativeSyntax ;
mf:name "bad IRI" ;
mf:action <invalid_iri_comment.nt> ;
mf:result <invalid_iri_error.txt> .
<#invalid_iri_comment_crlf>
rdf:type rdft:TestTurtleNegativeSyntax ;
mf:name "bad IRI" ;
mf:action <invalid_iri_comment_crlf.nt> ;
mf:result <invalid_iri_error.txt> .
<#invalid_string_escape>
rdf:type rdft:TestTurtleNegativeSyntax ;
mf:name "bad string escape" ;
mf:action <invalid_string_escape.nt> ;
mf:result <invalid_string_escape_error.txt> .
<#unexpected_eof>
rdf:type rdft:TestTurtleNegativeSyntax ;
mf:name "unexpected end of file" ;
mf:action <unexpected_eof.nt> ;
mf:result <unexpected_eof_error.txt> .
<#unexpected_eof_crlf>
rdf:type rdft:TestTurtleNegativeSyntax ;
mf:name "unexpected end of file" ;
mf:action <unexpected_eof_crlf.nt> ;
mf:result <unexpected_eof_error.txt> .
<#invalid_predicate>
rdf:type rdft:TestTurtleNegativeSyntax ;
mf:name "invalid predicate" ;
mf:action <invalid_predicate.nt> ;
mf:result <invalid_predicate_error.txt> .

@ -0,0 +1,2 @@
<http://example.com/s> <http://example.com/p> <http://example.com/o

@ -0,0 +1,2 @@
<http://example.com/s> <http://example.com/p> <http://example.com/o

@ -0,0 +1 @@
Parser error at line 2 column 3: Unexpected end of file

@ -1,8 +1,8 @@
use crate::evaluator::TestEvaluator;
use crate::files::{guess_rdf_format, load_dataset, load_n3};
use crate::files::{guess_rdf_format, load_dataset, load_n3, read_file_to_string};
use crate::manifest::Test;
use crate::report::dataset_diff;
use anyhow::{anyhow, ensure, Result};
use crate::report::{dataset_diff, format_diff};
use anyhow::{anyhow, bail, ensure, Result};
use oxigraph::io::RdfFormat;
use oxigraph::model::{BlankNode, Dataset, Quad};
use oxttl::n3::{N3Quad, N3Term};
@ -116,10 +116,17 @@ fn evaluate_negative_syntax_test(test: &Test, format: RdfFormat) -> Result<()> {
.action
.as_deref()
.ok_or_else(|| anyhow!("No action found"))?;
let Err(error) = load_dataset(action, format, false) else {
bail!("File parsed without errors even if it should not");
};
if let Some(result) = &test.result {
let expected = read_file_to_string(result)?;
ensure!(
load_dataset(action, format, false).is_err(),
"File parsed without errors even if it should not"
expected == error.to_string(),
"Not expected error message:\n{}",
format_diff(&expected, &error.to_string(), "message")
);
}
Ok(())
}

@ -20,6 +20,14 @@ fn oxigraph_parser_recovery_testsuite() -> Result<()> {
)
}
#[test]
fn oxigraph_parser_error_testsuite() -> Result<()> {
check_testsuite(
"https://github.com/oxigraph/oxigraph/tests/parser-error/manifest.ttl",
&[],
)
}
#[test]
fn oxigraph_sparql_testsuite() -> Result<()> {
check_testsuite(

Loading…
Cancel
Save