OxTTL: return file position in errors

pull/622/head
Tpt 1 year ago committed by Thomas Tanon
parent 8193cac86d
commit 13c3515d7b
  1. 10
      fuzz/fuzz_targets/nquads.rs
  2. 10
      fuzz/fuzz_targets/trig.rs
  3. 34
      lib/oxrdfio/src/error.rs
  4. 12
      lib/oxrdfxml/src/error.rs
  5. 6
      lib/oxrdfxml/src/parser.rs
  6. 6
      lib/oxttl/src/lexer.rs
  7. 2
      lib/oxttl/src/lib.rs
  8. 8
      lib/oxttl/src/line_formats.rs
  9. 4
      lib/oxttl/src/n3.rs
  10. 20
      lib/oxttl/src/terse.rs
  11. 132
      lib/oxttl/src/toolkit/error.rs
  12. 336
      lib/oxttl/src/toolkit/lexer.rs
  13. 8
      lib/oxttl/src/toolkit/mod.rs
  14. 142
      lib/oxttl/src/toolkit/parser.rs
  15. 2
      testsuite/oxigraph-tests/parser-error/invalid_iri.nt
  16. 2
      testsuite/oxigraph-tests/parser-error/invalid_iri_comment.nt
  17. 2
      testsuite/oxigraph-tests/parser-error/invalid_iri_comment_crlf.nt
  18. 2
      testsuite/oxigraph-tests/parser-error/invalid_iri_crlf.nt
  19. 1
      testsuite/oxigraph-tests/parser-error/invalid_iri_error.txt
  20. 2
      testsuite/oxigraph-tests/parser-error/invalid_predicate.nt
  21. 1
      testsuite/oxigraph-tests/parser-error/invalid_predicate_error.txt
  22. 1
      testsuite/oxigraph-tests/parser-error/invalid_string_escape.nt
  23. 1
      testsuite/oxigraph-tests/parser-error/invalid_string_escape_error.txt
  24. 66
      testsuite/oxigraph-tests/parser-error/manifest.ttl
  25. 2
      testsuite/oxigraph-tests/parser-error/unexpected_eof.nt
  26. 2
      testsuite/oxigraph-tests/parser-error/unexpected_eof_crlf.nt
  27. 1
      testsuite/oxigraph-tests/parser-error/unexpected_eof_error.txt
  28. 17
      testsuite/src/parser_evaluator.rs
  29. 8
      testsuite/tests/oxigraph.rs

@ -2,9 +2,9 @@
use libfuzzer_sys::fuzz_target; use libfuzzer_sys::fuzz_target;
use oxrdf::Quad; use oxrdf::Quad;
use oxttl::{NQuadsParser, NQuadsSerializer, SyntaxError}; use oxttl::{NQuadsParser, NQuadsSerializer};
fn parse<'a>(chunks: impl IntoIterator<Item = &'a [u8]>) -> (Vec<Quad>, Vec<SyntaxError>) { fn parse<'a>(chunks: impl IntoIterator<Item = &'a [u8]>) -> (Vec<Quad>, Vec<String>) {
let mut quads = Vec::new(); let mut quads = Vec::new();
let mut errors = Vec::new(); let mut errors = Vec::new();
let mut parser = NQuadsParser::new().with_quoted_triples().parse(); let mut parser = NQuadsParser::new().with_quoted_triples().parse();
@ -13,7 +13,7 @@ fn parse<'a>(chunks: impl IntoIterator<Item = &'a [u8]>) -> (Vec<Quad>, Vec<Synt
while let Some(result) = parser.read_next() { while let Some(result) = parser.read_next() {
match result { match result {
Ok(quad) => quads.push(quad), Ok(quad) => quads.push(quad),
Err(error) => errors.push(error), Err(error) => errors.push(error.to_string()),
} }
} }
} }
@ -21,7 +21,7 @@ fn parse<'a>(chunks: impl IntoIterator<Item = &'a [u8]>) -> (Vec<Quad>, Vec<Synt
while let Some(result) = parser.read_next() { while let Some(result) = parser.read_next() {
match result { match result {
Ok(quad) => quads.push(quad), Ok(quad) => quads.push(quad),
Err(error) => errors.push(error), Err(error) => errors.push(error.to_string()),
} }
} }
assert!(parser.is_end()); assert!(parser.is_end());
@ -39,7 +39,7 @@ fuzz_target!(|data: &[u8]| {
.collect::<Vec<_>>() .collect::<Vec<_>>()
.as_slice()]); .as_slice()]);
assert_eq!(quads, quads_without_split); assert_eq!(quads, quads_without_split);
assert_eq!(errors.len(), errors_without_split.len()); assert_eq!(errors, errors_without_split);
// We serialize // We serialize
let mut writer = NQuadsSerializer::new().serialize_to_write(Vec::new()); let mut writer = NQuadsSerializer::new().serialize_to_write(Vec::new());

@ -2,9 +2,9 @@
use libfuzzer_sys::fuzz_target; use libfuzzer_sys::fuzz_target;
use oxrdf::{Dataset, GraphName, Quad, Subject, Term, Triple}; use oxrdf::{Dataset, GraphName, Quad, Subject, Term, Triple};
use oxttl::{SyntaxError, TriGParser, TriGSerializer}; use oxttl::{TriGParser, TriGSerializer};
fn parse<'a>(chunks: impl IntoIterator<Item = &'a [u8]>) -> (Vec<Quad>, Vec<SyntaxError>) { fn parse<'a>(chunks: impl IntoIterator<Item = &'a [u8]>) -> (Vec<Quad>, Vec<String>) {
let mut quads = Vec::new(); let mut quads = Vec::new();
let mut errors = Vec::new(); let mut errors = Vec::new();
let mut parser = TriGParser::new() let mut parser = TriGParser::new()
@ -17,7 +17,7 @@ fn parse<'a>(chunks: impl IntoIterator<Item = &'a [u8]>) -> (Vec<Quad>, Vec<Synt
while let Some(result) = parser.read_next() { while let Some(result) = parser.read_next() {
match result { match result {
Ok(quad) => quads.push(quad), Ok(quad) => quads.push(quad),
Err(error) => errors.push(error), Err(error) => errors.push(error.to_string()),
} }
} }
} }
@ -25,7 +25,7 @@ fn parse<'a>(chunks: impl IntoIterator<Item = &'a [u8]>) -> (Vec<Quad>, Vec<Synt
while let Some(result) = parser.read_next() { while let Some(result) = parser.read_next() {
match result { match result {
Ok(quad) => quads.push(quad), Ok(quad) => quads.push(quad),
Err(error) => errors.push(error), Err(error) => errors.push(error.to_string()),
} }
} }
assert!(parser.is_end()); assert!(parser.is_end());
@ -96,7 +96,7 @@ fuzz_target!(|data: &[u8]| {
String::from_utf8_lossy(&serialize_quads(&quads_without_split)) String::from_utf8_lossy(&serialize_quads(&quads_without_split))
); );
} }
assert_eq!(errors.len(), errors_without_split.len()); assert_eq!(errors, errors_without_split);
// We serialize // We serialize
let new_serialization = serialize_quads(&quads); let new_serialization = serialize_quads(&quads);

@ -1,4 +1,5 @@
use std::error::Error; use std::error::Error;
use std::ops::Range;
use std::{fmt, io}; use std::{fmt, io};
/// Error returned during RDF format parsing. /// Error returned during RDF format parsing.
@ -110,10 +111,33 @@ pub struct SyntaxError {
enum SyntaxErrorKind { enum SyntaxErrorKind {
Turtle(oxttl::SyntaxError), Turtle(oxttl::SyntaxError),
RdfXml(oxrdfxml::SyntaxError), RdfXml(oxrdfxml::SyntaxError),
Msg { msg: &'static str }, Msg { msg: &'static str },
} }
impl SyntaxError {
/// The location of the error inside of the file.
#[inline]
pub fn location(&self) -> Option<Range<TextPosition>> {
match &self.inner {
SyntaxErrorKind::Turtle(e) => {
let location = e.location();
Some(
TextPosition {
line: location.start.line,
column: location.start.column,
offset: location.start.offset,
}..TextPosition {
line: location.end.line,
column: location.end.column,
offset: location.end.offset,
},
)
}
SyntaxErrorKind::RdfXml(_) | SyntaxErrorKind::Msg { .. } => None,
}
}
}
impl fmt::Display for SyntaxError { impl fmt::Display for SyntaxError {
#[inline] #[inline]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
@ -146,3 +170,11 @@ impl From<SyntaxError> for io::Error {
} }
} }
} }
/// A position in a text i.e. a `line` number starting from 0, a `column` number starting from 0 (in number of code points) and a global file `offset` starting from 0 (in number of bytes).
#[derive(Eq, PartialEq, Debug, Clone, Copy)]
pub struct TextPosition {
pub line: u64,
pub column: u64,
pub offset: u64,
}

@ -72,15 +72,6 @@ impl From<quick_xml::Error> for ParseError {
} }
} }
impl From<quick_xml::events::attributes::AttrError> for ParseError {
#[inline]
fn from(error: quick_xml::events::attributes::AttrError) -> Self {
Self::Syntax(SyntaxError {
inner: SyntaxErrorKind::XmlAttribute(error),
})
}
}
/// An error in the syntax of the parsed file. /// An error in the syntax of the parsed file.
#[derive(Debug)] #[derive(Debug)]
pub struct SyntaxError { pub struct SyntaxError {
@ -90,7 +81,6 @@ pub struct SyntaxError {
#[derive(Debug)] #[derive(Debug)]
pub enum SyntaxErrorKind { pub enum SyntaxErrorKind {
Xml(quick_xml::Error), Xml(quick_xml::Error),
XmlAttribute(quick_xml::events::attributes::AttrError),
InvalidIri { InvalidIri {
iri: String, iri: String,
error: IriParseError, error: IriParseError,
@ -119,7 +109,6 @@ impl fmt::Display for SyntaxError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match &self.inner { match &self.inner {
SyntaxErrorKind::Xml(error) => error.fmt(f), SyntaxErrorKind::Xml(error) => error.fmt(f),
SyntaxErrorKind::XmlAttribute(error) => error.fmt(f),
SyntaxErrorKind::InvalidIri { iri, error } => { SyntaxErrorKind::InvalidIri { iri, error } => {
write!(f, "error while parsing IRI '{iri}': {error}") write!(f, "error while parsing IRI '{iri}': {error}")
} }
@ -136,7 +125,6 @@ impl Error for SyntaxError {
fn source(&self) -> Option<&(dyn Error + 'static)> { fn source(&self) -> Option<&(dyn Error + 'static)> {
match &self.inner { match &self.inner {
SyntaxErrorKind::Xml(error) => Some(error), SyntaxErrorKind::Xml(error) => Some(error),
SyntaxErrorKind::XmlAttribute(error) => Some(error),
SyntaxErrorKind::InvalidIri { error, .. } => Some(error), SyntaxErrorKind::InvalidIri { error, .. } => Some(error),
SyntaxErrorKind::InvalidLanguageTag { error, .. } => Some(error), SyntaxErrorKind::InvalidLanguageTag { error, .. } => Some(error),
SyntaxErrorKind::Msg { .. } => None, SyntaxErrorKind::Msg { .. } => None,

@ -8,7 +8,7 @@ use quick_xml::escape::unescape_with;
use quick_xml::events::attributes::Attribute; use quick_xml::events::attributes::Attribute;
use quick_xml::events::*; use quick_xml::events::*;
use quick_xml::name::{LocalName, QName, ResolveResult}; use quick_xml::name::{LocalName, QName, ResolveResult};
use quick_xml::{NsReader, Writer}; use quick_xml::{Error, NsReader, Writer};
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
use std::io::{BufReader, Read}; use std::io::{BufReader, Read};
use std::str; use std::str;
@ -515,7 +515,7 @@ impl<R> RdfXmlReader<R> {
.to_string(), .to_string(),
); );
for attr in event.attributes() { for attr in event.attributes() {
clean_event.push_attribute(attr?); clean_event.push_attribute(attr.map_err(Error::InvalidAttr)?);
} }
writer.write_event(Event::Start(clean_event))?; writer.write_event(Event::Start(clean_event))?;
self.in_literal_depth += 1; self.in_literal_depth += 1;
@ -544,7 +544,7 @@ impl<R> RdfXmlReader<R> {
let mut type_attr = None; let mut type_attr = None;
for attribute in event.attributes() { for attribute in event.attributes() {
let attribute = attribute?; let attribute = attribute.map_err(Error::InvalidAttr)?;
if attribute.key.as_ref().starts_with(b"xml") { if attribute.key.as_ref().starts_with(b"xml") {
if attribute.key.as_ref() == b"xml:lang" { if attribute.key.as_ref() == b"xml:lang" {
let tag = self.convert_attribute(&attribute)?; let tag = self.convert_attribute(&attribute)?;

@ -266,7 +266,7 @@ impl N3Lexer {
)); ));
} }
} }
Err(e) => return Some((e.position.end, Err(e))), Err(e) => return Some((e.location.end, Err(e))),
} }
} else if is_ending { } else if is_ending {
while data[..i].ends_with(b".") { while data[..i].ends_with(b".") {
@ -447,7 +447,7 @@ impl N3Lexer {
return Some((i, Ok((buffer, might_be_invalid_iri)))); return Some((i, Ok((buffer, might_be_invalid_iri))));
} }
} }
Err(e) => return Some((e.position.end, Err(e))), Err(e) => return Some((e.location.end, Err(e))),
} }
} else if is_ending { } else if is_ending {
let buffer = if let Some(mut buffer) = buffer { let buffer = if let Some(mut buffer) = buffer {
@ -515,7 +515,7 @@ impl N3Lexer {
} }
i += consumed; i += consumed;
} }
Err(e) => return Some((e.position.end, Err(e))), Err(e) => return Some((e.location.end, Err(e))),
} }
} }
} }

@ -17,7 +17,7 @@ pub mod turtle;
pub use crate::n3::N3Parser; pub use crate::n3::N3Parser;
pub use crate::nquads::{NQuadsParser, NQuadsSerializer}; pub use crate::nquads::{NQuadsParser, NQuadsSerializer};
pub use crate::ntriples::{NTriplesParser, NTriplesSerializer}; pub use crate::ntriples::{NTriplesParser, NTriplesSerializer};
pub use crate::toolkit::{ParseError, SyntaxError}; pub use crate::toolkit::{ParseError, SyntaxError, TextPosition};
pub use crate::trig::{TriGParser, TriGSerializer}; pub use crate::trig::{TriGParser, TriGSerializer};
pub use crate::turtle::{TurtleParser, TurtleSerializer}; pub use crate::turtle::{TurtleParser, TurtleSerializer};

@ -76,7 +76,7 @@ impl RuleRecognizer for NQuadsRecognizer {
} }
_ => self.error( _ => self.error(
errors, errors,
format!("The subject of a triple should be an IRI or a blank node, {token:?} found"), "The subject of a triple should be an IRI or a blank node, TOKEN found",
), ),
}, },
NQuadsState::ExpectPredicate => match token { NQuadsState::ExpectPredicate => match token {
@ -88,7 +88,7 @@ impl RuleRecognizer for NQuadsRecognizer {
} }
_ => self.error( _ => self.error(
errors, errors,
format!("The predicate of a triple should be an IRI, {token:?} found"), "The predicate of a triple should be an IRI, TOKEN found",
), ),
}, },
NQuadsState::ExpectedObject => match token { NQuadsState::ExpectedObject => match token {
@ -118,7 +118,7 @@ impl RuleRecognizer for NQuadsRecognizer {
} }
_ => self.error( _ => self.error(
errors, errors,
format!("The object of a triple should be an IRI, a blank node or a literal, {token:?} found"), "The object of a triple should be an IRI, a blank node or a literal, TOKEN found",
), ),
}, },
NQuadsState::ExpectLiteralAnnotationOrGraphNameOrDot { value } => match token { NQuadsState::ExpectLiteralAnnotationOrGraphNameOrDot { value } => match token {
@ -159,7 +159,7 @@ impl RuleRecognizer for NQuadsRecognizer {
.push(NQuadsState::ExpectPossibleGraphOrEndOfQuotedTriple); .push(NQuadsState::ExpectPossibleGraphOrEndOfQuotedTriple);
self self
} }
_ => self.error(errors, format!("A literal datatype must be an IRI, found {token:?}")), _ => self.error(errors, "A literal datatype must be an IRI, found TOKEN"),
}, },
NQuadsState::ExpectPossibleGraphOrEndOfQuotedTriple => { NQuadsState::ExpectPossibleGraphOrEndOfQuotedTriple => {
if self.stack.is_empty() { if self.stack.is_empty() {

@ -836,7 +836,7 @@ impl RuleRecognizer for N3Recognizer {
self.stack.push(N3State::FormulaContent); self.stack.push(N3State::FormulaContent);
self self
} }
_ => self.error(errors, format!("This is not a valid RDF value: {token:?}")) _ => self.error(errors, "TOKEN is not a valid RDF value")
} }
} }
N3State::PropertyListMiddle => match token { N3State::PropertyListMiddle => match token {
@ -950,7 +950,7 @@ impl RuleRecognizer for N3Recognizer {
Err(e) => self.error(errors, e) Err(e) => self.error(errors, e)
} }
_ => { _ => {
self.error(errors, format!("Expecting a datatype IRI after '^^, found {token:?}")).recognize_next(token, results, errors) self.error(errors, "Expecting a datatype IRI after '^^, found TOKEN").recognize_next(token, results, errors)
} }
} }
} }

@ -167,7 +167,7 @@ impl RuleRecognizer for TriGRecognizer {
self self
} }
_ => { _ => {
self.error(errors, format!("The token {token:?} is not a valid subject or graph name")) self.error(errors, "TOKEN is not a valid subject or graph name")
} }
} }
TriGState::WrappedGraphOrPredicateObjectList { term } => { TriGState::WrappedGraphOrPredicateObjectList { term } => {
@ -317,7 +317,7 @@ impl RuleRecognizer for TriGRecognizer {
self self
} }
_ => { _ => {
self.error(errors, format!("The token {token:?} is not a valid RDF subject")) self.error(errors, "TOKEN is not a valid RDF subject")
} }
}, },
TriGState::TriplesBlankNodePropertyListCurrent => if token == N3Token::Punctuation("]") { TriGState::TriplesBlankNodePropertyListCurrent => if token == N3Token::Punctuation("]") {
@ -350,7 +350,7 @@ impl RuleRecognizer for TriGRecognizer {
self self
} }
_ => { _ => {
self.error(errors, format!("The token {token:?} is not a valid graph name")) self.error(errors, "TOKEN is not a valid graph name")
} }
} }
TriGState::GraphNameAnonEnd => if token == N3Token::Punctuation("]") { TriGState::GraphNameAnonEnd => if token == N3Token::Punctuation("]") {
@ -456,7 +456,7 @@ impl RuleRecognizer for TriGRecognizer {
Err(e) => self.error(errors, e) Err(e) => self.error(errors, e)
} }
_ => { _ => {
self.error(errors, format!("The token {token:?} is not a valid predicate")) self.error(errors, "TOKEN is not a valid predicate")
} }
} }
// [12] object ::= iri | BlankNode | collection | blankNodePropertyList | literal | quotedTriple // [12] object ::= iri | BlankNode | collection | blankNodePropertyList | literal | quotedTriple
@ -536,7 +536,7 @@ impl RuleRecognizer for TriGRecognizer {
self self
} }
_ => { _ => {
self.error(errors, format!("This is not a valid RDF object: {token:?}")) self.error(errors, "TOKEN is not a valid RDF object")
} }
} }
@ -637,7 +637,7 @@ impl RuleRecognizer for TriGRecognizer {
Err(e) => self.error(errors, e) Err(e) => self.error(errors, e)
} }
_ => { _ => {
self.error(errors, format!("Expecting a datatype IRI after '^^, found {token:?}")).recognize_next(token, results, errors) self.error(errors, "Expecting a datatype IRI after ^^, found TOKEN").recognize_next(token, results, errors)
} }
} }
} }
@ -653,7 +653,7 @@ impl RuleRecognizer for TriGRecognizer {
if token == N3Token::Punctuation(">>") { if token == N3Token::Punctuation(">>") {
self self
} else { } else {
self.error(errors, format!("Expecting '>>' to close a quoted triple, found {token:?}")) self.error(errors, "Expecting '>>' to close a quoted triple, found TOKEN")
} }
} }
#[cfg(feature = "rdf-star")] #[cfg(feature = "rdf-star")]
@ -670,7 +670,7 @@ impl RuleRecognizer for TriGRecognizer {
if token == N3Token::Punctuation(">>") { if token == N3Token::Punctuation(">>") {
self self
} else { } else {
self.error(errors, format!("Expecting '>>' to close a quoted triple, found {token:?}")) self.error(errors, "Expecting '>>' to close a quoted triple, found TOKEN")
} }
} }
// [28t] qtSubject ::= iri | BlankNode | quotedTriple // [28t] qtSubject ::= iri | BlankNode | quotedTriple
@ -703,7 +703,7 @@ impl RuleRecognizer for TriGRecognizer {
self.stack.push(TriGState::QuotedSubject); self.stack.push(TriGState::QuotedSubject);
self self
} }
_ => self.error(errors, format!("This is not a valid RDF quoted triple subject: {token:?}")) _ => self.error(errors, "TOKEN is not a valid RDF quoted triple subject: TOKEN")
} }
// [29t] qtObject ::= iri | BlankNode | literal | quotedTriple // [29t] qtObject ::= iri | BlankNode | literal | quotedTriple
#[cfg(feature = "rdf-star")] #[cfg(feature = "rdf-star")]
@ -759,7 +759,7 @@ impl RuleRecognizer for TriGRecognizer {
self.stack.push(TriGState::QuotedSubject); self.stack.push(TriGState::QuotedSubject);
self self
} }
_ => self.error(errors, format!("This is not a valid RDF quoted triple object: {token:?}")) _ => self.error(errors, "TOKEN is not a valid RDF quoted triple object")
} }
#[cfg(feature = "rdf-star")] #[cfg(feature = "rdf-star")]
TriGState::QuotedAnonEnd => if token == N3Token::Punctuation("]") { TriGState::QuotedAnonEnd => if token == N3Token::Punctuation("]") {

@ -0,0 +1,132 @@
use std::error::Error;
use std::ops::Range;
use std::{fmt, io};
/// A position in a text i.e. a `line` number starting from 0, a `column` number starting from 0 (in number of code points) and a global file `offset` starting from 0 (in number of bytes).
#[derive(Eq, PartialEq, Debug, Clone, Copy)]
pub struct TextPosition {
pub line: u64,
pub column: u64,
pub offset: u64,
}
/// An error in the syntax of the parsed file.
///
/// It is composed of a message and a byte range in the input.
#[derive(Debug)]
pub struct SyntaxError {
pub(super) location: Range<TextPosition>,
pub(super) message: String,
}
impl SyntaxError {
/// The location of the error inside of the file.
#[inline]
pub fn location(&self) -> Range<TextPosition> {
self.location.clone()
}
/// The error message.
#[inline]
pub fn message(&self) -> &str {
&self.message
}
}
impl fmt::Display for SyntaxError {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.location.start.offset + 1 >= self.location.end.offset {
write!(
f,
"Parser error at line {} column {}: {}",
self.location.start.line + 1,
self.location.start.column + 1,
self.message
)
} else if self.location.start.line == self.location.end.line {
write!(
f,
"Parser error between at line {} between columns {} and column {}: {}",
self.location.start.line + 1,
self.location.start.column + 1,
self.location.end.column + 1,
self.message
)
} else {
write!(
f,
"Parser error between line {} column {} and line {} column {}: {}",
self.location.start.line + 1,
self.location.start.column + 1,
self.location.end.line + 1,
self.location.end.column + 1,
self.message
)
}
}
}
impl Error for SyntaxError {}
impl From<SyntaxError> for io::Error {
#[inline]
fn from(error: SyntaxError) -> Self {
io::Error::new(io::ErrorKind::InvalidData, error)
}
}
/// A parsing error.
///
/// It is the union of [`SyntaxError`] and [`std::io::Error`].
#[derive(Debug)]
pub enum ParseError {
/// I/O error during parsing (file not found...).
Io(io::Error),
/// An error in the file syntax.
Syntax(SyntaxError),
}
impl fmt::Display for ParseError {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Io(e) => e.fmt(f),
Self::Syntax(e) => e.fmt(f),
}
}
}
impl Error for ParseError {
#[inline]
fn source(&self) -> Option<&(dyn Error + 'static)> {
Some(match self {
Self::Io(e) => e,
Self::Syntax(e) => e,
})
}
}
impl From<SyntaxError> for ParseError {
#[inline]
fn from(error: SyntaxError) -> Self {
Self::Syntax(error)
}
}
impl From<io::Error> for ParseError {
#[inline]
fn from(error: io::Error) -> Self {
Self::Io(error)
}
}
impl From<ParseError> for io::Error {
#[inline]
fn from(error: ParseError) -> Self {
match error {
ParseError::Syntax(e) => e.into(),
ParseError::Io(e) => e,
}
}
}

@ -1,9 +1,10 @@
use memchr::memchr2; use crate::toolkit::error::{SyntaxError, TextPosition};
use memchr::{memchr2, memchr2_iter};
use std::borrow::Cow;
use std::cmp::min; use std::cmp::min;
use std::error::Error;
use std::fmt;
use std::io::{self, Read}; use std::io::{self, Read};
use std::ops::{Range, RangeInclusive}; use std::ops::{Range, RangeInclusive};
use std::str;
#[cfg(feature = "async-tokio")] #[cfg(feature = "async-tokio")]
use tokio::io::{AsyncRead, AsyncReadExt}; use tokio::io::{AsyncRead, AsyncReadExt};
@ -22,14 +23,14 @@ pub trait TokenRecognizer {
} }
pub struct TokenRecognizerError { pub struct TokenRecognizerError {
pub position: Range<usize>, pub location: Range<usize>,
pub message: String, pub message: String,
} }
impl<S: Into<String>> From<(Range<usize>, S)> for TokenRecognizerError { impl<S: Into<String>> From<(Range<usize>, S)> for TokenRecognizerError {
fn from((position, message): (Range<usize>, S)) -> Self { fn from((location, message): (Range<usize>, S)) -> Self {
Self { Self {
position, location,
message: message.into(), message: message.into(),
} }
} }
@ -37,34 +38,37 @@ impl<S: Into<String>> From<(Range<usize>, S)> for TokenRecognizerError {
#[allow(clippy::range_plus_one)] #[allow(clippy::range_plus_one)]
impl<S: Into<String>> From<(RangeInclusive<usize>, S)> for TokenRecognizerError { impl<S: Into<String>> From<(RangeInclusive<usize>, S)> for TokenRecognizerError {
fn from((position, message): (RangeInclusive<usize>, S)) -> Self { fn from((location, message): (RangeInclusive<usize>, S)) -> Self {
(*position.start()..*position.end() + 1, message).into() (*location.start()..*location.end() + 1, message).into()
} }
} }
impl<S: Into<String>> From<(usize, S)> for TokenRecognizerError { impl<S: Into<String>> From<(usize, S)> for TokenRecognizerError {
fn from((position, message): (usize, S)) -> Self { fn from((location, message): (usize, S)) -> Self {
(position..=position, message).into() (location..=location, message).into()
} }
} }
pub struct TokenWithPosition<T> {
pub token: T,
pub position: Range<usize>,
}
pub struct Lexer<R: TokenRecognizer> { pub struct Lexer<R: TokenRecognizer> {
parser: R, parser: R,
data: Vec<u8>, data: Vec<u8>,
start: usize, position: Position,
previous_position: Position, // Lexer position before the last emitted token
is_ending: bool, is_ending: bool,
position: usize,
min_buffer_size: usize, min_buffer_size: usize,
max_buffer_size: usize, max_buffer_size: usize,
is_line_jump_whitespace: bool, is_line_jump_whitespace: bool,
line_comment_start: Option<&'static [u8]>, line_comment_start: Option<&'static [u8]>,
} }
#[derive(Clone, Copy)]
struct Position {
line_start_buffer_offset: usize,
buffer_offset: usize,
global_offset: u64,
global_line: u64,
}
impl<R: TokenRecognizer> Lexer<R> { impl<R: TokenRecognizer> Lexer<R> {
pub fn new( pub fn new(
parser: R, parser: R,
@ -76,9 +80,19 @@ impl<R: TokenRecognizer> Lexer<R> {
Self { Self {
parser, parser,
data: Vec::new(), data: Vec::new(),
start: 0, position: Position {
line_start_buffer_offset: 0,
buffer_offset: 0,
global_offset: 0,
global_line: 0,
},
previous_position: Position {
line_start_buffer_offset: 0,
buffer_offset: 0,
global_offset: 0,
global_line: 0,
},
is_ending: false, is_ending: false,
position: 0,
min_buffer_size, min_buffer_size,
max_buffer_size, max_buffer_size,
is_line_jump_whitespace, is_line_jump_whitespace,
@ -148,24 +162,43 @@ impl<R: TokenRecognizer> Lexer<R> {
Ok(()) Ok(())
} }
pub fn read_next( #[allow(clippy::unwrap_in_result)]
&mut self, pub fn read_next(&mut self, options: &R::Options) -> Option<Result<R::Token<'_>, SyntaxError>> {
options: &R::Options,
) -> Option<Result<TokenWithPosition<R::Token<'_>>, LexerError>> {
self.skip_whitespaces_and_comments()?; self.skip_whitespaces_and_comments()?;
let Some((consumed, result)) = self.previous_position = self.position;
self.parser let Some((consumed, result)) = self.parser.recognize_next_token(
.recognize_next_token(&self.data[self.start..], self.is_ending, options) &self.data[self.position.buffer_offset..],
else { self.is_ending,
options,
) else {
return if self.is_ending { return if self.is_ending {
if self.start == self.data.len() { if self.position.buffer_offset == self.data.len() {
None // We have finished None // We have finished
} else { } else {
let error = LexerError { let (new_line_jumps, new_line_start) =
position: self.position..self.position + (self.data.len() - self.start), Self::find_number_of_line_jumps_and_start_of_last_line(
&self.data[self.position.buffer_offset..],
);
if new_line_jumps > 0 {
self.position.line_start_buffer_offset =
self.position.buffer_offset + new_line_start;
}
self.position.global_offset +=
u64::try_from(self.data.len() - self.position.buffer_offset).unwrap();
self.position.buffer_offset = self.data.len();
self.position.global_line += new_line_jumps;
let new_position = TextPosition {
line: self.position.global_line,
column: Self::column_from_bytes(
&self.data[self.position.line_start_buffer_offset..],
),
offset: self.position.global_offset,
};
let error = SyntaxError {
location: new_position..new_position,
message: "Unexpected end of file".into(), message: "Unexpected end of file".into(),
}; };
self.start = self.data.len(); // We consume everything self.position.buffer_offset = self.data.len(); // We consume everything
Some(Err(error)) Some(Err(error))
} }
} else { } else {
@ -177,44 +210,119 @@ impl<R: TokenRecognizer> Lexer<R> {
"The lexer must consume at least one byte each time" "The lexer must consume at least one byte each time"
); );
debug_assert!( debug_assert!(
self.start + consumed <= self.data.len(), self.position.buffer_offset + consumed <= self.data.len(),
"The lexer tried to consumed {consumed} bytes but only {} bytes are readable", "The lexer tried to consumed {consumed} bytes but only {} bytes are readable",
self.data.len() - self.start self.data.len() - self.position.buffer_offset
);
let (new_line_jumps, new_line_start) =
Self::find_number_of_line_jumps_and_start_of_last_line(
&self.data[self.position.buffer_offset..self.position.buffer_offset + consumed],
); );
let old_position = self.position; if new_line_jumps > 0 {
self.start += consumed; self.position.line_start_buffer_offset = self.position.buffer_offset + new_line_start;
self.position += consumed; }
Some(match result { self.position.buffer_offset += consumed;
Ok(token) => Ok(TokenWithPosition { self.position.global_offset += u64::try_from(consumed).unwrap();
token, self.position.global_line += new_line_jumps;
position: old_position..self.position, Some(result.map_err(|e| SyntaxError {
}), location: self.location_from_buffer_offset_range(e.location),
Err(e) => Err(LexerError {
position: e.position.start + self.position..e.position.end + self.position,
message: e.message, message: e.message,
}), }))
}) }
pub fn location_from_buffer_offset_range(
&self,
offset_range: Range<usize>,
) -> Range<TextPosition> {
let start_offset = self.previous_position.buffer_offset + offset_range.start;
let (start_extra_line_jumps, start_line_start) =
Self::find_number_of_line_jumps_and_start_of_last_line(
&self.data[self.previous_position.buffer_offset..start_offset],
);
let start_line_start = if start_extra_line_jumps > 0 {
start_line_start + self.previous_position.buffer_offset
} else {
self.previous_position.line_start_buffer_offset
};
let end_offset = self.previous_position.buffer_offset + offset_range.end;
let (end_extra_line_jumps, end_line_start) =
Self::find_number_of_line_jumps_and_start_of_last_line(
&self.data[self.previous_position.buffer_offset..end_offset],
);
let end_line_start = if end_extra_line_jumps > 0 {
end_line_start + self.previous_position.buffer_offset
} else {
self.previous_position.line_start_buffer_offset
};
TextPosition {
line: self.previous_position.global_line + start_extra_line_jumps,
column: Self::column_from_bytes(&self.data[start_line_start..start_offset]),
offset: self.previous_position.global_offset
+ u64::try_from(offset_range.start).unwrap(),
}..TextPosition {
line: self.previous_position.global_line + end_extra_line_jumps,
column: Self::column_from_bytes(&self.data[end_line_start..end_offset]),
offset: self.previous_position.global_offset + u64::try_from(offset_range.end).unwrap(),
}
}
pub fn last_token_location(&self) -> Range<TextPosition> {
TextPosition {
line: self.previous_position.global_line,
column: Self::column_from_bytes(
&self.data[self.previous_position.line_start_buffer_offset
..self.previous_position.buffer_offset],
),
offset: self.previous_position.global_offset,
}..TextPosition {
line: self.position.global_line,
column: Self::column_from_bytes(
&self.data[self.position.line_start_buffer_offset..self.position.buffer_offset],
),
offset: self.position.global_offset,
}
}
pub fn last_token_source(&self) -> Cow<'_, str> {
String::from_utf8_lossy(
&self.data[self.previous_position.buffer_offset..self.position.buffer_offset],
)
} }
pub fn is_end(&self) -> bool { pub fn is_end(&self) -> bool {
self.is_ending && self.data.len() == self.start self.is_ending && self.data.len() == self.position.buffer_offset
} }
#[allow(clippy::unwrap_in_result)]
fn skip_whitespaces_and_comments(&mut self) -> Option<()> { fn skip_whitespaces_and_comments(&mut self) -> Option<()> {
loop { loop {
self.skip_whitespaces(); self.skip_whitespaces()?;
let buf = &self.data[self.start..]; let buf = &self.data[self.position.buffer_offset..];
if let Some(line_comment_start) = self.line_comment_start { if let Some(line_comment_start) = self.line_comment_start {
if buf.starts_with(line_comment_start) { if buf.starts_with(line_comment_start) {
// Comment // Comment
if let Some(end) = memchr2(b'\r', b'\n', &buf[line_comment_start.len()..]) { if let Some(end) = memchr2(b'\r', b'\n', &buf[line_comment_start.len()..]) {
self.start += end + line_comment_start.len(); let mut end_position = line_comment_start.len() + end;
self.position += end + line_comment_start.len(); if buf.get(end_position).copied() == Some(b'\r') {
// We look for \n for Windows line end style
if let Some(c) = buf.get(end_position + 1) {
if *c == b'\n' {
end_position += 1;
}
} else if !self.is_ending {
return None; // We need to read more
}
}
let comment_size = end_position + 1;
self.position.buffer_offset += comment_size;
self.position.line_start_buffer_offset = self.position.buffer_offset;
self.position.global_offset += u64::try_from(comment_size).unwrap();
self.position.global_line += 1;
continue; continue;
} }
if self.is_ending { if self.is_ending {
self.start = self.data.len(); // EOF self.position.buffer_offset = self.data.len(); // EOF
return Some(()); return Some(());
} }
return None; // We need more data return None; // We need more data
@ -224,80 +332,98 @@ impl<R: TokenRecognizer> Lexer<R> {
} }
} }
fn skip_whitespaces(&mut self) { fn skip_whitespaces(&mut self) -> Option<()> {
if self.is_line_jump_whitespace { if self.is_line_jump_whitespace {
for (i, c) in self.data[self.start..].iter().enumerate() { let mut i = self.position.buffer_offset;
if !matches!(c, b' ' | b'\t' | b'\r' | b'\n') { while let Some(c) = self.data.get(i) {
self.start += i; match c {
self.position += i; b' ' | b'\t' => {
return; self.position.buffer_offset += 1;
} self.position.global_offset += 1;
}
b'\r' => {
// We look for \n for Windows line end style
let mut increment: u8 = 1;
if let Some(c) = self.data.get(i + 1) {
if *c == b'\n' {
increment += 1;
i += 1;
}
} else if !self.is_ending {
return None; // We need to read more
}
self.position.buffer_offset += usize::from(increment);
self.position.line_start_buffer_offset = self.position.buffer_offset;
self.position.global_offset += u64::from(increment);
self.position.global_line += 1;
}
b'\n' => {
self.position.buffer_offset += 1;
self.position.line_start_buffer_offset = self.position.buffer_offset;
self.position.global_offset += 1;
self.position.global_line += 1;
}
_ => return Some(()),
}
i += 1;
//TODO: SIMD //TODO: SIMD
} }
} else { } else {
for (i, c) in self.data[self.start..].iter().enumerate() { for c in &self.data[self.position.buffer_offset..] {
if !matches!(c, b' ' | b'\t') { if matches!(c, b' ' | b'\t') {
self.start += i; self.position.buffer_offset += 1;
self.position += i; self.position.global_offset += 1;
return; } else {
return Some(());
} }
//TODO: SIMD //TODO: SIMD
} }
} }
// We only have whitespaces Some(())
self.position += self.data.len() - self.start;
self.start = self.data.len();
} }
fn shrink_data(&mut self) { fn shrink_data(&mut self) {
if self.start > 0 { if self.position.line_start_buffer_offset > 0 {
self.data.copy_within(self.start.., 0); self.data
self.data.truncate(self.data.len() - self.start); .copy_within(self.position.line_start_buffer_offset.., 0);
self.start = 0; self.data
.truncate(self.data.len() - self.position.line_start_buffer_offset);
self.position.buffer_offset -= self.position.line_start_buffer_offset;
self.position.line_start_buffer_offset = 0;
self.previous_position = self.position;
} }
} }
}
#[derive(Debug)]
pub struct LexerError {
position: Range<usize>,
message: String,
}
impl LexerError { fn find_number_of_line_jumps_and_start_of_last_line(bytes: &[u8]) -> (u64, usize) {
pub fn position(&self) -> Range<usize> { let mut num_of_jumps = 0;
self.position.clone() let mut last_jump_pos = 0;
let mut previous_cr = 0;
for pos in memchr2_iter(b'\r', b'\n', bytes) {
if bytes[pos] == b'\r' {
previous_cr = pos;
num_of_jumps += 1;
last_jump_pos = pos + 1;
} else {
if previous_cr < pos - 1 {
// We count \r\n as a single line jump
num_of_jumps += 1;
} }
last_jump_pos = pos + 1;
pub fn message(&self) -> &str {
&self.message
} }
pub fn into_message(self) -> String {
self.message
} }
} (num_of_jumps, last_jump_pos)
}
impl fmt::Display for LexerError { fn column_from_bytes(bytes: &[u8]) -> u64 {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match str::from_utf8(bytes) {
if self.position.start + 1 == self.position.end { Ok(s) => u64::try_from(s.chars().count()).unwrap(),
write!( Err(e) => {
f, if e.valid_up_to() == 0 {
"Lexer error at byte {}: {}", 0
self.position.start, self.message
)
} else { } else {
write!( Self::column_from_bytes(&bytes[..e.valid_up_to()])
f, }
"Lexer error between bytes {} and {}: {}",
self.position.start, self.position.end, self.message
)
} }
} }
}
impl Error for LexerError {
fn description(&self) -> &str {
self.message()
} }
} }

@ -2,12 +2,12 @@
//! //!
//! Provides the basic code to write plain Rust lexers and parsers able to read files chunk by chunk. //! Provides the basic code to write plain Rust lexers and parsers able to read files chunk by chunk.
mod error;
mod lexer; mod lexer;
mod parser; mod parser;
pub use self::lexer::{Lexer, LexerError, TokenRecognizer, TokenRecognizerError}; pub use self::error::{ParseError, SyntaxError, TextPosition};
pub use self::lexer::{Lexer, TokenRecognizer, TokenRecognizerError};
#[cfg(feature = "async-tokio")] #[cfg(feature = "async-tokio")]
pub use self::parser::FromTokioAsyncReadIterator; pub use self::parser::FromTokioAsyncReadIterator;
pub use self::parser::{ pub use self::parser::{FromReadIterator, Parser, RuleRecognizer, RuleRecognizerError};
FromReadIterator, ParseError, Parser, RuleRecognizer, RuleRecognizerError, SyntaxError,
};

@ -1,9 +1,6 @@
use crate::toolkit::lexer::TokenWithPosition; use crate::toolkit::error::{ParseError, SyntaxError};
use crate::toolkit::{Lexer, LexerError, TokenRecognizer}; use crate::toolkit::lexer::{Lexer, TokenRecognizer};
use std::error::Error;
use std::io::Read; use std::io::Read;
use std::ops::Range;
use std::{fmt, io};
#[cfg(feature = "async-tokio")] #[cfg(feature = "async-tokio")]
use tokio::io::AsyncRead; use tokio::io::AsyncRead;
@ -42,7 +39,6 @@ pub struct Parser<RR: RuleRecognizer> {
state: Option<RR>, state: Option<RR>,
results: Vec<RR::Output>, results: Vec<RR::Output>,
errors: Vec<RuleRecognizerError>, errors: Vec<RuleRecognizerError>,
position: Range<usize>,
default_lexer_options: <RR::TokenRecognizer as TokenRecognizer>::Options, default_lexer_options: <RR::TokenRecognizer as TokenRecognizer>::Options,
} }
@ -53,7 +49,6 @@ impl<RR: RuleRecognizer> Parser<RR> {
state: Some(recognizer), state: Some(recognizer),
results: vec![], results: vec![],
errors: vec![], errors: vec![],
position: 0..0,
default_lexer_options: <RR::TokenRecognizer as TokenRecognizer>::Options::default(), default_lexer_options: <RR::TokenRecognizer as TokenRecognizer>::Options::default(),
} }
} }
@ -76,8 +71,10 @@ impl<RR: RuleRecognizer> Parser<RR> {
loop { loop {
if let Some(error) = self.errors.pop() { if let Some(error) = self.errors.pop() {
return Some(Err(SyntaxError { return Some(Err(SyntaxError {
position: self.position.clone(), location: self.lexer.last_token_location(),
message: error.message, message: error
.message
.replace("TOKEN", &self.lexer.last_token_source()),
})); }));
} }
if let Some(result) = self.results.pop() { if let Some(result) = self.results.pop() {
@ -89,8 +86,7 @@ impl<RR: RuleRecognizer> Parser<RR> {
.map_or(&self.default_lexer_options, |p| p.lexer_options()), .map_or(&self.default_lexer_options, |p| p.lexer_options()),
) { ) {
match result { match result {
Ok(TokenWithPosition { token, position }) => { Ok(token) => {
self.position = position;
self.state = self.state.take().map(|state| { self.state = self.state.take().map(|state| {
state.recognize_next(token, &mut self.results, &mut self.errors) state.recognize_next(token, &mut self.results, &mut self.errors)
}); });
@ -98,7 +94,7 @@ impl<RR: RuleRecognizer> Parser<RR> {
} }
Err(e) => { Err(e) => {
self.state = self.state.take().map(RR::error_recovery_state); self.state = self.state.take().map(RR::error_recovery_state);
return Some(Err(e.into())); return Some(Err(e));
} }
} }
} }
@ -126,128 +122,6 @@ impl<RR: RuleRecognizer> Parser<RR> {
} }
} }
/// An error in the syntax of the parsed file.
///
/// It is composed of a message and a byte range in the input.
#[derive(Debug)]
pub struct SyntaxError {
position: Range<usize>,
message: String,
}
impl SyntaxError {
/// The invalid byte range in the input.
#[inline]
pub fn position(&self) -> Range<usize> {
self.position.clone()
}
/// The error message.
#[inline]
pub fn message(&self) -> &str {
&self.message
}
/// Converts this error to an error message.
#[inline]
pub fn into_message(self) -> String {
self.message
}
}
impl fmt::Display for SyntaxError {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.position.start + 1 == self.position.end {
write!(
f,
"Parser error at byte {}: {}",
self.position.start, self.message
)
} else {
write!(
f,
"Parser error between bytes {} and {}: {}",
self.position.start, self.position.end, self.message
)
}
}
}
impl Error for SyntaxError {}
impl From<SyntaxError> for io::Error {
#[inline]
fn from(error: SyntaxError) -> Self {
io::Error::new(io::ErrorKind::InvalidData, error)
}
}
impl From<LexerError> for SyntaxError {
#[inline]
fn from(e: LexerError) -> Self {
Self {
position: e.position(),
message: e.into_message(),
}
}
}
/// A parsing error.
///
/// It is the union of [`SyntaxError`] and [`std::io::Error`].
#[derive(Debug)]
pub enum ParseError {
/// I/O error during parsing (file not found...).
Io(io::Error),
/// An error in the file syntax.
Syntax(SyntaxError),
}
impl fmt::Display for ParseError {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Io(e) => e.fmt(f),
Self::Syntax(e) => e.fmt(f),
}
}
}
impl Error for ParseError {
#[inline]
fn source(&self) -> Option<&(dyn Error + 'static)> {
Some(match self {
Self::Io(e) => e,
Self::Syntax(e) => e,
})
}
}
impl From<SyntaxError> for ParseError {
#[inline]
fn from(error: SyntaxError) -> Self {
Self::Syntax(error)
}
}
impl From<io::Error> for ParseError {
#[inline]
fn from(error: io::Error) -> Self {
Self::Io(error)
}
}
impl From<ParseError> for io::Error {
#[inline]
fn from(error: ParseError) -> Self {
match error {
ParseError::Syntax(e) => e.into(),
ParseError::Io(e) => e,
}
}
}
pub struct FromReadIterator<R: Read, RR: RuleRecognizer> { pub struct FromReadIterator<R: Read, RR: RuleRecognizer> {
read: R, read: R,
parser: Parser<RR>, parser: Parser<RR>,

@ -0,0 +1,2 @@
<http://example.com/s> <http://example.com/p> <http://example.com/o> .
<http://example.com/s> <http:// /p> <http://example.com/o> .

@ -0,0 +1,2 @@
<http://example.com/s> <http://example.com/p> <http://example.com/o> . # foo
<http://example.com/s> <http:// /p> <http://example.com/o> .

@ -0,0 +1,2 @@
<http://example.com/s> <http://example.com/p> <http://example.com/o> . # foo
<http://example.com/s> <http:// /p> <http://example.com/o> .

@ -0,0 +1,2 @@
<http://example.com/s> <http://example.com/p> <http://example.com/o> .
<http://example.com/s> <http:// /p> <http://example.com/o> .

@ -0,0 +1 @@
Parser error between at line 2 between columns 24 and column 36: Invalid IRI code point ' '

@ -0,0 +1,2 @@
<http://example.com/s> <http://example.com/p> <http://example.com/o> .
<http://example.com/s> "p" <http://example.com/o> .

@ -0,0 +1 @@
Parser error between at line 2 between columns 24 and column 27: "p" is not a valid predicate

@ -0,0 +1 @@
<http://example.com/s> <http://example.com/p> "fooé \a baré" .

@ -0,0 +1 @@
Parser error between at line 1 between columns 53 and column 55: Unexpected escape character '\a'

@ -0,0 +1,66 @@
@prefix mf: <http://www.w3.org/2001/sw/DataAccess/tests/test-manifest#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix rdft: <http://www.w3.org/ns/rdftest#> .
<>
rdf:type mf:Manifest ;
rdfs:comment "Oxigraph parser error test cases" ;
mf:entries (
<#invalid_iri>
<#invalid_iri_crlf>
<#invalid_iri_comment>
<#invalid_iri_comment_crlf>
<#invalid_string_escape>
<#unexpected_eof>
<#unexpected_eof_crlf>
<#invalid_predicate>
) .
<#invalid_iri>
rdf:type rdft:TestTurtleNegativeSyntax ;
mf:name "bad IRI" ;
mf:action <invalid_iri.nt> ;
mf:result <invalid_iri_error.txt> .
<#invalid_iri_crlf>
rdf:type rdft:TestTurtleNegativeSyntax ;
mf:name "bad IRI" ;
mf:action <invalid_iri_crlf.nt> ;
mf:result <invalid_iri_error.txt> .
<#invalid_iri_comment>
rdf:type rdft:TestTurtleNegativeSyntax ;
mf:name "bad IRI" ;
mf:action <invalid_iri_comment.nt> ;
mf:result <invalid_iri_error.txt> .
<#invalid_iri_comment_crlf>
rdf:type rdft:TestTurtleNegativeSyntax ;
mf:name "bad IRI" ;
mf:action <invalid_iri_comment_crlf.nt> ;
mf:result <invalid_iri_error.txt> .
<#invalid_string_escape>
rdf:type rdft:TestTurtleNegativeSyntax ;
mf:name "bad string escape" ;
mf:action <invalid_string_escape.nt> ;
mf:result <invalid_string_escape_error.txt> .
<#unexpected_eof>
rdf:type rdft:TestTurtleNegativeSyntax ;
mf:name "unexpected end of file" ;
mf:action <unexpected_eof.nt> ;
mf:result <unexpected_eof_error.txt> .
<#unexpected_eof_crlf>
rdf:type rdft:TestTurtleNegativeSyntax ;
mf:name "unexpected end of file" ;
mf:action <unexpected_eof_crlf.nt> ;
mf:result <unexpected_eof_error.txt> .
<#invalid_predicate>
rdf:type rdft:TestTurtleNegativeSyntax ;
mf:name "invalid predicate" ;
mf:action <invalid_predicate.nt> ;
mf:result <invalid_predicate_error.txt> .

@ -0,0 +1,2 @@
<http://example.com/s> <http://example.com/p> <http://example.com/o

@ -0,0 +1,2 @@
<http://example.com/s> <http://example.com/p> <http://example.com/o

@ -0,0 +1 @@
Parser error at line 2 column 3: Unexpected end of file

@ -1,8 +1,8 @@
use crate::evaluator::TestEvaluator; use crate::evaluator::TestEvaluator;
use crate::files::{guess_rdf_format, load_dataset, load_n3}; use crate::files::{guess_rdf_format, load_dataset, load_n3, read_file_to_string};
use crate::manifest::Test; use crate::manifest::Test;
use crate::report::dataset_diff; use crate::report::{dataset_diff, format_diff};
use anyhow::{anyhow, ensure, Result}; use anyhow::{anyhow, bail, ensure, Result};
use oxigraph::io::RdfFormat; use oxigraph::io::RdfFormat;
use oxigraph::model::{BlankNode, Dataset, Quad}; use oxigraph::model::{BlankNode, Dataset, Quad};
use oxttl::n3::{N3Quad, N3Term}; use oxttl::n3::{N3Quad, N3Term};
@ -116,10 +116,17 @@ fn evaluate_negative_syntax_test(test: &Test, format: RdfFormat) -> Result<()> {
.action .action
.as_deref() .as_deref()
.ok_or_else(|| anyhow!("No action found"))?; .ok_or_else(|| anyhow!("No action found"))?;
let Err(error) = load_dataset(action, format, false) else {
bail!("File parsed without errors even if it should not");
};
if let Some(result) = &test.result {
let expected = read_file_to_string(result)?;
ensure!( ensure!(
load_dataset(action, format, false).is_err(), expected == error.to_string(),
"File parsed without errors even if it should not" "Not expected error message:\n{}",
format_diff(&expected, &error.to_string(), "message")
); );
}
Ok(()) Ok(())
} }

@ -20,6 +20,14 @@ fn oxigraph_parser_recovery_testsuite() -> Result<()> {
) )
} }
#[test]
fn oxigraph_parser_error_testsuite() -> Result<()> {
check_testsuite(
"https://github.com/oxigraph/oxigraph/tests/parser-error/manifest.ttl",
&[],
)
}
#[test] #[test]
fn oxigraph_sparql_testsuite() -> Result<()> { fn oxigraph_sparql_testsuite() -> Result<()> {
check_testsuite( check_testsuite(

Loading…
Cancel
Save