From d280f7d2f76ab9cd4f5457f93badc96b8b2f4478 Mon Sep 17 00:00:00 2001 From: Tpt Date: Mon, 25 Sep 2023 22:06:51 +0200 Subject: [PATCH] Adds basic location support to sparesults SyntaxError --- lib/oxrdfio/src/lib.rs | 2 +- lib/sparesults/src/csv.rs | 148 +++++++++++++++++++++++++----------- lib/sparesults/src/error.rs | 70 +++++++++++++++-- lib/sparesults/src/lib.rs | 2 +- lib/src/io/mod.rs | 2 +- lib/src/sparql/results.rs | 2 +- python/src/io.rs | 8 +- python/src/sparql.rs | 66 ++++++++++++---- python/tests/test_io.py | 27 +++++++ 9 files changed, 252 insertions(+), 75 deletions(-) diff --git a/lib/oxrdfio/src/lib.rs b/lib/oxrdfio/src/lib.rs index d31ec656..4b51dcf8 100644 --- a/lib/oxrdfio/src/lib.rs +++ b/lib/oxrdfio/src/lib.rs @@ -9,7 +9,7 @@ mod format; mod parser; mod serializer; -pub use error::{ParseError, SyntaxError}; +pub use error::{ParseError, SyntaxError, TextPosition}; pub use format::RdfFormat; #[cfg(feature = "async-tokio")] pub use parser::FromTokioAsyncReadQuadReader; diff --git a/lib/sparesults/src/csv.rs b/lib/sparesults/src/csv.rs index 6fbfeb40..e45f521f 100644 --- a/lib/sparesults/src/csv.rs +++ b/lib/sparesults/src/csv.rs @@ -1,10 +1,10 @@ //! Implementation of [SPARQL 1.1 Query Results CSV and TSV Formats](https://www.w3.org/TR/sparql11-results-csv-tsv/) -use crate::error::{ParseError, SyntaxError, SyntaxErrorKind}; +use crate::error::{ParseError, SyntaxError, SyntaxErrorKind, TextPosition}; use memchr::memchr; use oxrdf::Variable; use oxrdf::{vocab::xsd, *}; -use std::io::{self, BufRead, Read, Write}; +use std::io::{self, Read, Write}; use std::str::{self, FromStr}; const MAX_BUFFER_SIZE: usize = 4096 * 4096; @@ -283,12 +283,13 @@ pub enum TsvQueryResultsReader { } impl TsvQueryResultsReader { - pub fn read(read: R) -> Result { - let mut reader = LineReader::new(read); + pub fn read(mut read: R) -> Result { + let mut reader = LineReader::new(); + let mut buffer = Vec::new(); // We read the header let line = reader - .next_line()? + .next_line(&mut buffer, &mut read)? .trim_matches(|c| matches!(c, ' ' | '\r' | '\n')); if line.eq_ignore_ascii_case("true") { return Ok(Self::Boolean(true)); @@ -318,34 +319,65 @@ impl TsvQueryResultsReader { let column_len = variables.len(); Ok(Self::Solutions { variables, - solutions: TsvSolutionsReader { reader, column_len }, + solutions: TsvSolutionsReader { + read, + buffer, + reader, + column_len, + }, }) } } pub struct TsvSolutionsReader { - reader: LineReader, + read: R, + buffer: Vec, + reader: LineReader, column_len: usize, } -impl TsvSolutionsReader { +impl TsvSolutionsReader { + #[allow(clippy::unwrap_in_result)] pub fn read_next(&mut self) -> Result>>, ParseError> { - let line = self.reader.next_line()?; + let line = self.reader.next_line(&mut self.buffer, &mut self.read)?; if line.is_empty() { return Ok(None); // EOF } let elements = line .split('\t') - .map(|v| { + .enumerate() + .map(|(i, v)| { let v = v.trim(); if v.is_empty() { Ok(None) } else { - Ok(Some(Term::from_str(v).map_err(|e| SyntaxError { - inner: SyntaxErrorKind::Term { - error: e, - term: v.into(), - }, + Ok(Some(Term::from_str(v).map_err(|e| { + let start_position_char = line + .split('\t') + .take(i) + .map(|c| c.chars().count() + 1) + .sum::(); + let start_position_bytes = + line.split('\t').take(i).map(|c| c.len() + 1).sum::(); + SyntaxError { + inner: SyntaxErrorKind::Term { + error: e, + term: v.into(), + location: TextPosition { + line: self.reader.line_count - 1, + column: start_position_char.try_into().unwrap(), + offset: self.reader.last_line_start + + u64::try_from(start_position_bytes).unwrap(), + }..TextPosition { + line: self.reader.line_count - 1, + column: (start_position_char + v.chars().count()) + .try_into() + .unwrap(), + offset: self.reader.last_line_start + + u64::try_from(start_position_bytes + v.len()).unwrap(), + }, + }, + } })?)) } }) @@ -355,64 +387,88 @@ impl TsvSolutionsReader { } else if self.column_len == 0 && elements == [None] { Ok(Some(Vec::new())) // Zero columns case } else { - Err(SyntaxError::msg(format!( - "This TSV files has {} columns but we found a row with {} columns: {}", - self.column_len, - elements.len(), - line - )) + Err(SyntaxError::located_message( + format!( + "This TSV files has {} columns but we found a row on line {} with {} columns: {}", + self.column_len, + self.reader.line_count - 1, + elements.len(), + line + ), + TextPosition { + line: self.reader.line_count - 1, + column: 0, + offset: self.reader.last_line_start, + }..TextPosition { + line: self.reader.line_count - 1, + column: line.chars().count().try_into().unwrap(), + offset: self.reader.last_line_end, + }, + ) .into()) } } } -struct LineReader { - read: R, - buffer: Vec, - start: usize, - end: usize, +struct LineReader { + buffer_start: usize, + buffer_end: usize, + line_count: u64, + last_line_start: u64, + last_line_end: u64, } -impl LineReader { - fn new(read: R) -> Self { +impl LineReader { + fn new() -> Self { Self { - read, - buffer: Vec::new(), - start: 0, - end: 0, + buffer_start: 0, + buffer_end: 0, + line_count: 0, + last_line_start: 0, + last_line_end: 0, } } - fn next_line(&mut self) -> io::Result<&str> { - self.buffer.copy_within(self.start..self.end, 0); - self.end -= self.start; - self.start = 0; + #[allow(clippy::unwrap_in_result)] + fn next_line<'a>( + &mut self, + buffer: &'a mut Vec, + read: &mut impl Read, + ) -> io::Result<&'a str> { let line_end = loop { - if let Some(eol) = memchr(b'\n', &self.buffer[self.start..self.end]) { - break self.start + eol + 1; + if let Some(eol) = memchr(b'\n', &buffer[self.buffer_start..self.buffer_end]) { + break self.buffer_start + eol + 1; + } + if self.buffer_start > buffer.len() / 2 { + buffer.copy_within(self.buffer_start..self.buffer_end, 0); + self.buffer_end -= self.buffer_start; + self.buffer_start = 0; } - if self.end + 1024 > self.buffer.len() { - if self.end + 1024 > MAX_BUFFER_SIZE { + if self.buffer_end + 1024 > buffer.len() { + if self.buffer_end + 1024 > MAX_BUFFER_SIZE { return Err(io::Error::new( io::ErrorKind::OutOfMemory, format!("Reached the buffer maximal size of {MAX_BUFFER_SIZE}"), )); } - self.buffer.resize(self.end + 1024, b'\0'); + buffer.resize(self.buffer_end + 1024, b'\0'); } - let read = self.read.read(&mut self.buffer[self.end..])?; + let read = read.read(&mut buffer[self.buffer_end..])?; if read == 0 { - break self.end; + break self.buffer_end; } - self.end += read; + self.buffer_end += read; }; - let result = str::from_utf8(&self.buffer[self.start..line_end]).map_err(|e| { + let result = str::from_utf8(&buffer[self.buffer_start..line_end]).map_err(|e| { io::Error::new( io::ErrorKind::InvalidData, format!("Invalid UTF-8 in the TSV file: {e}"), ) }); - self.start = line_end; + self.line_count += 1; + self.last_line_start = self.last_line_end; + self.last_line_end += u64::try_from(line_end - self.buffer_start).unwrap(); + self.buffer_start = line_end; result } } diff --git a/lib/sparesults/src/error.rs b/lib/sparesults/src/error.rs index 1096b393..b8fe2eff 100644 --- a/lib/sparesults/src/error.rs +++ b/lib/sparesults/src/error.rs @@ -1,5 +1,6 @@ use oxrdf::TermParseError; use std::error::Error; +use std::ops::Range; use std::sync::Arc; use std::{fmt, io}; @@ -90,8 +91,15 @@ pub struct SyntaxError { pub(crate) enum SyntaxErrorKind { Json(json_event_parser::SyntaxError), Xml(quick_xml::Error), - Term { error: TermParseError, term: String }, - Msg { msg: String }, + Term { + error: TermParseError, + term: String, + location: Range, + }, + Msg { + msg: String, + location: Option>, + }, } impl SyntaxError { @@ -99,7 +107,45 @@ impl SyntaxError { #[inline] pub(crate) fn msg(msg: impl Into) -> Self { Self { - inner: SyntaxErrorKind::Msg { msg: msg.into() }, + inner: SyntaxErrorKind::Msg { + msg: msg.into(), + location: None, + }, + } + } + + /// Builds an error from a printable error message and a location + #[inline] + pub(crate) fn located_message(msg: impl Into, location: Range) -> Self { + Self { + inner: SyntaxErrorKind::Msg { + msg: msg.into(), + location: Some(location), + }, + } + } + + /// The location of the error inside of the file. + #[inline] + pub fn location(&self) -> Option> { + match &self.inner { + SyntaxErrorKind::Json(e) => { + let location = e.location(); + Some( + TextPosition { + line: location.start.line, + column: location.start.column, + offset: location.start.offset, + }..TextPosition { + line: location.end.line, + column: location.end.column, + offset: location.end.offset, + }, + ) + } + SyntaxErrorKind::Term { location, .. } => Some(location.clone()), + SyntaxErrorKind::Msg { location, .. } => location.clone(), + SyntaxErrorKind::Xml(_) => None, } } } @@ -110,8 +156,12 @@ impl fmt::Display for SyntaxError { match &self.inner { SyntaxErrorKind::Json(e) => e.fmt(f), SyntaxErrorKind::Xml(e) => e.fmt(f), - SyntaxErrorKind::Term { error, term } => write!(f, "{error}: {term}"), - SyntaxErrorKind::Msg { msg } => f.write_str(msg), + SyntaxErrorKind::Term { + error, + term, + location, + } => write!(f, "{error} on '{term}' in line {}", location.start.line + 1), + SyntaxErrorKind::Msg { msg, .. } => f.write_str(msg), } } } @@ -144,7 +194,7 @@ impl From for io::Error { _ => Self::new(io::ErrorKind::InvalidData, error), }, SyntaxErrorKind::Term { .. } => Self::new(io::ErrorKind::InvalidData, error), - SyntaxErrorKind::Msg { msg } => Self::new(io::ErrorKind::InvalidData, msg), + SyntaxErrorKind::Msg { msg, .. } => Self::new(io::ErrorKind::InvalidData, msg), } } } @@ -156,3 +206,11 @@ impl From for SyntaxError { } } } + +/// A position in a text i.e. a `line` number starting from 0, a `column` number starting from 0 (in number of code points) and a global file `offset` starting from 0 (in number of bytes). +#[derive(Eq, PartialEq, Debug, Clone, Copy)] +pub struct TextPosition { + pub line: u64, + pub column: u64, + pub offset: u64, +} diff --git a/lib/sparesults/src/lib.rs b/lib/sparesults/src/lib.rs index 95bcef93..301dc2c8 100644 --- a/lib/sparesults/src/lib.rs +++ b/lib/sparesults/src/lib.rs @@ -13,7 +13,7 @@ mod serializer; pub mod solution; mod xml; -pub use crate::error::{ParseError, SyntaxError}; +pub use crate::error::{ParseError, SyntaxError, TextPosition}; pub use crate::format::QueryResultsFormat; pub use crate::parser::{FromReadQueryResultsReader, FromReadSolutionsReader, QueryResultsParser}; pub use crate::serializer::{QueryResultsSerializer, ToWriteSolutionsWriter}; diff --git a/lib/src/io/mod.rs b/lib/src/io/mod.rs index 2e4dd2f4..5e1cb271 100644 --- a/lib/src/io/mod.rs +++ b/lib/src/io/mod.rs @@ -35,6 +35,6 @@ pub use self::read::{DatasetParser, GraphParser}; #[allow(deprecated)] pub use self::write::{DatasetSerializer, GraphSerializer}; pub use oxrdfio::{ - FromReadQuadReader, ParseError, RdfFormat, RdfParser, RdfSerializer, SyntaxError, + FromReadQuadReader, ParseError, RdfFormat, RdfParser, RdfSerializer, SyntaxError, TextPosition, ToWriteQuadWriter, }; diff --git a/lib/src/sparql/results.rs b/lib/src/sparql/results.rs index 0716752b..bbafe70d 100644 --- a/lib/src/sparql/results.rs +++ b/lib/src/sparql/results.rs @@ -43,5 +43,5 @@ pub use sparesults::{ FromReadQueryResultsReader, FromReadSolutionsReader, ParseError, QueryResultsFormat, - QueryResultsParser, QueryResultsSerializer, SyntaxError, + QueryResultsParser, QueryResultsSerializer, SyntaxError, TextPosition, }; diff --git a/python/src/io.rs b/python/src/io.rs index e8da9a8c..4e04a4da 100644 --- a/python/src/io.rs +++ b/python/src/io.rs @@ -401,7 +401,7 @@ pub fn map_parse_error(error: ParseError, file_path: Option) -> PyErr { match error { ParseError::Syntax(error) => { // Python 3.9 does not support end line and end column - if python_version() >= (3, 10, 0) { + if python_version() >= (3, 10) { let params = if let Some(location) = error.location() { ( file_path, @@ -458,12 +458,12 @@ pub fn allow_threads_unsafe(_py: Python<'_>, f: impl FnOnce() -> T) -> T { f() } -fn python_version() -> (u8, u8, u8) { - static VERSION: OnceLock<(u8, u8, u8)> = OnceLock::new(); +pub fn python_version() -> (u8, u8) { + static VERSION: OnceLock<(u8, u8)> = OnceLock::new(); *VERSION.get_or_init(|| { Python::with_gil(|py| { let v = py.version_info(); - (v.major, v.minor, v.patch) + (v.major, v.minor) }) }) } diff --git a/python/src/sparql.rs b/python/src/sparql.rs index 1316eecf..b30d27fe 100644 --- a/python/src/sparql.rs +++ b/python/src/sparql.rs @@ -190,7 +190,10 @@ pub struct PyQuerySolutions { } enum PyQuerySolutionsVariant { Query(QuerySolutionIter), - Reader(FromReadSolutionsReader>), + Reader { + iter: FromReadSolutionsReader>, + file_path: Option, + }, } #[pymethods] @@ -207,8 +210,8 @@ impl PyQuerySolutions { PyQuerySolutionsVariant::Query(inner) => { inner.variables().iter().map(|v| v.clone().into()).collect() } - PyQuerySolutionsVariant::Reader(inner) => { - inner.variables().iter().map(|v| v.clone().into()).collect() + PyQuerySolutionsVariant::Reader { iter, .. } => { + iter.variables().iter().map(|v| v.clone().into()).collect() } } } @@ -252,7 +255,9 @@ impl PyQuerySolutions { output, match &self.inner { PyQuerySolutionsVariant::Query(inner) => inner.variables().to_vec(), - PyQuerySolutionsVariant::Reader(inner) => inner.variables().to_vec(), + PyQuerySolutionsVariant::Reader { iter, .. } => { + iter.variables().to_vec() + } }, ) .map_err(map_io_err)?; @@ -264,10 +269,12 @@ impl PyQuerySolutions { .map_err(map_io_err)?; } } - PyQuerySolutionsVariant::Reader(inner) => { - for solution in inner { + PyQuerySolutionsVariant::Reader { iter, file_path } => { + for solution in iter { writer - .write(&solution.map_err(map_query_results_parse_error)?) + .write(&solution.map_err(|e| { + map_query_results_parse_error(e, file_path.clone()) + })?) .map_err(map_io_err)?; } } @@ -290,10 +297,10 @@ impl PyQuerySolutions { PyQuerySolutionsVariant::Query(inner) => allow_threads_unsafe(py, || { inner.next().transpose().map_err(map_evaluation_error) }), - PyQuerySolutionsVariant::Reader(inner) => inner + PyQuerySolutionsVariant::Reader { iter, file_path } => iter .next() .transpose() - .map_err(map_query_results_parse_error), + .map_err(|e| map_query_results_parse_error(e, file_path.clone())), }? .map(move |inner| PyQuerySolution { inner })) } @@ -498,10 +505,10 @@ pub fn parse_query_results( }; let results = QueryResultsParser::from_format(format) .parse_read(BufReader::new(input)) - .map_err(map_query_results_parse_error)?; + .map_err(|e| map_query_results_parse_error(e, file_path.clone()))?; Ok(match results { - FromReadQueryResultsReader::Solutions(inner) => PyQuerySolutions { - inner: PyQuerySolutionsVariant::Reader(inner), + FromReadQueryResultsReader::Solutions(iter) => PyQuerySolutions { + inner: PyQuerySolutionsVariant::Reader { iter, file_path }, } .into_py(py), FromReadQueryResultsReader::Boolean(inner) => PyQueryBoolean { inner }.into_py(py), @@ -513,7 +520,7 @@ pub fn map_evaluation_error(error: EvaluationError) -> PyErr { EvaluationError::Parsing(error) => PySyntaxError::new_err(error.to_string()), EvaluationError::Storage(error) => map_storage_error(error), EvaluationError::GraphParsing(error) => map_parse_error(error, None), - EvaluationError::ResultsParsing(error) => map_query_results_parse_error(error), + EvaluationError::ResultsParsing(error) => map_query_results_parse_error(error, None), EvaluationError::ResultsSerialization(error) => map_io_err(error), EvaluationError::Service(error) => match error.downcast() { Ok(error) => map_io_err(*error), @@ -523,9 +530,38 @@ pub fn map_evaluation_error(error: EvaluationError) -> PyErr { } } -pub fn map_query_results_parse_error(error: ParseError) -> PyErr { +pub fn map_query_results_parse_error(error: ParseError, file_path: Option) -> PyErr { match error { - ParseError::Syntax(error) => PySyntaxError::new_err(error.to_string()), + ParseError::Syntax(error) => { + // Python 3.9 does not support end line and end column + if python_version() >= (3, 10) { + let params = if let Some(location) = error.location() { + ( + file_path, + Some(location.start.line + 1), + Some(location.start.column + 1), + None::>, + Some(location.end.line + 1), + Some(location.end.column + 1), + ) + } else { + (None, None, None, None, None, None) + }; + PySyntaxError::new_err((error.to_string(), params)) + } else { + let params = if let Some(location) = error.location() { + ( + file_path, + Some(location.start.line + 1), + Some(location.start.column + 1), + None::>, + ) + } else { + (None, None, None, None) + }; + PySyntaxError::new_err((error.to_string(), params)) + } + } ParseError::Io(error) => map_io_err(error), } } diff --git a/python/tests/test_io.py b/python/tests/test_io.py index 9d4a1326..c761148f 100644 --- a/python/tests/test_io.py +++ b/python/tests/test_io.py @@ -205,3 +205,30 @@ class TestParseQuerySolutions(unittest.TestCase): def test_parse_io_error(self) -> None: with self.assertRaises(UnsupportedOperation) as _, TemporaryFile("wb") as fp: parse_query_results(fp, "srx") + + def test_parse_syntax_error_json(self) -> None: + with NamedTemporaryFile() as fp: + fp.write(b"{]") + fp.flush() + with self.assertRaises(SyntaxError) as ctx: + list(parse_query_results(fp.name, "srj")) # type: ignore[arg-type] + self.assertEqual(ctx.exception.filename, fp.name) + self.assertEqual(ctx.exception.lineno, 1) + self.assertEqual(ctx.exception.offset, 2) + if sys.version_info >= (3, 10): + self.assertEqual(ctx.exception.end_lineno, 1) + self.assertEqual(ctx.exception.end_offset, 3) + + def test_parse_syntax_error_tsv(self) -> None: + with NamedTemporaryFile() as fp: + fp.write(b"?a\t?test\n") + fp.write(b"1\t\n") + fp.flush() + with self.assertRaises(SyntaxError) as ctx: + list(parse_query_results(fp.name, "tsv")) # type: ignore[arg-type] + self.assertEqual(ctx.exception.filename, fp.name) + self.assertEqual(ctx.exception.lineno, 2) + self.assertEqual(ctx.exception.offset, 3) + if sys.version_info >= (3, 10): + self.assertEqual(ctx.exception.end_lineno, 2) + self.assertEqual(ctx.exception.end_offset, 9)