Adds basic location support to sparesults SyntaxError

pull/643/head
Tpt 1 year ago committed by Thomas Tanon
parent dbb39d867a
commit d280f7d2f7
  1. 2
      lib/oxrdfio/src/lib.rs
  2. 134
      lib/sparesults/src/csv.rs
  3. 70
      lib/sparesults/src/error.rs
  4. 2
      lib/sparesults/src/lib.rs
  5. 2
      lib/src/io/mod.rs
  6. 2
      lib/src/sparql/results.rs
  7. 8
      python/src/io.rs
  8. 66
      python/src/sparql.rs
  9. 27
      python/tests/test_io.py

@ -9,7 +9,7 @@ mod format;
mod parser; mod parser;
mod serializer; mod serializer;
pub use error::{ParseError, SyntaxError}; pub use error::{ParseError, SyntaxError, TextPosition};
pub use format::RdfFormat; pub use format::RdfFormat;
#[cfg(feature = "async-tokio")] #[cfg(feature = "async-tokio")]
pub use parser::FromTokioAsyncReadQuadReader; pub use parser::FromTokioAsyncReadQuadReader;

@ -1,10 +1,10 @@
//! Implementation of [SPARQL 1.1 Query Results CSV and TSV Formats](https://www.w3.org/TR/sparql11-results-csv-tsv/) //! Implementation of [SPARQL 1.1 Query Results CSV and TSV Formats](https://www.w3.org/TR/sparql11-results-csv-tsv/)
use crate::error::{ParseError, SyntaxError, SyntaxErrorKind}; use crate::error::{ParseError, SyntaxError, SyntaxErrorKind, TextPosition};
use memchr::memchr; use memchr::memchr;
use oxrdf::Variable; use oxrdf::Variable;
use oxrdf::{vocab::xsd, *}; use oxrdf::{vocab::xsd, *};
use std::io::{self, BufRead, Read, Write}; use std::io::{self, Read, Write};
use std::str::{self, FromStr}; use std::str::{self, FromStr};
const MAX_BUFFER_SIZE: usize = 4096 * 4096; const MAX_BUFFER_SIZE: usize = 4096 * 4096;
@ -283,12 +283,13 @@ pub enum TsvQueryResultsReader<R: Read> {
} }
impl<R: Read> TsvQueryResultsReader<R> { impl<R: Read> TsvQueryResultsReader<R> {
pub fn read(read: R) -> Result<Self, ParseError> { pub fn read(mut read: R) -> Result<Self, ParseError> {
let mut reader = LineReader::new(read); let mut reader = LineReader::new();
let mut buffer = Vec::new();
// We read the header // We read the header
let line = reader let line = reader
.next_line()? .next_line(&mut buffer, &mut read)?
.trim_matches(|c| matches!(c, ' ' | '\r' | '\n')); .trim_matches(|c| matches!(c, ' ' | '\r' | '\n'));
if line.eq_ignore_ascii_case("true") { if line.eq_ignore_ascii_case("true") {
return Ok(Self::Boolean(true)); return Ok(Self::Boolean(true));
@ -318,34 +319,65 @@ impl<R: Read> TsvQueryResultsReader<R> {
let column_len = variables.len(); let column_len = variables.len();
Ok(Self::Solutions { Ok(Self::Solutions {
variables, variables,
solutions: TsvSolutionsReader { reader, column_len }, solutions: TsvSolutionsReader {
read,
buffer,
reader,
column_len,
},
}) })
} }
} }
pub struct TsvSolutionsReader<R: Read> { pub struct TsvSolutionsReader<R: Read> {
reader: LineReader<R>, read: R,
buffer: Vec<u8>,
reader: LineReader,
column_len: usize, column_len: usize,
} }
impl<R: BufRead> TsvSolutionsReader<R> { impl<R: Read> TsvSolutionsReader<R> {
#[allow(clippy::unwrap_in_result)]
pub fn read_next(&mut self) -> Result<Option<Vec<Option<Term>>>, ParseError> { pub fn read_next(&mut self) -> Result<Option<Vec<Option<Term>>>, ParseError> {
let line = self.reader.next_line()?; let line = self.reader.next_line(&mut self.buffer, &mut self.read)?;
if line.is_empty() { if line.is_empty() {
return Ok(None); // EOF return Ok(None); // EOF
} }
let elements = line let elements = line
.split('\t') .split('\t')
.map(|v| { .enumerate()
.map(|(i, v)| {
let v = v.trim(); let v = v.trim();
if v.is_empty() { if v.is_empty() {
Ok(None) Ok(None)
} else { } else {
Ok(Some(Term::from_str(v).map_err(|e| SyntaxError { Ok(Some(Term::from_str(v).map_err(|e| {
let start_position_char = line
.split('\t')
.take(i)
.map(|c| c.chars().count() + 1)
.sum::<usize>();
let start_position_bytes =
line.split('\t').take(i).map(|c| c.len() + 1).sum::<usize>();
SyntaxError {
inner: SyntaxErrorKind::Term { inner: SyntaxErrorKind::Term {
error: e, error: e,
term: v.into(), term: v.into(),
location: TextPosition {
line: self.reader.line_count - 1,
column: start_position_char.try_into().unwrap(),
offset: self.reader.last_line_start
+ u64::try_from(start_position_bytes).unwrap(),
}..TextPosition {
line: self.reader.line_count - 1,
column: (start_position_char + v.chars().count())
.try_into()
.unwrap(),
offset: self.reader.last_line_start
+ u64::try_from(start_position_bytes + v.len()).unwrap(),
},
}, },
}
})?)) })?))
} }
}) })
@ -355,64 +387,88 @@ impl<R: BufRead> TsvSolutionsReader<R> {
} else if self.column_len == 0 && elements == [None] { } else if self.column_len == 0 && elements == [None] {
Ok(Some(Vec::new())) // Zero columns case Ok(Some(Vec::new())) // Zero columns case
} else { } else {
Err(SyntaxError::msg(format!( Err(SyntaxError::located_message(
"This TSV files has {} columns but we found a row with {} columns: {}", format!(
"This TSV files has {} columns but we found a row on line {} with {} columns: {}",
self.column_len, self.column_len,
self.reader.line_count - 1,
elements.len(), elements.len(),
line line
)) ),
TextPosition {
line: self.reader.line_count - 1,
column: 0,
offset: self.reader.last_line_start,
}..TextPosition {
line: self.reader.line_count - 1,
column: line.chars().count().try_into().unwrap(),
offset: self.reader.last_line_end,
},
)
.into()) .into())
} }
} }
} }
struct LineReader<R: Read> { struct LineReader {
read: R, buffer_start: usize,
buffer: Vec<u8>, buffer_end: usize,
start: usize, line_count: u64,
end: usize, last_line_start: u64,
last_line_end: u64,
} }
impl<R: Read> LineReader<R> { impl LineReader {
fn new(read: R) -> Self { fn new() -> Self {
Self { Self {
read, buffer_start: 0,
buffer: Vec::new(), buffer_end: 0,
start: 0, line_count: 0,
end: 0, last_line_start: 0,
last_line_end: 0,
} }
} }
fn next_line(&mut self) -> io::Result<&str> { #[allow(clippy::unwrap_in_result)]
self.buffer.copy_within(self.start..self.end, 0); fn next_line<'a>(
self.end -= self.start; &mut self,
self.start = 0; buffer: &'a mut Vec<u8>,
read: &mut impl Read,
) -> io::Result<&'a str> {
let line_end = loop { let line_end = loop {
if let Some(eol) = memchr(b'\n', &self.buffer[self.start..self.end]) { if let Some(eol) = memchr(b'\n', &buffer[self.buffer_start..self.buffer_end]) {
break self.start + eol + 1; break self.buffer_start + eol + 1;
}
if self.buffer_start > buffer.len() / 2 {
buffer.copy_within(self.buffer_start..self.buffer_end, 0);
self.buffer_end -= self.buffer_start;
self.buffer_start = 0;
} }
if self.end + 1024 > self.buffer.len() { if self.buffer_end + 1024 > buffer.len() {
if self.end + 1024 > MAX_BUFFER_SIZE { if self.buffer_end + 1024 > MAX_BUFFER_SIZE {
return Err(io::Error::new( return Err(io::Error::new(
io::ErrorKind::OutOfMemory, io::ErrorKind::OutOfMemory,
format!("Reached the buffer maximal size of {MAX_BUFFER_SIZE}"), format!("Reached the buffer maximal size of {MAX_BUFFER_SIZE}"),
)); ));
} }
self.buffer.resize(self.end + 1024, b'\0'); buffer.resize(self.buffer_end + 1024, b'\0');
} }
let read = self.read.read(&mut self.buffer[self.end..])?; let read = read.read(&mut buffer[self.buffer_end..])?;
if read == 0 { if read == 0 {
break self.end; break self.buffer_end;
} }
self.end += read; self.buffer_end += read;
}; };
let result = str::from_utf8(&self.buffer[self.start..line_end]).map_err(|e| { let result = str::from_utf8(&buffer[self.buffer_start..line_end]).map_err(|e| {
io::Error::new( io::Error::new(
io::ErrorKind::InvalidData, io::ErrorKind::InvalidData,
format!("Invalid UTF-8 in the TSV file: {e}"), format!("Invalid UTF-8 in the TSV file: {e}"),
) )
}); });
self.start = line_end; self.line_count += 1;
self.last_line_start = self.last_line_end;
self.last_line_end += u64::try_from(line_end - self.buffer_start).unwrap();
self.buffer_start = line_end;
result result
} }
} }

@ -1,5 +1,6 @@
use oxrdf::TermParseError; use oxrdf::TermParseError;
use std::error::Error; use std::error::Error;
use std::ops::Range;
use std::sync::Arc; use std::sync::Arc;
use std::{fmt, io}; use std::{fmt, io};
@ -90,8 +91,15 @@ pub struct SyntaxError {
pub(crate) enum SyntaxErrorKind { pub(crate) enum SyntaxErrorKind {
Json(json_event_parser::SyntaxError), Json(json_event_parser::SyntaxError),
Xml(quick_xml::Error), Xml(quick_xml::Error),
Term { error: TermParseError, term: String }, Term {
Msg { msg: String }, error: TermParseError,
term: String,
location: Range<TextPosition>,
},
Msg {
msg: String,
location: Option<Range<TextPosition>>,
},
} }
impl SyntaxError { impl SyntaxError {
@ -99,7 +107,45 @@ impl SyntaxError {
#[inline] #[inline]
pub(crate) fn msg(msg: impl Into<String>) -> Self { pub(crate) fn msg(msg: impl Into<String>) -> Self {
Self { Self {
inner: SyntaxErrorKind::Msg { msg: msg.into() }, inner: SyntaxErrorKind::Msg {
msg: msg.into(),
location: None,
},
}
}
/// Builds an error from a printable error message and a location
#[inline]
pub(crate) fn located_message(msg: impl Into<String>, location: Range<TextPosition>) -> Self {
Self {
inner: SyntaxErrorKind::Msg {
msg: msg.into(),
location: Some(location),
},
}
}
/// The location of the error inside of the file.
#[inline]
pub fn location(&self) -> Option<Range<TextPosition>> {
match &self.inner {
SyntaxErrorKind::Json(e) => {
let location = e.location();
Some(
TextPosition {
line: location.start.line,
column: location.start.column,
offset: location.start.offset,
}..TextPosition {
line: location.end.line,
column: location.end.column,
offset: location.end.offset,
},
)
}
SyntaxErrorKind::Term { location, .. } => Some(location.clone()),
SyntaxErrorKind::Msg { location, .. } => location.clone(),
SyntaxErrorKind::Xml(_) => None,
} }
} }
} }
@ -110,8 +156,12 @@ impl fmt::Display for SyntaxError {
match &self.inner { match &self.inner {
SyntaxErrorKind::Json(e) => e.fmt(f), SyntaxErrorKind::Json(e) => e.fmt(f),
SyntaxErrorKind::Xml(e) => e.fmt(f), SyntaxErrorKind::Xml(e) => e.fmt(f),
SyntaxErrorKind::Term { error, term } => write!(f, "{error}: {term}"), SyntaxErrorKind::Term {
SyntaxErrorKind::Msg { msg } => f.write_str(msg), error,
term,
location,
} => write!(f, "{error} on '{term}' in line {}", location.start.line + 1),
SyntaxErrorKind::Msg { msg, .. } => f.write_str(msg),
} }
} }
} }
@ -144,7 +194,7 @@ impl From<SyntaxError> for io::Error {
_ => Self::new(io::ErrorKind::InvalidData, error), _ => Self::new(io::ErrorKind::InvalidData, error),
}, },
SyntaxErrorKind::Term { .. } => Self::new(io::ErrorKind::InvalidData, error), SyntaxErrorKind::Term { .. } => Self::new(io::ErrorKind::InvalidData, error),
SyntaxErrorKind::Msg { msg } => Self::new(io::ErrorKind::InvalidData, msg), SyntaxErrorKind::Msg { msg, .. } => Self::new(io::ErrorKind::InvalidData, msg),
} }
} }
} }
@ -156,3 +206,11 @@ impl From<json_event_parser::SyntaxError> for SyntaxError {
} }
} }
} }
/// A position in a text i.e. a `line` number starting from 0, a `column` number starting from 0 (in number of code points) and a global file `offset` starting from 0 (in number of bytes).
#[derive(Eq, PartialEq, Debug, Clone, Copy)]
pub struct TextPosition {
pub line: u64,
pub column: u64,
pub offset: u64,
}

@ -13,7 +13,7 @@ mod serializer;
pub mod solution; pub mod solution;
mod xml; mod xml;
pub use crate::error::{ParseError, SyntaxError}; pub use crate::error::{ParseError, SyntaxError, TextPosition};
pub use crate::format::QueryResultsFormat; pub use crate::format::QueryResultsFormat;
pub use crate::parser::{FromReadQueryResultsReader, FromReadSolutionsReader, QueryResultsParser}; pub use crate::parser::{FromReadQueryResultsReader, FromReadSolutionsReader, QueryResultsParser};
pub use crate::serializer::{QueryResultsSerializer, ToWriteSolutionsWriter}; pub use crate::serializer::{QueryResultsSerializer, ToWriteSolutionsWriter};

@ -35,6 +35,6 @@ pub use self::read::{DatasetParser, GraphParser};
#[allow(deprecated)] #[allow(deprecated)]
pub use self::write::{DatasetSerializer, GraphSerializer}; pub use self::write::{DatasetSerializer, GraphSerializer};
pub use oxrdfio::{ pub use oxrdfio::{
FromReadQuadReader, ParseError, RdfFormat, RdfParser, RdfSerializer, SyntaxError, FromReadQuadReader, ParseError, RdfFormat, RdfParser, RdfSerializer, SyntaxError, TextPosition,
ToWriteQuadWriter, ToWriteQuadWriter,
}; };

@ -43,5 +43,5 @@
pub use sparesults::{ pub use sparesults::{
FromReadQueryResultsReader, FromReadSolutionsReader, ParseError, QueryResultsFormat, FromReadQueryResultsReader, FromReadSolutionsReader, ParseError, QueryResultsFormat,
QueryResultsParser, QueryResultsSerializer, SyntaxError, QueryResultsParser, QueryResultsSerializer, SyntaxError, TextPosition,
}; };

@ -401,7 +401,7 @@ pub fn map_parse_error(error: ParseError, file_path: Option<PathBuf>) -> PyErr {
match error { match error {
ParseError::Syntax(error) => { ParseError::Syntax(error) => {
// Python 3.9 does not support end line and end column // Python 3.9 does not support end line and end column
if python_version() >= (3, 10, 0) { if python_version() >= (3, 10) {
let params = if let Some(location) = error.location() { let params = if let Some(location) = error.location() {
( (
file_path, file_path,
@ -458,12 +458,12 @@ pub fn allow_threads_unsafe<T>(_py: Python<'_>, f: impl FnOnce() -> T) -> T {
f() f()
} }
fn python_version() -> (u8, u8, u8) { pub fn python_version() -> (u8, u8) {
static VERSION: OnceLock<(u8, u8, u8)> = OnceLock::new(); static VERSION: OnceLock<(u8, u8)> = OnceLock::new();
*VERSION.get_or_init(|| { *VERSION.get_or_init(|| {
Python::with_gil(|py| { Python::with_gil(|py| {
let v = py.version_info(); let v = py.version_info();
(v.major, v.minor, v.patch) (v.major, v.minor)
}) })
}) })
} }

@ -190,7 +190,10 @@ pub struct PyQuerySolutions {
} }
enum PyQuerySolutionsVariant { enum PyQuerySolutionsVariant {
Query(QuerySolutionIter), Query(QuerySolutionIter),
Reader(FromReadSolutionsReader<BufReader<PyReadable>>), Reader {
iter: FromReadSolutionsReader<BufReader<PyReadable>>,
file_path: Option<PathBuf>,
},
} }
#[pymethods] #[pymethods]
@ -207,8 +210,8 @@ impl PyQuerySolutions {
PyQuerySolutionsVariant::Query(inner) => { PyQuerySolutionsVariant::Query(inner) => {
inner.variables().iter().map(|v| v.clone().into()).collect() inner.variables().iter().map(|v| v.clone().into()).collect()
} }
PyQuerySolutionsVariant::Reader(inner) => { PyQuerySolutionsVariant::Reader { iter, .. } => {
inner.variables().iter().map(|v| v.clone().into()).collect() iter.variables().iter().map(|v| v.clone().into()).collect()
} }
} }
} }
@ -252,7 +255,9 @@ impl PyQuerySolutions {
output, output,
match &self.inner { match &self.inner {
PyQuerySolutionsVariant::Query(inner) => inner.variables().to_vec(), PyQuerySolutionsVariant::Query(inner) => inner.variables().to_vec(),
PyQuerySolutionsVariant::Reader(inner) => inner.variables().to_vec(), PyQuerySolutionsVariant::Reader { iter, .. } => {
iter.variables().to_vec()
}
}, },
) )
.map_err(map_io_err)?; .map_err(map_io_err)?;
@ -264,10 +269,12 @@ impl PyQuerySolutions {
.map_err(map_io_err)?; .map_err(map_io_err)?;
} }
} }
PyQuerySolutionsVariant::Reader(inner) => { PyQuerySolutionsVariant::Reader { iter, file_path } => {
for solution in inner { for solution in iter {
writer writer
.write(&solution.map_err(map_query_results_parse_error)?) .write(&solution.map_err(|e| {
map_query_results_parse_error(e, file_path.clone())
})?)
.map_err(map_io_err)?; .map_err(map_io_err)?;
} }
} }
@ -290,10 +297,10 @@ impl PyQuerySolutions {
PyQuerySolutionsVariant::Query(inner) => allow_threads_unsafe(py, || { PyQuerySolutionsVariant::Query(inner) => allow_threads_unsafe(py, || {
inner.next().transpose().map_err(map_evaluation_error) inner.next().transpose().map_err(map_evaluation_error)
}), }),
PyQuerySolutionsVariant::Reader(inner) => inner PyQuerySolutionsVariant::Reader { iter, file_path } => iter
.next() .next()
.transpose() .transpose()
.map_err(map_query_results_parse_error), .map_err(|e| map_query_results_parse_error(e, file_path.clone())),
}? }?
.map(move |inner| PyQuerySolution { inner })) .map(move |inner| PyQuerySolution { inner }))
} }
@ -498,10 +505,10 @@ pub fn parse_query_results(
}; };
let results = QueryResultsParser::from_format(format) let results = QueryResultsParser::from_format(format)
.parse_read(BufReader::new(input)) .parse_read(BufReader::new(input))
.map_err(map_query_results_parse_error)?; .map_err(|e| map_query_results_parse_error(e, file_path.clone()))?;
Ok(match results { Ok(match results {
FromReadQueryResultsReader::Solutions(inner) => PyQuerySolutions { FromReadQueryResultsReader::Solutions(iter) => PyQuerySolutions {
inner: PyQuerySolutionsVariant::Reader(inner), inner: PyQuerySolutionsVariant::Reader { iter, file_path },
} }
.into_py(py), .into_py(py),
FromReadQueryResultsReader::Boolean(inner) => PyQueryBoolean { inner }.into_py(py), FromReadQueryResultsReader::Boolean(inner) => PyQueryBoolean { inner }.into_py(py),
@ -513,7 +520,7 @@ pub fn map_evaluation_error(error: EvaluationError) -> PyErr {
EvaluationError::Parsing(error) => PySyntaxError::new_err(error.to_string()), EvaluationError::Parsing(error) => PySyntaxError::new_err(error.to_string()),
EvaluationError::Storage(error) => map_storage_error(error), EvaluationError::Storage(error) => map_storage_error(error),
EvaluationError::GraphParsing(error) => map_parse_error(error, None), EvaluationError::GraphParsing(error) => map_parse_error(error, None),
EvaluationError::ResultsParsing(error) => map_query_results_parse_error(error), EvaluationError::ResultsParsing(error) => map_query_results_parse_error(error, None),
EvaluationError::ResultsSerialization(error) => map_io_err(error), EvaluationError::ResultsSerialization(error) => map_io_err(error),
EvaluationError::Service(error) => match error.downcast() { EvaluationError::Service(error) => match error.downcast() {
Ok(error) => map_io_err(*error), Ok(error) => map_io_err(*error),
@ -523,9 +530,38 @@ pub fn map_evaluation_error(error: EvaluationError) -> PyErr {
} }
} }
pub fn map_query_results_parse_error(error: ParseError) -> PyErr { pub fn map_query_results_parse_error(error: ParseError, file_path: Option<PathBuf>) -> PyErr {
match error { match error {
ParseError::Syntax(error) => PySyntaxError::new_err(error.to_string()), ParseError::Syntax(error) => {
// Python 3.9 does not support end line and end column
if python_version() >= (3, 10) {
let params = if let Some(location) = error.location() {
(
file_path,
Some(location.start.line + 1),
Some(location.start.column + 1),
None::<Vec<u8>>,
Some(location.end.line + 1),
Some(location.end.column + 1),
)
} else {
(None, None, None, None, None, None)
};
PySyntaxError::new_err((error.to_string(), params))
} else {
let params = if let Some(location) = error.location() {
(
file_path,
Some(location.start.line + 1),
Some(location.start.column + 1),
None::<Vec<u8>>,
)
} else {
(None, None, None, None)
};
PySyntaxError::new_err((error.to_string(), params))
}
}
ParseError::Io(error) => map_io_err(error), ParseError::Io(error) => map_io_err(error),
} }
} }

@ -205,3 +205,30 @@ class TestParseQuerySolutions(unittest.TestCase):
def test_parse_io_error(self) -> None: def test_parse_io_error(self) -> None:
with self.assertRaises(UnsupportedOperation) as _, TemporaryFile("wb") as fp: with self.assertRaises(UnsupportedOperation) as _, TemporaryFile("wb") as fp:
parse_query_results(fp, "srx") parse_query_results(fp, "srx")
def test_parse_syntax_error_json(self) -> None:
with NamedTemporaryFile() as fp:
fp.write(b"{]")
fp.flush()
with self.assertRaises(SyntaxError) as ctx:
list(parse_query_results(fp.name, "srj")) # type: ignore[arg-type]
self.assertEqual(ctx.exception.filename, fp.name)
self.assertEqual(ctx.exception.lineno, 1)
self.assertEqual(ctx.exception.offset, 2)
if sys.version_info >= (3, 10):
self.assertEqual(ctx.exception.end_lineno, 1)
self.assertEqual(ctx.exception.end_offset, 3)
def test_parse_syntax_error_tsv(self) -> None:
with NamedTemporaryFile() as fp:
fp.write(b"?a\t?test\n")
fp.write(b"1\t<foo >\n")
fp.flush()
with self.assertRaises(SyntaxError) as ctx:
list(parse_query_results(fp.name, "tsv")) # type: ignore[arg-type]
self.assertEqual(ctx.exception.filename, fp.name)
self.assertEqual(ctx.exception.lineno, 2)
self.assertEqual(ctx.exception.offset, 3)
if sys.version_info >= (3, 10):
self.assertEqual(ctx.exception.end_lineno, 2)
self.assertEqual(ctx.exception.end_offset, 9)

Loading…
Cancel
Save