Adds basic location support to sparesults SyntaxError

pull/643/head
Tpt 1 year ago committed by Thomas Tanon
parent dbb39d867a
commit d280f7d2f7
  1. 2
      lib/oxrdfio/src/lib.rs
  2. 148
      lib/sparesults/src/csv.rs
  3. 70
      lib/sparesults/src/error.rs
  4. 2
      lib/sparesults/src/lib.rs
  5. 2
      lib/src/io/mod.rs
  6. 2
      lib/src/sparql/results.rs
  7. 8
      python/src/io.rs
  8. 66
      python/src/sparql.rs
  9. 27
      python/tests/test_io.py

@ -9,7 +9,7 @@ mod format;
mod parser;
mod serializer;
pub use error::{ParseError, SyntaxError};
pub use error::{ParseError, SyntaxError, TextPosition};
pub use format::RdfFormat;
#[cfg(feature = "async-tokio")]
pub use parser::FromTokioAsyncReadQuadReader;

@ -1,10 +1,10 @@
//! Implementation of [SPARQL 1.1 Query Results CSV and TSV Formats](https://www.w3.org/TR/sparql11-results-csv-tsv/)
use crate::error::{ParseError, SyntaxError, SyntaxErrorKind};
use crate::error::{ParseError, SyntaxError, SyntaxErrorKind, TextPosition};
use memchr::memchr;
use oxrdf::Variable;
use oxrdf::{vocab::xsd, *};
use std::io::{self, BufRead, Read, Write};
use std::io::{self, Read, Write};
use std::str::{self, FromStr};
const MAX_BUFFER_SIZE: usize = 4096 * 4096;
@ -283,12 +283,13 @@ pub enum TsvQueryResultsReader<R: Read> {
}
impl<R: Read> TsvQueryResultsReader<R> {
pub fn read(read: R) -> Result<Self, ParseError> {
let mut reader = LineReader::new(read);
pub fn read(mut read: R) -> Result<Self, ParseError> {
let mut reader = LineReader::new();
let mut buffer = Vec::new();
// We read the header
let line = reader
.next_line()?
.next_line(&mut buffer, &mut read)?
.trim_matches(|c| matches!(c, ' ' | '\r' | '\n'));
if line.eq_ignore_ascii_case("true") {
return Ok(Self::Boolean(true));
@ -318,34 +319,65 @@ impl<R: Read> TsvQueryResultsReader<R> {
let column_len = variables.len();
Ok(Self::Solutions {
variables,
solutions: TsvSolutionsReader { reader, column_len },
solutions: TsvSolutionsReader {
read,
buffer,
reader,
column_len,
},
})
}
}
pub struct TsvSolutionsReader<R: Read> {
reader: LineReader<R>,
read: R,
buffer: Vec<u8>,
reader: LineReader,
column_len: usize,
}
impl<R: BufRead> TsvSolutionsReader<R> {
impl<R: Read> TsvSolutionsReader<R> {
#[allow(clippy::unwrap_in_result)]
pub fn read_next(&mut self) -> Result<Option<Vec<Option<Term>>>, ParseError> {
let line = self.reader.next_line()?;
let line = self.reader.next_line(&mut self.buffer, &mut self.read)?;
if line.is_empty() {
return Ok(None); // EOF
}
let elements = line
.split('\t')
.map(|v| {
.enumerate()
.map(|(i, v)| {
let v = v.trim();
if v.is_empty() {
Ok(None)
} else {
Ok(Some(Term::from_str(v).map_err(|e| SyntaxError {
inner: SyntaxErrorKind::Term {
error: e,
term: v.into(),
},
Ok(Some(Term::from_str(v).map_err(|e| {
let start_position_char = line
.split('\t')
.take(i)
.map(|c| c.chars().count() + 1)
.sum::<usize>();
let start_position_bytes =
line.split('\t').take(i).map(|c| c.len() + 1).sum::<usize>();
SyntaxError {
inner: SyntaxErrorKind::Term {
error: e,
term: v.into(),
location: TextPosition {
line: self.reader.line_count - 1,
column: start_position_char.try_into().unwrap(),
offset: self.reader.last_line_start
+ u64::try_from(start_position_bytes).unwrap(),
}..TextPosition {
line: self.reader.line_count - 1,
column: (start_position_char + v.chars().count())
.try_into()
.unwrap(),
offset: self.reader.last_line_start
+ u64::try_from(start_position_bytes + v.len()).unwrap(),
},
},
}
})?))
}
})
@ -355,64 +387,88 @@ impl<R: BufRead> TsvSolutionsReader<R> {
} else if self.column_len == 0 && elements == [None] {
Ok(Some(Vec::new())) // Zero columns case
} else {
Err(SyntaxError::msg(format!(
"This TSV files has {} columns but we found a row with {} columns: {}",
self.column_len,
elements.len(),
line
))
Err(SyntaxError::located_message(
format!(
"This TSV files has {} columns but we found a row on line {} with {} columns: {}",
self.column_len,
self.reader.line_count - 1,
elements.len(),
line
),
TextPosition {
line: self.reader.line_count - 1,
column: 0,
offset: self.reader.last_line_start,
}..TextPosition {
line: self.reader.line_count - 1,
column: line.chars().count().try_into().unwrap(),
offset: self.reader.last_line_end,
},
)
.into())
}
}
}
struct LineReader<R: Read> {
read: R,
buffer: Vec<u8>,
start: usize,
end: usize,
struct LineReader {
buffer_start: usize,
buffer_end: usize,
line_count: u64,
last_line_start: u64,
last_line_end: u64,
}
impl<R: Read> LineReader<R> {
fn new(read: R) -> Self {
impl LineReader {
fn new() -> Self {
Self {
read,
buffer: Vec::new(),
start: 0,
end: 0,
buffer_start: 0,
buffer_end: 0,
line_count: 0,
last_line_start: 0,
last_line_end: 0,
}
}
fn next_line(&mut self) -> io::Result<&str> {
self.buffer.copy_within(self.start..self.end, 0);
self.end -= self.start;
self.start = 0;
#[allow(clippy::unwrap_in_result)]
fn next_line<'a>(
&mut self,
buffer: &'a mut Vec<u8>,
read: &mut impl Read,
) -> io::Result<&'a str> {
let line_end = loop {
if let Some(eol) = memchr(b'\n', &self.buffer[self.start..self.end]) {
break self.start + eol + 1;
if let Some(eol) = memchr(b'\n', &buffer[self.buffer_start..self.buffer_end]) {
break self.buffer_start + eol + 1;
}
if self.buffer_start > buffer.len() / 2 {
buffer.copy_within(self.buffer_start..self.buffer_end, 0);
self.buffer_end -= self.buffer_start;
self.buffer_start = 0;
}
if self.end + 1024 > self.buffer.len() {
if self.end + 1024 > MAX_BUFFER_SIZE {
if self.buffer_end + 1024 > buffer.len() {
if self.buffer_end + 1024 > MAX_BUFFER_SIZE {
return Err(io::Error::new(
io::ErrorKind::OutOfMemory,
format!("Reached the buffer maximal size of {MAX_BUFFER_SIZE}"),
));
}
self.buffer.resize(self.end + 1024, b'\0');
buffer.resize(self.buffer_end + 1024, b'\0');
}
let read = self.read.read(&mut self.buffer[self.end..])?;
let read = read.read(&mut buffer[self.buffer_end..])?;
if read == 0 {
break self.end;
break self.buffer_end;
}
self.end += read;
self.buffer_end += read;
};
let result = str::from_utf8(&self.buffer[self.start..line_end]).map_err(|e| {
let result = str::from_utf8(&buffer[self.buffer_start..line_end]).map_err(|e| {
io::Error::new(
io::ErrorKind::InvalidData,
format!("Invalid UTF-8 in the TSV file: {e}"),
)
});
self.start = line_end;
self.line_count += 1;
self.last_line_start = self.last_line_end;
self.last_line_end += u64::try_from(line_end - self.buffer_start).unwrap();
self.buffer_start = line_end;
result
}
}

@ -1,5 +1,6 @@
use oxrdf::TermParseError;
use std::error::Error;
use std::ops::Range;
use std::sync::Arc;
use std::{fmt, io};
@ -90,8 +91,15 @@ pub struct SyntaxError {
pub(crate) enum SyntaxErrorKind {
Json(json_event_parser::SyntaxError),
Xml(quick_xml::Error),
Term { error: TermParseError, term: String },
Msg { msg: String },
Term {
error: TermParseError,
term: String,
location: Range<TextPosition>,
},
Msg {
msg: String,
location: Option<Range<TextPosition>>,
},
}
impl SyntaxError {
@ -99,7 +107,45 @@ impl SyntaxError {
#[inline]
pub(crate) fn msg(msg: impl Into<String>) -> Self {
Self {
inner: SyntaxErrorKind::Msg { msg: msg.into() },
inner: SyntaxErrorKind::Msg {
msg: msg.into(),
location: None,
},
}
}
/// Builds an error from a printable error message and a location
#[inline]
pub(crate) fn located_message(msg: impl Into<String>, location: Range<TextPosition>) -> Self {
Self {
inner: SyntaxErrorKind::Msg {
msg: msg.into(),
location: Some(location),
},
}
}
/// The location of the error inside of the file.
#[inline]
pub fn location(&self) -> Option<Range<TextPosition>> {
match &self.inner {
SyntaxErrorKind::Json(e) => {
let location = e.location();
Some(
TextPosition {
line: location.start.line,
column: location.start.column,
offset: location.start.offset,
}..TextPosition {
line: location.end.line,
column: location.end.column,
offset: location.end.offset,
},
)
}
SyntaxErrorKind::Term { location, .. } => Some(location.clone()),
SyntaxErrorKind::Msg { location, .. } => location.clone(),
SyntaxErrorKind::Xml(_) => None,
}
}
}
@ -110,8 +156,12 @@ impl fmt::Display for SyntaxError {
match &self.inner {
SyntaxErrorKind::Json(e) => e.fmt(f),
SyntaxErrorKind::Xml(e) => e.fmt(f),
SyntaxErrorKind::Term { error, term } => write!(f, "{error}: {term}"),
SyntaxErrorKind::Msg { msg } => f.write_str(msg),
SyntaxErrorKind::Term {
error,
term,
location,
} => write!(f, "{error} on '{term}' in line {}", location.start.line + 1),
SyntaxErrorKind::Msg { msg, .. } => f.write_str(msg),
}
}
}
@ -144,7 +194,7 @@ impl From<SyntaxError> for io::Error {
_ => Self::new(io::ErrorKind::InvalidData, error),
},
SyntaxErrorKind::Term { .. } => Self::new(io::ErrorKind::InvalidData, error),
SyntaxErrorKind::Msg { msg } => Self::new(io::ErrorKind::InvalidData, msg),
SyntaxErrorKind::Msg { msg, .. } => Self::new(io::ErrorKind::InvalidData, msg),
}
}
}
@ -156,3 +206,11 @@ impl From<json_event_parser::SyntaxError> for SyntaxError {
}
}
}
/// A position in a text i.e. a `line` number starting from 0, a `column` number starting from 0 (in number of code points) and a global file `offset` starting from 0 (in number of bytes).
#[derive(Eq, PartialEq, Debug, Clone, Copy)]
pub struct TextPosition {
pub line: u64,
pub column: u64,
pub offset: u64,
}

@ -13,7 +13,7 @@ mod serializer;
pub mod solution;
mod xml;
pub use crate::error::{ParseError, SyntaxError};
pub use crate::error::{ParseError, SyntaxError, TextPosition};
pub use crate::format::QueryResultsFormat;
pub use crate::parser::{FromReadQueryResultsReader, FromReadSolutionsReader, QueryResultsParser};
pub use crate::serializer::{QueryResultsSerializer, ToWriteSolutionsWriter};

@ -35,6 +35,6 @@ pub use self::read::{DatasetParser, GraphParser};
#[allow(deprecated)]
pub use self::write::{DatasetSerializer, GraphSerializer};
pub use oxrdfio::{
FromReadQuadReader, ParseError, RdfFormat, RdfParser, RdfSerializer, SyntaxError,
FromReadQuadReader, ParseError, RdfFormat, RdfParser, RdfSerializer, SyntaxError, TextPosition,
ToWriteQuadWriter,
};

@ -43,5 +43,5 @@
pub use sparesults::{
FromReadQueryResultsReader, FromReadSolutionsReader, ParseError, QueryResultsFormat,
QueryResultsParser, QueryResultsSerializer, SyntaxError,
QueryResultsParser, QueryResultsSerializer, SyntaxError, TextPosition,
};

@ -401,7 +401,7 @@ pub fn map_parse_error(error: ParseError, file_path: Option<PathBuf>) -> PyErr {
match error {
ParseError::Syntax(error) => {
// Python 3.9 does not support end line and end column
if python_version() >= (3, 10, 0) {
if python_version() >= (3, 10) {
let params = if let Some(location) = error.location() {
(
file_path,
@ -458,12 +458,12 @@ pub fn allow_threads_unsafe<T>(_py: Python<'_>, f: impl FnOnce() -> T) -> T {
f()
}
fn python_version() -> (u8, u8, u8) {
static VERSION: OnceLock<(u8, u8, u8)> = OnceLock::new();
pub fn python_version() -> (u8, u8) {
static VERSION: OnceLock<(u8, u8)> = OnceLock::new();
*VERSION.get_or_init(|| {
Python::with_gil(|py| {
let v = py.version_info();
(v.major, v.minor, v.patch)
(v.major, v.minor)
})
})
}

@ -190,7 +190,10 @@ pub struct PyQuerySolutions {
}
enum PyQuerySolutionsVariant {
Query(QuerySolutionIter),
Reader(FromReadSolutionsReader<BufReader<PyReadable>>),
Reader {
iter: FromReadSolutionsReader<BufReader<PyReadable>>,
file_path: Option<PathBuf>,
},
}
#[pymethods]
@ -207,8 +210,8 @@ impl PyQuerySolutions {
PyQuerySolutionsVariant::Query(inner) => {
inner.variables().iter().map(|v| v.clone().into()).collect()
}
PyQuerySolutionsVariant::Reader(inner) => {
inner.variables().iter().map(|v| v.clone().into()).collect()
PyQuerySolutionsVariant::Reader { iter, .. } => {
iter.variables().iter().map(|v| v.clone().into()).collect()
}
}
}
@ -252,7 +255,9 @@ impl PyQuerySolutions {
output,
match &self.inner {
PyQuerySolutionsVariant::Query(inner) => inner.variables().to_vec(),
PyQuerySolutionsVariant::Reader(inner) => inner.variables().to_vec(),
PyQuerySolutionsVariant::Reader { iter, .. } => {
iter.variables().to_vec()
}
},
)
.map_err(map_io_err)?;
@ -264,10 +269,12 @@ impl PyQuerySolutions {
.map_err(map_io_err)?;
}
}
PyQuerySolutionsVariant::Reader(inner) => {
for solution in inner {
PyQuerySolutionsVariant::Reader { iter, file_path } => {
for solution in iter {
writer
.write(&solution.map_err(map_query_results_parse_error)?)
.write(&solution.map_err(|e| {
map_query_results_parse_error(e, file_path.clone())
})?)
.map_err(map_io_err)?;
}
}
@ -290,10 +297,10 @@ impl PyQuerySolutions {
PyQuerySolutionsVariant::Query(inner) => allow_threads_unsafe(py, || {
inner.next().transpose().map_err(map_evaluation_error)
}),
PyQuerySolutionsVariant::Reader(inner) => inner
PyQuerySolutionsVariant::Reader { iter, file_path } => iter
.next()
.transpose()
.map_err(map_query_results_parse_error),
.map_err(|e| map_query_results_parse_error(e, file_path.clone())),
}?
.map(move |inner| PyQuerySolution { inner }))
}
@ -498,10 +505,10 @@ pub fn parse_query_results(
};
let results = QueryResultsParser::from_format(format)
.parse_read(BufReader::new(input))
.map_err(map_query_results_parse_error)?;
.map_err(|e| map_query_results_parse_error(e, file_path.clone()))?;
Ok(match results {
FromReadQueryResultsReader::Solutions(inner) => PyQuerySolutions {
inner: PyQuerySolutionsVariant::Reader(inner),
FromReadQueryResultsReader::Solutions(iter) => PyQuerySolutions {
inner: PyQuerySolutionsVariant::Reader { iter, file_path },
}
.into_py(py),
FromReadQueryResultsReader::Boolean(inner) => PyQueryBoolean { inner }.into_py(py),
@ -513,7 +520,7 @@ pub fn map_evaluation_error(error: EvaluationError) -> PyErr {
EvaluationError::Parsing(error) => PySyntaxError::new_err(error.to_string()),
EvaluationError::Storage(error) => map_storage_error(error),
EvaluationError::GraphParsing(error) => map_parse_error(error, None),
EvaluationError::ResultsParsing(error) => map_query_results_parse_error(error),
EvaluationError::ResultsParsing(error) => map_query_results_parse_error(error, None),
EvaluationError::ResultsSerialization(error) => map_io_err(error),
EvaluationError::Service(error) => match error.downcast() {
Ok(error) => map_io_err(*error),
@ -523,9 +530,38 @@ pub fn map_evaluation_error(error: EvaluationError) -> PyErr {
}
}
pub fn map_query_results_parse_error(error: ParseError) -> PyErr {
pub fn map_query_results_parse_error(error: ParseError, file_path: Option<PathBuf>) -> PyErr {
match error {
ParseError::Syntax(error) => PySyntaxError::new_err(error.to_string()),
ParseError::Syntax(error) => {
// Python 3.9 does not support end line and end column
if python_version() >= (3, 10) {
let params = if let Some(location) = error.location() {
(
file_path,
Some(location.start.line + 1),
Some(location.start.column + 1),
None::<Vec<u8>>,
Some(location.end.line + 1),
Some(location.end.column + 1),
)
} else {
(None, None, None, None, None, None)
};
PySyntaxError::new_err((error.to_string(), params))
} else {
let params = if let Some(location) = error.location() {
(
file_path,
Some(location.start.line + 1),
Some(location.start.column + 1),
None::<Vec<u8>>,
)
} else {
(None, None, None, None)
};
PySyntaxError::new_err((error.to_string(), params))
}
}
ParseError::Io(error) => map_io_err(error),
}
}

@ -205,3 +205,30 @@ class TestParseQuerySolutions(unittest.TestCase):
def test_parse_io_error(self) -> None:
with self.assertRaises(UnsupportedOperation) as _, TemporaryFile("wb") as fp:
parse_query_results(fp, "srx")
def test_parse_syntax_error_json(self) -> None:
with NamedTemporaryFile() as fp:
fp.write(b"{]")
fp.flush()
with self.assertRaises(SyntaxError) as ctx:
list(parse_query_results(fp.name, "srj")) # type: ignore[arg-type]
self.assertEqual(ctx.exception.filename, fp.name)
self.assertEqual(ctx.exception.lineno, 1)
self.assertEqual(ctx.exception.offset, 2)
if sys.version_info >= (3, 10):
self.assertEqual(ctx.exception.end_lineno, 1)
self.assertEqual(ctx.exception.end_offset, 3)
def test_parse_syntax_error_tsv(self) -> None:
with NamedTemporaryFile() as fp:
fp.write(b"?a\t?test\n")
fp.write(b"1\t<foo >\n")
fp.flush()
with self.assertRaises(SyntaxError) as ctx:
list(parse_query_results(fp.name, "tsv")) # type: ignore[arg-type]
self.assertEqual(ctx.exception.filename, fp.name)
self.assertEqual(ctx.exception.lineno, 2)
self.assertEqual(ctx.exception.offset, 3)
if sys.version_info >= (3, 10):
self.assertEqual(ctx.exception.end_lineno, 2)
self.assertEqual(ctx.exception.end_offset, 9)

Loading…
Cancel
Save