From c6d26a2a37b828e2c2a7427ea216e1cbeba8096d Mon Sep 17 00:00:00 2001 From: Tpt Date: Thu, 20 Aug 2020 23:32:55 +0200 Subject: [PATCH] Easy parsing of terms from their serialization and SPARQL TSV parser --- CHANGELOG.md | 2 + lib/src/model/mod.rs | 2 + lib/src/model/parser.rs | 340 ++++++++++++++++++++++++++++++ lib/src/sparql/csv_results.rs | 62 +++++- lib/src/sparql/model.rs | 5 +- testsuite/src/sparql_evaluator.rs | 5 + testsuite/tests/sparql.rs | 13 ++ 7 files changed, 426 insertions(+), 3 deletions(-) create mode 100644 lib/src/model/parser.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index ab781eac..947adcee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ### Added - [SPARQL 1.1 Update](https://www.w3.org/TR/sparql11-update/) support for Rust, Python and JavaScript. +- [SPARQL 1.1 Query Results CSV and TSV Formats](https://www.w3.org/TR/sparql11-results-csv-tsv/) serializers and TSV format parser. +- `std::str::FromStr` implementations to `NamedNode`, `BlankNode`, `Literal`, `Term` and `Variable` allowing to easily parse Turtle/SPARQL serialization of these terms. ## Changed - Fixes evaluation of `MONTH()` and `DAY()` functions on the `xsd:date` values. diff --git a/lib/src/model/mod.rs b/lib/src/model/mod.rs index b3ec5e95..c9fde7fa 100644 --- a/lib/src/model/mod.rs +++ b/lib/src/model/mod.rs @@ -5,6 +5,7 @@ mod blank_node; mod literal; mod named_node; +mod parser; #[cfg(feature = "sophia")] mod sophia; mod triple; @@ -14,6 +15,7 @@ pub(crate) mod xsd; pub use crate::model::blank_node::{BlankNode, BlankNodeIdParseError, BlankNodeRef}; pub use crate::model::literal::{Literal, LiteralRef}; pub use crate::model::named_node::{NamedNode, NamedNodeRef}; +pub use crate::model::parser::TermParseError; pub use crate::model::triple::{ GraphName, GraphNameRef, NamedOrBlankNode, NamedOrBlankNodeRef, Quad, QuadRef, Term, TermRef, Triple, TripleRef, diff --git a/lib/src/model/parser.rs b/lib/src/model/parser.rs new file mode 100644 index 00000000..1fc19142 --- /dev/null +++ b/lib/src/model/parser.rs @@ -0,0 +1,340 @@ +use crate::model::blank_node::{BlankNode, BlankNodeIdParseError}; +use crate::model::named_node::NamedNode; +use crate::model::vocab::xsd; +use crate::model::{Literal, Term}; +use crate::sparql::{Variable, VariableNameParseError}; +use oxilangtag::LanguageTagParseError; +use oxiri::IriParseError; +use std::char; +use std::error::Error; +use std::fmt; +use std::str::{Chars, FromStr}; + +impl FromStr for NamedNode { + type Err = TermParseError; + + /// Parses a named node from its NTriples and Turtle serialization + /// + /// ``` + /// use oxigraph::model::NamedNode; + /// use std::str::FromStr; + /// + /// assert_eq!(NamedNode::from_str("").unwrap(), NamedNode::new("http://example.com").unwrap()) + /// ``` + fn from_str(s: &str) -> Result { + if !s.starts_with('<') || !s.ends_with('>') { + return Err(TermParseError::msg( + "Named node serialization should be enclosed between < and >", + )); + } + NamedNode::new(&s[1..s.len() - 1]).map_err(|error| TermParseError { + kind: TermParseErrorKind::Iri { + value: s.to_owned(), + error, + }, + }) + } +} + +impl FromStr for BlankNode { + type Err = TermParseError; + + /// Parses a blank node from its NTriples and Turtle serialization + /// + /// ``` + /// use oxigraph::model::BlankNode; + /// use std::str::FromStr; + /// + /// assert_eq!(BlankNode::from_str("_:ex").unwrap(), BlankNode::new("ex").unwrap()) + /// ``` + fn from_str(s: &str) -> Result { + if !s.starts_with("_:") { + return Err(TermParseError::msg( + "Blank node serialization should start with '_:'", + )); + } + BlankNode::new(&s[2..]).map_err(|error| TermParseError { + kind: TermParseErrorKind::BlankNode { + value: s.to_owned(), + error, + }, + }) + } +} + +impl FromStr for Literal { + type Err = TermParseError; + + /// Parses a literal from its NTriples or Turtle serialization + /// + /// ``` + /// use oxigraph::model::{Literal, NamedNode, vocab::xsd}; + /// use std::str::FromStr; + /// + /// assert_eq!(Literal::from_str("\"ex\\n\"").unwrap(), Literal::new_simple_literal("ex\n")); + /// assert_eq!(Literal::from_str("\"ex\"@en").unwrap(), Literal::new_language_tagged_literal("ex", "en").unwrap()); + /// assert_eq!(Literal::from_str("\"2020\"^^").unwrap(), Literal::new_typed_literal("2020", NamedNode::new("http://www.w3.org/2001/XMLSchema#gYear").unwrap())); + /// assert_eq!(Literal::from_str("true").unwrap(), Literal::new_typed_literal("true", xsd::BOOLEAN)); + /// assert_eq!(Literal::from_str("+122").unwrap(), Literal::new_typed_literal("+122", xsd::INTEGER)); + /// assert_eq!(Literal::from_str("-122.23").unwrap(), Literal::new_typed_literal("-122.23", xsd::DECIMAL)); + /// assert_eq!(Literal::from_str("-122e+1").unwrap(), Literal::new_typed_literal("-122e+1", xsd::DOUBLE)); + /// ``` + fn from_str(s: &str) -> Result { + if s.starts_with('"') { + let mut value = String::with_capacity(s.len() - 2); + let mut chars = s[1..].chars(); + while let Some(c) = chars.next() { + match c { + '"' => { + let remain = chars.as_str(); + return if remain.is_empty() { + Ok(Literal::new_simple_literal(value)) + } else if remain.starts_with('@') { + Literal::new_language_tagged_literal(value, &remain[1..]).map_err( + |error| TermParseError { + kind: TermParseErrorKind::LanguageTag { + value: remain[1..].to_owned(), + error, + }, + }, + ) + } else if remain.starts_with("^^") { + Ok(Literal::new_typed_literal( + value, + NamedNode::from_str(&remain[2..])?, + )) + } else { + Err(TermParseError::msg("Unexpected characters after a literal")) + }; + } + '\\' => { + if let Some(c) = chars.next() { + value.push(match c { + 't' => '\t', + 'b' => '\u{8}', + 'n' => '\n', + 'r' => '\r', + 'f' => '\u{C}', + '"' => '"', + '\'' => '\'', + '\\' => '\\', + 'u' => read_hexa_char(&mut chars, 4)?, + 'U' => read_hexa_char(&mut chars, 8)?, + _ => return Err(TermParseError::msg("Unexpected escaped char")), + }) + } else { + return Err(TermParseError::msg("Unexpected literal end")); + } + } + c => value.push(c), + } + } + Err(TermParseError::msg("Unexpected literal end")) + } else if s == "true" { + Ok(Literal::new_typed_literal("true", xsd::BOOLEAN)) + } else if s == "false" { + Ok(Literal::new_typed_literal("false", xsd::BOOLEAN)) + } else { + let input = s.as_bytes(); + if input.is_empty() { + return Err(TermParseError::msg("Empty term serialization")); + } + + let mut cursor = match input.get(0) { + Some(b'+') | Some(b'-') => 1, + _ => 0, + }; + + let mut count_before: usize = 0; + while cursor < input.len() && b'0' <= input[cursor] && input[cursor] <= b'9' { + count_before += 1; + cursor += 1; + } + + if cursor == input.len() { + return if count_before > 0 { + Ok(Literal::new_typed_literal(s, xsd::INTEGER)) + } else { + Err(TermParseError::msg("Empty integer serialization")) + }; + } + + let mut count_after: usize = 0; + if input[cursor] == b'.' { + cursor += 1; + while cursor < input.len() && b'0' <= input[cursor] && input[cursor] <= b'9' { + count_after += 1; + cursor += 1; + } + } + + if cursor == input.len() { + return if count_after > 0 { + Ok(Literal::new_typed_literal(s, xsd::DECIMAL)) + } else { + Err(TermParseError::msg( + "Decimal serialization without floating part", + )) + }; + } + + if input[cursor] != b'e' && input[cursor] != b'E' { + return Err(TermParseError::msg("Double serialization without exponent")); + } + cursor += 1; + cursor += match input.get(cursor) { + Some(b'+') | Some(b'-') => 1, + _ => 0, + }; + let mut count_exponent = 0; + while cursor < input.len() && b'0' <= input[cursor] && input[cursor] <= b'9' { + count_exponent += 1; + cursor += 1; + } + if cursor == input.len() && count_exponent > 0 { + Ok(Literal::new_typed_literal(s, xsd::DOUBLE)) + } else { + Err(TermParseError::msg( + "Double serialization with an invalid exponent", + )) + } + } + } +} + +fn read_hexa_char(input: &mut Chars<'_>, len: usize) -> Result { + let mut value = 0; + for _ in 0..len { + if let Some(c) = input.next() { + value = value * 16 + + match c { + '0'..='9' => u32::from(c) - u32::from('0'), + 'a'..='f' => u32::from(c) - u32::from('a') + 10, + 'A'..='F' => u32::from(c) - u32::from('A') + 10, + _ => { + return Err(TermParseError::msg( + "Unexpected character in a unicode escape", + )) + } + } + } else { + return Err(TermParseError::msg("Unexpected literal string end")); + } + } + char::from_u32(value).ok_or_else(|| TermParseError::msg("Invalid encoded unicode code point")) +} + +impl FromStr for Term { + type Err = TermParseError; + + /// Parses a term from its NTriples or Turtle serialization + /// + /// ``` + /// use oxigraph::model::{Literal, Term}; + /// use std::str::FromStr; + /// + /// assert_eq!(Term::from_str("\"ex\"").unwrap(), Literal::new_simple_literal("ex").into()) + /// ``` + fn from_str(s: &str) -> Result { + Ok(if s.starts_with('<') { + NamedNode::from_str(s)?.into() + } else if s.starts_with('_') { + BlankNode::from_str(s)?.into() + } else { + Literal::from_str(s)?.into() + }) + } +} + +impl FromStr for Variable { + type Err = TermParseError; + + /// Parses a variable from its SPARQL serialization + /// + /// ``` + /// use oxigraph::sparql::Variable; + /// use std::str::FromStr; + /// + /// assert_eq!(Variable::from_str("$foo").unwrap(), Variable::new("foo").unwrap()) + /// ``` + fn from_str(s: &str) -> Result { + if !s.starts_with("?") && !s.starts_with('$') { + return Err(TermParseError::msg( + "Variable serialization should start with ? or $", + )); + } + Variable::new(&s[1..]).map_err(|error| TermParseError { + kind: TermParseErrorKind::Variable { + value: s.to_owned(), + error, + }, + }) + } +} + +/// An error raised during term serialization parsing. +#[allow(missing_copy_implementations)] +#[derive(Debug)] +pub struct TermParseError { + kind: TermParseErrorKind, +} + +#[derive(Debug)] +enum TermParseErrorKind { + Iri { + error: IriParseError, + value: String, + }, + BlankNode { + error: BlankNodeIdParseError, + value: String, + }, + LanguageTag { + error: LanguageTagParseError, + value: String, + }, + Variable { + error: VariableNameParseError, + value: String, + }, + Msg { + msg: &'static str, + }, +} + +impl fmt::Display for TermParseError { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match &self.kind { + TermParseErrorKind::Iri { error, value } => write!( + f, + "Error while parsing the named node '{}': {}", + value, error + ), + TermParseErrorKind::BlankNode { error, value } => write!( + f, + "Error while parsing the blank node '{}': {}", + value, error + ), + TermParseErrorKind::LanguageTag { error, value } => write!( + f, + "Error while parsing the language tag '{}': {}", + value, error + ), + TermParseErrorKind::Variable { error, value } => { + write!(f, "Error while parsing the variable '{}': {}", value, error) + } + TermParseErrorKind::Msg { msg } => write!(f, "{}", msg), + } + } +} + +impl Error for TermParseError {} + +impl TermParseError { + pub(crate) fn msg(msg: &'static str) -> Self { + Self { + kind: TermParseErrorKind::Msg { msg }, + } + } +} diff --git a/lib/src/sparql/csv_results.rs b/lib/src/sparql/csv_results.rs index a89d5768..521c32e8 100644 --- a/lib/src/sparql/csv_results.rs +++ b/lib/src/sparql/csv_results.rs @@ -1,9 +1,12 @@ //! Implementation of [SPARQL 1.1 Query Results CSV and TSV Formats](https://www.w3.org/TR/sparql11-results-csv-tsv/) +use crate::error::invalid_data_error; use crate::model::{vocab::xsd, *}; use crate::sparql::error::EvaluationError; use crate::sparql::model::*; -use std::io::{self, Write}; +use std::io::{self, BufRead, Write}; +use std::rc::Rc; +use std::str::FromStr; pub fn write_csv_results( results: QueryResults, @@ -164,6 +167,63 @@ fn write_tsv_term<'a>(term: impl Into>, mut sink: impl Write) -> io: } } +pub fn read_tsv_results(mut source: impl BufRead + 'static) -> Result { + let mut buffer = String::new(); + + // We read the header + source.read_line(&mut buffer)?; + if buffer.trim().eq_ignore_ascii_case("true") { + return Ok(QueryResults::Boolean(true)); + } + if buffer.trim().eq_ignore_ascii_case("false") { + return Ok(QueryResults::Boolean(false)); + } + let variables = buffer + .split('\t') + .map(|v| Variable::from_str(v.trim()).map_err(invalid_data_error)) + .collect::, io::Error>>()?; + + Ok(QueryResults::Solutions(QuerySolutionIter::new( + Rc::new(variables), + Box::new(TsvResultsIterator { buffer, source }), + ))) +} + +struct TsvResultsIterator { + source: R, + buffer: String, +} + +impl Iterator for TsvResultsIterator { + type Item = Result>, EvaluationError>; + + fn next(&mut self) -> Option>, EvaluationError>> { + self.read_next().transpose() + } +} + +impl TsvResultsIterator { + fn read_next(&mut self) -> Result>>, EvaluationError> { + self.buffer.clear(); + if self.source.read_line(&mut self.buffer)? == 0 { + return Ok(None); + } + Ok(Some( + self.buffer + .split('\t') + .map(|v| { + let v = v.trim(); + if v.is_empty() { + Ok(None) + } else { + Ok(Some(Term::from_str(v).map_err(invalid_data_error)?)) + } + }) + .collect::, EvaluationError>>()?, + )) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/lib/src/sparql/model.rs b/lib/src/sparql/model.rs index b7e0089e..ab8d62fc 100644 --- a/lib/src/sparql/model.rs +++ b/lib/src/sparql/model.rs @@ -2,7 +2,7 @@ use crate::error::invalid_input_error; use crate::io::GraphFormat; use crate::io::GraphSerializer; use crate::model::*; -use crate::sparql::csv_results::{write_csv_results, write_tsv_results}; +use crate::sparql::csv_results::{read_tsv_results, write_csv_results, write_tsv_results}; use crate::sparql::error::EvaluationError; use crate::sparql::json_results::write_json_results; use crate::sparql::xml_results::{read_xml_results, write_xml_results}; @@ -33,9 +33,10 @@ impl QueryResults { QueryResultsFormat::Json => Err(invalid_input_error( "JSON SPARQL results format parsing has not been implemented yet", )), //TODO: implement - QueryResultsFormat::Csv | QueryResultsFormat::Tsv => Err(invalid_input_error( + QueryResultsFormat::Csv => Err(invalid_input_error( "CSV and TSV SPARQL results format parsing is not implemented", )), + QueryResultsFormat::Tsv => read_tsv_results(reader), } } diff --git a/testsuite/src/sparql_evaluator.rs b/testsuite/src/sparql_evaluator.rs index e3e729de..ac2ddc65 100644 --- a/testsuite/src/sparql_evaluator.rs +++ b/testsuite/src/sparql_evaluator.rs @@ -223,6 +223,11 @@ fn load_sparql_query_result(url: &str) -> Result { QueryResults::read(read_file(url)?, QueryResultsFormat::Json)?, false, ) + } else if url.ends_with(".tsv") { + StaticQueryResults::from_query_results( + QueryResults::read(read_file(url)?, QueryResultsFormat::Tsv)?, + false, + ) } else { Ok(StaticQueryResults::from_dataset(load_store(url)?)) } diff --git a/testsuite/tests/sparql.rs b/testsuite/tests/sparql.rs index 6308c551..271dc863 100644 --- a/testsuite/tests/sparql.rs +++ b/testsuite/tests/sparql.rs @@ -128,3 +128,16 @@ fn sparql11_update_w3c_evaluation_testsuite() -> Result<()> { vec![], ) } + +#[test] +fn sparql11_tsv_w3c_evaluation_testsuite() -> Result<()> { + run_testsuite( + "http://www.w3.org/2009/sparql/docs/tests/data-sparql11/csv-tsv-res/manifest.ttl", + vec![ + // We do not run CSVResultFormatTest tests yet + "http://www.w3.org/2009/sparql/docs/tests/data-sparql11/csv-tsv-res/manifest#csv01", + "http://www.w3.org/2009/sparql/docs/tests/data-sparql11/csv-tsv-res/manifest#csv02", + "http://www.w3.org/2009/sparql/docs/tests/data-sparql11/csv-tsv-res/manifest#csv03", + ], + ) +}