From c6d26a2a37b828e2c2a7427ea216e1cbeba8096d Mon Sep 17 00:00:00 2001
From: Tpt <thomaspt@hotmail.fr>
Date: Thu, 20 Aug 2020 23:32:55 +0200
Subject: [PATCH] Easy parsing of terms from their serialization and SPARQL TSV
 parser

---
 CHANGELOG.md                      |   2 +
 lib/src/model/mod.rs              |   2 +
 lib/src/model/parser.rs           | 340 ++++++++++++++++++++++++++++++
 lib/src/sparql/csv_results.rs     |  62 +++++-
 lib/src/sparql/model.rs           |   5 +-
 testsuite/src/sparql_evaluator.rs |   5 +
 testsuite/tests/sparql.rs         |  13 ++
 7 files changed, 426 insertions(+), 3 deletions(-)
 create mode 100644 lib/src/model/parser.rs

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ab781eac..947adcee 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,8 @@
 
 ### Added
 - [SPARQL 1.1 Update](https://www.w3.org/TR/sparql11-update/) support for Rust, Python and JavaScript.
+- [SPARQL 1.1 Query Results CSV and TSV Formats](https://www.w3.org/TR/sparql11-results-csv-tsv/) serializers and TSV format parser.
+- `std::str::FromStr` implementations to `NamedNode`, `BlankNode`, `Literal`, `Term` and `Variable` allowing to easily parse Turtle/SPARQL serialization of these terms.
 
 ## Changed
 - Fixes evaluation of `MONTH()` and `DAY()` functions on the `xsd:date` values.
diff --git a/lib/src/model/mod.rs b/lib/src/model/mod.rs
index b3ec5e95..c9fde7fa 100644
--- a/lib/src/model/mod.rs
+++ b/lib/src/model/mod.rs
@@ -5,6 +5,7 @@
 mod blank_node;
 mod literal;
 mod named_node;
+mod parser;
 #[cfg(feature = "sophia")]
 mod sophia;
 mod triple;
@@ -14,6 +15,7 @@ pub(crate) mod xsd;
 pub use crate::model::blank_node::{BlankNode, BlankNodeIdParseError, BlankNodeRef};
 pub use crate::model::literal::{Literal, LiteralRef};
 pub use crate::model::named_node::{NamedNode, NamedNodeRef};
+pub use crate::model::parser::TermParseError;
 pub use crate::model::triple::{
     GraphName, GraphNameRef, NamedOrBlankNode, NamedOrBlankNodeRef, Quad, QuadRef, Term, TermRef,
     Triple, TripleRef,
diff --git a/lib/src/model/parser.rs b/lib/src/model/parser.rs
new file mode 100644
index 00000000..1fc19142
--- /dev/null
+++ b/lib/src/model/parser.rs
@@ -0,0 +1,340 @@
+use crate::model::blank_node::{BlankNode, BlankNodeIdParseError};
+use crate::model::named_node::NamedNode;
+use crate::model::vocab::xsd;
+use crate::model::{Literal, Term};
+use crate::sparql::{Variable, VariableNameParseError};
+use oxilangtag::LanguageTagParseError;
+use oxiri::IriParseError;
+use std::char;
+use std::error::Error;
+use std::fmt;
+use std::str::{Chars, FromStr};
+
+impl FromStr for NamedNode {
+    type Err = TermParseError;
+
+    /// Parses a named node from its NTriples and Turtle serialization
+    ///
+    /// ```
+    /// use oxigraph::model::NamedNode;
+    /// use std::str::FromStr;
+    ///
+    /// assert_eq!(NamedNode::from_str("<http://example.com>").unwrap(), NamedNode::new("http://example.com").unwrap())
+    /// ```
+    fn from_str(s: &str) -> Result<Self, TermParseError> {
+        if !s.starts_with('<') || !s.ends_with('>') {
+            return Err(TermParseError::msg(
+                "Named node serialization should be enclosed between < and >",
+            ));
+        }
+        NamedNode::new(&s[1..s.len() - 1]).map_err(|error| TermParseError {
+            kind: TermParseErrorKind::Iri {
+                value: s.to_owned(),
+                error,
+            },
+        })
+    }
+}
+
+impl FromStr for BlankNode {
+    type Err = TermParseError;
+
+    /// Parses a blank node from its NTriples and Turtle serialization
+    ///
+    /// ```
+    /// use oxigraph::model::BlankNode;
+    /// use std::str::FromStr;
+    ///
+    /// assert_eq!(BlankNode::from_str("_:ex").unwrap(), BlankNode::new("ex").unwrap())
+    /// ```
+    fn from_str(s: &str) -> Result<Self, TermParseError> {
+        if !s.starts_with("_:") {
+            return Err(TermParseError::msg(
+                "Blank node serialization should start with '_:'",
+            ));
+        }
+        BlankNode::new(&s[2..]).map_err(|error| TermParseError {
+            kind: TermParseErrorKind::BlankNode {
+                value: s.to_owned(),
+                error,
+            },
+        })
+    }
+}
+
+impl FromStr for Literal {
+    type Err = TermParseError;
+
+    /// Parses a literal from its NTriples or Turtle serialization
+    ///
+    /// ```
+    /// use oxigraph::model::{Literal, NamedNode, vocab::xsd};
+    /// use std::str::FromStr;
+    ///
+    /// assert_eq!(Literal::from_str("\"ex\\n\"").unwrap(), Literal::new_simple_literal("ex\n"));
+    /// assert_eq!(Literal::from_str("\"ex\"@en").unwrap(), Literal::new_language_tagged_literal("ex", "en").unwrap());
+    /// assert_eq!(Literal::from_str("\"2020\"^^<http://www.w3.org/2001/XMLSchema#gYear>").unwrap(), Literal::new_typed_literal("2020", NamedNode::new("http://www.w3.org/2001/XMLSchema#gYear").unwrap()));
+    /// assert_eq!(Literal::from_str("true").unwrap(), Literal::new_typed_literal("true", xsd::BOOLEAN));
+    /// assert_eq!(Literal::from_str("+122").unwrap(), Literal::new_typed_literal("+122", xsd::INTEGER));
+    /// assert_eq!(Literal::from_str("-122.23").unwrap(), Literal::new_typed_literal("-122.23", xsd::DECIMAL));
+    /// assert_eq!(Literal::from_str("-122e+1").unwrap(), Literal::new_typed_literal("-122e+1", xsd::DOUBLE));
+    /// ```
+    fn from_str(s: &str) -> Result<Self, TermParseError> {
+        if s.starts_with('"') {
+            let mut value = String::with_capacity(s.len() - 2);
+            let mut chars = s[1..].chars();
+            while let Some(c) = chars.next() {
+                match c {
+                    '"' => {
+                        let remain = chars.as_str();
+                        return if remain.is_empty() {
+                            Ok(Literal::new_simple_literal(value))
+                        } else if remain.starts_with('@') {
+                            Literal::new_language_tagged_literal(value, &remain[1..]).map_err(
+                                |error| TermParseError {
+                                    kind: TermParseErrorKind::LanguageTag {
+                                        value: remain[1..].to_owned(),
+                                        error,
+                                    },
+                                },
+                            )
+                        } else if remain.starts_with("^^") {
+                            Ok(Literal::new_typed_literal(
+                                value,
+                                NamedNode::from_str(&remain[2..])?,
+                            ))
+                        } else {
+                            Err(TermParseError::msg("Unexpected characters after a literal"))
+                        };
+                    }
+                    '\\' => {
+                        if let Some(c) = chars.next() {
+                            value.push(match c {
+                                't' => '\t',
+                                'b' => '\u{8}',
+                                'n' => '\n',
+                                'r' => '\r',
+                                'f' => '\u{C}',
+                                '"' => '"',
+                                '\'' => '\'',
+                                '\\' => '\\',
+                                'u' => read_hexa_char(&mut chars, 4)?,
+                                'U' => read_hexa_char(&mut chars, 8)?,
+                                _ => return Err(TermParseError::msg("Unexpected escaped char")),
+                            })
+                        } else {
+                            return Err(TermParseError::msg("Unexpected literal end"));
+                        }
+                    }
+                    c => value.push(c),
+                }
+            }
+            Err(TermParseError::msg("Unexpected literal end"))
+        } else if s == "true" {
+            Ok(Literal::new_typed_literal("true", xsd::BOOLEAN))
+        } else if s == "false" {
+            Ok(Literal::new_typed_literal("false", xsd::BOOLEAN))
+        } else {
+            let input = s.as_bytes();
+            if input.is_empty() {
+                return Err(TermParseError::msg("Empty term serialization"));
+            }
+
+            let mut cursor = match input.get(0) {
+                Some(b'+') | Some(b'-') => 1,
+                _ => 0,
+            };
+
+            let mut count_before: usize = 0;
+            while cursor < input.len() && b'0' <= input[cursor] && input[cursor] <= b'9' {
+                count_before += 1;
+                cursor += 1;
+            }
+
+            if cursor == input.len() {
+                return if count_before > 0 {
+                    Ok(Literal::new_typed_literal(s, xsd::INTEGER))
+                } else {
+                    Err(TermParseError::msg("Empty integer serialization"))
+                };
+            }
+
+            let mut count_after: usize = 0;
+            if input[cursor] == b'.' {
+                cursor += 1;
+                while cursor < input.len() && b'0' <= input[cursor] && input[cursor] <= b'9' {
+                    count_after += 1;
+                    cursor += 1;
+                }
+            }
+
+            if cursor == input.len() {
+                return if count_after > 0 {
+                    Ok(Literal::new_typed_literal(s, xsd::DECIMAL))
+                } else {
+                    Err(TermParseError::msg(
+                        "Decimal serialization without floating part",
+                    ))
+                };
+            }
+
+            if input[cursor] != b'e' && input[cursor] != b'E' {
+                return Err(TermParseError::msg("Double serialization without exponent"));
+            }
+            cursor += 1;
+            cursor += match input.get(cursor) {
+                Some(b'+') | Some(b'-') => 1,
+                _ => 0,
+            };
+            let mut count_exponent = 0;
+            while cursor < input.len() && b'0' <= input[cursor] && input[cursor] <= b'9' {
+                count_exponent += 1;
+                cursor += 1;
+            }
+            if cursor == input.len() && count_exponent > 0 {
+                Ok(Literal::new_typed_literal(s, xsd::DOUBLE))
+            } else {
+                Err(TermParseError::msg(
+                    "Double serialization with an invalid exponent",
+                ))
+            }
+        }
+    }
+}
+
+fn read_hexa_char(input: &mut Chars<'_>, len: usize) -> Result<char, TermParseError> {
+    let mut value = 0;
+    for _ in 0..len {
+        if let Some(c) = input.next() {
+            value = value * 16
+                + match c {
+                    '0'..='9' => u32::from(c) - u32::from('0'),
+                    'a'..='f' => u32::from(c) - u32::from('a') + 10,
+                    'A'..='F' => u32::from(c) - u32::from('A') + 10,
+                    _ => {
+                        return Err(TermParseError::msg(
+                            "Unexpected character in a unicode escape",
+                        ))
+                    }
+                }
+        } else {
+            return Err(TermParseError::msg("Unexpected literal string end"));
+        }
+    }
+    char::from_u32(value).ok_or_else(|| TermParseError::msg("Invalid encoded unicode code point"))
+}
+
+impl FromStr for Term {
+    type Err = TermParseError;
+
+    /// Parses a term from its NTriples or Turtle serialization
+    ///
+    /// ```
+    /// use oxigraph::model::{Literal, Term};
+    /// use std::str::FromStr;
+    ///
+    /// assert_eq!(Term::from_str("\"ex\"").unwrap(), Literal::new_simple_literal("ex").into())
+    /// ```
+    fn from_str(s: &str) -> Result<Self, TermParseError> {
+        Ok(if s.starts_with('<') {
+            NamedNode::from_str(s)?.into()
+        } else if s.starts_with('_') {
+            BlankNode::from_str(s)?.into()
+        } else {
+            Literal::from_str(s)?.into()
+        })
+    }
+}
+
+impl FromStr for Variable {
+    type Err = TermParseError;
+
+    /// Parses a variable from its SPARQL serialization
+    ///
+    /// ```
+    /// use oxigraph::sparql::Variable;
+    /// use std::str::FromStr;
+    ///
+    /// assert_eq!(Variable::from_str("$foo").unwrap(), Variable::new("foo").unwrap())
+    /// ```
+    fn from_str(s: &str) -> Result<Self, TermParseError> {
+        if !s.starts_with("?") && !s.starts_with('$') {
+            return Err(TermParseError::msg(
+                "Variable serialization should start with ? or $",
+            ));
+        }
+        Variable::new(&s[1..]).map_err(|error| TermParseError {
+            kind: TermParseErrorKind::Variable {
+                value: s.to_owned(),
+                error,
+            },
+        })
+    }
+}
+
+/// An error raised during term serialization parsing.
+#[allow(missing_copy_implementations)]
+#[derive(Debug)]
+pub struct TermParseError {
+    kind: TermParseErrorKind,
+}
+
+#[derive(Debug)]
+enum TermParseErrorKind {
+    Iri {
+        error: IriParseError,
+        value: String,
+    },
+    BlankNode {
+        error: BlankNodeIdParseError,
+        value: String,
+    },
+    LanguageTag {
+        error: LanguageTagParseError,
+        value: String,
+    },
+    Variable {
+        error: VariableNameParseError,
+        value: String,
+    },
+    Msg {
+        msg: &'static str,
+    },
+}
+
+impl fmt::Display for TermParseError {
+    #[inline]
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match &self.kind {
+            TermParseErrorKind::Iri { error, value } => write!(
+                f,
+                "Error while parsing the named node '{}': {}",
+                value, error
+            ),
+            TermParseErrorKind::BlankNode { error, value } => write!(
+                f,
+                "Error while parsing the blank node '{}': {}",
+                value, error
+            ),
+            TermParseErrorKind::LanguageTag { error, value } => write!(
+                f,
+                "Error while parsing the language tag '{}': {}",
+                value, error
+            ),
+            TermParseErrorKind::Variable { error, value } => {
+                write!(f, "Error while parsing the variable '{}': {}", value, error)
+            }
+            TermParseErrorKind::Msg { msg } => write!(f, "{}", msg),
+        }
+    }
+}
+
+impl Error for TermParseError {}
+
+impl TermParseError {
+    pub(crate) fn msg(msg: &'static str) -> Self {
+        Self {
+            kind: TermParseErrorKind::Msg { msg },
+        }
+    }
+}
diff --git a/lib/src/sparql/csv_results.rs b/lib/src/sparql/csv_results.rs
index a89d5768..521c32e8 100644
--- a/lib/src/sparql/csv_results.rs
+++ b/lib/src/sparql/csv_results.rs
@@ -1,9 +1,12 @@
 //! Implementation of [SPARQL 1.1 Query Results CSV and TSV Formats](https://www.w3.org/TR/sparql11-results-csv-tsv/)
 
+use crate::error::invalid_data_error;
 use crate::model::{vocab::xsd, *};
 use crate::sparql::error::EvaluationError;
 use crate::sparql::model::*;
-use std::io::{self, Write};
+use std::io::{self, BufRead, Write};
+use std::rc::Rc;
+use std::str::FromStr;
 
 pub fn write_csv_results(
     results: QueryResults,
@@ -164,6 +167,63 @@ fn write_tsv_term<'a>(term: impl Into<TermRef<'a>>, mut sink: impl Write) -> io:
     }
 }
 
+pub fn read_tsv_results(mut source: impl BufRead + 'static) -> Result<QueryResults, io::Error> {
+    let mut buffer = String::new();
+
+    // We read the header
+    source.read_line(&mut buffer)?;
+    if buffer.trim().eq_ignore_ascii_case("true") {
+        return Ok(QueryResults::Boolean(true));
+    }
+    if buffer.trim().eq_ignore_ascii_case("false") {
+        return Ok(QueryResults::Boolean(false));
+    }
+    let variables = buffer
+        .split('\t')
+        .map(|v| Variable::from_str(v.trim()).map_err(invalid_data_error))
+        .collect::<Result<Vec<_>, io::Error>>()?;
+
+    Ok(QueryResults::Solutions(QuerySolutionIter::new(
+        Rc::new(variables),
+        Box::new(TsvResultsIterator { buffer, source }),
+    )))
+}
+
+struct TsvResultsIterator<R: BufRead> {
+    source: R,
+    buffer: String,
+}
+
+impl<R: BufRead> Iterator for TsvResultsIterator<R> {
+    type Item = Result<Vec<Option<Term>>, EvaluationError>;
+
+    fn next(&mut self) -> Option<Result<Vec<Option<Term>>, EvaluationError>> {
+        self.read_next().transpose()
+    }
+}
+
+impl<R: BufRead> TsvResultsIterator<R> {
+    fn read_next(&mut self) -> Result<Option<Vec<Option<Term>>>, EvaluationError> {
+        self.buffer.clear();
+        if self.source.read_line(&mut self.buffer)? == 0 {
+            return Ok(None);
+        }
+        Ok(Some(
+            self.buffer
+                .split('\t')
+                .map(|v| {
+                    let v = v.trim();
+                    if v.is_empty() {
+                        Ok(None)
+                    } else {
+                        Ok(Some(Term::from_str(v).map_err(invalid_data_error)?))
+                    }
+                })
+                .collect::<Result<Vec<_>, EvaluationError>>()?,
+        ))
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/lib/src/sparql/model.rs b/lib/src/sparql/model.rs
index b7e0089e..ab8d62fc 100644
--- a/lib/src/sparql/model.rs
+++ b/lib/src/sparql/model.rs
@@ -2,7 +2,7 @@ use crate::error::invalid_input_error;
 use crate::io::GraphFormat;
 use crate::io::GraphSerializer;
 use crate::model::*;
-use crate::sparql::csv_results::{write_csv_results, write_tsv_results};
+use crate::sparql::csv_results::{read_tsv_results, write_csv_results, write_tsv_results};
 use crate::sparql::error::EvaluationError;
 use crate::sparql::json_results::write_json_results;
 use crate::sparql::xml_results::{read_xml_results, write_xml_results};
@@ -33,9 +33,10 @@ impl QueryResults {
             QueryResultsFormat::Json => Err(invalid_input_error(
                 "JSON SPARQL results format parsing has not been implemented yet",
             )), //TODO: implement
-            QueryResultsFormat::Csv | QueryResultsFormat::Tsv => Err(invalid_input_error(
+            QueryResultsFormat::Csv => Err(invalid_input_error(
                 "CSV and TSV SPARQL results format parsing is not implemented",
             )),
+            QueryResultsFormat::Tsv => read_tsv_results(reader),
         }
     }
 
diff --git a/testsuite/src/sparql_evaluator.rs b/testsuite/src/sparql_evaluator.rs
index e3e729de..ac2ddc65 100644
--- a/testsuite/src/sparql_evaluator.rs
+++ b/testsuite/src/sparql_evaluator.rs
@@ -223,6 +223,11 @@ fn load_sparql_query_result(url: &str) -> Result<StaticQueryResults> {
             QueryResults::read(read_file(url)?, QueryResultsFormat::Json)?,
             false,
         )
+    } else if url.ends_with(".tsv") {
+        StaticQueryResults::from_query_results(
+            QueryResults::read(read_file(url)?, QueryResultsFormat::Tsv)?,
+            false,
+        )
     } else {
         Ok(StaticQueryResults::from_dataset(load_store(url)?))
     }
diff --git a/testsuite/tests/sparql.rs b/testsuite/tests/sparql.rs
index 6308c551..271dc863 100644
--- a/testsuite/tests/sparql.rs
+++ b/testsuite/tests/sparql.rs
@@ -128,3 +128,16 @@ fn sparql11_update_w3c_evaluation_testsuite() -> Result<()> {
         vec![],
     )
 }
+
+#[test]
+fn sparql11_tsv_w3c_evaluation_testsuite() -> Result<()> {
+    run_testsuite(
+        "http://www.w3.org/2009/sparql/docs/tests/data-sparql11/csv-tsv-res/manifest.ttl",
+        vec![
+            // We do not run CSVResultFormatTest tests yet
+            "http://www.w3.org/2009/sparql/docs/tests/data-sparql11/csv-tsv-res/manifest#csv01",
+            "http://www.w3.org/2009/sparql/docs/tests/data-sparql11/csv-tsv-res/manifest#csv02",
+            "http://www.w3.org/2009/sparql/docs/tests/data-sparql11/csv-tsv-res/manifest#csv03",
+        ],
+    )
+}