From 8f360568dc286dfdb08eed926e44adf9cc6329c8 Mon Sep 17 00:00:00 2001 From: Tpt Date: Sat, 8 May 2021 07:17:25 +0200 Subject: [PATCH] Makes Term::from_str support RDF-star --- lib/src/model/parser.rs | 398 ++++++++++++++++++++++++---------------- 1 file changed, 242 insertions(+), 156 deletions(-) diff --git a/lib/src/model/parser.rs b/lib/src/model/parser.rs index 4a8e2ded..466cd25a 100644 --- a/lib/src/model/parser.rs +++ b/lib/src/model/parser.rs @@ -1,7 +1,5 @@ -use crate::model::blank_node::{BlankNode, BlankNodeIdParseError}; -use crate::model::named_node::NamedNode; use crate::model::vocab::xsd; -use crate::model::{Literal, Term}; +use crate::model::{BlankNode, BlankNodeIdParseError, Literal, NamedNode, Subject, Term, Triple}; use crate::sparql::{Variable, VariableNameParseError}; use oxilangtag::LanguageTagParseError; use oxiri::IriParseError; @@ -22,17 +20,13 @@ impl FromStr for NamedNode { /// assert_eq!(NamedNode::from_str("").unwrap(), NamedNode::new("http://example.com").unwrap()) /// ``` fn from_str(s: &str) -> Result { - if !s.starts_with('<') || !s.ends_with('>') { + let (term, left) = read_named_node(s)?; + if !left.is_empty() { return Err(TermParseError::msg( - "Named node serialization should be enclosed between < and >", + "Named node serialization should end with a >", )); } - NamedNode::new(&s[1..s.len() - 1]).map_err(|error| TermParseError { - kind: TermParseErrorKind::Iri { - value: s.to_owned(), - error, - }, - }) + Ok(term) } } @@ -48,17 +42,13 @@ impl FromStr for BlankNode { /// assert_eq!(BlankNode::from_str("_:ex").unwrap(), BlankNode::new("ex").unwrap()) /// ``` fn from_str(s: &str) -> Result { - if !s.starts_with("_:") { + let (term, left) = read_blank_node(s)?; + if !left.is_empty() { return Err(TermParseError::msg( - "Blank node serialization should start with '_:'", + "Blank node serialization should not contain whitespaces", )); } - BlankNode::new(&s[2..]).map_err(|error| TermParseError { - kind: TermParseErrorKind::BlankNode { - value: s.to_owned(), - error, - }, - }) + Ok(term) } } @@ -80,107 +70,196 @@ impl FromStr for Literal { /// assert_eq!(Literal::from_str("-122e+1").unwrap(), Literal::new_typed_literal("-122e+1", xsd::DOUBLE)); /// ``` fn from_str(s: &str) -> Result { - if let Some(s) = s.strip_prefix('"') { - let mut value = String::with_capacity(s.len() - 1); - let mut chars = s.chars(); - while let Some(c) = chars.next() { - match c { - '"' => { - let remain = chars.as_str(); - return if remain.is_empty() { - Ok(Literal::new_simple_literal(value)) - } else if let Some(language) = remain.strip_prefix('@') { - Literal::new_language_tagged_literal(value, &remain[1..]).map_err( + let (term, left) = read_literal(s)?; + if !left.is_empty() { + return Err(TermParseError::msg("Invalid literal serialization")); + } + Ok(term) + } +} + +impl FromStr for Term { + type Err = TermParseError; + + /// Parses a term from its NTriples or Turtle serialization + /// + /// ``` + /// use oxigraph::model::*; + /// use std::str::FromStr; + /// + /// assert_eq!(Term::from_str("\"ex\"").unwrap(), Literal::new_simple_literal("ex").into()); + /// assert_eq!(Term::from_str("<< _:s \"o\" >>").unwrap(), Triple::new( + /// BlankNode::new("s").unwrap(), + /// NamedNode::new("http://example.com/p").unwrap(), + /// Literal::new_simple_literal("o") + /// ).into()); + /// ``` + fn from_str(s: &str) -> Result { + let (term, left) = read_term(s)?; + if !left.is_empty() { + return Err(TermParseError::msg("Invalid term serialization")); + } + Ok(term) + } +} + +impl FromStr for Variable { + type Err = TermParseError; + + /// Parses a variable from its SPARQL serialization + /// + /// ``` + /// use oxigraph::sparql::Variable; + /// use std::str::FromStr; + /// + /// assert_eq!(Variable::from_str("$foo").unwrap(), Variable::new("foo").unwrap()) + /// ``` + fn from_str(s: &str) -> Result { + if !s.starts_with('?') && !s.starts_with('$') { + return Err(TermParseError::msg( + "Variable serialization should start with ? or $", + )); + } + Variable::new(&s[1..]).map_err(|error| TermParseError { + kind: TermParseErrorKind::Variable { + value: s.to_owned(), + error, + }, + }) + } +} + +fn read_named_node(s: &str) -> Result<(NamedNode, &str), TermParseError> { + let s = s.trim(); + if let Some(remain) = s.strip_prefix('<') { + let end = remain + .find('>') + .ok_or_else(|| TermParseError::msg("Named node serialization should end with a >"))?; + let (value, remain) = remain.split_at(end); + let remain = &remain[1..]; + let term = NamedNode::new(value).map_err(|error| TermParseError { + kind: TermParseErrorKind::Iri { + value: value.to_owned(), + error, + }, + })?; + Ok((term, remain)) + } else { + Err(TermParseError::msg( + "Named node serialization should start with a <", + )) + } +} + +fn read_blank_node(s: &str) -> Result<(BlankNode, &str), TermParseError> { + let s = s.trim(); + if let Some(remain) = s.strip_prefix("_:") { + let end = remain + .find(|v: char| v.is_whitespace() || matches!(v, '<' | '_' | '?' | '$' | '"' | '\'')) + .unwrap_or_else(|| remain.len()); + let (value, remain) = remain.split_at(end); + let term = BlankNode::new(value).map_err(|error| TermParseError { + kind: TermParseErrorKind::BlankNode { + value: value.to_owned(), + error, + }, + })?; + Ok((term, remain)) + } else { + Err(TermParseError::msg( + "Blank node serialization should start with '_:'", + )) + } +} + +fn read_literal(s: &str) -> Result<(Literal, &str), TermParseError> { + let s = s.trim(); + if let Some(s) = s.strip_prefix('"') { + let mut value = String::with_capacity(s.len() - 1); + let mut chars = s.chars(); + while let Some(c) = chars.next() { + match c { + '"' => { + let remain = chars.as_str(); + return if let Some(remain) = remain.strip_prefix('@') { + let end = remain + .find(|v| !matches!(v, 'a'..='z' | 'A'..='Z' | '-')) + .unwrap_or_else(|| remain.len()); + let (language, remain) = remain.split_at(end); + Ok(( + Literal::new_language_tagged_literal(value, language).map_err( |error| TermParseError { kind: TermParseErrorKind::LanguageTag { value: language.to_owned(), error, }, }, - ) - } else if let Some(datatype) = remain.strip_prefix("^^") { - Ok(Literal::new_typed_literal( - value, - NamedNode::from_str(datatype)?, - )) - } else { - Err(TermParseError::msg("Unexpected characters after a literal")) - }; - } - '\\' => { - if let Some(c) = chars.next() { - value.push(match c { - 't' => '\t', - 'b' => '\u{8}', - 'n' => '\n', - 'r' => '\r', - 'f' => '\u{C}', - '"' => '"', - '\'' => '\'', - '\\' => '\\', - 'u' => read_hexa_char(&mut chars, 4)?, - 'U' => read_hexa_char(&mut chars, 8)?, - _ => return Err(TermParseError::msg("Unexpected escaped char")), - }) - } else { - return Err(TermParseError::msg("Unexpected literal end")); - } + )?, + remain, + )) + } else if let Some(remain) = remain.strip_prefix("^^") { + let (datatype, remain) = read_named_node(remain)?; + Ok((Literal::new_typed_literal(value, datatype), remain)) + } else { + Ok((Literal::new_simple_literal(value), remain)) + }; + } + '\\' => { + if let Some(c) = chars.next() { + value.push(match c { + 't' => '\t', + 'b' => '\u{8}', + 'n' => '\n', + 'r' => '\r', + 'f' => '\u{C}', + '"' => '"', + '\'' => '\'', + '\\' => '\\', + 'u' => read_hexa_char(&mut chars, 4)?, + 'U' => read_hexa_char(&mut chars, 8)?, + _ => return Err(TermParseError::msg("Unexpected escaped char")), + }) + } else { + return Err(TermParseError::msg("Unexpected literal end")); } - c => value.push(c), } + c => value.push(c), } - Err(TermParseError::msg("Unexpected literal end")) - } else if s == "true" { - Ok(Literal::new_typed_literal("true", xsd::BOOLEAN)) - } else if s == "false" { - Ok(Literal::new_typed_literal("false", xsd::BOOLEAN)) - } else { - let input = s.as_bytes(); - if input.is_empty() { - return Err(TermParseError::msg("Empty term serialization")); - } - - let mut cursor = match input.get(0) { - Some(b'+') | Some(b'-') => 1, - _ => 0, - }; + } + Err(TermParseError::msg("Unexpected literal end")) + } else if let Some(remain) = s.strip_prefix("true") { + Ok((Literal::new_typed_literal("true", xsd::BOOLEAN), remain)) + } else if let Some(remain) = s.strip_prefix("false") { + Ok((Literal::new_typed_literal("false", xsd::BOOLEAN), remain)) + } else { + let input = s.as_bytes(); + if input.is_empty() { + return Err(TermParseError::msg("Empty term serialization")); + } - let mut count_before: usize = 0; - while cursor < input.len() && b'0' <= input[cursor] && input[cursor] <= b'9' { - count_before += 1; - cursor += 1; - } + let mut cursor = match input.get(0) { + Some(b'+') | Some(b'-') => 1, + _ => 0, + }; + let mut with_dot = false; - if cursor == input.len() { - return if count_before > 0 { - Ok(Literal::new_typed_literal(s, xsd::INTEGER)) - } else { - Err(TermParseError::msg("Empty integer serialization")) - }; - } + let mut count_before: usize = 0; + while cursor < input.len() && b'0' <= input[cursor] && input[cursor] <= b'9' { + count_before += 1; + cursor += 1; + } - let mut count_after: usize = 0; - if input[cursor] == b'.' { + let mut count_after: usize = 0; + if cursor < input.len() && input[cursor] == b'.' { + with_dot = true; + cursor += 1; + while cursor < input.len() && b'0' <= input[cursor] && input[cursor] <= b'9' { + count_after += 1; cursor += 1; - while cursor < input.len() && b'0' <= input[cursor] && input[cursor] <= b'9' { - count_after += 1; - cursor += 1; - } - } - - if cursor == input.len() { - return if count_after > 0 { - Ok(Literal::new_typed_literal(s, xsd::DECIMAL)) - } else { - Err(TermParseError::msg( - "Decimal serialization without floating part", - )) - }; } + } - if input[cursor] != b'e' && input[cursor] != b'E' { - return Err(TermParseError::msg("Double serialization without exponent")); - } + if cursor < input.len() && (input[cursor] == b'e' || input[cursor] == b'E') { cursor += 1; cursor += match input.get(cursor) { Some(b'+') | Some(b'-') => 1, @@ -191,14 +270,69 @@ impl FromStr for Literal { count_exponent += 1; cursor += 1; } - if cursor == input.len() && count_exponent > 0 { - Ok(Literal::new_typed_literal(s, xsd::DOUBLE)) + if count_exponent > 0 { + Ok((Literal::new_typed_literal(s, xsd::DOUBLE), &s[cursor..])) } else { Err(TermParseError::msg( "Double serialization with an invalid exponent", )) } + } else if with_dot { + if count_after > 0 { + Ok((Literal::new_typed_literal(s, xsd::DECIMAL), &s[cursor..])) + } else { + Err(TermParseError::msg( + "Decimal serialization without floating part", + )) + } + } else if count_before > 0 { + Ok((Literal::new_typed_literal(s, xsd::INTEGER), &s[cursor..])) + } else { + Err(TermParseError::msg("Empty integer serialization")) + } + } +} + +fn read_term(s: &str) -> Result<(Term, &str), TermParseError> { + let s = s.trim(); + if let Some(remain) = s.strip_prefix("<<") { + let (subject, remain) = read_term(remain)?; + let (predicate, remain) = read_named_node(remain)?; + let (object, remain) = read_term(remain)?; + let remain = remain.trim_start(); + if let Some(remain) = remain.strip_prefix(">>") { + Ok(( + Triple { + subject: match subject { + Term::NamedNode(s) => s.into(), + Term::BlankNode(s) => s.into(), + Term::Literal(_) => { + return Err(TermParseError::msg( + "Literals are not allowed in subject position", + )) + } + Term::Triple(s) => Subject::Triple(s), + }, + predicate, + object, + } + .into(), + remain, + )) + } else { + Err(TermParseError::msg( + "Nested triple serialization should be enclosed between << and >>", + )) } + } else if s.starts_with('<') { + let (term, remain) = read_named_node(s)?; + Ok((term.into(), remain)) + } else if s.starts_with('_') { + let (term, remain) = read_blank_node(s)?; + Ok((term.into(), remain)) + } else { + let (term, remain) = read_literal(s)?; + Ok((term.into(), remain)) } } @@ -224,54 +358,6 @@ fn read_hexa_char(input: &mut Chars<'_>, len: usize) -> Result Result { - Ok(if s.starts_with('<') { - NamedNode::from_str(s)?.into() - } else if s.starts_with('_') { - BlankNode::from_str(s)?.into() - } else { - Literal::from_str(s)?.into() - }) - } -} - -impl FromStr for Variable { - type Err = TermParseError; - - /// Parses a variable from its SPARQL serialization - /// - /// ``` - /// use oxigraph::sparql::Variable; - /// use std::str::FromStr; - /// - /// assert_eq!(Variable::from_str("$foo").unwrap(), Variable::new("foo").unwrap()) - /// ``` - fn from_str(s: &str) -> Result { - if !s.starts_with('?') && !s.starts_with('$') { - return Err(TermParseError::msg( - "Variable serialization should start with ? or $", - )); - } - Variable::new(&s[1..]).map_err(|error| TermParseError { - kind: TermParseErrorKind::Variable { - value: s.to_owned(), - error, - }, - }) - } -} - /// An error raised during term serialization parsing. #[allow(missing_copy_implementations)] #[derive(Debug)] @@ -324,7 +410,7 @@ impl fmt::Display for TermParseError { TermParseErrorKind::Variable { error, value } => { write!(f, "Error while parsing the variable '{}': {}", value, error) } - TermParseErrorKind::Msg { msg } => write!(f, "{}", msg), + TermParseErrorKind::Msg { msg } => f.write_str(msg), } } }