From b673471b3957be719b181a32be27f458d5bb7a0c Mon Sep 17 00:00:00 2001 From: Tpt Date: Thu, 21 Feb 2019 17:00:25 +0100 Subject: [PATCH] Use LanguageTag inside of Literal Allows to normalize language tags case and fixes language tagged string SPARQL lookyp --- lib/src/model/literal.rs | 18 +++++-- lib/src/model/mod.rs | 1 + lib/src/rio/ntriples/ntriples_grammar.rustpeg | 7 ++- lib/src/rio/turtle/turtle_grammar.rustpeg | 7 ++- lib/src/rio/xml.rs | 54 ++++++++++--------- lib/src/sparql/sparql_grammar.rustpeg | 7 ++- lib/src/sparql/xml_results.rs | 9 ++-- lib/src/store/memory.rs | 5 ++ lib/src/store/numeric_encoder.rs | 37 ++++++++----- lib/src/store/rocksdb.rs | 8 ++- lib/tests/sparql_test_cases.rs | 10 ---- python/src/lib.rs | 7 ++- 12 files changed, 104 insertions(+), 66 deletions(-) diff --git a/lib/src/model/literal.rs b/lib/src/model/literal.rs index e99d0efa..657a04da 100644 --- a/lib/src/model/literal.rs +++ b/lib/src/model/literal.rs @@ -1,3 +1,4 @@ +use crate::model::language_tag::LanguageTag; use crate::model::named_node::NamedNode; use crate::model::vocab::rdf; use crate::model::vocab::xsd; @@ -18,6 +19,7 @@ use std::option::Option; /// The default string formatter is returning a N-Triples, Turtle and SPARQL compatible representation: /// ``` /// use rudf::model::Literal; +/// use rudf::model::LanguageTag; /// use rudf::model::vocab::xsd; /// /// assert_eq!( @@ -32,7 +34,7 @@ use std::option::Option; /// /// assert_eq!( /// "\"foo\"@en", -/// Literal::new_language_tagged_literal("foo", "en").to_string() +/// Literal::new_language_tagged_literal("foo", LanguageTag::parse("en").unwrap()).to_string() /// ); /// ``` #[derive(Eq, PartialEq, Ord, PartialOrd, Debug, Clone, Hash)] @@ -41,7 +43,10 @@ pub struct Literal(LiteralContent); #[derive(PartialEq, Eq, Ord, PartialOrd, Debug, Clone, Hash)] enum LiteralContent { String(String), - LanguageTaggedString { value: String, language: String }, + LanguageTaggedString { + value: String, + language: LanguageTag, + }, Boolean(bool), Float(OrderedFloat), Double(OrderedFloat), @@ -51,7 +56,10 @@ enum LiteralContent { NaiveTime(NaiveTime), DateTime(DateTime), NaiveDateTime(NaiveDateTime), - TypedLiteral { value: String, datatype: NamedNode }, + TypedLiteral { + value: String, + datatype: NamedNode, + }, } impl Literal { @@ -131,7 +139,7 @@ impl Literal { /// Builds a RDF [language-tagged string](https://www.w3.org/TR/rdf11-concepts/#dfn-language-tagged-string) pub fn new_language_tagged_literal( value: impl Into, - language: impl Into, + language: impl Into, ) -> Self { Literal(LiteralContent::LanguageTaggedString { value: value.into(), @@ -159,7 +167,7 @@ impl Literal { /// The literal [language tag](https://www.w3.org/TR/rdf11-concepts/#dfn-language-tag) if it is a [language-tagged string](https://www.w3.org/TR/rdf11-concepts/#dfn-language-tagged-string). /// Language tags are defined by the [BCP47](https://tools.ietf.org/html/bcp47). - pub fn language(&self) -> Option<&str> { + pub fn language(&self) -> Option<&LanguageTag> { match self.0 { LiteralContent::LanguageTaggedString { ref language, .. } => Some(language), _ => None, diff --git a/lib/src/model/mod.rs b/lib/src/model/mod.rs index 646ec7f6..1efc744b 100644 --- a/lib/src/model/mod.rs +++ b/lib/src/model/mod.rs @@ -14,6 +14,7 @@ pub use crate::model::blank_node::BlankNode; pub use crate::model::dataset::Dataset; pub use crate::model::dataset::Graph; pub use crate::model::dataset::NamedGraph; +pub use crate::model::language_tag::LanguageTag; pub use crate::model::literal::Literal; pub use crate::model::named_node::NamedNode; pub use crate::model::triple::NamedOrBlankNode; diff --git a/lib/src/rio/ntriples/ntriples_grammar.rustpeg b/lib/src/rio/ntriples/ntriples_grammar.rustpeg index 1f3aa105..c1486fb5 100644 --- a/lib/src/rio/ntriples/ntriples_grammar.rustpeg +++ b/lib/src/rio/ntriples/ntriples_grammar.rustpeg @@ -37,8 +37,11 @@ literal -> Literal = //[144s] -LANGTAG -> &'input str = "@" l:$([a-zA-Z]+ ("-" [a-zA-Z0-9]+)*) { - l +LANGTAG -> LanguageTag = "@" l:$([a-zA-Z]+ ("-" [a-zA-Z0-9]+)*) {? + match LanguageTag::parse(&l) { + Ok(langtag) => Ok(langtag), + Err(error) => Err("language tag parsing failed") + } } //[7] diff --git a/lib/src/rio/turtle/turtle_grammar.rustpeg b/lib/src/rio/turtle/turtle_grammar.rustpeg index 1025e0c2..9b4dfe90 100644 --- a/lib/src/rio/turtle/turtle_grammar.rustpeg +++ b/lib/src/rio/turtle/turtle_grammar.rustpeg @@ -185,8 +185,11 @@ BLANK_NODE_LABEL -> &'input str = "_:" b:$(([0-9] / PN_CHARS_U) PN_CHARS* ("."+ } //[144s] -LANGTAG -> &'input str = "@" l:$([a-zA-Z]+ ("-" [a-zA-Z0-9]+)*) { - l +LANGTAG -> LanguageTag = "@" l:$([a-zA-Z]+ ("-" [a-zA-Z0-9]+)*) {? + match LanguageTag::parse(&l) { + Ok(langtag) => Ok(langtag), + Err(error) => Err("language tag parsing failed") + } } //[19] diff --git a/lib/src/rio/xml.rs b/lib/src/rio/xml.rs index 0f9056cb..8565a19f 100644 --- a/lib/src/rio/xml.rs +++ b/lib/src/rio/xml.rs @@ -1,7 +1,6 @@ //! Implementation of [RDF XML](https://www.w3.org/TR/rdf-syntax-grammar/) syntax use crate::model::vocab::rdf; -use crate::model::Triple; use crate::model::*; use crate::Result; use failure::format_err; @@ -82,18 +81,18 @@ enum RdfXmlState { }, RDF { base_uri: Option, - language: String, + language: Option, }, NodeElt { base_uri: Option, - language: String, + language: Option, subject: NamedOrBlankNode, }, PropertyElt { //Resource, Literal or Empty property element uri: Url, base_uri: Option, - language: String, + language: Option, subject: NamedOrBlankNode, object: Option, id_attr: Option, @@ -102,7 +101,7 @@ enum RdfXmlState { ParseTypeCollectionPropertyElt { uri: NamedNode, base_uri: Option, - language: String, + language: Option, subject: NamedOrBlankNode, id_attr: Option, }, @@ -120,13 +119,13 @@ impl RdfXmlState { } } - fn language(&self) -> &str { + fn language(&self) -> Option<&LanguageTag> { match self { - RdfXmlState::Doc { .. } => "", - RdfXmlState::RDF { language, .. } => language, - RdfXmlState::NodeElt { language, .. } => language, - RdfXmlState::PropertyElt { language, .. } => language, - RdfXmlState::ParseTypeCollectionPropertyElt { language, .. } => language, + RdfXmlState::Doc { .. } => None, + RdfXmlState::RDF { language, .. } => language.as_ref(), + RdfXmlState::NodeElt { language, .. } => language.as_ref(), + RdfXmlState::PropertyElt { language, .. } => language.as_ref(), + RdfXmlState::ParseTypeCollectionPropertyElt { language, .. } => language.as_ref(), } } } @@ -193,10 +192,10 @@ impl RdfXmlIterator { let uri = self.resolve_tag_name(event.name())?; //We read attributes - let mut language = String::default(); + let mut language = None; let mut base_uri = None; if let Some(current_state) = self.state.last() { - language = current_state.language().to_string(); + language = current_state.language().cloned(); base_uri = current_state.base_uri().clone(); } let mut id_attr = None; @@ -212,7 +211,9 @@ impl RdfXmlIterator { let attribute = attribute?; match attribute.key { b"xml:lang" => { - language = attribute.unescape_and_decode_value(&self.reader)?; + language = Some(LanguageTag::parse( + &attribute.unescape_and_decode_value(&self.reader)?, + )?); } b"xml:base" => { base_uri = Some(self.resolve_uri(&attribute.unescaped_value()?, &None)?) @@ -434,7 +435,7 @@ impl RdfXmlIterator { &mut self, uri: NamedNode, base_uri: Option, - language: String, + language: Option, id_attr: Option, node_id_attr: Option, about_attr: Option, @@ -474,7 +475,7 @@ impl RdfXmlIterator { &mut self, uri: NamedNode, base_uri: Option, - language: String, + language: Option, subject: NamedOrBlankNode, id_attr: Option, ) -> RdfXmlState { @@ -544,13 +545,18 @@ impl RdfXmlIterator { Ok(()) } - fn new_literal(&self, text: String, language: String, datatype: Option) -> Literal { + fn new_literal( + &self, + text: String, + language: Option, + datatype: Option, + ) -> Literal { if let Some(datatype) = datatype { Literal::new_typed_literal(text, datatype) - } else if language.is_empty() { - Literal::new_simple_literal(text) - } else { + } else if let Some(language) = language { Literal::new_language_tagged_literal(text, language) + } else { + Literal::new_simple_literal(text) } } @@ -581,16 +587,16 @@ impl RdfXmlIterator { &mut self, subject: &NamedOrBlankNode, literal_attributes: Vec<(NamedNode, String)>, - language: &str, + language: &Option, ) { for (literal_predicate, literal_value) in literal_attributes { self.triples_cache.push(Triple::new( subject.clone(), literal_predicate, - if language.is_empty() { - Literal::new_simple_literal(literal_value) + if let Some(language) = language { + Literal::new_language_tagged_literal(literal_value, language.clone()) } else { - Literal::new_language_tagged_literal(literal_value, language) + Literal::new_simple_literal(literal_value) }, )); } diff --git a/lib/src/sparql/sparql_grammar.rustpeg b/lib/src/sparql/sparql_grammar.rustpeg index d1dbfe12..62cc9f03 100644 --- a/lib/src/sparql/sparql_grammar.rustpeg +++ b/lib/src/sparql/sparql_grammar.rustpeg @@ -949,8 +949,11 @@ VAR1 -> &'input str = '?' v:$(VARNAME) { v } VAR2 -> &'input str = '$' v:$(VARNAME) { v } //[145] -LANGTAG -> &'input str = "@" l:$([a-zA-Z]+ ("-" [a-zA-Z0-9]+)*) { - l +LANGTAG -> LanguageTag = "@" l:$([a-zA-Z]+ ("-" [a-zA-Z0-9]+)*) {? + match LanguageTag::parse(&l) { + Ok(langtag) => Ok(langtag), + Err(error) => Err("language tag parsing failed") + } } //[146] diff --git a/lib/src/sparql/xml_results.rs b/lib/src/sparql/xml_results.rs index 3fa30816..f472384e 100644 --- a/lib/src/sparql/xml_results.rs +++ b/lib/src/sparql/xml_results.rs @@ -81,7 +81,7 @@ pub fn write_xml_results(results: QueryResult<'_>, sink: W) -> Result< Term::Literal(literal) => { let mut literal_tag = BytesStart::borrowed_name(b"literal"); if let Some(language) = literal.language() { - literal_tag.push_attribute(("xml:lang", language)); + literal_tag.push_attribute(("xml:lang", language.as_str())); } else if !literal.is_plain() { literal_tag .push_attribute(("datatype", literal.datatype().as_str())); @@ -336,7 +336,10 @@ impl Iterator for ResultsIterator { if let Ok(attr) = attr { if attr.key == b"xml:lang" { match attr.unescape_and_decode_value(&self.reader) { - Ok(val) => lang = Some(val), + Ok(val) => match LanguageTag::parse(&val) { + Ok(val) => lang = Some(val), + Err(error) => return Some(Err(error.into())), + }, Err(error) => return Some(Err(error.into())), } } else if attr.key == b"datatype" { @@ -429,7 +432,7 @@ impl Iterator for ResultsIterator { fn build_literal( value: impl Into, - lang: &Option, + lang: &Option, datatype: &Option, ) -> Literal { match datatype { diff --git a/lib/src/store/memory.rs b/lib/src/store/memory.rs index 373dc3e6..d6487ef3 100644 --- a/lib/src/store/memory.rs +++ b/lib/src/store/memory.rs @@ -1,3 +1,4 @@ +use crate::model::LanguageTag; use crate::store::encoded::*; use crate::store::numeric_encoder::*; use crate::utils::MutexPoisonError; @@ -81,6 +82,10 @@ impl StringStore for MemoryStore { fn get_url(&self, id: u64) -> Result { self.string_store.get_url(id) } + + fn get_language_tag(&self, id: u64) -> Result { + self.string_store.get_language_tag(id) + } } impl EncodedQuadsStore for MemoryStore { diff --git a/lib/src/store/numeric_encoder.rs b/lib/src/store/numeric_encoder.rs index 437e0a96..3638af03 100644 --- a/lib/src/store/numeric_encoder.rs +++ b/lib/src/store/numeric_encoder.rs @@ -13,7 +13,6 @@ use std::io::Read; use std::io::Write; use std::ops::Deref; use std::str; -use std::str::FromStr; use std::sync::RwLock; use url::Url; use uuid::Uuid; @@ -36,6 +35,7 @@ pub trait StringStore { fn insert_str(&self, value: &str) -> Result; fn get_str(&self, id: u64) -> Result; fn get_url(&self, id: u64) -> Result; + fn get_language_tag(&self, id: u64) -> Result; /// Should be called when the bytes store is created fn set_first_strings(&self) -> Result<()> { @@ -74,6 +74,10 @@ impl<'a, S: StringStore> StringStore for &'a S { fn get_url(&self, id: u64) -> Result { (*self).get_url(id) } + + fn get_language_tag(&self, id: u64) -> Result { + (*self).get_language_tag(id) + } } pub struct MemoryStringStore { @@ -120,7 +124,16 @@ impl StringStore for MemoryStringStore { if id2str.len() as u64 <= id { Err(format_err!("value not found in the dictionary")) } else { - Ok(Url::from_str(&id2str[id as usize])?) + Ok(Url::parse(&id2str[id as usize])?) + } + } + + fn get_language_tag(&self, id: u64) -> Result { + let id2str = self.id2str.read().map_err(MutexPoisonError::from)?; + if id2str.len() as u64 <= id { + Err(format_err!("value not found in the dictionary")) + } else { + Ok(LanguageTag::parse(&id2str[id as usize])?) } } } @@ -586,16 +599,10 @@ impl Encoder { } pub fn encode_literal(&self, literal: &Literal) -> Result { - Ok(if literal.is_plain() { - if let Some(language) = literal.language() { - EncodedTerm::LangStringLiteral { - value_id: self.string_store.insert_str(&literal.value())?, - language_id: self.string_store.insert_str(language)?, - } - } else { - EncodedTerm::StringLiteral { - value_id: self.string_store.insert_str(&literal.value())?, - } + Ok(if let Some(language) = literal.language() { + EncodedTerm::LangStringLiteral { + value_id: self.string_store.insert_str(&literal.value())?, + language_id: self.string_store.insert_str(language.as_str())?, } } else if literal.is_string() { EncodedTerm::StringLiteral { @@ -711,7 +718,7 @@ impl Encoder { language_id, } => Ok(Literal::new_language_tagged_literal( self.string_store.get_str(value_id)?, - self.string_store.get_str(language_id)?, + self.string_store.get_language_tag(language_id)?, ) .into()), EncodedTerm::TypedLiteral { @@ -787,6 +794,8 @@ impl Default for Encoder { #[test] fn test_encoding() { + use std::str::FromStr; + let encoder: Encoder = Encoder::default(); let terms: Vec = vec![ NamedNode::from_str("http://foo.com").unwrap().into(), @@ -798,7 +807,7 @@ fn test_encoding() { Literal::from(1.2).into(), Literal::from(1).into(), Literal::from("foo").into(), - Literal::new_language_tagged_literal("foo", "fr").into(), + Literal::new_language_tagged_literal("foo", LanguageTag::parse("fr").unwrap()).into(), ]; for term in terms { let encoded = encoder.encode_term(&term).unwrap(); diff --git a/lib/src/store/rocksdb.rs b/lib/src/store/rocksdb.rs index 122122b3..e0a1705e 100644 --- a/lib/src/store/rocksdb.rs +++ b/lib/src/store/rocksdb.rs @@ -1,3 +1,4 @@ +use crate::model::LanguageTag; use crate::store::encoded::EncodedQuadsStore; use crate::store::encoded::StoreDataset; use crate::store::numeric_encoder::*; @@ -17,7 +18,6 @@ use std::io::Cursor; use std::ops::Deref; use std::path::Path; use std::str; -use std::str::FromStr; use std::sync::Mutex; use url::Url; @@ -120,7 +120,11 @@ impl StringStore for RocksDbStore { } fn get_url(&self, id: u64) -> Result { - Ok(Url::from_str(&self.get_str(id)?)?) + Ok(Url::parse(&self.get_str(id)?)?) + } + + fn get_language_tag(&self, id: u64) -> Result { + Ok(LanguageTag::parse(&self.get_str(id)?)?) } } diff --git a/lib/tests/sparql_test_cases.rs b/lib/tests/sparql_test_cases.rs index a7a0f2c7..e2113bd6 100644 --- a/lib/tests/sparql_test_cases.rs +++ b/lib/tests/sparql_test_cases.rs @@ -168,16 +168,6 @@ fn sparql_w3c_query_evaluation_testsuite() { NamedNode::from_str( "http://www.w3.org/2001/sw/DataAccess/tests/data-r2/optional-filter/manifest#dawg-optional-filter-005-not-simplified", ).unwrap(), - //Case insensitive language tag comparison - NamedNode::from_str( - "http://www.w3.org/2001/sw/DataAccess/tests/data-r2/expr-builtin/manifest#lang-case-insensitive-eq", - ).unwrap(), - NamedNode::from_str( - "http://www.w3.org/2001/sw/DataAccess/tests/data-r2/expr-builtin/manifest#lang-case-insensitive-ne", - ).unwrap(), - NamedNode::from_str( - "http://www.w3.org/2001/sw/DataAccess/tests/data-r2/expr-builtin/manifest#dawg-lang-3", - ).unwrap(), //DATATYPE("foo"@en) returns rdf:langString in SPARQL 1.1 NamedNode::from_str( "http://www.w3.org/2001/sw/DataAccess/tests/data-r2/expr-builtin/manifest#dawg-datatype-2", diff --git a/python/src/lib.rs b/python/src/lib.rs index ba3f18af..e0ebcfac 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -87,7 +87,10 @@ py_class!(class Literal |py| { def __new__(_cls, value: String, language: Option = None, datatype: Option = None) -> PyResult { Literal::create_instance(py, match language { - Some(language) => model::Literal::new_language_tagged_literal(value, language), + Some(language) => { + let language = model::LanguageTag::parse(&language).map_err(|error| new_value_error(py, &error.into()))?; + model::Literal::new_language_tagged_literal(value, language) + }, None => match datatype { Some(datatype) => model::Literal::new_typed_literal(value, datatype.inner(py).clone()), None => model::Literal::new_simple_literal(value) @@ -100,7 +103,7 @@ py_class!(class Literal |py| { } def language(&self) -> PyResult> { - Ok(self.inner(py).language().map(|l| l.to_string())) + Ok(self.inner(py).language().map(|l| l.as_str().to_string())) } def datatype(&self) -> PyResult {