Use LanguageTag inside of Literal

Allows to normalize language tags case and fixes language tagged string SPARQL lookyp
pull/10/head
Tpt 6 years ago
parent 01c1e812f5
commit b673471b39
  1. 18
      lib/src/model/literal.rs
  2. 1
      lib/src/model/mod.rs
  3. 7
      lib/src/rio/ntriples/ntriples_grammar.rustpeg
  4. 7
      lib/src/rio/turtle/turtle_grammar.rustpeg
  5. 54
      lib/src/rio/xml.rs
  6. 7
      lib/src/sparql/sparql_grammar.rustpeg
  7. 7
      lib/src/sparql/xml_results.rs
  8. 5
      lib/src/store/memory.rs
  9. 33
      lib/src/store/numeric_encoder.rs
  10. 8
      lib/src/store/rocksdb.rs
  11. 10
      lib/tests/sparql_test_cases.rs
  12. 7
      python/src/lib.rs

@ -1,3 +1,4 @@
use crate::model::language_tag::LanguageTag;
use crate::model::named_node::NamedNode;
use crate::model::vocab::rdf;
use crate::model::vocab::xsd;
@ -18,6 +19,7 @@ use std::option::Option;
/// The default string formatter is returning a N-Triples, Turtle and SPARQL compatible representation:
/// ```
/// use rudf::model::Literal;
/// use rudf::model::LanguageTag;
/// use rudf::model::vocab::xsd;
///
/// assert_eq!(
@ -32,7 +34,7 @@ use std::option::Option;
///
/// assert_eq!(
/// "\"foo\"@en",
/// Literal::new_language_tagged_literal("foo", "en").to_string()
/// Literal::new_language_tagged_literal("foo", LanguageTag::parse("en").unwrap()).to_string()
/// );
/// ```
#[derive(Eq, PartialEq, Ord, PartialOrd, Debug, Clone, Hash)]
@ -41,7 +43,10 @@ pub struct Literal(LiteralContent);
#[derive(PartialEq, Eq, Ord, PartialOrd, Debug, Clone, Hash)]
enum LiteralContent {
String(String),
LanguageTaggedString { value: String, language: String },
LanguageTaggedString {
value: String,
language: LanguageTag,
},
Boolean(bool),
Float(OrderedFloat<f32>),
Double(OrderedFloat<f64>),
@ -51,7 +56,10 @@ enum LiteralContent {
NaiveTime(NaiveTime),
DateTime(DateTime<FixedOffset>),
NaiveDateTime(NaiveDateTime),
TypedLiteral { value: String, datatype: NamedNode },
TypedLiteral {
value: String,
datatype: NamedNode,
},
}
impl Literal {
@ -131,7 +139,7 @@ impl Literal {
/// Builds a RDF [language-tagged string](https://www.w3.org/TR/rdf11-concepts/#dfn-language-tagged-string)
pub fn new_language_tagged_literal(
value: impl Into<String>,
language: impl Into<String>,
language: impl Into<LanguageTag>,
) -> Self {
Literal(LiteralContent::LanguageTaggedString {
value: value.into(),
@ -159,7 +167,7 @@ impl Literal {
/// The literal [language tag](https://www.w3.org/TR/rdf11-concepts/#dfn-language-tag) if it is a [language-tagged string](https://www.w3.org/TR/rdf11-concepts/#dfn-language-tagged-string).
/// Language tags are defined by the [BCP47](https://tools.ietf.org/html/bcp47).
pub fn language(&self) -> Option<&str> {
pub fn language(&self) -> Option<&LanguageTag> {
match self.0 {
LiteralContent::LanguageTaggedString { ref language, .. } => Some(language),
_ => None,

@ -14,6 +14,7 @@ pub use crate::model::blank_node::BlankNode;
pub use crate::model::dataset::Dataset;
pub use crate::model::dataset::Graph;
pub use crate::model::dataset::NamedGraph;
pub use crate::model::language_tag::LanguageTag;
pub use crate::model::literal::Literal;
pub use crate::model::named_node::NamedNode;
pub use crate::model::triple::NamedOrBlankNode;

@ -37,8 +37,11 @@ literal -> Literal =
//[144s]
LANGTAG -> &'input str = "@" l:$([a-zA-Z]+ ("-" [a-zA-Z0-9]+)*) {
l
LANGTAG -> LanguageTag = "@" l:$([a-zA-Z]+ ("-" [a-zA-Z0-9]+)*) {?
match LanguageTag::parse(&l) {
Ok(langtag) => Ok(langtag),
Err(error) => Err("language tag parsing failed")
}
}
//[7]

@ -185,8 +185,11 @@ BLANK_NODE_LABEL -> &'input str = "_:" b:$(([0-9] / PN_CHARS_U) PN_CHARS* ("."+
}
//[144s]
LANGTAG -> &'input str = "@" l:$([a-zA-Z]+ ("-" [a-zA-Z0-9]+)*) {
l
LANGTAG -> LanguageTag = "@" l:$([a-zA-Z]+ ("-" [a-zA-Z0-9]+)*) {?
match LanguageTag::parse(&l) {
Ok(langtag) => Ok(langtag),
Err(error) => Err("language tag parsing failed")
}
}
//[19]

@ -1,7 +1,6 @@
//! Implementation of [RDF XML](https://www.w3.org/TR/rdf-syntax-grammar/) syntax
use crate::model::vocab::rdf;
use crate::model::Triple;
use crate::model::*;
use crate::Result;
use failure::format_err;
@ -82,18 +81,18 @@ enum RdfXmlState {
},
RDF {
base_uri: Option<Url>,
language: String,
language: Option<LanguageTag>,
},
NodeElt {
base_uri: Option<Url>,
language: String,
language: Option<LanguageTag>,
subject: NamedOrBlankNode,
},
PropertyElt {
//Resource, Literal or Empty property element
uri: Url,
base_uri: Option<Url>,
language: String,
language: Option<LanguageTag>,
subject: NamedOrBlankNode,
object: Option<NamedOrBlankNode>,
id_attr: Option<NamedNode>,
@ -102,7 +101,7 @@ enum RdfXmlState {
ParseTypeCollectionPropertyElt {
uri: NamedNode,
base_uri: Option<Url>,
language: String,
language: Option<LanguageTag>,
subject: NamedOrBlankNode,
id_attr: Option<NamedNode>,
},
@ -120,13 +119,13 @@ impl RdfXmlState {
}
}
fn language(&self) -> &str {
fn language(&self) -> Option<&LanguageTag> {
match self {
RdfXmlState::Doc { .. } => "",
RdfXmlState::RDF { language, .. } => language,
RdfXmlState::NodeElt { language, .. } => language,
RdfXmlState::PropertyElt { language, .. } => language,
RdfXmlState::ParseTypeCollectionPropertyElt { language, .. } => language,
RdfXmlState::Doc { .. } => None,
RdfXmlState::RDF { language, .. } => language.as_ref(),
RdfXmlState::NodeElt { language, .. } => language.as_ref(),
RdfXmlState::PropertyElt { language, .. } => language.as_ref(),
RdfXmlState::ParseTypeCollectionPropertyElt { language, .. } => language.as_ref(),
}
}
}
@ -193,10 +192,10 @@ impl<R: BufRead> RdfXmlIterator<R> {
let uri = self.resolve_tag_name(event.name())?;
//We read attributes
let mut language = String::default();
let mut language = None;
let mut base_uri = None;
if let Some(current_state) = self.state.last() {
language = current_state.language().to_string();
language = current_state.language().cloned();
base_uri = current_state.base_uri().clone();
}
let mut id_attr = None;
@ -212,7 +211,9 @@ impl<R: BufRead> RdfXmlIterator<R> {
let attribute = attribute?;
match attribute.key {
b"xml:lang" => {
language = attribute.unescape_and_decode_value(&self.reader)?;
language = Some(LanguageTag::parse(
&attribute.unescape_and_decode_value(&self.reader)?,
)?);
}
b"xml:base" => {
base_uri = Some(self.resolve_uri(&attribute.unescaped_value()?, &None)?)
@ -434,7 +435,7 @@ impl<R: BufRead> RdfXmlIterator<R> {
&mut self,
uri: NamedNode,
base_uri: Option<Url>,
language: String,
language: Option<LanguageTag>,
id_attr: Option<NamedNode>,
node_id_attr: Option<BlankNode>,
about_attr: Option<NamedNode>,
@ -474,7 +475,7 @@ impl<R: BufRead> RdfXmlIterator<R> {
&mut self,
uri: NamedNode,
base_uri: Option<Url>,
language: String,
language: Option<LanguageTag>,
subject: NamedOrBlankNode,
id_attr: Option<NamedNode>,
) -> RdfXmlState {
@ -544,13 +545,18 @@ impl<R: BufRead> RdfXmlIterator<R> {
Ok(())
}
fn new_literal(&self, text: String, language: String, datatype: Option<NamedNode>) -> Literal {
fn new_literal(
&self,
text: String,
language: Option<LanguageTag>,
datatype: Option<NamedNode>,
) -> Literal {
if let Some(datatype) = datatype {
Literal::new_typed_literal(text, datatype)
} else if language.is_empty() {
Literal::new_simple_literal(text)
} else {
} else if let Some(language) = language {
Literal::new_language_tagged_literal(text, language)
} else {
Literal::new_simple_literal(text)
}
}
@ -581,16 +587,16 @@ impl<R: BufRead> RdfXmlIterator<R> {
&mut self,
subject: &NamedOrBlankNode,
literal_attributes: Vec<(NamedNode, String)>,
language: &str,
language: &Option<LanguageTag>,
) {
for (literal_predicate, literal_value) in literal_attributes {
self.triples_cache.push(Triple::new(
subject.clone(),
literal_predicate,
if language.is_empty() {
Literal::new_simple_literal(literal_value)
if let Some(language) = language {
Literal::new_language_tagged_literal(literal_value, language.clone())
} else {
Literal::new_language_tagged_literal(literal_value, language)
Literal::new_simple_literal(literal_value)
},
));
}

@ -949,8 +949,11 @@ VAR1 -> &'input str = '?' v:$(VARNAME) { v }
VAR2 -> &'input str = '$' v:$(VARNAME) { v }
//[145]
LANGTAG -> &'input str = "@" l:$([a-zA-Z]+ ("-" [a-zA-Z0-9]+)*) {
l
LANGTAG -> LanguageTag = "@" l:$([a-zA-Z]+ ("-" [a-zA-Z0-9]+)*) {?
match LanguageTag::parse(&l) {
Ok(langtag) => Ok(langtag),
Err(error) => Err("language tag parsing failed")
}
}
//[146]

@ -81,7 +81,7 @@ pub fn write_xml_results<W: Write>(results: QueryResult<'_>, sink: W) -> Result<
Term::Literal(literal) => {
let mut literal_tag = BytesStart::borrowed_name(b"literal");
if let Some(language) = literal.language() {
literal_tag.push_attribute(("xml:lang", language));
literal_tag.push_attribute(("xml:lang", language.as_str()));
} else if !literal.is_plain() {
literal_tag
.push_attribute(("datatype", literal.datatype().as_str()));
@ -336,8 +336,11 @@ impl<R: BufRead> Iterator for ResultsIterator<R> {
if let Ok(attr) = attr {
if attr.key == b"xml:lang" {
match attr.unescape_and_decode_value(&self.reader) {
Ok(val) => match LanguageTag::parse(&val) {
Ok(val) => lang = Some(val),
Err(error) => return Some(Err(error.into())),
},
Err(error) => return Some(Err(error.into())),
}
} else if attr.key == b"datatype" {
match attr.unescaped_value() {
@ -429,7 +432,7 @@ impl<R: BufRead> Iterator for ResultsIterator<R> {
fn build_literal(
value: impl Into<String>,
lang: &Option<String>,
lang: &Option<LanguageTag>,
datatype: &Option<NamedNode>,
) -> Literal {
match datatype {

@ -1,3 +1,4 @@
use crate::model::LanguageTag;
use crate::store::encoded::*;
use crate::store::numeric_encoder::*;
use crate::utils::MutexPoisonError;
@ -81,6 +82,10 @@ impl StringStore for MemoryStore {
fn get_url(&self, id: u64) -> Result<Url> {
self.string_store.get_url(id)
}
fn get_language_tag(&self, id: u64) -> Result<LanguageTag> {
self.string_store.get_language_tag(id)
}
}
impl EncodedQuadsStore for MemoryStore {

@ -13,7 +13,6 @@ use std::io::Read;
use std::io::Write;
use std::ops::Deref;
use std::str;
use std::str::FromStr;
use std::sync::RwLock;
use url::Url;
use uuid::Uuid;
@ -36,6 +35,7 @@ pub trait StringStore {
fn insert_str(&self, value: &str) -> Result<u64>;
fn get_str(&self, id: u64) -> Result<Self::StringType>;
fn get_url(&self, id: u64) -> Result<Url>;
fn get_language_tag(&self, id: u64) -> Result<LanguageTag>;
/// Should be called when the bytes store is created
fn set_first_strings(&self) -> Result<()> {
@ -74,6 +74,10 @@ impl<'a, S: StringStore> StringStore for &'a S {
fn get_url(&self, id: u64) -> Result<Url> {
(*self).get_url(id)
}
fn get_language_tag(&self, id: u64) -> Result<LanguageTag> {
(*self).get_language_tag(id)
}
}
pub struct MemoryStringStore {
@ -120,7 +124,16 @@ impl StringStore for MemoryStringStore {
if id2str.len() as u64 <= id {
Err(format_err!("value not found in the dictionary"))
} else {
Ok(Url::from_str(&id2str[id as usize])?)
Ok(Url::parse(&id2str[id as usize])?)
}
}
fn get_language_tag(&self, id: u64) -> Result<LanguageTag> {
let id2str = self.id2str.read().map_err(MutexPoisonError::from)?;
if id2str.len() as u64 <= id {
Err(format_err!("value not found in the dictionary"))
} else {
Ok(LanguageTag::parse(&id2str[id as usize])?)
}
}
}
@ -586,16 +599,10 @@ impl<S: StringStore> Encoder<S> {
}
pub fn encode_literal(&self, literal: &Literal) -> Result<EncodedTerm> {
Ok(if literal.is_plain() {
if let Some(language) = literal.language() {
Ok(if let Some(language) = literal.language() {
EncodedTerm::LangStringLiteral {
value_id: self.string_store.insert_str(&literal.value())?,
language_id: self.string_store.insert_str(language)?,
}
} else {
EncodedTerm::StringLiteral {
value_id: self.string_store.insert_str(&literal.value())?,
}
language_id: self.string_store.insert_str(language.as_str())?,
}
} else if literal.is_string() {
EncodedTerm::StringLiteral {
@ -711,7 +718,7 @@ impl<S: StringStore> Encoder<S> {
language_id,
} => Ok(Literal::new_language_tagged_literal(
self.string_store.get_str(value_id)?,
self.string_store.get_str(language_id)?,
self.string_store.get_language_tag(language_id)?,
)
.into()),
EncodedTerm::TypedLiteral {
@ -787,6 +794,8 @@ impl<S: StringStore + Default> Default for Encoder<S> {
#[test]
fn test_encoding() {
use std::str::FromStr;
let encoder: Encoder<MemoryStringStore> = Encoder::default();
let terms: Vec<Term> = vec![
NamedNode::from_str("http://foo.com").unwrap().into(),
@ -798,7 +807,7 @@ fn test_encoding() {
Literal::from(1.2).into(),
Literal::from(1).into(),
Literal::from("foo").into(),
Literal::new_language_tagged_literal("foo", "fr").into(),
Literal::new_language_tagged_literal("foo", LanguageTag::parse("fr").unwrap()).into(),
];
for term in terms {
let encoded = encoder.encode_term(&term).unwrap();

@ -1,3 +1,4 @@
use crate::model::LanguageTag;
use crate::store::encoded::EncodedQuadsStore;
use crate::store::encoded::StoreDataset;
use crate::store::numeric_encoder::*;
@ -17,7 +18,6 @@ use std::io::Cursor;
use std::ops::Deref;
use std::path::Path;
use std::str;
use std::str::FromStr;
use std::sync::Mutex;
use url::Url;
@ -120,7 +120,11 @@ impl StringStore for RocksDbStore {
}
fn get_url(&self, id: u64) -> Result<Url> {
Ok(Url::from_str(&self.get_str(id)?)?)
Ok(Url::parse(&self.get_str(id)?)?)
}
fn get_language_tag(&self, id: u64) -> Result<LanguageTag> {
Ok(LanguageTag::parse(&self.get_str(id)?)?)
}
}

@ -168,16 +168,6 @@ fn sparql_w3c_query_evaluation_testsuite() {
NamedNode::from_str(
"http://www.w3.org/2001/sw/DataAccess/tests/data-r2/optional-filter/manifest#dawg-optional-filter-005-not-simplified",
).unwrap(),
//Case insensitive language tag comparison
NamedNode::from_str(
"http://www.w3.org/2001/sw/DataAccess/tests/data-r2/expr-builtin/manifest#lang-case-insensitive-eq",
).unwrap(),
NamedNode::from_str(
"http://www.w3.org/2001/sw/DataAccess/tests/data-r2/expr-builtin/manifest#lang-case-insensitive-ne",
).unwrap(),
NamedNode::from_str(
"http://www.w3.org/2001/sw/DataAccess/tests/data-r2/expr-builtin/manifest#dawg-lang-3",
).unwrap(),
//DATATYPE("foo"@en) returns rdf:langString in SPARQL 1.1
NamedNode::from_str(
"http://www.w3.org/2001/sw/DataAccess/tests/data-r2/expr-builtin/manifest#dawg-datatype-2",

@ -87,7 +87,10 @@ py_class!(class Literal |py| {
def __new__(_cls, value: String, language: Option<String> = None, datatype: Option<NamedNode> = None) -> PyResult<Literal> {
Literal::create_instance(py, match language {
Some(language) => model::Literal::new_language_tagged_literal(value, language),
Some(language) => {
let language = model::LanguageTag::parse(&language).map_err(|error| new_value_error(py, &error.into()))?;
model::Literal::new_language_tagged_literal(value, language)
},
None => match datatype {
Some(datatype) => model::Literal::new_typed_literal(value, datatype.inner(py).clone()),
None => model::Literal::new_simple_literal(value)
@ -100,7 +103,7 @@ py_class!(class Literal |py| {
}
def language(&self) -> PyResult<Option<String>> {
Ok(self.inner(py).language().map(|l| l.to_string()))
Ok(self.inner(py).language().map(|l| l.as_str().to_string()))
}
def datatype(&self) -> PyResult<NamedNode> {

Loading…
Cancel
Save