From 4c9bd526145b38fcd1de6d0b0fc818cfece4984b Mon Sep 17 00:00:00 2001 From: Tpt Date: Sat, 4 Apr 2020 18:25:38 +0200 Subject: [PATCH] Validates language tags Makes also sure their are all lowercase Closes #24 --- lib/src/error.rs | 12 ++++++++++++ lib/src/model/literal.rs | 20 +++++++++++++++++--- lib/src/sparql/eval.rs | 18 +++++++++++++----- lib/src/sparql/parser.rs | 2 +- lib/src/sparql/xml_results.rs | 10 +++++----- lib/src/store/numeric_encoder.rs | 30 ++++++++++-------------------- 6 files changed, 58 insertions(+), 34 deletions(-) diff --git a/lib/src/error.rs b/lib/src/error.rs index 124c19ab..d7d4c24f 100644 --- a/lib/src/error.rs +++ b/lib/src/error.rs @@ -1,6 +1,7 @@ use peg::error::ParseError; use peg::str::LineCol; use rio_api::iri::IriParseError; +use rio_api::language_tag::LanguageTagParseError; use rio_turtle::TurtleError; use rio_xml::RdfXmlError; use std::error; @@ -26,6 +27,7 @@ impl fmt::Display for Error { ErrorKind::FromUtf8(e) => e.fmt(f), ErrorKind::Poison => write!(f, "Mutex was poisoned"), ErrorKind::Iri(e) => e.fmt(f), + ErrorKind::LanguageTag(e) => e.fmt(f), ErrorKind::Other(e) => e.fmt(f), } } @@ -39,6 +41,7 @@ impl error::Error for Error { ErrorKind::FromUtf8(e) => Some(e), ErrorKind::Poison => None, ErrorKind::Iri(e) => Some(e), + ErrorKind::LanguageTag(e) => Some(e), ErrorKind::Other(e) => Some(e.as_ref()), } } @@ -67,6 +70,7 @@ enum ErrorKind { FromUtf8(FromUtf8Error), Poison, Iri(IriParseError), + LanguageTag(LanguageTagParseError), Other(Box), } @@ -94,6 +98,14 @@ impl From for Error { } } +impl From for Error { + fn from(error: LanguageTagParseError) -> Self { + Self { + inner: ErrorKind::LanguageTag(error), + } + } +} + impl From for Error { fn from(error: TurtleError) -> Self { Self::wrap(error) diff --git a/lib/src/model/literal.rs b/lib/src/model/literal.rs index 9aa0096e..1414b3f6 100644 --- a/lib/src/model/literal.rs +++ b/lib/src/model/literal.rs @@ -2,6 +2,8 @@ use crate::model::named_node::NamedNode; use crate::model::vocab::rdf; use crate::model::vocab::xsd; use crate::model::xsd::*; +use crate::Result; +use rio_api::language_tag::LanguageTag; use rio_api::model as rio; use std::borrow::Cow; use std::fmt; @@ -11,6 +13,7 @@ use std::option::Option; /// /// The default string formatter is returning a N-Triples, Turtle and SPARQL compatible representation: /// ``` +/// # use oxigraph::Result; /// use oxigraph::model::Literal; /// use oxigraph::model::vocab::xsd; /// @@ -26,8 +29,9 @@ use std::option::Option; /// /// assert_eq!( /// "\"foo\"@en", -/// Literal::new_language_tagged_literal("foo", "en").to_string() +/// Literal::new_language_tagged_literal("foo", "en")?.to_string() /// ); +/// # Result::Ok(()) /// ``` #[derive(Eq, PartialEq, Ord, PartialOrd, Debug, Clone, Hash)] pub struct Literal(LiteralContent); @@ -60,12 +64,22 @@ impl Literal { pub fn new_language_tagged_literal( value: impl Into, language: impl Into, - ) -> Self { + ) -> Result { let mut language = language.into(); language.make_ascii_lowercase(); + Ok(Literal(LiteralContent::LanguageTaggedString { + value: value.into(), + language: LanguageTag::parse(language)?.into_inner(), + })) + } + + pub(crate) fn new_language_tagged_literal_unchecked( + value: impl Into, + language: impl Into, + ) -> Self { Literal(LiteralContent::LanguageTaggedString { value: value.into(), - language, + language: language.into(), }) } diff --git a/lib/src/sparql/eval.rs b/lib/src/sparql/eval.rs index 517b08a1..2c4f6f6c 100644 --- a/lib/src/sparql/eval.rs +++ b/lib/src/sparql/eval.rs @@ -14,6 +14,7 @@ use md5::Md5; use rand::random; use regex::{Regex, RegexBuilder}; use rio_api::iri::Iri; +use rio_api::language_tag::LanguageTag; use rio_api::model as rio; use sha1::Sha1; use sha2::{Sha256, Sha384, Sha512}; @@ -906,10 +907,12 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator { _ => None, }, PlanExpression::LangMatches(language_tag, language_range) => { - let language_tag = + let mut language_tag = self.to_simple_string(self.eval_expression(language_tag, tuple)?)?; - let language_range = + language_tag.make_ascii_lowercase(); + let mut language_range = self.to_simple_string(self.eval_expression(language_range, tuple)?)?; + language_range.make_ascii_lowercase(); Some( if &*language_range == "*" { !language_tag.is_empty() @@ -917,7 +920,7 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator { !ZipLongest::new(language_range.split('-'), language_tag.split('-')).any( |parts| match parts { (Some(range_subtag), Some(language_subtag)) => { - !range_subtag.eq_ignore_ascii_case(language_subtag) + range_subtag != language_subtag } (Some(_), None) => true, (None, _) => false, @@ -1243,8 +1246,7 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator { Some(EncodedTerm::LangStringLiteral { value_id: self .to_simple_string_id(self.eval_expression(lexical_form, tuple)?)?, - language_id: self - .to_simple_string_id(self.eval_expression(lang_tag, tuple)?)?, + language_id: self.build_language_id(self.eval_expression(lang_tag, tuple)?)?, }) } PlanExpression::StrDT(lexical_form, datatype) => { @@ -1498,6 +1500,12 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator { Some(value_id) } + fn build_language_id(&self, value: EncodedTerm) -> Option { + let mut language = self.to_simple_string(value)?; + language.make_ascii_lowercase(); + self.build_string_id(LanguageTag::parse(language).ok()?.as_str()) + } + fn to_argument_compatible_strings( &self, arg1: EncodedTerm, diff --git a/lib/src/sparql/parser.rs b/lib/src/sparql/parser.rs index a873eec6..719abdef 100644 --- a/lib/src/sparql/parser.rs +++ b/lib/src/sparql/parser.rs @@ -1431,7 +1431,7 @@ parser! { //[129] rule RDFLiteral() -> Literal = v:String() _ "^^" _ t:iri() { Literal::new_typed_literal(v, t) } / - v:String() _ l:LANGTAG() { Literal::new_language_tagged_literal(v, l) } / + v:String() _ l:LANGTAG() {? Literal::new_language_tagged_literal(v, l).map_err(|_| "language tag parsing failed") } / v:String() { Literal::new_simple_literal(v) } //[130] diff --git a/lib/src/sparql/xml_results.rs b/lib/src/sparql/xml_results.rs index 3285364b..2e5f4dd5 100644 --- a/lib/src/sparql/xml_results.rs +++ b/lib/src/sparql/xml_results.rs @@ -385,7 +385,7 @@ impl ResultsIterator { self.reader.decode(&data)?, lang.take(), datatype.take(), - ) + )? .into(), ); } @@ -413,7 +413,7 @@ impl ResultsIterator { State::Literal => { if term.is_none() { //We default to the empty literal - term = Some(build_literal("", lang.take(), datatype.take()).into()) + term = Some(build_literal("", lang.take(), datatype.take())?.into()) } state = State::Binding; } @@ -430,12 +430,12 @@ fn build_literal( value: impl Into, lang: Option, datatype: Option, -) -> Literal { +) -> Result { match datatype { - Some(datatype) => Literal::new_typed_literal(value, datatype), + Some(datatype) => Ok(Literal::new_typed_literal(value, datatype)), None => match lang { Some(lang) => Literal::new_language_tagged_literal(value, lang), - None => Literal::new_simple_literal(value), + None => Ok(Literal::new_simple_literal(value)), }, } } diff --git a/lib/src/store/numeric_encoder.rs b/lib/src/store/numeric_encoder.rs index 200d34bc..c5086758 100644 --- a/lib/src/store/numeric_encoder.rs +++ b/lib/src/store/numeric_encoder.rs @@ -389,11 +389,7 @@ impl<'a> From> for EncodedTerm { rio::Literal::LanguageTaggedString { value, language } => { EncodedTerm::LangStringLiteral { value_id: get_str_id(value), - language_id: if language.bytes().all(|b| b.is_ascii_lowercase()) { - get_str_id(language) - } else { - get_str_id(&language.to_ascii_lowercase()) - }, + language_id: get_str_id(language), } } rio::Literal::Typed { value, datatype } => { @@ -986,18 +982,8 @@ impl Encoder for S { rio::Literal::LanguageTaggedString { value, language } => { let value_id = get_str_id(value); self.insert_str(value_id, value)?; - - let language_id = if language.bytes().all(|b| b.is_ascii_lowercase()) { - let language_id = get_str_id(language); - self.insert_str(language_id, language)?; - language_id - } else { - let language = language.to_ascii_lowercase(); - let language_id = get_str_id(&language); - self.insert_str(language_id, &language)?; - language_id - }; - + let language_id = get_str_id(language); + self.insert_str(language_id, language)?; EncodedTerm::LangStringLiteral { value_id, language_id, @@ -1161,7 +1147,7 @@ impl Decoder for S { EncodedTerm::LangStringLiteral { value_id, language_id, - } => Ok(Literal::new_language_tagged_literal( + } => Ok(Literal::new_language_tagged_literal_unchecked( get_required_str(self, value_id)?, get_required_str(self, language_id)?, ) @@ -1209,8 +1195,12 @@ fn test_encoding() { Literal::from(1.2).into(), Literal::from(1).into(), Literal::from("foo").into(), - Literal::new_language_tagged_literal("foo", "fr").into(), - Literal::new_language_tagged_literal("foo", "FR").into(), + Literal::new_language_tagged_literal("foo", "fr") + .unwrap() + .into(), + Literal::new_language_tagged_literal("foo", "FR") + .unwrap() + .into(), Literal::new_typed_literal("-1.32", xsd::DECIMAL.clone()).into(), Literal::new_typed_literal("2020-01-01T01:01:01Z", xsd::DATE_TIME.clone()).into(), Literal::new_typed_literal("2020-01-01", xsd::DATE.clone()).into(),