Validates language tags

Makes also sure their are all lowercase

Closes #24
pull/26/head
Tpt 5 years ago
parent a8dbc94d6f
commit 4c9bd52614
  1. 12
      lib/src/error.rs
  2. 20
      lib/src/model/literal.rs
  3. 18
      lib/src/sparql/eval.rs
  4. 2
      lib/src/sparql/parser.rs
  5. 10
      lib/src/sparql/xml_results.rs
  6. 30
      lib/src/store/numeric_encoder.rs

@ -1,6 +1,7 @@
use peg::error::ParseError; use peg::error::ParseError;
use peg::str::LineCol; use peg::str::LineCol;
use rio_api::iri::IriParseError; use rio_api::iri::IriParseError;
use rio_api::language_tag::LanguageTagParseError;
use rio_turtle::TurtleError; use rio_turtle::TurtleError;
use rio_xml::RdfXmlError; use rio_xml::RdfXmlError;
use std::error; use std::error;
@ -26,6 +27,7 @@ impl fmt::Display for Error {
ErrorKind::FromUtf8(e) => e.fmt(f), ErrorKind::FromUtf8(e) => e.fmt(f),
ErrorKind::Poison => write!(f, "Mutex was poisoned"), ErrorKind::Poison => write!(f, "Mutex was poisoned"),
ErrorKind::Iri(e) => e.fmt(f), ErrorKind::Iri(e) => e.fmt(f),
ErrorKind::LanguageTag(e) => e.fmt(f),
ErrorKind::Other(e) => e.fmt(f), ErrorKind::Other(e) => e.fmt(f),
} }
} }
@ -39,6 +41,7 @@ impl error::Error for Error {
ErrorKind::FromUtf8(e) => Some(e), ErrorKind::FromUtf8(e) => Some(e),
ErrorKind::Poison => None, ErrorKind::Poison => None,
ErrorKind::Iri(e) => Some(e), ErrorKind::Iri(e) => Some(e),
ErrorKind::LanguageTag(e) => Some(e),
ErrorKind::Other(e) => Some(e.as_ref()), ErrorKind::Other(e) => Some(e.as_ref()),
} }
} }
@ -67,6 +70,7 @@ enum ErrorKind {
FromUtf8(FromUtf8Error), FromUtf8(FromUtf8Error),
Poison, Poison,
Iri(IriParseError), Iri(IriParseError),
LanguageTag(LanguageTagParseError),
Other(Box<dyn error::Error + Send + Sync + 'static>), Other(Box<dyn error::Error + Send + Sync + 'static>),
} }
@ -94,6 +98,14 @@ impl From<IriParseError> for Error {
} }
} }
impl From<LanguageTagParseError> for Error {
fn from(error: LanguageTagParseError) -> Self {
Self {
inner: ErrorKind::LanguageTag(error),
}
}
}
impl From<TurtleError> for Error { impl From<TurtleError> for Error {
fn from(error: TurtleError) -> Self { fn from(error: TurtleError) -> Self {
Self::wrap(error) Self::wrap(error)

@ -2,6 +2,8 @@ use crate::model::named_node::NamedNode;
use crate::model::vocab::rdf; use crate::model::vocab::rdf;
use crate::model::vocab::xsd; use crate::model::vocab::xsd;
use crate::model::xsd::*; use crate::model::xsd::*;
use crate::Result;
use rio_api::language_tag::LanguageTag;
use rio_api::model as rio; use rio_api::model as rio;
use std::borrow::Cow; use std::borrow::Cow;
use std::fmt; use std::fmt;
@ -11,6 +13,7 @@ use std::option::Option;
/// ///
/// The default string formatter is returning a N-Triples, Turtle and SPARQL compatible representation: /// The default string formatter is returning a N-Triples, Turtle and SPARQL compatible representation:
/// ``` /// ```
/// # use oxigraph::Result;
/// use oxigraph::model::Literal; /// use oxigraph::model::Literal;
/// use oxigraph::model::vocab::xsd; /// use oxigraph::model::vocab::xsd;
/// ///
@ -26,8 +29,9 @@ use std::option::Option;
/// ///
/// assert_eq!( /// assert_eq!(
/// "\"foo\"@en", /// "\"foo\"@en",
/// Literal::new_language_tagged_literal("foo", "en").to_string() /// Literal::new_language_tagged_literal("foo", "en")?.to_string()
/// ); /// );
/// # Result::Ok(())
/// ``` /// ```
#[derive(Eq, PartialEq, Ord, PartialOrd, Debug, Clone, Hash)] #[derive(Eq, PartialEq, Ord, PartialOrd, Debug, Clone, Hash)]
pub struct Literal(LiteralContent); pub struct Literal(LiteralContent);
@ -60,12 +64,22 @@ impl Literal {
pub fn new_language_tagged_literal( pub fn new_language_tagged_literal(
value: impl Into<String>, value: impl Into<String>,
language: impl Into<String>, language: impl Into<String>,
) -> Self { ) -> Result<Self> {
let mut language = language.into(); let mut language = language.into();
language.make_ascii_lowercase(); language.make_ascii_lowercase();
Ok(Literal(LiteralContent::LanguageTaggedString {
value: value.into(),
language: LanguageTag::parse(language)?.into_inner(),
}))
}
pub(crate) fn new_language_tagged_literal_unchecked(
value: impl Into<String>,
language: impl Into<String>,
) -> Self {
Literal(LiteralContent::LanguageTaggedString { Literal(LiteralContent::LanguageTaggedString {
value: value.into(), value: value.into(),
language, language: language.into(),
}) })
} }

@ -14,6 +14,7 @@ use md5::Md5;
use rand::random; use rand::random;
use regex::{Regex, RegexBuilder}; use regex::{Regex, RegexBuilder};
use rio_api::iri::Iri; use rio_api::iri::Iri;
use rio_api::language_tag::LanguageTag;
use rio_api::model as rio; use rio_api::model as rio;
use sha1::Sha1; use sha1::Sha1;
use sha2::{Sha256, Sha384, Sha512}; use sha2::{Sha256, Sha384, Sha512};
@ -906,10 +907,12 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator<S> {
_ => None, _ => None,
}, },
PlanExpression::LangMatches(language_tag, language_range) => { PlanExpression::LangMatches(language_tag, language_range) => {
let language_tag = let mut language_tag =
self.to_simple_string(self.eval_expression(language_tag, tuple)?)?; self.to_simple_string(self.eval_expression(language_tag, tuple)?)?;
let language_range = language_tag.make_ascii_lowercase();
let mut language_range =
self.to_simple_string(self.eval_expression(language_range, tuple)?)?; self.to_simple_string(self.eval_expression(language_range, tuple)?)?;
language_range.make_ascii_lowercase();
Some( Some(
if &*language_range == "*" { if &*language_range == "*" {
!language_tag.is_empty() !language_tag.is_empty()
@ -917,7 +920,7 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator<S> {
!ZipLongest::new(language_range.split('-'), language_tag.split('-')).any( !ZipLongest::new(language_range.split('-'), language_tag.split('-')).any(
|parts| match parts { |parts| match parts {
(Some(range_subtag), Some(language_subtag)) => { (Some(range_subtag), Some(language_subtag)) => {
!range_subtag.eq_ignore_ascii_case(language_subtag) range_subtag != language_subtag
} }
(Some(_), None) => true, (Some(_), None) => true,
(None, _) => false, (None, _) => false,
@ -1243,8 +1246,7 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator<S> {
Some(EncodedTerm::LangStringLiteral { Some(EncodedTerm::LangStringLiteral {
value_id: self value_id: self
.to_simple_string_id(self.eval_expression(lexical_form, tuple)?)?, .to_simple_string_id(self.eval_expression(lexical_form, tuple)?)?,
language_id: self language_id: self.build_language_id(self.eval_expression(lang_tag, tuple)?)?,
.to_simple_string_id(self.eval_expression(lang_tag, tuple)?)?,
}) })
} }
PlanExpression::StrDT(lexical_form, datatype) => { PlanExpression::StrDT(lexical_form, datatype) => {
@ -1498,6 +1500,12 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator<S> {
Some(value_id) Some(value_id)
} }
fn build_language_id(&self, value: EncodedTerm) -> Option<u128> {
let mut language = self.to_simple_string(value)?;
language.make_ascii_lowercase();
self.build_string_id(LanguageTag::parse(language).ok()?.as_str())
}
fn to_argument_compatible_strings( fn to_argument_compatible_strings(
&self, &self,
arg1: EncodedTerm, arg1: EncodedTerm,

@ -1431,7 +1431,7 @@ parser! {
//[129] //[129]
rule RDFLiteral() -> Literal = rule RDFLiteral() -> Literal =
v:String() _ "^^" _ t:iri() { Literal::new_typed_literal(v, t) } / v:String() _ "^^" _ t:iri() { Literal::new_typed_literal(v, t) } /
v:String() _ l:LANGTAG() { Literal::new_language_tagged_literal(v, l) } / v:String() _ l:LANGTAG() {? Literal::new_language_tagged_literal(v, l).map_err(|_| "language tag parsing failed") } /
v:String() { Literal::new_simple_literal(v) } v:String() { Literal::new_simple_literal(v) }
//[130] //[130]

@ -385,7 +385,7 @@ impl<R: BufRead> ResultsIterator<R> {
self.reader.decode(&data)?, self.reader.decode(&data)?,
lang.take(), lang.take(),
datatype.take(), datatype.take(),
) )?
.into(), .into(),
); );
} }
@ -413,7 +413,7 @@ impl<R: BufRead> ResultsIterator<R> {
State::Literal => { State::Literal => {
if term.is_none() { if term.is_none() {
//We default to the empty literal //We default to the empty literal
term = Some(build_literal("", lang.take(), datatype.take()).into()) term = Some(build_literal("", lang.take(), datatype.take())?.into())
} }
state = State::Binding; state = State::Binding;
} }
@ -430,12 +430,12 @@ fn build_literal(
value: impl Into<String>, value: impl Into<String>,
lang: Option<String>, lang: Option<String>,
datatype: Option<NamedNode>, datatype: Option<NamedNode>,
) -> Literal { ) -> Result<Literal> {
match datatype { match datatype {
Some(datatype) => Literal::new_typed_literal(value, datatype), Some(datatype) => Ok(Literal::new_typed_literal(value, datatype)),
None => match lang { None => match lang {
Some(lang) => Literal::new_language_tagged_literal(value, lang), Some(lang) => Literal::new_language_tagged_literal(value, lang),
None => Literal::new_simple_literal(value), None => Ok(Literal::new_simple_literal(value)),
}, },
} }
} }

@ -389,11 +389,7 @@ impl<'a> From<rio::Literal<'a>> for EncodedTerm {
rio::Literal::LanguageTaggedString { value, language } => { rio::Literal::LanguageTaggedString { value, language } => {
EncodedTerm::LangStringLiteral { EncodedTerm::LangStringLiteral {
value_id: get_str_id(value), value_id: get_str_id(value),
language_id: if language.bytes().all(|b| b.is_ascii_lowercase()) { language_id: get_str_id(language),
get_str_id(language)
} else {
get_str_id(&language.to_ascii_lowercase())
},
} }
} }
rio::Literal::Typed { value, datatype } => { rio::Literal::Typed { value, datatype } => {
@ -986,18 +982,8 @@ impl<S: StrContainer> Encoder for S {
rio::Literal::LanguageTaggedString { value, language } => { rio::Literal::LanguageTaggedString { value, language } => {
let value_id = get_str_id(value); let value_id = get_str_id(value);
self.insert_str(value_id, value)?; self.insert_str(value_id, value)?;
let language_id = get_str_id(language);
let language_id = if language.bytes().all(|b| b.is_ascii_lowercase()) { self.insert_str(language_id, language)?;
let language_id = get_str_id(language);
self.insert_str(language_id, language)?;
language_id
} else {
let language = language.to_ascii_lowercase();
let language_id = get_str_id(&language);
self.insert_str(language_id, &language)?;
language_id
};
EncodedTerm::LangStringLiteral { EncodedTerm::LangStringLiteral {
value_id, value_id,
language_id, language_id,
@ -1161,7 +1147,7 @@ impl<S: StrLookup> Decoder for S {
EncodedTerm::LangStringLiteral { EncodedTerm::LangStringLiteral {
value_id, value_id,
language_id, language_id,
} => Ok(Literal::new_language_tagged_literal( } => Ok(Literal::new_language_tagged_literal_unchecked(
get_required_str(self, value_id)?, get_required_str(self, value_id)?,
get_required_str(self, language_id)?, get_required_str(self, language_id)?,
) )
@ -1209,8 +1195,12 @@ fn test_encoding() {
Literal::from(1.2).into(), Literal::from(1.2).into(),
Literal::from(1).into(), Literal::from(1).into(),
Literal::from("foo").into(), Literal::from("foo").into(),
Literal::new_language_tagged_literal("foo", "fr").into(), Literal::new_language_tagged_literal("foo", "fr")
Literal::new_language_tagged_literal("foo", "FR").into(), .unwrap()
.into(),
Literal::new_language_tagged_literal("foo", "FR")
.unwrap()
.into(),
Literal::new_typed_literal("-1.32", xsd::DECIMAL.clone()).into(), Literal::new_typed_literal("-1.32", xsd::DECIMAL.clone()).into(),
Literal::new_typed_literal("2020-01-01T01:01:01Z", xsd::DATE_TIME.clone()).into(), Literal::new_typed_literal("2020-01-01T01:01:01Z", xsd::DATE_TIME.clone()).into(),
Literal::new_typed_literal("2020-01-01", xsd::DATE.clone()).into(), Literal::new_typed_literal("2020-01-01", xsd::DATE.clone()).into(),

Loading…
Cancel
Save