From 6906bcd351d3fd1bd5544ffd7bf171542be28254 Mon Sep 17 00:00:00 2001 From: Tpt Date: Sat, 19 May 2018 10:38:34 +0200 Subject: [PATCH] Adds proper relative IRI resolution --- Cargo.toml | 3 +- src/lib.rs | 1 + src/model/data.rs | 17 +++++--- src/rio/ntriples/ntriples_grammar.rustpeg | 10 ++++- src/rio/turtle/mod.rs | 13 +++++- src/rio/turtle/turtle_grammar.rustpeg | 51 +++++++++++++++-------- 6 files changed, 67 insertions(+), 28 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 73b06cee..0be8054e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,7 +15,8 @@ build = "build.rs" travis-ci = { repository = "Tpt/rudf" } [dependencies] -lazy_static = "^1.0" +lazy_static = "1.0" +url = "1.7" [build-dependencies] peg = "0.5" diff --git a/src/lib.rs b/src/lib.rs index 6ffc142c..b34fdf24 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,6 @@ #[macro_use] extern crate lazy_static; +extern crate url; pub mod model; pub mod rio; diff --git a/src/model/data.rs b/src/model/data.rs index 08c5aebd..bbd4cc21 100644 --- a/src/model/data.rs +++ b/src/model/data.rs @@ -4,22 +4,27 @@ use std::fmt; use std::option::Option; use std::sync::Arc; use std::sync::Mutex; +use url::Url; /// A RDF [IRI](https://www.w3.org/TR/rdf11-concepts/#dfn-iri) #[derive(Eq, PartialEq, Debug, Clone, Hash)] pub struct NamedNode { - iri: String, + iri: Arc, } impl NamedNode { pub fn value(&self) -> &str { + self.iri.as_str() + } + + pub fn url(&self) -> &Url { &self.iri } } impl fmt::Display for NamedNode { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "<{}>", self.value()) + write!(f, "<{}>", self.iri) } } @@ -51,10 +56,10 @@ pub enum Literal { lazy_static! { static ref XSD_STRING: NamedNode = NamedNode { - iri: "http://www.w3.org/2001/XMLSchema#string".to_owned() + iri: Arc::new(Url::parse("http://www.w3.org/2001/XMLSchema#string").unwrap()) }; static ref RDF_LANG_STRING: NamedNode = NamedNode { - iri: "http://www.w3.org/1999/02/22-rdf-syntax-ns#langString".to_owned() + iri: Arc::new(Url::parse("http://www.w3.org/1999/02/22-rdf-syntax-ns#langString").unwrap()) }; } @@ -366,8 +371,8 @@ impl Default for DataFactory { impl DataFactory { /// Builds a RDF [IRI](https://www.w3.org/TR/rdf11-concepts/#dfn-iri) - pub fn named_node(&self, iri: impl Into) -> NamedNode { - NamedNode { iri: iri.into() } + pub fn named_node(&self, iri: impl Into) -> NamedNode { + NamedNode { iri: Arc::new(iri.into()) } } /// Builds a RDF [blank node](https://www.w3.org/TR/rdf11-concepts/#dfn-blank-node) with a known id diff --git a/src/rio/ntriples/ntriples_grammar.rustpeg b/src/rio/ntriples/ntriples_grammar.rustpeg index 6a245287..5c36294f 100644 --- a/src/rio/ntriples/ntriples_grammar.rustpeg +++ b/src/rio/ntriples/ntriples_grammar.rustpeg @@ -1,6 +1,8 @@ //See https://www.w3.org/TR/2014/REC-n-triples-20140225/#n-triples-grammar +use std::iter::FromIterator; use std::char; +use url::Url; use model::data::*; #![arguments(data_factory: &DataFactory)] @@ -43,8 +45,12 @@ LANGTAG -> &'input str = "@" l: $([a-zA-Z]+ ("-" [a-zA-Z0-9]+)*) { EOL = [\r\n]+ //[8] -IRIREF -> String = "<" _ i: ((_IRIREF_simple_char / UCHAR)*) _ ">" { - i.into_iter().collect() +IRIREF -> Url = "<" _ i: ((_IRIREF_simple_char / UCHAR)*) _ ">" {? + let s = String::from_iter(i.into_iter()); + match Url::parse(&s) { + Ok(url) => Ok(url), + Err(error) => Err("IRI parsing failed") + } } _IRIREF_simple_char -> char = c: $([^\u{00}-\u{20}<>"{}|^\u{60}\u{5c}]) { c.chars().next().unwrap() } diff --git a/src/rio/turtle/mod.rs b/src/rio/turtle/mod.rs index 715e4db4..b82d1ec1 100644 --- a/src/rio/turtle/mod.rs +++ b/src/rio/turtle/mod.rs @@ -9,22 +9,31 @@ use rio::*; use std::collections::HashMap; use std::io::BufReader; use std::io::Read; +use url::Url; +use url::ParseOptions; //TODO: make private pub struct ParserState { - pub base_uri: String, + pub base_uri: Option, pub namespaces: HashMap, pub cur_subject: Vec, pub cur_predicate: Vec, } +impl ParserState { + fn url_parser<'a>(&'a self) -> ParseOptions<'a> { + Url::options().base_url(self.base_uri.as_ref()) + } +} + pub fn read_turtle<'a, R: Read + 'a>( source: R, data_factory: &'a DataFactory, + base_uri: impl Into> ) -> RioResult> { let factory = data_factory.clone(); //TODO: try to avoid clone here let mut state = ParserState { - base_uri: String::default(), + base_uri: base_uri.into(), namespaces: HashMap::default(), cur_subject: Vec::default(), cur_predicate: Vec::default(), diff --git a/src/rio/turtle/turtle_grammar.rustpeg b/src/rio/turtle/turtle_grammar.rustpeg index 33adacba..ad435ece 100644 --- a/src/rio/turtle/turtle_grammar.rustpeg +++ b/src/rio/turtle/turtle_grammar.rustpeg @@ -1,8 +1,9 @@ //See https://www.w3.org/TR/turtle/#sec-grammar use std::char; -use std::iter; +use url::Url; use model::data::*; +use std::iter; use rio::turtle::ParserState; #![arguments(state: &mut ParserState, buffer: &mut Vec, data_factory: &DataFactory)] @@ -23,13 +24,25 @@ prefixID -> () = "@prefix" _ ns:PNAME_NS _ i:IRIREF _ "." { } //[5] -base -> () = "@base" _ i:IRIREF _ "." { - state.base_uri = i.into(); +base -> () = "@base" _ i:IRIREF _ "." {? + match Url::parse(&i) { + Ok(url) => { + state.base_uri = Some(url); + Ok(()) + }, + Err(error) => Err("IRI parsing failed") + } } //[5s] -sparqlBase -> () = "BASE"i _ i:IRIREF { - state.base_uri = i.into(); +sparqlBase -> () = "BASE"i _ i:IRIREF {? + match Url::parse(&i) { + Ok(url) => { + state.base_uri = Some(url); + Ok(()) + }, + Err(error) => Err("IRI parsing failed") + } } //[6s] @@ -57,7 +70,7 @@ objectList -> () = object _ ("," _ object _)* //[9] verb -> NamedNode = predicate / - "a" { data_factory.named_node("http://www.w3.org/1999/02/22-rdf-syntax-ns#type") } + "a" { data_factory.named_node(Url::parse("http://www.w3.org/1999/02/22-rdf-syntax-ns#type").unwrap()) } // [10] subject -> NamedOrBlankNode = @@ -102,11 +115,13 @@ blankNodePropertyList_open -> () = "[" { //[15] collection -> NamedOrBlankNode = '(' _ o:(collection_value*) ')' { - let mut current_list_node = NamedOrBlankNode::from(data_factory.named_node("http://www.w3.org/1999/02/22-rdf-syntax-ns#nil")); + let first = data_factory.named_node(Url::parse("http://www.w3.org/1999/02/22-rdf-syntax-ns#first").unwrap()); + let rest = data_factory.named_node(Url::parse("http://www.w3.org/1999/02/22-rdf-syntax-ns#rest").unwrap()); + let mut current_list_node = NamedOrBlankNode::from(data_factory.named_node(Url::parse("http://www.w3.org/1999/02/22-rdf-syntax-ns#nil").unwrap())); for obj in o.into_iter().rev() { let new_blank_node = NamedOrBlankNode::from(data_factory.new_blank_node()); - buffer.push(data_factory.triple(new_blank_node.clone(), data_factory.named_node("http://www.w3.org/1999/02/22-rdf-syntax-ns#first"), obj)); - buffer.push(data_factory.triple(new_blank_node.clone(), data_factory.named_node("http://www.w3.org/1999/02/22-rdf-syntax-ns#rest"), current_list_node)); + buffer.push(data_factory.triple(new_blank_node.clone(), first.clone(), obj)); + buffer.push(data_factory.triple(new_blank_node.clone(), rest.clone(), current_list_node)); current_list_node = new_blank_node; } current_list_node @@ -115,9 +130,9 @@ collection_value -> Term = o:object_value _ { o } //[16] NumericLiteral -> Literal = - d:$(DOUBLE) { data_factory.typed_literal(d, data_factory.named_node("http://www.w3.org/2001/XMLSchema#double")) } / - d:$(DECIMAL) { data_factory.typed_literal(d, data_factory.named_node("http://www.w3.org/2001/XMLSchema#decimal")) } / - i:$(INTEGER) { data_factory.typed_literal(i, data_factory.named_node("http://www.w3.org/2001/XMLSchema#integer")) } + d:$(DOUBLE) { data_factory.typed_literal(d, data_factory.named_node(Url::parse("http://www.w3.org/2001/XMLSchema#double").unwrap())) } / + d:$(DECIMAL) { data_factory.typed_literal(d, data_factory.named_node(Url::parse("http://www.w3.org/2001/XMLSchema#decimal").unwrap())) } / + i:$(INTEGER) { data_factory.typed_literal(i, data_factory.named_node(Url::parse("http://www.w3.org/2001/XMLSchema#integer").unwrap())) } //[128s] RDFLiteral -> Literal = @@ -127,15 +142,18 @@ RDFLiteral -> Literal = //[133s] BooleanLiteral -> Literal = - "true" { data_factory.typed_literal("true", data_factory.named_node("http://www.w3.org/2001/XMLSchema#boolean")) } / - "false" { data_factory.typed_literal("false", data_factory.named_node("http://www.w3.org/2001/XMLSchema#boolean")) } + "true" { data_factory.typed_literal("true", data_factory.named_node(Url::parse("http://www.w3.org/2001/XMLSchema#boolean").unwrap())) } / + "false" { data_factory.typed_literal("false", data_factory.named_node(Url::parse("http://www.w3.org/2001/XMLSchema#boolean").unwrap())) } //[17] String -> String = STRING_LITERAL_LONG_SINGLE_QUOTE / STRING_LITERAL_LONG_QUOTE / STRING_LITERAL_QUOTE / STRING_LITERAL_SINGLE_QUOTE //[135s] -iri -> NamedNode = i:(IRIREF / PrefixedName) { - data_factory.named_node(i) +iri -> NamedNode = i:(IRIREF / PrefixedName) {? + match state.url_parser().parse(&i) { + Ok(url) => Ok(data_factory.named_node(url)), + Err(error) => Err("IRI parsing failed") + } } //[136s] @@ -149,7 +167,6 @@ BlankNode -> BlankNode = //[18] IRIREF -> String = "<" i:((_IRIREF_simple_char / UCHAR)*) ">" { - //TODO: relative URIs resolution i.into_iter().collect() } _IRIREF_simple_char -> char = c:$([^\u{00}-\u{20}<>"{}|^\u{60}\u{5c}]) { c.chars().next().unwrap() }