diff --git a/build.rs b/build.rs index b198e889..c63acd89 100644 --- a/build.rs +++ b/build.rs @@ -1,5 +1,6 @@ extern crate peg; fn main() { - peg::cargo_build("src/rio/ntriples/grammar.rustpeg"); + peg::cargo_build("src/rio/ntriples/ntriples_grammar.rustpeg"); + peg::cargo_build("src/rio/turtle/turtle_grammar.rustpeg"); } diff --git a/src/rio/mod.rs b/src/rio/mod.rs index 5bbe8544..e4794582 100644 --- a/src/rio/mod.rs +++ b/src/rio/mod.rs @@ -2,6 +2,7 @@ use std::error::Error; use std::fmt; pub mod ntriples; +pub mod turtle; pub type RioResult = Result; diff --git a/src/rio/ntriples/mod.rs b/src/rio/ntriples/mod.rs index ac52aec4..d1c2a6a6 100644 --- a/src/rio/ntriples/mod.rs +++ b/src/rio/ntriples/mod.rs @@ -1,7 +1,7 @@ ///Implements https://www.w3.org/TR/n-triples/ mod grammar { - include!(concat!(env!("OUT_DIR"), "/grammar.rs")); + include!(concat!(env!("OUT_DIR"), "/ntriples_grammar.rs")); } use model::data::*; @@ -15,7 +15,7 @@ pub fn read_ntriples<'a, R: Read + 'a>( data_factory: &'a DataFactory, ) -> impl Iterator> { let factory = data_factory.clone(); //TODO: try to avoid clone here - //TODO: use read_lines to avoid allocations + //TODO: use read_lines to avoid allocations BufReader::new(source) .lines() .flat_map(move |line| match line { diff --git a/src/rio/ntriples/grammar.rustpeg b/src/rio/ntriples/ntriples_grammar.rustpeg similarity index 93% rename from src/rio/ntriples/grammar.rustpeg rename to src/rio/ntriples/ntriples_grammar.rustpeg index a1d7317c..549ac4ff 100644 --- a/src/rio/ntriples/grammar.rustpeg +++ b/src/rio/ntriples/ntriples_grammar.rustpeg @@ -35,8 +35,8 @@ literal -> Literal = //[144s] -LANGTAG -> String = "@" l: $([a-zA-Z]+ ("-" [a-zA-Z0-9]+)*) { - l.into() +LANGTAG -> &'input str = "@" l: $([a-zA-Z]+ ("-" [a-zA-Z0-9]+)*) { + l } //[7] @@ -55,8 +55,8 @@ STRING_LITERAL_QUOTE -> String = "\"" l: ((STRING_LITERAL_QUOTE_simple_char / EC STRING_LITERAL_QUOTE_simple_char -> char = c: $([^\u{0022}\u{005c}\u{000a}\u{000d}]) { c.chars().next().unwrap() } //[141s] -BLANK_NODE_LABEL -> String = "_:" b: $((PN_CHARS_U / [0-9]) ((PN_CHARS / ".")* PN_CHARS)?) { - b.into() +BLANK_NODE_LABEL -> &'input str = "_:" b: $((PN_CHARS_U / [0-9]) ((PN_CHARS / ".")* PN_CHARS)?) { + b } //[10] @@ -93,6 +93,7 @@ PN_CHARS -> () = PN_CHARS_U / [\-0-9\u{00B7}\u{0300}-\u{036F}\u{203F}-\u{2040}] //[162s] HEX -> () = ([0-9A-Fa-f]) + //space _ = #quiet<[ \t]*> //comment diff --git a/src/rio/turtle/mod.rs b/src/rio/turtle/mod.rs new file mode 100644 index 00000000..715e4db4 --- /dev/null +++ b/src/rio/turtle/mod.rs @@ -0,0 +1,42 @@ +/// Implements https://www.w3.org/TR/turtle/ + +mod grammar { + include!(concat!(env!("OUT_DIR"), "/turtle_grammar.rs")); +} + +use model::data::*; +use rio::*; +use std::collections::HashMap; +use std::io::BufReader; +use std::io::Read; + +//TODO: make private +pub struct ParserState { + pub base_uri: String, + pub namespaces: HashMap, + pub cur_subject: Vec, + pub cur_predicate: Vec, +} + +pub fn read_turtle<'a, R: Read + 'a>( + source: R, + data_factory: &'a DataFactory, +) -> RioResult> { + let factory = data_factory.clone(); //TODO: try to avoid clone here + let mut state = ParserState { + base_uri: String::default(), + namespaces: HashMap::default(), + cur_subject: Vec::default(), + cur_predicate: Vec::default(), + }; + let mut string_buffer = String::default(); + let mut triple_buffer = Vec::default(); + match BufReader::new(source).read_to_string(&mut string_buffer) { + Ok(_) => match grammar::turtleDoc(&string_buffer, &mut state, &mut triple_buffer, &factory) + { + Ok(_) => Ok(triple_buffer.into_iter()), + Err(error) => Err(RioError::new(error)), + }, + Err(error) => Err(RioError::new(error)), + } +} diff --git a/src/rio/turtle/turtle_grammar.rustpeg b/src/rio/turtle/turtle_grammar.rustpeg new file mode 100644 index 00000000..0a09c1ad --- /dev/null +++ b/src/rio/turtle/turtle_grammar.rustpeg @@ -0,0 +1,275 @@ +//See https://www.w3.org/TR/turtle/#sec-grammar + +use std::char; +use std::iter; +use model::data::*; +use rio::turtle::ParserState; + +#![arguments(state: &mut ParserState, buffer: &mut Vec, data_factory: &DataFactory)] + +//[1] +#[pub] +turtleDoc -> () = _ (statement _)* + +//[2] +statement -> () = directive / triples "." + +//[3] +directive -> () = prefixID / base / sparqlPrefix / sparqlBase + +//[4] +prefixID -> () = "@prefix" _ ns:PNAME_NS _ i:IRIREF _ "." { + state.namespaces.insert(ns.into(), i.into()); +} + +//[5] +base -> () = "@base" _ i:IRIREF _ "." { + state.base_uri = i.into(); +} + +//[5s] +sparqlBase -> () = "BASE"i _ i:IRIREF { + state.base_uri = i.into(); +} + +//[6s] +sparqlPrefix -> () = "PREFIX"i _ ns:PNAME_NS _ i:IRIREF { + state.namespaces.insert(ns.into(), i.into()); +} + +//[6] +triples -> () = subject_push _ predicateObjectList / triples_blankNodePropertyList_push _ predicateObjectList? +subject_push -> () = s:subject { + state.cur_subject.push(s) +} +triples_blankNodePropertyList_push -> () = s: blankNodePropertyList { + state.cur_subject.push(s) +} + +//[7] +predicateObjectList -> () = predicate_push _ objectList _ (";" _ (predicate_push _ objectList _)?)* +predicate_push -> () = v:verb { + state.cur_predicate.push(v) +} + +//[8] +objectList -> () = object _ ("," _ object _)* + +//[9] +verb -> NamedNode = predicate / + "a" { data_factory.named_node("http://www.w3.org/1999/02/22-rdf-syntax-ns#type") } + +// [10] +subject -> NamedOrBlankNode = + i:iri { i.into() } / + b:BlankNode { b.into() } / + c:collection { c } + +//[11] +predicate -> NamedNode = iri + +// [12] +object -> () = o:object_value {? + match state.cur_subject.last() { + Some(s) => match state.cur_predicate.last() { + Some(p) => { + buffer.push(data_factory.triple(s.clone(), p.clone(), o)); + Ok(()) + } + None => Err("Predicate not found") + }, + None => Err("Subject not found") + } +} + +object_value -> Term = + i:iri { i.into() } / + b:BlankNode { b.into() } / + c:collection { c.into() } / + b:blankNodePropertyList { b.into() } / + l:literal { l.into() } + +//[13] +literal -> Literal = RDFLiteral / NumericLiteral / BooleanLiteral + +//[14] +blankNodePropertyList -> NamedOrBlankNode = blankNodePropertyList_open _ predicateObjectList _ "]" {? + state.cur_subject.pop().ok_or("No subject found in the stack") +} +blankNodePropertyList_open -> () = "[" { + state.cur_subject.push(data_factory.new_blank_node().into()) +} + +//[15] +collection -> NamedOrBlankNode = '(' _ o:(collection_value*) ')' { + let mut current_list_node = NamedOrBlankNode::from(data_factory.named_node("http://www.w3.org/1999/02/22-rdf-syntax-ns#nil")); + for obj in o.into_iter().rev() { + let new_blank_node = NamedOrBlankNode::from(data_factory.new_blank_node()); + buffer.push(data_factory.triple(new_blank_node.clone(), data_factory.named_node("http://www.w3.org/1999/02/22-rdf-syntax-ns#first"), obj)); + buffer.push(data_factory.triple(new_blank_node.clone(), data_factory.named_node("http://www.w3.org/1999/02/22-rdf-syntax-ns#rest"), current_list_node)); + current_list_node = new_blank_node; + } + current_list_node +} +collection_value -> Term = o:object_value _ { o } + +//[16] +NumericLiteral -> Literal = + i:$(INTEGER) { data_factory.typed_literal(i, data_factory.named_node("http://www.w3.org/2001/XMLSchema#integer")) } / + d:$(DECIMAL) { data_factory.typed_literal(d, data_factory.named_node("http://www.w3.org/2001/XMLSchema#decimal")) } / + d:$(DOUBLE) { data_factory.typed_literal(d, data_factory.named_node("http://www.w3.org/2001/XMLSchema#double")) } + +//[128s] +RDFLiteral -> Literal = + v:String _ "^^" _ t:iri { data_factory.typed_literal(v, t) } / + v:String _ l:LANGTAG { data_factory.language_tagged_literal(v, l) } / + v:String { data_factory.simple_literal(v) } + +//[133s] +BooleanLiteral -> Literal = + "true" { data_factory.typed_literal("true", data_factory.named_node("http://www.w3.org/2001/XMLSchema#boolean")) } / + "false" { data_factory.typed_literal("false", data_factory.named_node("http://www.w3.org/2001/XMLSchema#boolean")) } + +//[17] +String -> String = STRING_LITERAL_QUOTE / STRING_LITERAL_SINGLE_QUOTE / STRING_LITERAL_LONG_SINGLE_QUOTE / STRING_LITERAL_LONG_QUOTE + +//[135s] +iri -> NamedNode = i:(IRIREF / PrefixedName) { + data_factory.named_node(i) +} + +//[136s] +PrefixedName -> String = PNAME_LN / + ns:PNAME_NS {? state.namespaces.get(ns).map(|v| v.clone()).ok_or("Prefix not found") } + +//[137s] +BlankNode -> BlankNode = + b:BLANK_NODE_LABEL { data_factory.blank_node(b) } / + ANON { data_factory.new_blank_node() } + +//[18] +IRIREF -> String = "<" _ i:((_IRIREF_simple_char / UCHAR)*) _ ">" { + //TODO: relative URIs resolution + i.into_iter().collect() +} +_IRIREF_simple_char -> char = c:$([^\u{00}-\u{20}<>"{}|^\u{60}\u{5c}]) { c.chars().next().unwrap() } + +//[139s] +PNAME_NS -> &'input str = ns:$(PN_PREFIX? ":") { + ns +} + +//[140s] +PNAME_LN -> String = ns:$(PNAME_NS) local:$(PN_LOCAL) {? + state.namespaces.get(ns).map(|v| v.clone() + local).ok_or("Prefix not found") +} + +//[141s] +BLANK_NODE_LABEL -> &'input str = "_:" b:$((PN_CHARS_U / [0-9]) ((PN_CHARS / ".")* PN_CHARS)?) { + //TODO unescape + b +} + +//[144s] +LANGTAG -> &'input str = "@" l:$([a-zA-Z]+ ("-" [a-zA-Z0-9]+)*) { + l +} + +//[19] +INTEGER -> () = [+-]? [0-9]+ + +//[20] +DECIMAL -> () = [+-]? [0-9]* "." [0-9]+ + +//[21] +DOUBLE -> () = [+-]? ([0-9]+ "." [0-9]* EXPONENT / "." [0-9]+ EXPONENT / [0-9]+ EXPONENT) + +//[154s] +EXPONENT -> () = [eE] [+-]? [0-9]+ + +//[22] +STRING_LITERAL_QUOTE -> String = "\"" l:((STRING_LITERAL_QUOTE_simple_char / ECHAR / UCHAR)*) "\"" { + l.into_iter().collect() +} +STRING_LITERAL_QUOTE_simple_char -> char = c:$([^\u{0022}\u{005c}\u{000a}\u{000d}]) { c.chars().next().unwrap() } + +//[23] +STRING_LITERAL_SINGLE_QUOTE -> String = "'" l:((STRING_LITERAL_SINGLE_QUOTE_simple_char / ECHAR / UCHAR)*) "'" { + l.into_iter().collect() +} +STRING_LITERAL_SINGLE_QUOTE_simple_char -> char = c:$([^\u{0027}\u{005c}\u{000a}\u{000d}]) { c.chars().next().unwrap() } + +//[24] +STRING_LITERAL_LONG_SINGLE_QUOTE -> String = "'''" ("'" / "''")? l:((STRING_LITERAL_LONG_SINGLE_QUOTE_simple_char / ECHAR / UCHAR)*) "'''" { + l.into_iter().collect() +} +STRING_LITERAL_LONG_SINGLE_QUOTE_simple_char -> char = c:$([^\u{0027}\u{005c}]) { c.chars().next().unwrap() } + +//[25] +STRING_LITERAL_LONG_QUOTE -> String = "\"\"\"" ("\"" / "\"\"")? l:((STRING_LITERAL_LONG_QUOTE_simple_char / ECHAR / UCHAR)*) "\"\"\"" { +l.into_iter().collect() +} +STRING_LITERAL_LONG_QUOTE_simple_char -> char = c:$([^\u{0022}\u{005c}]) { c.chars().next().unwrap() } + +//[26] +UCHAR -> char = "\\u" h:$(HEX HEX HEX HEX) { + u32::from_str_radix(h, 16).ok().and_then(char::from_u32).unwrap() +} / "\\U" h:$(HEX HEX HEX HEX HEX HEX HEX HEX) { + u32::from_str_radix(h, 16).ok().and_then(char::from_u32).unwrap() +} + +//[159s] +ECHAR -> char = "\\" c:$([tbnrf"'\\]) { + match c { + "t" => '\u{0009}', + "b" => '\u{0008}', + "n" => '\u{000A}', + "r" => '\u{000D}', + "f" => '\u{000C}', + "\"" => '\u{0022}', + "'" => '\u{0027}', + "\\" => '\u{005C}', + _ => panic!("unexpected escaped char") // not possible + } +} + +//[161s] +WS -> () = #quiet<[\u{20}\u{9}\u{D}\u{A}]> + +//[162s] +ANON -> () = "[" WS* "]" + +//[163s] +PN_CHARS_BASE -> () = [A-Za-z\u{00C0}-\u{00D6}\u{00D8}-\u{00F6}\u{00F8}-\u{02FF}\u{0370}-\u{037D}\u{037F}-\u{1FFF}\u{200C}-\u{200D}\u{2070}-\u{218F}\u{2C00}-\u{2FEF}\u{3001}-\u{D7FF}\u{F900}-\u{FDCF}\u{FDF0}-\u{FFFD}] + +//[164s] +PN_CHARS_U -> () = PN_CHARS_BASE / "_" + +//[166s] +PN_CHARS -> () = PN_CHARS_U / [\-0-9\u{00B7}\u{0300}-\u{036F}\u{203F}-\u{2040}] + +//[167s] +PN_PREFIX -> () = PN_CHARS_BASE PN_CHARS* ("." PN_CHARS+)* + +//[168s] +PN_LOCAL -> () = (PN_CHARS_U / ":" / [0-9] / PLX) (PN_CHARS / ":" / PLX)* ("." (PN_CHARS / ":" / PLX)+)* + +//[169s] +PLX -> String = + p:$(PERCENT) { p.into() } / + e:PN_LOCAL_ESC { iter::once(e).collect() } + +//[170s] +PERCENT -> () = "%" HEX HEX + +//[171s] +HEX -> () = ([0-9A-Fa-f]) + +//[172s] +PN_LOCAL_ESC -> char = "\\" c:$([_~\.\-!$&'()*+,;=/?#@%]) { c.chars().next().unwrap() } + + +//space +_ = #quiet<([ \t\n\r] / comment)*> +//comment +comment = #quiet<"#" [^\r\n]*>