Adds a basic Turtle parser

pull/10/head
Tpt 6 years ago
parent 005271c348
commit 4c7fd0168b
  1. 3
      build.rs
  2. 1
      src/rio/mod.rs
  3. 4
      src/rio/ntriples/mod.rs
  4. 9
      src/rio/ntriples/ntriples_grammar.rustpeg
  5. 42
      src/rio/turtle/mod.rs
  6. 275
      src/rio/turtle/turtle_grammar.rustpeg

@ -1,5 +1,6 @@
extern crate peg;
fn main() {
peg::cargo_build("src/rio/ntriples/grammar.rustpeg");
peg::cargo_build("src/rio/ntriples/ntriples_grammar.rustpeg");
peg::cargo_build("src/rio/turtle/turtle_grammar.rustpeg");
}

@ -2,6 +2,7 @@ use std::error::Error;
use std::fmt;
pub mod ntriples;
pub mod turtle;
pub type RioResult<T> = Result<T, RioError>;

@ -1,7 +1,7 @@
///Implements https://www.w3.org/TR/n-triples/
mod grammar {
include!(concat!(env!("OUT_DIR"), "/grammar.rs"));
include!(concat!(env!("OUT_DIR"), "/ntriples_grammar.rs"));
}
use model::data::*;
@ -15,7 +15,7 @@ pub fn read_ntriples<'a, R: Read + 'a>(
data_factory: &'a DataFactory,
) -> impl Iterator<Item = RioResult<Triple>> {
let factory = data_factory.clone(); //TODO: try to avoid clone here
//TODO: use read_lines to avoid allocations
//TODO: use read_lines to avoid allocations
BufReader::new(source)
.lines()
.flat_map(move |line| match line {

@ -35,8 +35,8 @@ literal -> Literal =
//[144s]
LANGTAG -> String = "@" l: $([a-zA-Z]+ ("-" [a-zA-Z0-9]+)*) {
l.into()
LANGTAG -> &'input str = "@" l: $([a-zA-Z]+ ("-" [a-zA-Z0-9]+)*) {
l
}
//[7]
@ -55,8 +55,8 @@ STRING_LITERAL_QUOTE -> String = "\"" l: ((STRING_LITERAL_QUOTE_simple_char / EC
STRING_LITERAL_QUOTE_simple_char -> char = c: $([^\u{0022}\u{005c}\u{000a}\u{000d}]) { c.chars().next().unwrap() }
//[141s]
BLANK_NODE_LABEL -> String = "_:" b: $((PN_CHARS_U / [0-9]) ((PN_CHARS / ".")* PN_CHARS)?) {
b.into()
BLANK_NODE_LABEL -> &'input str = "_:" b: $((PN_CHARS_U / [0-9]) ((PN_CHARS / ".")* PN_CHARS)?) {
b
}
//[10]
@ -93,6 +93,7 @@ PN_CHARS -> () = PN_CHARS_U / [\-0-9\u{00B7}\u{0300}-\u{036F}\u{203F}-\u{2040}]
//[162s]
HEX -> () = ([0-9A-Fa-f])
//space
_ = #quiet<[ \t]*>
//comment

@ -0,0 +1,42 @@
/// Implements https://www.w3.org/TR/turtle/
mod grammar {
include!(concat!(env!("OUT_DIR"), "/turtle_grammar.rs"));
}
use model::data::*;
use rio::*;
use std::collections::HashMap;
use std::io::BufReader;
use std::io::Read;
//TODO: make private
pub struct ParserState {
pub base_uri: String,
pub namespaces: HashMap<String, String>,
pub cur_subject: Vec<NamedOrBlankNode>,
pub cur_predicate: Vec<NamedNode>,
}
pub fn read_turtle<'a, R: Read + 'a>(
source: R,
data_factory: &'a DataFactory,
) -> RioResult<impl Iterator<Item = Triple>> {
let factory = data_factory.clone(); //TODO: try to avoid clone here
let mut state = ParserState {
base_uri: String::default(),
namespaces: HashMap::default(),
cur_subject: Vec::default(),
cur_predicate: Vec::default(),
};
let mut string_buffer = String::default();
let mut triple_buffer = Vec::default();
match BufReader::new(source).read_to_string(&mut string_buffer) {
Ok(_) => match grammar::turtleDoc(&string_buffer, &mut state, &mut triple_buffer, &factory)
{
Ok(_) => Ok(triple_buffer.into_iter()),
Err(error) => Err(RioError::new(error)),
},
Err(error) => Err(RioError::new(error)),
}
}

@ -0,0 +1,275 @@
//See https://www.w3.org/TR/turtle/#sec-grammar
use std::char;
use std::iter;
use model::data::*;
use rio::turtle::ParserState;
#![arguments(state: &mut ParserState, buffer: &mut Vec<Triple>, data_factory: &DataFactory)]
//[1]
#[pub]
turtleDoc -> () = _ (statement _)*
//[2]
statement -> () = directive / triples "."
//[3]
directive -> () = prefixID / base / sparqlPrefix / sparqlBase
//[4]
prefixID -> () = "@prefix" _ ns:PNAME_NS _ i:IRIREF _ "." {
state.namespaces.insert(ns.into(), i.into());
}
//[5]
base -> () = "@base" _ i:IRIREF _ "." {
state.base_uri = i.into();
}
//[5s]
sparqlBase -> () = "BASE"i _ i:IRIREF {
state.base_uri = i.into();
}
//[6s]
sparqlPrefix -> () = "PREFIX"i _ ns:PNAME_NS _ i:IRIREF {
state.namespaces.insert(ns.into(), i.into());
}
//[6]
triples -> () = subject_push _ predicateObjectList / triples_blankNodePropertyList_push _ predicateObjectList?
subject_push -> () = s:subject {
state.cur_subject.push(s)
}
triples_blankNodePropertyList_push -> () = s: blankNodePropertyList {
state.cur_subject.push(s)
}
//[7]
predicateObjectList -> () = predicate_push _ objectList _ (";" _ (predicate_push _ objectList _)?)*
predicate_push -> () = v:verb {
state.cur_predicate.push(v)
}
//[8]
objectList -> () = object _ ("," _ object _)*
//[9]
verb -> NamedNode = predicate /
"a" { data_factory.named_node("http://www.w3.org/1999/02/22-rdf-syntax-ns#type") }
// [10]
subject -> NamedOrBlankNode =
i:iri { i.into() } /
b:BlankNode { b.into() } /
c:collection { c }
//[11]
predicate -> NamedNode = iri
// [12]
object -> () = o:object_value {?
match state.cur_subject.last() {
Some(s) => match state.cur_predicate.last() {
Some(p) => {
buffer.push(data_factory.triple(s.clone(), p.clone(), o));
Ok(())
}
None => Err("Predicate not found")
},
None => Err("Subject not found")
}
}
object_value -> Term =
i:iri { i.into() } /
b:BlankNode { b.into() } /
c:collection { c.into() } /
b:blankNodePropertyList { b.into() } /
l:literal { l.into() }
//[13]
literal -> Literal = RDFLiteral / NumericLiteral / BooleanLiteral
//[14]
blankNodePropertyList -> NamedOrBlankNode = blankNodePropertyList_open _ predicateObjectList _ "]" {?
state.cur_subject.pop().ok_or("No subject found in the stack")
}
blankNodePropertyList_open -> () = "[" {
state.cur_subject.push(data_factory.new_blank_node().into())
}
//[15]
collection -> NamedOrBlankNode = '(' _ o:(collection_value*) ')' {
let mut current_list_node = NamedOrBlankNode::from(data_factory.named_node("http://www.w3.org/1999/02/22-rdf-syntax-ns#nil"));
for obj in o.into_iter().rev() {
let new_blank_node = NamedOrBlankNode::from(data_factory.new_blank_node());
buffer.push(data_factory.triple(new_blank_node.clone(), data_factory.named_node("http://www.w3.org/1999/02/22-rdf-syntax-ns#first"), obj));
buffer.push(data_factory.triple(new_blank_node.clone(), data_factory.named_node("http://www.w3.org/1999/02/22-rdf-syntax-ns#rest"), current_list_node));
current_list_node = new_blank_node;
}
current_list_node
}
collection_value -> Term = o:object_value _ { o }
//[16]
NumericLiteral -> Literal =
i:$(INTEGER) { data_factory.typed_literal(i, data_factory.named_node("http://www.w3.org/2001/XMLSchema#integer")) } /
d:$(DECIMAL) { data_factory.typed_literal(d, data_factory.named_node("http://www.w3.org/2001/XMLSchema#decimal")) } /
d:$(DOUBLE) { data_factory.typed_literal(d, data_factory.named_node("http://www.w3.org/2001/XMLSchema#double")) }
//[128s]
RDFLiteral -> Literal =
v:String _ "^^" _ t:iri { data_factory.typed_literal(v, t) } /
v:String _ l:LANGTAG { data_factory.language_tagged_literal(v, l) } /
v:String { data_factory.simple_literal(v) }
//[133s]
BooleanLiteral -> Literal =
"true" { data_factory.typed_literal("true", data_factory.named_node("http://www.w3.org/2001/XMLSchema#boolean")) } /
"false" { data_factory.typed_literal("false", data_factory.named_node("http://www.w3.org/2001/XMLSchema#boolean")) }
//[17]
String -> String = STRING_LITERAL_QUOTE / STRING_LITERAL_SINGLE_QUOTE / STRING_LITERAL_LONG_SINGLE_QUOTE / STRING_LITERAL_LONG_QUOTE
//[135s]
iri -> NamedNode = i:(IRIREF / PrefixedName) {
data_factory.named_node(i)
}
//[136s]
PrefixedName -> String = PNAME_LN /
ns:PNAME_NS {? state.namespaces.get(ns).map(|v| v.clone()).ok_or("Prefix not found") }
//[137s]
BlankNode -> BlankNode =
b:BLANK_NODE_LABEL { data_factory.blank_node(b) } /
ANON { data_factory.new_blank_node() }
//[18]
IRIREF -> String = "<" _ i:((_IRIREF_simple_char / UCHAR)*) _ ">" {
//TODO: relative URIs resolution
i.into_iter().collect()
}
_IRIREF_simple_char -> char = c:$([^\u{00}-\u{20}<>"{}|^\u{60}\u{5c}]) { c.chars().next().unwrap() }
//[139s]
PNAME_NS -> &'input str = ns:$(PN_PREFIX? ":") {
ns
}
//[140s]
PNAME_LN -> String = ns:$(PNAME_NS) local:$(PN_LOCAL) {?
state.namespaces.get(ns).map(|v| v.clone() + local).ok_or("Prefix not found")
}
//[141s]
BLANK_NODE_LABEL -> &'input str = "_:" b:$((PN_CHARS_U / [0-9]) ((PN_CHARS / ".")* PN_CHARS)?) {
//TODO unescape
b
}
//[144s]
LANGTAG -> &'input str = "@" l:$([a-zA-Z]+ ("-" [a-zA-Z0-9]+)*) {
l
}
//[19]
INTEGER -> () = [+-]? [0-9]+
//[20]
DECIMAL -> () = [+-]? [0-9]* "." [0-9]+
//[21]
DOUBLE -> () = [+-]? ([0-9]+ "." [0-9]* EXPONENT / "." [0-9]+ EXPONENT / [0-9]+ EXPONENT)
//[154s]
EXPONENT -> () = [eE] [+-]? [0-9]+
//[22]
STRING_LITERAL_QUOTE -> String = "\"" l:((STRING_LITERAL_QUOTE_simple_char / ECHAR / UCHAR)*) "\"" {
l.into_iter().collect()
}
STRING_LITERAL_QUOTE_simple_char -> char = c:$([^\u{0022}\u{005c}\u{000a}\u{000d}]) { c.chars().next().unwrap() }
//[23]
STRING_LITERAL_SINGLE_QUOTE -> String = "'" l:((STRING_LITERAL_SINGLE_QUOTE_simple_char / ECHAR / UCHAR)*) "'" {
l.into_iter().collect()
}
STRING_LITERAL_SINGLE_QUOTE_simple_char -> char = c:$([^\u{0027}\u{005c}\u{000a}\u{000d}]) { c.chars().next().unwrap() }
//[24]
STRING_LITERAL_LONG_SINGLE_QUOTE -> String = "'''" ("'" / "''")? l:((STRING_LITERAL_LONG_SINGLE_QUOTE_simple_char / ECHAR / UCHAR)*) "'''" {
l.into_iter().collect()
}
STRING_LITERAL_LONG_SINGLE_QUOTE_simple_char -> char = c:$([^\u{0027}\u{005c}]) { c.chars().next().unwrap() }
//[25]
STRING_LITERAL_LONG_QUOTE -> String = "\"\"\"" ("\"" / "\"\"")? l:((STRING_LITERAL_LONG_QUOTE_simple_char / ECHAR / UCHAR)*) "\"\"\"" {
l.into_iter().collect()
}
STRING_LITERAL_LONG_QUOTE_simple_char -> char = c:$([^\u{0022}\u{005c}]) { c.chars().next().unwrap() }
//[26]
UCHAR -> char = "\\u" h:$(HEX HEX HEX HEX) {
u32::from_str_radix(h, 16).ok().and_then(char::from_u32).unwrap()
} / "\\U" h:$(HEX HEX HEX HEX HEX HEX HEX HEX) {
u32::from_str_radix(h, 16).ok().and_then(char::from_u32).unwrap()
}
//[159s]
ECHAR -> char = "\\" c:$([tbnrf"'\\]) {
match c {
"t" => '\u{0009}',
"b" => '\u{0008}',
"n" => '\u{000A}',
"r" => '\u{000D}',
"f" => '\u{000C}',
"\"" => '\u{0022}',
"'" => '\u{0027}',
"\\" => '\u{005C}',
_ => panic!("unexpected escaped char") // not possible
}
}
//[161s]
WS -> () = #quiet<[\u{20}\u{9}\u{D}\u{A}]>
//[162s]
ANON -> () = "[" WS* "]"
//[163s]
PN_CHARS_BASE -> () = [A-Za-z\u{00C0}-\u{00D6}\u{00D8}-\u{00F6}\u{00F8}-\u{02FF}\u{0370}-\u{037D}\u{037F}-\u{1FFF}\u{200C}-\u{200D}\u{2070}-\u{218F}\u{2C00}-\u{2FEF}\u{3001}-\u{D7FF}\u{F900}-\u{FDCF}\u{FDF0}-\u{FFFD}]
//[164s]
PN_CHARS_U -> () = PN_CHARS_BASE / "_"
//[166s]
PN_CHARS -> () = PN_CHARS_U / [\-0-9\u{00B7}\u{0300}-\u{036F}\u{203F}-\u{2040}]
//[167s]
PN_PREFIX -> () = PN_CHARS_BASE PN_CHARS* ("." PN_CHARS+)*
//[168s]
PN_LOCAL -> () = (PN_CHARS_U / ":" / [0-9] / PLX) (PN_CHARS / ":" / PLX)* ("." (PN_CHARS / ":" / PLX)+)*
//[169s]
PLX -> String =
p:$(PERCENT) { p.into() } /
e:PN_LOCAL_ESC { iter::once(e).collect() }
//[170s]
PERCENT -> () = "%" HEX HEX
//[171s]
HEX -> () = ([0-9A-Fa-f])
//[172s]
PN_LOCAL_ESC -> char = "\\" c:$([_~\.\-!$&'()*+,;=/?#@%]) { c.chars().next().unwrap() }
//space
_ = #quiet<([ \t\n\r] / comment)*>
//comment
comment = #quiet<"#" [^\r\n]*>
Loading…
Cancel
Save