Adds proper decoding of N-Triple escaped characters

pull/10/head
Tpt 7 years ago
parent 1dd7f43cc1
commit 005271c348
  1. 51
      src/rio/ntriples/grammar.rustpeg
  2. 2
      src/rio/ntriples/mod.rs

@ -10,50 +10,62 @@ use model::data::*;
triple -> Option<Triple> = triple -> Option<Triple> =
_ s:subject _ p:predicate _ o:object _ "." _ comment? { Some(data_factory.triple(s, p, o)) } / _ s:subject _ p:predicate _ o:object _ "." _ comment? { Some(data_factory.triple(s, p, o)) } /
_ comment? { None } _ comment? { None }
//[3] //[3]
subject -> NamedOrBlankNode = subject -> NamedOrBlankNode =
i: IRIREF { data_factory.named_node(i).into() } / i: IRIREF { data_factory.named_node(i).into() } /
b: BLANK_NODE_LABEL { data_factory.blank_node(b).into() } b: BLANK_NODE_LABEL { data_factory.blank_node(b).into() }
//[4] //[4]
predicate -> NamedNode = i:IRIREF { predicate -> NamedNode = i:IRIREF {
data_factory.named_node(i) data_factory.named_node(i)
} }
//[5] //[5]
object -> Term = object -> Term =
i: IRIREF { data_factory.named_node(i).into() } / i: IRIREF { data_factory.named_node(i).into() } /
b: BLANK_NODE_LABEL { data_factory.blank_node(b).into() } / b: BLANK_NODE_LABEL { data_factory.blank_node(b).into() } /
l: literal { l.into() } l: literal { l.into() }
//[6] //[6]
literal -> Literal = literal -> Literal =
v: STRING_LITERAL_QUOTE _ "^^" _ t:IRIREF { data_factory.typed_literal(v, data_factory.named_node(t)) } / v: STRING_LITERAL_QUOTE _ "^^" _ t:IRIREF { data_factory.typed_literal(v, data_factory.named_node(t)) } /
v: STRING_LITERAL_QUOTE _ l:LANGTAG { data_factory.language_tagged_literal(v, l) } / v: STRING_LITERAL_QUOTE _ l:LANGTAG { data_factory.language_tagged_literal(v, l) } /
v: STRING_LITERAL_QUOTE { data_factory.simple_literal(v) } v: STRING_LITERAL_QUOTE { data_factory.simple_literal(v) }
//[144s] //[144s]
LANGTAG -> String = "@" l: $([a-zA-Z]+ ("-" [a-zA-Z0-9]+)*) { LANGTAG -> String = "@" l: $([a-zA-Z]+ ("-" [a-zA-Z0-9]+)*) {
l.into() l.into()
} }
//[7] //[7]
EOL = [\r\n]+ EOL = [\r\n]+
//[8] //[8]
IRIREF -> String = "<" _ i: $(([^<>{}] / UCHAR)*) _ ">" { IRIREF -> String = "<" _ i: ((_IRIREF_simple_char / UCHAR)*) _ ">" {
i.into() i.into_iter().collect()
} }
_IRIREF_simple_char -> char = c: $([^\u{00}-\u{20}<>"{}|^\u{60}\u{5c}]) { c.chars().next().unwrap() }
//[9] //[9]
STRING_LITERAL_QUOTE -> String = "\"" l: ((NOT_BAD_LITERAL_VALUE / ECHAR / UCHAR)*) "\"" { STRING_LITERAL_QUOTE -> String = "\"" l: ((STRING_LITERAL_QUOTE_simple_char / ECHAR / UCHAR)*) "\"" {
l.into_iter().collect() l.into_iter().collect()
} }
NOT_BAD_LITERAL_VALUE -> char = c: $([^\u{0022}\u{005c}\u{000a}\u{000d}]) { c.chars().next().unwrap() } STRING_LITERAL_QUOTE_simple_char -> char = c: $([^\u{0022}\u{005c}\u{000a}\u{000d}]) { c.chars().next().unwrap() }
//[141s] //[141s]
BLANK_NODE_LABEL -> String = "_:" b: $((PN_CHARS_U / [0-9]) ((PN_CHARS / ".")* PN_CHARS)?) { BLANK_NODE_LABEL -> String = "_:" b: $((PN_CHARS_U / [0-9]) ((PN_CHARS / ".")* PN_CHARS)?) {
b.into() b.into()
} }
//[10] //[10]
UCHAR -> char = "\\u" h: $(HEX HEX HEX HEX) { UCHAR -> char = "\\u" h: $(HEX HEX HEX HEX) {
u32::from_str_radix(h, 16).ok().and_then(char::from_u32).unwrap() u32::from_str_radix(h, 16).ok().and_then(char::from_u32).unwrap()
} / "\\U" h: $(HEX HEX HEX HEX HEX HEX HEX HEX) { } / "\\U" h: $(HEX HEX HEX HEX HEX HEX HEX HEX) {
u32::from_str_radix(h, 16).ok().and_then(char::from_u32).unwrap() u32::from_str_radix(h, 16).ok().and_then(char::from_u32).unwrap()
} }
//[153s] //[153s]
ECHAR -> char = '\\' c: $([tbnrf"'\\]) { ECHAR -> char = '\\' c: $([tbnrf"'\\]) {
match c { match c {
@ -68,35 +80,20 @@ ECHAR -> char = '\\' c: $([tbnrf"'\\]) {
_ => panic!("unexpected escaped char") // not possible _ => panic!("unexpected escaped char") // not possible
} }
} }
//[157s] //[157s]
PN_CHARS_BASE -> char = c: $([A-Za-z\u{00C0}-\u{00D6}\u{00D8}-\u{00F6}\u{00F8}-\u{02FF}\u{0370}-\u{037D}\u{037F}-\u{1FFF}\u{200C}-\u{200D}\u{2070}-\u{218F}\u{2C00}-\u{2FEF}\u{3001}-\u{D7FF}\u{F900}-\u{FDCF}\u{FDF0}-\u{FFFD}]) { c.chars().next().unwrap() } PN_CHARS_BASE -> () = [A-Za-z\u{00C0}-\u{00D6}\u{00D8}-\u{00F6}\u{00F8}-\u{02FF}\u{0370}-\u{037D}\u{037F}-\u{1FFF}\u{200C}-\u{200D}\u{2070}-\u{218F}\u{2C00}-\u{2FEF}\u{3001}-\u{D7FF}\u{F900}-\u{FDCF}\u{FDF0}-\u{FFFD}]
//[158s] //[158s]
PN_CHARS_U -> char = PN_CHARS_BASE / '_' { '_' } / ':' { ':' } PN_CHARS_U -> () = PN_CHARS_BASE / '_' / ':'
//[160s] //[160s]
PN_CHARS -> char = PN_CHARS_U / c: $([\-0-9\u{00B7}\u{0300}-\u{036F}\u{203F}-\u{2040}]) { c.chars().next().unwrap() } PN_CHARS -> () = PN_CHARS_U / [\-0-9\u{00B7}\u{0300}-\u{036F}\u{203F}-\u{2040}]
//[162s] //[162s]
HEX -> char = c: $([0-9A-Fa-f]) { c.chars().next().unwrap() } HEX -> () = ([0-9A-Fa-f])
//space //space
_ = #quiet<[ \t]*> _ = #quiet<[ \t]*>
//comment //comment
comment = #quiet<"#" [^\r\n]*> comment = #quiet<"#" [^\r\n]*>
/*grammar;
pub NTripleLine: Option<Triple> = {
Comment? => None,
<t:NTriple> Comment? => Some(t)
};
pub NQuadLine: Option<Quad> = {
Comment? => None,
<t:NQuad> Comment? => Some(t)
};
NTriple: Triple = <s:Node> <p:IRI> <o:RDFTerm> "." => data_factory.triple(s, p, o);
NQuad: Quad = {
<s:Node> <p:IRI> <o:RDFTerm> <g:Node> "." => data_factory.quad(s, p, o, Some(g)),
<s:Node> <p:IRI> <o:RDFTerm> "." => data_factory.quad(s, p, o, None)
};
*/

@ -9,14 +9,12 @@ use rio::*;
use std::io::BufRead; use std::io::BufRead;
use std::io::BufReader; use std::io::BufReader;
use std::io::Read; use std::io::Read;
use std::sync::Arc;
pub fn read_ntriples<'a, R: Read + 'a>( pub fn read_ntriples<'a, R: Read + 'a>(
source: R, source: R,
data_factory: &'a DataFactory, data_factory: &'a DataFactory,
) -> impl Iterator<Item = RioResult<Triple>> { ) -> impl Iterator<Item = RioResult<Triple>> {
let factory = data_factory.clone(); //TODO: try to avoid clone here let factory = data_factory.clone(); //TODO: try to avoid clone here
let mut input = String::new();
//TODO: use read_lines to avoid allocations //TODO: use read_lines to avoid allocations
BufReader::new(source) BufReader::new(source)
.lines() .lines()

Loading…
Cancel
Save