Optimizes LN parsing for Turtle and SPARQL queries

7 years ago · c67ab460d0
parent d84190bd87
commit c67ab460d0
4 changed files with 53 additions and 35 deletions
--- a/lib/src/rio/turtle/mod.rs
+++ b/lib/src/rio/turtle/mod.rs
@ -73,9 +73,30 @@ mod grammar {
        );
    }

-    pub fn unescape_echars(input: &str) -> Cow<str> {
+    fn unescape_echars(input: &str) -> Cow<str> {
        unescape_characters(input, &UNESCAPE_CHARACTERS, &UNESCAPE_REPLACEMENT)
    }
+
+    const UNESCAPE_PN_CHARACTERS: [u8; 20] = [
+        b'_', b'~', b'.', b'-', b'!', b'$', b'&', b'\'', b'(', b')', b'*', b'+', b',', b';', b'=',
+        b'/', b'?', b'#', b'@', b'%',
+    ];
+    lazy_static! {
+        static ref UNESCAPE_PN_REPLACEMENT: StaticSliceMap<char, char> = StaticSliceMap::new(
+            &[
+                '_', '~', '.', '-', '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', '/',
+                '?', '#', '@', '%'
+            ],
+            &[
+                '_', '~', '.', '-', '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', '/',
+                '?', '#', '@', '%'
+            ]
+        );
+    }
+
+    pub fn unescape_pn_local(input: &str) -> Cow<str> {
+        unescape_characters(input, &UNESCAPE_PN_CHARACTERS, &UNESCAPE_PN_REPLACEMENT)
+    }
 }

 pub use self::grammar::read_turtle;
--- a/lib/src/rio/turtle/turtle_grammar.rustpeg
+++ b/lib/src/rio/turtle/turtle_grammar.rustpeg
@ -3,7 +3,6 @@
 use std::char;
 use model::vocab::rdf;
 use model::vocab::xsd;
-use std::iter;
 use std::str::FromStr;
 use rio::utils::unescape_unicode_codepoints;

@ -170,9 +169,9 @@ PNAME_NS -> &'input str = ns:$(PN_PREFIX? ":") {
 }

 //[140s]
-PNAME_LN -> Url = ns:$(PNAME_NS) local:PN_LOCAL {?
+PNAME_LN -> Url = ns:$(PNAME_NS) local:$(PN_LOCAL) {?
    match state.namespaces.get(ns) {
-        Some(ns) => match Url::parse(&(ns.to_string() + &local)) {
+        Some(ns) => match Url::parse(&(ns.to_string() + &unescape_pn_local(local))) {
            Ok(url) => Ok(url),
            Err(error) => Err("IRI parsing failed")
        },
@ -249,21 +248,10 @@ PN_CHARS -> () = [\-0-9\u{00B7}\u{0300}-\u{036F}\u{203F}-\u{2040}] / PN_CHARS_U
 PN_PREFIX -> () = PN_CHARS_BASE PN_CHARS* ("."+ PN_CHARS+)*

 //[168s]
-PN_LOCAL -> String = f:PN_LOCAL_first c:(PN_LOCAL_next*) e:(PN_LOCAL_next_dot*) {
-    f.to_string() + &c.concat() + &e.concat()
-}
-PN_LOCAL_first -> String =
-    c:$(":" / [0-9] / PN_CHARS_U) { c.into() } /
-    s:PLX { s }
-PN_LOCAL_next -> String =
-    c:$(":" / PN_CHARS) { c.into() } /
-    s:PLX { s }
-PN_LOCAL_next_dot -> String = d:$("."+) f:PN_LOCAL_next { d.to_string() + &f}
+PN_LOCAL -> () = (PN_CHARS_U / ':' / [0-9] / PLX) (PN_CHARS / ':' / PLX)* ('.'+ (PN_CHARS / ':' / PLX)+)?

 //[169s]
-PLX -> String =
-    p:$(PERCENT) { p.into() } /
-    e:PN_LOCAL_ESC { iter::once(e).collect() }
+PLX -> () = PERCENT / PN_LOCAL_ESC

 //[170s]
 PERCENT -> () = "%" HEX HEX
@ -272,7 +260,7 @@ PERCENT -> () = "%" HEX HEX
 HEX -> () = ([0-9A-Fa-f])

 //[172s]
-PN_LOCAL_ESC -> char = "\\" c:$([_~\.\-!$&'()*+,;=/?#@%]) { c.chars().next().unwrap() }
+PN_LOCAL_ESC -> () = "\\" [_~\.\-!$&'()*+,;=/?#@%]


 //space
--- a/lib/src/sparql/parser.rs
+++ b/lib/src/sparql/parser.rs
@ -329,6 +329,27 @@ mod grammar {
        unescape_characters(input, &UNESCAPE_CHARACTERS, &UNESCAPE_REPLACEMENT)
    }

+    const UNESCAPE_PN_CHARACTERS: [u8; 20] = [
+        b'_', b'~', b'.', b'-', b'!', b'$', b'&', b'\'', b'(', b')', b'*', b'+', b',', b';', b'=',
+        b'/', b'?', b'#', b'@', b'%',
+    ];
+    lazy_static! {
+        static ref UNESCAPE_PN_REPLACEMENT: StaticSliceMap<char, char> = StaticSliceMap::new(
+            &[
+                '_', '~', '.', '-', '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', '/',
+                '?', '#', '@', '%'
+            ],
+            &[
+                '_', '~', '.', '-', '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', '/',
+                '?', '#', '@', '%'
+            ]
+        );
+    }
+
+    pub fn unescape_pn_local(input: &str) -> Cow<str> {
+        unescape_characters(input, &UNESCAPE_PN_CHARACTERS, &UNESCAPE_PN_REPLACEMENT)
+    }
+
    include!(concat!(env!("OUT_DIR"), "/sparql_grammar.rs"));

    pub fn read_sparql_query<'a, R: Read + 'a>(
--- a/lib/src/sparql/sparql_grammar.rustpeg
+++ b/lib/src/sparql/sparql_grammar.rustpeg
@ -3,7 +3,6 @@
 use std::char;
 use model::vocab::rdf;
 use model::vocab::xsd;
-use std::iter;
 use std::str::FromStr;

 #![arguments(state: &mut ParserState)]
@ -933,8 +932,8 @@ PNAME_NS -> &'input str = ns:$(PN_PREFIX? ":") {
 }

 //[141]
-PNAME_LN -> String = ns:$(PNAME_NS) local:PN_LOCAL {?
-    state.namespaces.get(ns).map(|v| v.clone() + &local).ok_or("Prefix not found")
+PNAME_LN -> String = ns:$(PNAME_NS) local:$(PN_LOCAL) {?
+    state.namespaces.get(ns).map(|v| v.clone() + &unescape_pn_local(local)).ok_or("Prefix not found")
 }

 //[142]
@ -1038,21 +1037,10 @@ PN_CHARS -> () = [\-0-9\u{00B7}\u{0300}-\u{036F}\u{203F}-\u{2040}] / PN_CHARS_U
 PN_PREFIX -> () = PN_CHARS_BASE PN_CHARS* ("."+ PN_CHARS+)*

 //[169]
-PN_LOCAL -> String = f:PN_LOCAL_first c:(PN_LOCAL_next*) e:(PN_LOCAL_next_dot*) {
-    f.to_string() + &c.concat() + &e.concat()
-}
-PN_LOCAL_first -> String =
-    c:$(":" / [0-9] / PN_CHARS_U) { c.into() } /
-    PLX
-PN_LOCAL_next -> String =
-    c:$(":" / PN_CHARS) { c.into() } /
-    PLX
-PN_LOCAL_next_dot -> String = d:$('.'+) f:PN_LOCAL_next* { d.to_string() + &f.concat()}
+PN_LOCAL -> () = (PN_CHARS_U / ':' / [0-9] / PLX) (PN_CHARS / ':' / PLX)* ('.'+ (PN_CHARS / ':' / PLX)+)?

 //[170]
-PLX -> String =
-    p:$(PERCENT) { p.into() } /
-    e:PN_LOCAL_ESC { iter::once(e).collect() }
+PLX -> () = PERCENT / PN_LOCAL_ESC

 //[171]
 PERCENT -> () = "%" HEX HEX
@ -1061,7 +1049,7 @@ PERCENT -> () = "%" HEX HEX
 HEX -> () = ([0-9A-Fa-f])

 //[173]
-PN_LOCAL_ESC -> char = "\\" c:$([_~\.\-!$&'()*+,;=/?#@%]) { c.chars().next().unwrap() } //TODO: added '/' to make tests pass but is it valid?
+PN_LOCAL_ESC -> () = "\\" [_~\.\-!$&'()*+,;=/?#@%] //TODO: added '/' to make tests pass but is it valid?

 //space
 _ = #quiet<([ \t\n\r] / comment)*>