Optimises NTriples encoded string decoding

Avoids copies when possible
7 years ago · 47de822737
parent 642dd15131
commit 47de822737
6 changed files with 212 additions and 122 deletions
--- a/src/rio/mod.rs
+++ b/src/rio/mod.rs
@ -1,2 +1,3 @@
 pub mod ntriples;
 pub mod turtle;
+pub(crate) mod utils;
--- a/src/rio/ntriples/mod.rs
+++ b/src/rio/ntriples/mod.rs
@ -3,6 +3,26 @@
 mod grammar {
    #![allow(unknown_lints)]
    #![allow(clippy)]
+
+    use rio::utils::unescape_characters;
+    use std::borrow::Cow;
+    use utils::StaticSliceMap;
+
+    const UNESCAPE_CHARACTERS: [u8; 8] = [b't', b'b', b'n', b'r', b'f', b'"', b'\'', b'\\'];
+    lazy_static! {
+        static ref UNESCAPE_REPLACEMENT: StaticSliceMap<char, char> = StaticSliceMap::new(
+            &['t', 'b', 'n', 'r', 'f', '"', '\'', '\\'],
+            &[
+                '\u{0009}', '\u{0008}', '\u{000A}', '\u{000D}', '\u{000C}', '\u{0022}', '\u{0027}',
+                '\u{005C}'
+            ]
+        );
+    }
+
+    pub fn unescape_echars(input: &str) -> Cow<str> {
+        unescape_characters(input, &UNESCAPE_CHARACTERS, &UNESCAPE_REPLACEMENT)
+    }
+
    include!(concat!(env!("OUT_DIR"), "/ntriples_grammar.rs"));
 }

--- a/src/rio/ntriples/ntriples_grammar.rustpeg
+++ b/src/rio/ntriples/ntriples_grammar.rustpeg
@ -1,10 +1,10 @@
 //See https://www.w3.org/TR/2014/REC-n-triples-20140225/#n-triples-grammar

-use std::iter::FromIterator;
 use std::char;
 use std::str::FromStr;
 use model::*;
 use std::collections::BTreeMap;
+use rio::utils::unescape_unicode_codepoints;

 #![arguments(bnodes_map: &mut BTreeMap<String, BlankNode>)]

@ -45,20 +45,19 @@ LANGTAG -> &'input str = "@" l:$([a-zA-Z]+ ("-" [a-zA-Z0-9]+)*) {
 EOL = [\r\n]+

 //[8]
-IRIREF -> NamedNode = "<" _ i:((_IRIREF_simple_char / UCHAR)*) _ ">" {?
-    let s = String::from_iter(i.into_iter());
+IRIREF -> NamedNode = "<" _ i:$(([^\u{00}-\u{20}<>"{}|^\u{60}\u{5c}] / UCHAR)*) _ ">" {?
+    let s = unescape_unicode_codepoints(i);
    match NamedNode::from_str(&s) {
        Ok(named_node) => Ok(named_node),
        Err(error) => Err("IRI parsing failed")
    }
 }
-_IRIREF_simple_char -> char = c:$([^\u{00}-\u{20}<>"{}|^\u{60}\u{5c}]) { c.chars().next().unwrap() }
+_IRIREF_simple_char -> char = c:$() { c.chars().next().unwrap() }

 //[9]
-STRING_LITERAL_QUOTE -> String = "\"" l:((STRING_LITERAL_QUOTE_simple_char / ECHAR / UCHAR)*) "\"" {
-    l.into_iter().collect()
+STRING_LITERAL_QUOTE -> String = "\"" l:$(([^\u{0022}\u{005c}\u{000a}\u{000d}] / ECHAR / UCHAR)*) "\"" {
+    unescape_unicode_codepoints(&unescape_echars(l)).into_owned()
 }
-STRING_LITERAL_QUOTE_simple_char -> char = c: $([^\u{0022}\u{005c}\u{000a}\u{000d}]) { c.chars().next().unwrap() }

 //[141s]
 BLANK_NODE_LABEL -> BlankNode = "_:" b:$(([0-9] / PN_CHARS_U) PN_CHARS* ("."+ PN_CHARS+)*) {
@ -66,26 +65,10 @@ BLANK_NODE_LABEL -> BlankNode = "_:" b:$(([0-9] / PN_CHARS_U) PN_CHARS* ("."+ PN
 }

 //[10]
-UCHAR -> char = "\\u" h: $(HEX HEX HEX HEX) {
-    u32::from_str_radix(h, 16).ok().and_then(char::from_u32).unwrap()
-} / "\\U" h: $(HEX HEX HEX HEX HEX HEX HEX HEX) {
-    u32::from_str_radix(h, 16).ok().and_then(char::from_u32).unwrap()
-}
+UCHAR -> () = "\\u" HEX HEX HEX HEX / "\\U" HEX HEX HEX HEX HEX HEX HEX HEX

 //[153s]
-ECHAR -> char = '\\' c:$([tbnrf"'\\]) {
-    match c {
-        "t" => '\u{0009}',
-        "b" => '\u{0008}',
-        "n" => '\u{000A}',
-        "r" => '\u{000D}',
-        "f" => '\u{000C}',
-        "\"" => '\u{0022}',
-        "'" => '\u{0027}',
-        "\\" => '\u{005C}',
-        _ => panic!("unexpected escaped char") // not possible
-    }
-}
+ECHAR -> () = '\\' [tbnrf"'\\]

 //[157s]
 PN_CHARS_BASE -> () = [A-Za-z\u{00C0}-\u{00D6}\u{00D8}-\u{00F6}\u{00F8}-\u{02FF}\u{0370}-\u{037D}\u{037F}-\u{1FFF}\u{200C}-\u{200D}\u{2070}-\u{218F}\u{2C00}-\u{2FEF}\u{3001}-\u{D7FF}\u{F900}-\u{FDCF}\u{FDF0}-\u{FFFD}]
--- a/src/rio/utils.rs
+++ b/src/rio/utils.rs
@ -0,0 +1,156 @@
+use std::borrow::Cow;
+use std::char;
+use std::str::Chars;
+use utils::StaticSliceMap;
+
+pub fn unescape_unicode_codepoints(input: &str) -> Cow<str> {
+    if needs_unescape_unicode_codepoints(input) {
+        UnescapeUnicodeCharIterator::new(input).collect()
+    } else {
+        input.into()
+    }
+}
+
+fn needs_unescape_unicode_codepoints(input: &str) -> bool {
+    let bytes = input.as_bytes();
+    for i in 1..bytes.len() {
+        if (bytes[i] == b'u' || bytes[i] == b'U') && bytes[i - 1] == b'/' {
+            return true;
+        }
+    }
+    false
+}
+
+struct UnescapeUnicodeCharIterator<'a> {
+    iter: Chars<'a>,
+    buffer: String,
+}
+
+impl<'a> UnescapeUnicodeCharIterator<'a> {
+    fn new(string: &'a str) -> Self {
+        Self {
+            iter: string.chars(),
+            buffer: String::with_capacity(9),
+        }
+    }
+}
+
+impl<'a> Iterator for UnescapeUnicodeCharIterator<'a> {
+    type Item = char;
+
+    fn next(&mut self) -> Option<char> {
+        if !self.buffer.is_empty() {
+            return Some(self.buffer.remove(0));
+        }
+        match self.iter.next()? {
+            '\\' => match self.iter.next() {
+                Some('u') => {
+                    self.buffer.push('u');
+                    for _ in 0..4 {
+                        if let Some(c) = self.iter.next() {
+                            self.buffer.push(c);
+                        } else {
+                            return Some('\\');
+                        }
+                    }
+                    if let Some(c) = u32::from_str_radix(&self.buffer[1..5], 16)
+                        .ok()
+                        .and_then(char::from_u32)
+                    {
+                        self.buffer.clear();
+                        Some(c)
+                    } else {
+                        Some('\\')
+                    }
+                }
+                Some('U') => {
+                    self.buffer.push('U');
+                    for _ in 0..8 {
+                        if let Some(c) = self.iter.next() {
+                            self.buffer.push(c);
+                        } else {
+                            return Some('\\');
+                        }
+                    }
+                    if let Some(c) = u32::from_str_radix(&self.buffer[1..9], 16)
+                        .ok()
+                        .and_then(char::from_u32)
+                    {
+                        self.buffer.clear();
+                        Some(c)
+                    } else {
+                        Some('\\')
+                    }
+                }
+                Some(c) => {
+                    self.buffer.push(c);
+                    Some('\\')
+                }
+                None => Some('\\'),
+            },
+            c => Some(c),
+        }
+    }
+}
+
+pub fn unescape_characters<'a>(
+    input: &'a str,
+    characters: &'static [u8],
+    replacement: &'static StaticSliceMap<char, char>,
+) -> Cow<'a, str> {
+    if needs_unescape_characters(input, characters) {
+        UnescapeCharsIterator::new(input, replacement).collect()
+    } else {
+        input.into()
+    }
+}
+
+fn needs_unescape_characters(input: &str, characters: &[u8]) -> bool {
+    let bytes = input.as_bytes();
+    for i in 1..bytes.len() {
+        if bytes[i - 1] == b'/' && characters.contains(&bytes[i]) {
+            return true;
+        }
+    }
+    false
+}
+
+struct UnescapeCharsIterator<'a> {
+    iter: Chars<'a>,
+    buffer: Option<char>,
+    replacement: &'static StaticSliceMap<char, char>,
+}
+
+impl<'a> UnescapeCharsIterator<'a> {
+    fn new(string: &'a str, replacement: &'static StaticSliceMap<char, char>) -> Self {
+        Self {
+            iter: string.chars(),
+            buffer: None,
+            replacement,
+        }
+    }
+}
+
+impl<'a> Iterator for UnescapeCharsIterator<'a> {
+    type Item = char;
+
+    fn next(&mut self) -> Option<char> {
+        if let Some(ch) = self.buffer {
+            self.buffer = None;
+            return Some(ch);
+        }
+        match self.iter.next()? {
+            '\\' => match self.iter.next() {
+                Some(ch) => match self.replacement.get(ch) {
+                    Some(replace) => Some(replace),
+                    None => {
+                        self.buffer = Some(ch);
+                        Some('\\')
+                    }
+                },
+                None => Some('\\'),
+            },
+            c => Some(c),
+        }
+    }
+}
--- a/src/sparql/parser.rs
+++ b/src/sparql/parser.rs
@ -1,16 +1,11 @@
-use std::borrow::Cow;
-use std::char;
-use std::str::Chars;
-
 mod grammar {
    #![allow(unknown_lints)]
    #![allow(clippy)]

    use model::*;
+    use rio::utils::unescape_unicode_codepoints;
    use sparql::algebra::*;
    use sparql::model::*;
-    use sparql::parser::unescape_unicode_codepoints;
-    use std::borrow::Cow;
    use std::collections::BTreeMap;
    use std::collections::HashMap;
    use std::io::BufReader;
@ -299,7 +294,7 @@ mod grammar {
        BufReader::new(source).read_to_string(&mut string_buffer)?;

        Ok(QueryUnit(
-            &unescape_unicode_codepoints(Cow::from(string_buffer)),
+            &unescape_unicode_codepoints(&string_buffer),
            &mut state,
        )?)
    }
@ -307,93 +302,3 @@ mod grammar {

 pub(crate) type ParseError = self::grammar::ParseError;
 pub use self::grammar::read_sparql_query;
-
-fn needs_unescape_unicode_codepoints(input: &str) -> bool {
-    let bytes = input.as_bytes();
-    for i in 1..bytes.len() {
-        if (bytes[i] == b'u' || bytes[i] == b'U') && bytes[i - 1] == b'/' {
-            return true;
-        }
-    }
-    false
-}
-
-struct UnescapeUnicodeCharIterator<'a> {
-    iter: Chars<'a>,
-    buffer: String,
-}
-
-impl<'a> UnescapeUnicodeCharIterator<'a> {
-    fn new(string: &'a str) -> Self {
-        Self {
-            iter: string.chars(),
-            buffer: String::with_capacity(9),
-        }
-    }
-}
-
-impl<'a> Iterator for UnescapeUnicodeCharIterator<'a> {
-    type Item = char;
-
-    fn next(&mut self) -> Option<char> {
-        if !self.buffer.is_empty() {
-            return Some(self.buffer.remove(0));
-        }
-        match self.iter.next()? {
-            '\\' => match self.iter.next() {
-                Some('u') => {
-                    self.buffer.push('u');
-                    for _ in 0..4 {
-                        if let Some(c) = self.iter.next() {
-                            self.buffer.push(c);
-                        } else {
-                            return Some('\\');
-                        }
-                    }
-                    if let Some(c) = u32::from_str_radix(&self.buffer[1..5], 16)
-                        .ok()
-                        .and_then(char::from_u32)
-                    {
-                        self.buffer.clear();
-                        Some(c)
-                    } else {
-                        Some('\\')
-                    }
-                }
-                Some('U') => {
-                    self.buffer.push('U');
-                    for _ in 0..8 {
-                        if let Some(c) = self.iter.next() {
-                            self.buffer.push(c);
-                        } else {
-                            return Some('\\');
-                        }
-                    }
-                    if let Some(c) = u32::from_str_radix(&self.buffer[1..9], 16)
-                        .ok()
-                        .and_then(char::from_u32)
-                    {
-                        self.buffer.clear();
-                        Some(c)
-                    } else {
-                        Some('\\')
-                    }
-                }
-                Some(c) => {
-                    self.buffer.push(c);
-                    Some('\\')
-                }
-                None => Some('\\'),
-            },
-            c => Some(c),
-        }
-    }
-}
-
-fn unescape_unicode_codepoints(input: Cow<str>) -> Cow<str> {
-    if needs_unescape_unicode_codepoints(&input) {
-        UnescapeUnicodeCharIterator::new(&input).collect()
-    } else {
-        input
-    }
-}
--- a/src/utils.rs
+++ b/src/utils.rs
@ -77,3 +77,28 @@ impl ExactSizeIterator for EscapeRDF {
        }
    }
 }
+
+pub struct StaticSliceMap<K: 'static + Copy + Eq, V: 'static + Copy> {
+    keys: &'static [K],
+    values: &'static [V],
+}
+
+impl<K: 'static + Copy + Eq, V: 'static + Copy> StaticSliceMap<K, V> {
+    pub fn new(keys: &'static [K], values: &'static [V]) -> Self {
+        assert_eq!(
+            keys.len(),
+            values.len(),
+            "keys and values slices of StaticSliceMap should have the same size"
+        );
+        Self { keys, values }
+    }
+
+    pub fn get(&self, key: K) -> Option<V> {
+        for i in 0..self.keys.len() {
+            if self.keys[i] == key {
+                return Some(self.values[i]);
+            }
+        }
+        None
+    }
+}