From 47de82273791b318b23f9ecd9881020790f9d5bf Mon Sep 17 00:00:00 2001 From: Tpt Date: Fri, 21 Sep 2018 15:03:55 +0200 Subject: [PATCH] Optimises NTriples encoded string decoding Avoids copies when possible --- src/rio/mod.rs | 1 + src/rio/ntriples/mod.rs | 20 +++ src/rio/ntriples/ntriples_grammar.rustpeg | 33 ++--- src/rio/utils.rs | 156 ++++++++++++++++++++++ src/sparql/parser.rs | 99 +------------- src/utils.rs | 25 ++++ 6 files changed, 212 insertions(+), 122 deletions(-) create mode 100644 src/rio/utils.rs diff --git a/src/rio/mod.rs b/src/rio/mod.rs index 0969fdfd..bca5a2f0 100644 --- a/src/rio/mod.rs +++ b/src/rio/mod.rs @@ -1,2 +1,3 @@ pub mod ntriples; pub mod turtle; +pub(crate) mod utils; diff --git a/src/rio/ntriples/mod.rs b/src/rio/ntriples/mod.rs index 3dc0306e..a03bc74d 100644 --- a/src/rio/ntriples/mod.rs +++ b/src/rio/ntriples/mod.rs @@ -3,6 +3,26 @@ mod grammar { #![allow(unknown_lints)] #![allow(clippy)] + + use rio::utils::unescape_characters; + use std::borrow::Cow; + use utils::StaticSliceMap; + + const UNESCAPE_CHARACTERS: [u8; 8] = [b't', b'b', b'n', b'r', b'f', b'"', b'\'', b'\\']; + lazy_static! { + static ref UNESCAPE_REPLACEMENT: StaticSliceMap = StaticSliceMap::new( + &['t', 'b', 'n', 'r', 'f', '"', '\'', '\\'], + &[ + '\u{0009}', '\u{0008}', '\u{000A}', '\u{000D}', '\u{000C}', '\u{0022}', '\u{0027}', + '\u{005C}' + ] + ); + } + + pub fn unescape_echars(input: &str) -> Cow { + unescape_characters(input, &UNESCAPE_CHARACTERS, &UNESCAPE_REPLACEMENT) + } + include!(concat!(env!("OUT_DIR"), "/ntriples_grammar.rs")); } diff --git a/src/rio/ntriples/ntriples_grammar.rustpeg b/src/rio/ntriples/ntriples_grammar.rustpeg index 50702d20..6f5993ba 100644 --- a/src/rio/ntriples/ntriples_grammar.rustpeg +++ b/src/rio/ntriples/ntriples_grammar.rustpeg @@ -1,10 +1,10 @@ //See https://www.w3.org/TR/2014/REC-n-triples-20140225/#n-triples-grammar -use std::iter::FromIterator; use std::char; use std::str::FromStr; use model::*; use std::collections::BTreeMap; +use rio::utils::unescape_unicode_codepoints; #![arguments(bnodes_map: &mut BTreeMap)] @@ -45,20 +45,19 @@ LANGTAG -> &'input str = "@" l:$([a-zA-Z]+ ("-" [a-zA-Z0-9]+)*) { EOL = [\r\n]+ //[8] -IRIREF -> NamedNode = "<" _ i:((_IRIREF_simple_char / UCHAR)*) _ ">" {? - let s = String::from_iter(i.into_iter()); +IRIREF -> NamedNode = "<" _ i:$(([^\u{00}-\u{20}<>"{}|^\u{60}\u{5c}] / UCHAR)*) _ ">" {? + let s = unescape_unicode_codepoints(i); match NamedNode::from_str(&s) { Ok(named_node) => Ok(named_node), Err(error) => Err("IRI parsing failed") } } -_IRIREF_simple_char -> char = c:$([^\u{00}-\u{20}<>"{}|^\u{60}\u{5c}]) { c.chars().next().unwrap() } +_IRIREF_simple_char -> char = c:$() { c.chars().next().unwrap() } //[9] -STRING_LITERAL_QUOTE -> String = "\"" l:((STRING_LITERAL_QUOTE_simple_char / ECHAR / UCHAR)*) "\"" { - l.into_iter().collect() +STRING_LITERAL_QUOTE -> String = "\"" l:$(([^\u{0022}\u{005c}\u{000a}\u{000d}] / ECHAR / UCHAR)*) "\"" { + unescape_unicode_codepoints(&unescape_echars(l)).into_owned() } -STRING_LITERAL_QUOTE_simple_char -> char = c: $([^\u{0022}\u{005c}\u{000a}\u{000d}]) { c.chars().next().unwrap() } //[141s] BLANK_NODE_LABEL -> BlankNode = "_:" b:$(([0-9] / PN_CHARS_U) PN_CHARS* ("."+ PN_CHARS+)*) { @@ -66,26 +65,10 @@ BLANK_NODE_LABEL -> BlankNode = "_:" b:$(([0-9] / PN_CHARS_U) PN_CHARS* ("."+ PN } //[10] -UCHAR -> char = "\\u" h: $(HEX HEX HEX HEX) { - u32::from_str_radix(h, 16).ok().and_then(char::from_u32).unwrap() -} / "\\U" h: $(HEX HEX HEX HEX HEX HEX HEX HEX) { - u32::from_str_radix(h, 16).ok().and_then(char::from_u32).unwrap() -} +UCHAR -> () = "\\u" HEX HEX HEX HEX / "\\U" HEX HEX HEX HEX HEX HEX HEX HEX //[153s] -ECHAR -> char = '\\' c:$([tbnrf"'\\]) { - match c { - "t" => '\u{0009}', - "b" => '\u{0008}', - "n" => '\u{000A}', - "r" => '\u{000D}', - "f" => '\u{000C}', - "\"" => '\u{0022}', - "'" => '\u{0027}', - "\\" => '\u{005C}', - _ => panic!("unexpected escaped char") // not possible - } -} +ECHAR -> () = '\\' [tbnrf"'\\] //[157s] PN_CHARS_BASE -> () = [A-Za-z\u{00C0}-\u{00D6}\u{00D8}-\u{00F6}\u{00F8}-\u{02FF}\u{0370}-\u{037D}\u{037F}-\u{1FFF}\u{200C}-\u{200D}\u{2070}-\u{218F}\u{2C00}-\u{2FEF}\u{3001}-\u{D7FF}\u{F900}-\u{FDCF}\u{FDF0}-\u{FFFD}] diff --git a/src/rio/utils.rs b/src/rio/utils.rs new file mode 100644 index 00000000..d1cd10fa --- /dev/null +++ b/src/rio/utils.rs @@ -0,0 +1,156 @@ +use std::borrow::Cow; +use std::char; +use std::str::Chars; +use utils::StaticSliceMap; + +pub fn unescape_unicode_codepoints(input: &str) -> Cow { + if needs_unescape_unicode_codepoints(input) { + UnescapeUnicodeCharIterator::new(input).collect() + } else { + input.into() + } +} + +fn needs_unescape_unicode_codepoints(input: &str) -> bool { + let bytes = input.as_bytes(); + for i in 1..bytes.len() { + if (bytes[i] == b'u' || bytes[i] == b'U') && bytes[i - 1] == b'/' { + return true; + } + } + false +} + +struct UnescapeUnicodeCharIterator<'a> { + iter: Chars<'a>, + buffer: String, +} + +impl<'a> UnescapeUnicodeCharIterator<'a> { + fn new(string: &'a str) -> Self { + Self { + iter: string.chars(), + buffer: String::with_capacity(9), + } + } +} + +impl<'a> Iterator for UnescapeUnicodeCharIterator<'a> { + type Item = char; + + fn next(&mut self) -> Option { + if !self.buffer.is_empty() { + return Some(self.buffer.remove(0)); + } + match self.iter.next()? { + '\\' => match self.iter.next() { + Some('u') => { + self.buffer.push('u'); + for _ in 0..4 { + if let Some(c) = self.iter.next() { + self.buffer.push(c); + } else { + return Some('\\'); + } + } + if let Some(c) = u32::from_str_radix(&self.buffer[1..5], 16) + .ok() + .and_then(char::from_u32) + { + self.buffer.clear(); + Some(c) + } else { + Some('\\') + } + } + Some('U') => { + self.buffer.push('U'); + for _ in 0..8 { + if let Some(c) = self.iter.next() { + self.buffer.push(c); + } else { + return Some('\\'); + } + } + if let Some(c) = u32::from_str_radix(&self.buffer[1..9], 16) + .ok() + .and_then(char::from_u32) + { + self.buffer.clear(); + Some(c) + } else { + Some('\\') + } + } + Some(c) => { + self.buffer.push(c); + Some('\\') + } + None => Some('\\'), + }, + c => Some(c), + } + } +} + +pub fn unescape_characters<'a>( + input: &'a str, + characters: &'static [u8], + replacement: &'static StaticSliceMap, +) -> Cow<'a, str> { + if needs_unescape_characters(input, characters) { + UnescapeCharsIterator::new(input, replacement).collect() + } else { + input.into() + } +} + +fn needs_unescape_characters(input: &str, characters: &[u8]) -> bool { + let bytes = input.as_bytes(); + for i in 1..bytes.len() { + if bytes[i - 1] == b'/' && characters.contains(&bytes[i]) { + return true; + } + } + false +} + +struct UnescapeCharsIterator<'a> { + iter: Chars<'a>, + buffer: Option, + replacement: &'static StaticSliceMap, +} + +impl<'a> UnescapeCharsIterator<'a> { + fn new(string: &'a str, replacement: &'static StaticSliceMap) -> Self { + Self { + iter: string.chars(), + buffer: None, + replacement, + } + } +} + +impl<'a> Iterator for UnescapeCharsIterator<'a> { + type Item = char; + + fn next(&mut self) -> Option { + if let Some(ch) = self.buffer { + self.buffer = None; + return Some(ch); + } + match self.iter.next()? { + '\\' => match self.iter.next() { + Some(ch) => match self.replacement.get(ch) { + Some(replace) => Some(replace), + None => { + self.buffer = Some(ch); + Some('\\') + } + }, + None => Some('\\'), + }, + c => Some(c), + } + } +} diff --git a/src/sparql/parser.rs b/src/sparql/parser.rs index 79ebd972..1e40db95 100644 --- a/src/sparql/parser.rs +++ b/src/sparql/parser.rs @@ -1,16 +1,11 @@ -use std::borrow::Cow; -use std::char; -use std::str::Chars; - mod grammar { #![allow(unknown_lints)] #![allow(clippy)] use model::*; + use rio::utils::unescape_unicode_codepoints; use sparql::algebra::*; use sparql::model::*; - use sparql::parser::unescape_unicode_codepoints; - use std::borrow::Cow; use std::collections::BTreeMap; use std::collections::HashMap; use std::io::BufReader; @@ -299,7 +294,7 @@ mod grammar { BufReader::new(source).read_to_string(&mut string_buffer)?; Ok(QueryUnit( - &unescape_unicode_codepoints(Cow::from(string_buffer)), + &unescape_unicode_codepoints(&string_buffer), &mut state, )?) } @@ -307,93 +302,3 @@ mod grammar { pub(crate) type ParseError = self::grammar::ParseError; pub use self::grammar::read_sparql_query; - -fn needs_unescape_unicode_codepoints(input: &str) -> bool { - let bytes = input.as_bytes(); - for i in 1..bytes.len() { - if (bytes[i] == b'u' || bytes[i] == b'U') && bytes[i - 1] == b'/' { - return true; - } - } - false -} - -struct UnescapeUnicodeCharIterator<'a> { - iter: Chars<'a>, - buffer: String, -} - -impl<'a> UnescapeUnicodeCharIterator<'a> { - fn new(string: &'a str) -> Self { - Self { - iter: string.chars(), - buffer: String::with_capacity(9), - } - } -} - -impl<'a> Iterator for UnescapeUnicodeCharIterator<'a> { - type Item = char; - - fn next(&mut self) -> Option { - if !self.buffer.is_empty() { - return Some(self.buffer.remove(0)); - } - match self.iter.next()? { - '\\' => match self.iter.next() { - Some('u') => { - self.buffer.push('u'); - for _ in 0..4 { - if let Some(c) = self.iter.next() { - self.buffer.push(c); - } else { - return Some('\\'); - } - } - if let Some(c) = u32::from_str_radix(&self.buffer[1..5], 16) - .ok() - .and_then(char::from_u32) - { - self.buffer.clear(); - Some(c) - } else { - Some('\\') - } - } - Some('U') => { - self.buffer.push('U'); - for _ in 0..8 { - if let Some(c) = self.iter.next() { - self.buffer.push(c); - } else { - return Some('\\'); - } - } - if let Some(c) = u32::from_str_radix(&self.buffer[1..9], 16) - .ok() - .and_then(char::from_u32) - { - self.buffer.clear(); - Some(c) - } else { - Some('\\') - } - } - Some(c) => { - self.buffer.push(c); - Some('\\') - } - None => Some('\\'), - }, - c => Some(c), - } - } -} - -fn unescape_unicode_codepoints(input: Cow) -> Cow { - if needs_unescape_unicode_codepoints(&input) { - UnescapeUnicodeCharIterator::new(&input).collect() - } else { - input - } -} diff --git a/src/utils.rs b/src/utils.rs index 85de85b0..a90b338e 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -77,3 +77,28 @@ impl ExactSizeIterator for EscapeRDF { } } } + +pub struct StaticSliceMap { + keys: &'static [K], + values: &'static [V], +} + +impl StaticSliceMap { + pub fn new(keys: &'static [K], values: &'static [V]) -> Self { + assert_eq!( + keys.len(), + values.len(), + "keys and values slices of StaticSliceMap should have the same size" + ); + Self { keys, values } + } + + pub fn get(&self, key: K) -> Option { + for i in 0..self.keys.len() { + if self.keys[i] == key { + return Some(self.values[i]); + } + } + None + } +}