Optimises NTriples encoded string decoding

Avoids copies when possible
pull/10/head
Tpt 6 years ago
parent 642dd15131
commit 47de822737
  1. 1
      src/rio/mod.rs
  2. 20
      src/rio/ntriples/mod.rs
  3. 33
      src/rio/ntriples/ntriples_grammar.rustpeg
  4. 156
      src/rio/utils.rs
  5. 99
      src/sparql/parser.rs
  6. 25
      src/utils.rs

@ -1,2 +1,3 @@
pub mod ntriples;
pub mod turtle;
pub(crate) mod utils;

@ -3,6 +3,26 @@
mod grammar {
#![allow(unknown_lints)]
#![allow(clippy)]
use rio::utils::unescape_characters;
use std::borrow::Cow;
use utils::StaticSliceMap;
const UNESCAPE_CHARACTERS: [u8; 8] = [b't', b'b', b'n', b'r', b'f', b'"', b'\'', b'\\'];
lazy_static! {
static ref UNESCAPE_REPLACEMENT: StaticSliceMap<char, char> = StaticSliceMap::new(
&['t', 'b', 'n', 'r', 'f', '"', '\'', '\\'],
&[
'\u{0009}', '\u{0008}', '\u{000A}', '\u{000D}', '\u{000C}', '\u{0022}', '\u{0027}',
'\u{005C}'
]
);
}
pub fn unescape_echars(input: &str) -> Cow<str> {
unescape_characters(input, &UNESCAPE_CHARACTERS, &UNESCAPE_REPLACEMENT)
}
include!(concat!(env!("OUT_DIR"), "/ntriples_grammar.rs"));
}

@ -1,10 +1,10 @@
//See https://www.w3.org/TR/2014/REC-n-triples-20140225/#n-triples-grammar
use std::iter::FromIterator;
use std::char;
use std::str::FromStr;
use model::*;
use std::collections::BTreeMap;
use rio::utils::unescape_unicode_codepoints;
#![arguments(bnodes_map: &mut BTreeMap<String, BlankNode>)]
@ -45,20 +45,19 @@ LANGTAG -> &'input str = "@" l:$([a-zA-Z]+ ("-" [a-zA-Z0-9]+)*) {
EOL = [\r\n]+
//[8]
IRIREF -> NamedNode = "<" _ i:((_IRIREF_simple_char / UCHAR)*) _ ">" {?
let s = String::from_iter(i.into_iter());
IRIREF -> NamedNode = "<" _ i:$(([^\u{00}-\u{20}<>"{}|^\u{60}\u{5c}] / UCHAR)*) _ ">" {?
let s = unescape_unicode_codepoints(i);
match NamedNode::from_str(&s) {
Ok(named_node) => Ok(named_node),
Err(error) => Err("IRI parsing failed")
}
}
_IRIREF_simple_char -> char = c:$([^\u{00}-\u{20}<>"{}|^\u{60}\u{5c}]) { c.chars().next().unwrap() }
_IRIREF_simple_char -> char = c:$() { c.chars().next().unwrap() }
//[9]
STRING_LITERAL_QUOTE -> String = "\"" l:((STRING_LITERAL_QUOTE_simple_char / ECHAR / UCHAR)*) "\"" {
l.into_iter().collect()
STRING_LITERAL_QUOTE -> String = "\"" l:$(([^\u{0022}\u{005c}\u{000a}\u{000d}] / ECHAR / UCHAR)*) "\"" {
unescape_unicode_codepoints(&unescape_echars(l)).into_owned()
}
STRING_LITERAL_QUOTE_simple_char -> char = c: $([^\u{0022}\u{005c}\u{000a}\u{000d}]) { c.chars().next().unwrap() }
//[141s]
BLANK_NODE_LABEL -> BlankNode = "_:" b:$(([0-9] / PN_CHARS_U) PN_CHARS* ("."+ PN_CHARS+)*) {
@ -66,26 +65,10 @@ BLANK_NODE_LABEL -> BlankNode = "_:" b:$(([0-9] / PN_CHARS_U) PN_CHARS* ("."+ PN
}
//[10]
UCHAR -> char = "\\u" h: $(HEX HEX HEX HEX) {
u32::from_str_radix(h, 16).ok().and_then(char::from_u32).unwrap()
} / "\\U" h: $(HEX HEX HEX HEX HEX HEX HEX HEX) {
u32::from_str_radix(h, 16).ok().and_then(char::from_u32).unwrap()
}
UCHAR -> () = "\\u" HEX HEX HEX HEX / "\\U" HEX HEX HEX HEX HEX HEX HEX HEX
//[153s]
ECHAR -> char = '\\' c:$([tbnrf"'\\]) {
match c {
"t" => '\u{0009}',
"b" => '\u{0008}',
"n" => '\u{000A}',
"r" => '\u{000D}',
"f" => '\u{000C}',
"\"" => '\u{0022}',
"'" => '\u{0027}',
"\\" => '\u{005C}',
_ => panic!("unexpected escaped char") // not possible
}
}
ECHAR -> () = '\\' [tbnrf"'\\]
//[157s]
PN_CHARS_BASE -> () = [A-Za-z\u{00C0}-\u{00D6}\u{00D8}-\u{00F6}\u{00F8}-\u{02FF}\u{0370}-\u{037D}\u{037F}-\u{1FFF}\u{200C}-\u{200D}\u{2070}-\u{218F}\u{2C00}-\u{2FEF}\u{3001}-\u{D7FF}\u{F900}-\u{FDCF}\u{FDF0}-\u{FFFD}]

@ -0,0 +1,156 @@
use std::borrow::Cow;
use std::char;
use std::str::Chars;
use utils::StaticSliceMap;
pub fn unescape_unicode_codepoints(input: &str) -> Cow<str> {
if needs_unescape_unicode_codepoints(input) {
UnescapeUnicodeCharIterator::new(input).collect()
} else {
input.into()
}
}
fn needs_unescape_unicode_codepoints(input: &str) -> bool {
let bytes = input.as_bytes();
for i in 1..bytes.len() {
if (bytes[i] == b'u' || bytes[i] == b'U') && bytes[i - 1] == b'/' {
return true;
}
}
false
}
struct UnescapeUnicodeCharIterator<'a> {
iter: Chars<'a>,
buffer: String,
}
impl<'a> UnescapeUnicodeCharIterator<'a> {
fn new(string: &'a str) -> Self {
Self {
iter: string.chars(),
buffer: String::with_capacity(9),
}
}
}
impl<'a> Iterator for UnescapeUnicodeCharIterator<'a> {
type Item = char;
fn next(&mut self) -> Option<char> {
if !self.buffer.is_empty() {
return Some(self.buffer.remove(0));
}
match self.iter.next()? {
'\\' => match self.iter.next() {
Some('u') => {
self.buffer.push('u');
for _ in 0..4 {
if let Some(c) = self.iter.next() {
self.buffer.push(c);
} else {
return Some('\\');
}
}
if let Some(c) = u32::from_str_radix(&self.buffer[1..5], 16)
.ok()
.and_then(char::from_u32)
{
self.buffer.clear();
Some(c)
} else {
Some('\\')
}
}
Some('U') => {
self.buffer.push('U');
for _ in 0..8 {
if let Some(c) = self.iter.next() {
self.buffer.push(c);
} else {
return Some('\\');
}
}
if let Some(c) = u32::from_str_radix(&self.buffer[1..9], 16)
.ok()
.and_then(char::from_u32)
{
self.buffer.clear();
Some(c)
} else {
Some('\\')
}
}
Some(c) => {
self.buffer.push(c);
Some('\\')
}
None => Some('\\'),
},
c => Some(c),
}
}
}
pub fn unescape_characters<'a>(
input: &'a str,
characters: &'static [u8],
replacement: &'static StaticSliceMap<char, char>,
) -> Cow<'a, str> {
if needs_unescape_characters(input, characters) {
UnescapeCharsIterator::new(input, replacement).collect()
} else {
input.into()
}
}
fn needs_unescape_characters(input: &str, characters: &[u8]) -> bool {
let bytes = input.as_bytes();
for i in 1..bytes.len() {
if bytes[i - 1] == b'/' && characters.contains(&bytes[i]) {
return true;
}
}
false
}
struct UnescapeCharsIterator<'a> {
iter: Chars<'a>,
buffer: Option<char>,
replacement: &'static StaticSliceMap<char, char>,
}
impl<'a> UnescapeCharsIterator<'a> {
fn new(string: &'a str, replacement: &'static StaticSliceMap<char, char>) -> Self {
Self {
iter: string.chars(),
buffer: None,
replacement,
}
}
}
impl<'a> Iterator for UnescapeCharsIterator<'a> {
type Item = char;
fn next(&mut self) -> Option<char> {
if let Some(ch) = self.buffer {
self.buffer = None;
return Some(ch);
}
match self.iter.next()? {
'\\' => match self.iter.next() {
Some(ch) => match self.replacement.get(ch) {
Some(replace) => Some(replace),
None => {
self.buffer = Some(ch);
Some('\\')
}
},
None => Some('\\'),
},
c => Some(c),
}
}
}

@ -1,16 +1,11 @@
use std::borrow::Cow;
use std::char;
use std::str::Chars;
mod grammar {
#![allow(unknown_lints)]
#![allow(clippy)]
use model::*;
use rio::utils::unescape_unicode_codepoints;
use sparql::algebra::*;
use sparql::model::*;
use sparql::parser::unescape_unicode_codepoints;
use std::borrow::Cow;
use std::collections::BTreeMap;
use std::collections::HashMap;
use std::io::BufReader;
@ -299,7 +294,7 @@ mod grammar {
BufReader::new(source).read_to_string(&mut string_buffer)?;
Ok(QueryUnit(
&unescape_unicode_codepoints(Cow::from(string_buffer)),
&unescape_unicode_codepoints(&string_buffer),
&mut state,
)?)
}
@ -307,93 +302,3 @@ mod grammar {
pub(crate) type ParseError = self::grammar::ParseError;
pub use self::grammar::read_sparql_query;
fn needs_unescape_unicode_codepoints(input: &str) -> bool {
let bytes = input.as_bytes();
for i in 1..bytes.len() {
if (bytes[i] == b'u' || bytes[i] == b'U') && bytes[i - 1] == b'/' {
return true;
}
}
false
}
struct UnescapeUnicodeCharIterator<'a> {
iter: Chars<'a>,
buffer: String,
}
impl<'a> UnescapeUnicodeCharIterator<'a> {
fn new(string: &'a str) -> Self {
Self {
iter: string.chars(),
buffer: String::with_capacity(9),
}
}
}
impl<'a> Iterator for UnescapeUnicodeCharIterator<'a> {
type Item = char;
fn next(&mut self) -> Option<char> {
if !self.buffer.is_empty() {
return Some(self.buffer.remove(0));
}
match self.iter.next()? {
'\\' => match self.iter.next() {
Some('u') => {
self.buffer.push('u');
for _ in 0..4 {
if let Some(c) = self.iter.next() {
self.buffer.push(c);
} else {
return Some('\\');
}
}
if let Some(c) = u32::from_str_radix(&self.buffer[1..5], 16)
.ok()
.and_then(char::from_u32)
{
self.buffer.clear();
Some(c)
} else {
Some('\\')
}
}
Some('U') => {
self.buffer.push('U');
for _ in 0..8 {
if let Some(c) = self.iter.next() {
self.buffer.push(c);
} else {
return Some('\\');
}
}
if let Some(c) = u32::from_str_radix(&self.buffer[1..9], 16)
.ok()
.and_then(char::from_u32)
{
self.buffer.clear();
Some(c)
} else {
Some('\\')
}
}
Some(c) => {
self.buffer.push(c);
Some('\\')
}
None => Some('\\'),
},
c => Some(c),
}
}
}
fn unescape_unicode_codepoints(input: Cow<str>) -> Cow<str> {
if needs_unescape_unicode_codepoints(&input) {
UnescapeUnicodeCharIterator::new(&input).collect()
} else {
input
}
}

@ -77,3 +77,28 @@ impl ExactSizeIterator for EscapeRDF {
}
}
}
pub struct StaticSliceMap<K: 'static + Copy + Eq, V: 'static + Copy> {
keys: &'static [K],
values: &'static [V],
}
impl<K: 'static + Copy + Eq, V: 'static + Copy> StaticSliceMap<K, V> {
pub fn new(keys: &'static [K], values: &'static [V]) -> Self {
assert_eq!(
keys.len(),
values.len(),
"keys and values slices of StaticSliceMap should have the same size"
);
Self { keys, values }
}
pub fn get(&self, key: K) -> Option<V> {
for i in 0..self.keys.len() {
if self.keys[i] == key {
return Some(self.values[i]);
}
}
None
}
}

Loading…
Cancel
Save