From c0e6d9daeacd2f44982ef3c81fe89ca3736ad4f6 Mon Sep 17 00:00:00 2001 From: Tpt Date: Wed, 12 Jul 2023 22:09:14 +0200 Subject: [PATCH] SPARQL: Do not unescape unicode escape everywhere but only in IRIs and strings Follows most systems behavior Issue #376 --- lib/spargebra/src/parser.rs | 298 ++++++++++-------------------------- 1 file changed, 78 insertions(+), 220 deletions(-) diff --git a/lib/spargebra/src/parser.rs b/lib/spargebra/src/parser.rs index daeddd55..0dccff3b 100644 --- a/lib/spargebra/src/parser.rs +++ b/lib/spargebra/src/parser.rs @@ -8,11 +8,9 @@ use oxrdf::vocab::{rdf, xsd}; use peg::parser; use peg::str::LineCol; use rand::random; -use std::borrow::Cow; use std::collections::{HashMap, HashSet}; use std::error::Error; use std::mem::take; -use std::str::Chars; use std::str::FromStr; use std::{char, fmt}; @@ -32,7 +30,7 @@ pub fn parse_query(query: &str, base_iri: Option<&str>) -> Result) -> Result Result, IriParseError> { + fn parse_iri(&self, iri: String) -> Result, IriParseError> { if let Some(base_iri) = &self.base_iri { - base_iri.resolve(iri) + base_iri.resolve(&iri) } else { - Iri::parse(iri.to_owned()) + Iri::parse(iri) } } @@ -746,211 +741,69 @@ impl ParserState { } } -pub fn unescape_unicode_codepoints(input: &str) -> Cow<'_, str> { - if needs_unescape_unicode_codepoints(input) { - UnescapeUnicodeCharIterator::new(input).collect() - } else { - input.into() - } -} - -fn needs_unescape_unicode_codepoints(input: &str) -> bool { - let bytes = input.as_bytes(); - for i in 1..bytes.len() { - if (bytes[i] == b'u' || bytes[i] == b'U') && bytes[i - 1] == b'\\' { - return true; - } +fn unescape_iriref(mut input: &str) -> Result { + let mut output = String::with_capacity(input.len()); + while let Some((before, after)) = input.split_once('\\') { + output.push_str(before); + let mut after = after.chars(); + let (escape, after) = match after.next() { + Some('u') => read_hex_char::<4>(after.as_str())?, + Some('U') => read_hex_char::<8>(after.as_str())?, + Some(_) => { + return Err( + "IRIs are only allowed to contain escape sequences \\uXXXX and \\UXXXXXXXX", + ) + } + None => return Err("IRIs are not allowed to end with a '\'"), + }; + output.push(escape); + input = after; } - false + output.push_str(input); + Ok(output) } -struct UnescapeUnicodeCharIterator<'a> { - iter: Chars<'a>, - buffer: String, -} - -impl<'a> UnescapeUnicodeCharIterator<'a> { - fn new(string: &'a str) -> Self { - Self { - iter: string.chars(), - buffer: String::with_capacity(9), - } +fn unescape_string(mut input: &str) -> Result { + let mut output = String::with_capacity(input.len()); + while let Some((before, after)) = input.split_once('\\') { + output.push_str(before); + let mut after = after.chars(); + let (escape, after) = match after.next() { + Some('t') => ('\u{0009}', after.as_str()), + Some('b') => ('\u{0008}', after.as_str()), + Some('n') => ('\u{000A}', after.as_str()), + Some('r') => ('\u{000D}', after.as_str()), + Some('f') => ('\u{000C}', after.as_str()), + Some('"') => ('\u{0022}', after.as_str()), + Some('\'') => ('\u{0027}', after.as_str()), + Some('\\') => ('\u{005C}', after.as_str()), + Some('u') => read_hex_char::<4>(after.as_str())?, + Some('U') => read_hex_char::<8>(after.as_str())?, + Some(_) => return Err("The character that can be escaped in strings are tbnrf\"'\\"), + None => return Err("strings are not allowed to end with a '\'"), + }; + output.push(escape); + input = after; } + output.push_str(input); + Ok(output) } -impl<'a> Iterator for UnescapeUnicodeCharIterator<'a> { - type Item = char; - - fn next(&mut self) -> Option { - if !self.buffer.is_empty() { - return Some(self.buffer.remove(0)); - } - match self.iter.next()? { - '\\' => match self.iter.next() { - Some('u') => { - self.buffer.push('u'); - for _ in 0..4 { - if let Some(c) = self.iter.next() { - self.buffer.push(c); - } else { - return Some('\\'); - } - } - if let Some(c) = u32::from_str_radix(&self.buffer[1..], 16) - .ok() - .and_then(char::from_u32) - { - self.buffer.clear(); - Some(c) - } else { - Some('\\') - } - } - Some('U') => { - self.buffer.push('U'); - for _ in 0..8 { - if let Some(c) = self.iter.next() { - self.buffer.push(c); - } else { - return Some('\\'); - } - } - if let Some(c) = u32::from_str_radix(&self.buffer[1..], 16) - .ok() - .and_then(char::from_u32) - { - self.buffer.clear(); - Some(c) - } else { - Some('\\') - } - } - Some(c) => { - self.buffer.push(c); - Some('\\') - } - None => Some('\\'), - }, - c => Some(c), +fn read_hex_char(input: &str) -> Result<(char, &str), &'static str> { + if let Some(escape) = input.get(..SIZE) { + if let Some(char) = u32::from_str_radix(escape, 16) + .ok() + .and_then(char::from_u32) + { + Ok((char, &input[SIZE..])) + } else { + Err("\\u escape sequence should be followed by hexadecimal digits") } - } -} - -pub fn unescape_characters<'a>( - input: &'a str, - characters: &'static [u8], - replacement: &'static StaticCharSliceMap, -) -> Cow<'a, str> { - if needs_unescape_characters(input, characters) { - UnescapeCharsIterator::new(input, replacement).collect() } else { - input.into() + Err("\\u escape sequence should be followed by hexadecimal digits") } } -fn needs_unescape_characters(input: &str, characters: &[u8]) -> bool { - let bytes = input.as_bytes(); - for i in 1..bytes.len() { - if bytes[i - 1] == b'\\' && characters.contains(&bytes[i]) { - return true; - } - } - false -} - -struct UnescapeCharsIterator<'a> { - iter: Chars<'a>, - buffer: Option, - replacement: &'static StaticCharSliceMap, -} - -impl<'a> UnescapeCharsIterator<'a> { - fn new(string: &'a str, replacement: &'static StaticCharSliceMap) -> Self { - Self { - iter: string.chars(), - buffer: None, - replacement, - } - } -} - -impl<'a> Iterator for UnescapeCharsIterator<'a> { - type Item = char; - - fn next(&mut self) -> Option { - if let Some(ch) = self.buffer { - self.buffer = None; - return Some(ch); - } - match self.iter.next()? { - '\\' => match self.iter.next() { - Some(ch) => { - if let Some(replace) = self.replacement.get(ch) { - Some(replace) - } else { - self.buffer = Some(ch); - Some('\\') - } - } - None => Some('\\'), - }, - c => Some(c), - } - } -} - -pub struct StaticCharSliceMap { - keys: &'static [char], - values: &'static [char], -} - -impl StaticCharSliceMap { - pub const fn new(keys: &'static [char], values: &'static [char]) -> Self { - Self { keys, values } - } - - pub fn get(&self, key: char) -> Option { - for i in 0..self.keys.len() { - if self.keys[i] == key { - return Some(self.values[i]); - } - } - None - } -} - -const UNESCAPE_CHARACTERS: [u8; 8] = [b't', b'b', b'n', b'r', b'f', b'"', b'\'', b'\\']; -const UNESCAPE_REPLACEMENT: StaticCharSliceMap = StaticCharSliceMap::new( - &['t', 'b', 'n', 'r', 'f', '"', '\'', '\\'], - &[ - '\u{0009}', '\u{0008}', '\u{000A}', '\u{000D}', '\u{000C}', '\u{0022}', '\u{0027}', - '\u{005C}', - ], -); - -fn unescape_echars(input: &str) -> Cow<'_, str> { - unescape_characters(input, &UNESCAPE_CHARACTERS, &UNESCAPE_REPLACEMENT) -} - -const UNESCAPE_PN_CHARACTERS: [u8; 20] = [ - b'_', b'~', b'.', b'-', b'!', b'$', b'&', b'\'', b'(', b')', b'*', b'+', b',', b';', b'=', - b'/', b'?', b'#', b'@', b'%', -]; -const UNESCAPE_PN_REPLACEMENT: StaticCharSliceMap = StaticCharSliceMap::new( - &[ - '_', '~', '.', '-', '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', '/', '?', '#', - '@', '%', - ], - &[ - '_', '~', '.', '-', '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', '/', '?', '#', - '@', '%', - ], -); - -pub fn unescape_pn_local(input: &str) -> Cow<'_, str> { - unescape_characters(input, &UNESCAPE_PN_CHARACTERS, &UNESCAPE_PN_REPLACEMENT) -} - fn variable() -> Variable { Variable::new_unchecked(format!("{:x}", random::())) } @@ -2143,7 +1996,7 @@ parser! { } / ANON() { BlankNode::default() } rule IRIREF() -> Iri = "<" i:$((!['>'] [_])*) ">" {? - state.parse_iri(i).map_err(|_| "IRI parsing failed") + state.parse_iri(unescape_iriref(i)?).map_err(|_| "IRI parsing failed") } rule PNAME_NS() -> &'input str = ns:$(PN_PREFIX()?) ":" { @@ -2152,8 +2005,11 @@ parser! { rule PNAME_LN() -> Iri = ns:PNAME_NS() local:$(PN_LOCAL()) {? if let Some(base) = state.namespaces.get(ns) { - let mut iri = base.clone(); - iri.push_str(&unescape_pn_local(local)); + let mut iri = String::with_capacity(base.len() + local.len()); + iri.push_str(base); + for chunk in local.split('\\') { // We remove \ + iri.push_str(chunk); + } Iri::parse(iri).map_err(|_| "IRI parsing failed") } else { Err("Prefix not found") @@ -2192,29 +2048,31 @@ parser! { rule EXPONENT() = ['e' | 'E'] ['+' | '-']? ['0'..='9']+ - rule STRING_LITERAL1() -> String = "'" l:$((STRING_LITERAL1_simple_char() / ECHAR())*) "'" { - unescape_echars(l).to_string() + rule STRING_LITERAL1() -> String = "'" l:$((STRING_LITERAL1_simple_char() / ECHAR() / UCHAR())*) "'" {? + unescape_string(l) } rule STRING_LITERAL1_simple_char() = !['\u{27}' | '\u{5C}' | '\u{A}' | '\u{D}'] [_] - rule STRING_LITERAL2() -> String = "\"" l:$((STRING_LITERAL2_simple_char() / ECHAR())*) "\"" { - unescape_echars(l).to_string() + rule STRING_LITERAL2() -> String = "\"" l:$((STRING_LITERAL2_simple_char() / ECHAR() / UCHAR())*) "\"" {? + unescape_string(l) } rule STRING_LITERAL2_simple_char() = !['\u{22}' | '\u{5C}' | '\u{A}' | '\u{D}'] [_] - rule STRING_LITERAL_LONG1() -> String = "'''" l:$(STRING_LITERAL_LONG1_inner()*) "'''" { - unescape_echars(l).to_string() + rule STRING_LITERAL_LONG1() -> String = "'''" l:$(STRING_LITERAL_LONG1_inner()*) "'''" {? + unescape_string(l) } - rule STRING_LITERAL_LONG1_inner() = ("''" / "'")? (STRING_LITERAL_LONG1_simple_char() / ECHAR()) + rule STRING_LITERAL_LONG1_inner() = ("''" / "'")? (STRING_LITERAL_LONG1_simple_char() / ECHAR() / UCHAR()) rule STRING_LITERAL_LONG1_simple_char() = !['\'' | '\\'] [_] - rule STRING_LITERAL_LONG2() -> String = "\"\"\"" l:$(STRING_LITERAL_LONG2_inner()*) "\"\"\"" { - unescape_echars(l).to_string() + rule STRING_LITERAL_LONG2() -> String = "\"\"\"" l:$(STRING_LITERAL_LONG2_inner()*) "\"\"\"" {? + unescape_string(l) } - rule STRING_LITERAL_LONG2_inner() = ("\"\"" / "\"")? (STRING_LITERAL_LONG2_simple_char() / ECHAR()) + rule STRING_LITERAL_LONG2_inner() = ("\"\"" / "\"")? (STRING_LITERAL_LONG2_simple_char() / ECHAR() / UCHAR()) rule STRING_LITERAL_LONG2_simple_char() = !['"' | '\\'] [_] + rule UCHAR() = "\\u" HEX() HEX() HEX() HEX() / "\\U" HEX() HEX() HEX() HEX() HEX() HEX() HEX() HEX() + rule ECHAR() = "\\" ['t' | 'b' | 'n' | 'r' | 'f' | '"' |'\'' | '\\'] rule NIL() = "(" WS()* ")" @@ -2223,7 +2081,7 @@ parser! { rule ANON() = "[" WS()* "]" - rule PN_CHARS_BASE() = ['A' ..= 'Z' | 'a' ..= 'z' | '\u{00C0}' ..='\u{00D6}' | '\u{00D8}'..='\u{00F6}' | '\u{00F8}'..='\u{02FF}' | '\u{0370}'..='\u{037D}' | '\u{037F}'..='\u{1FFF}' | '\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' | '\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' | '\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}'] + rule PN_CHARS_BASE() = ['A' ..= 'Z' | 'a' ..= 'z' | '\u{00C0}'..='\u{00D6}' | '\u{00D8}'..='\u{00F6}' | '\u{00F8}'..='\u{02FF}' | '\u{0370}'..='\u{037D}' | '\u{037F}'..='\u{1FFF}' | '\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' | '\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' | '\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}'] rule PN_CHARS_U() = ['_'] / PN_CHARS_BASE()