SPARQL: Do not unescape unicode escape everywhere but only in IRIs and strings

Follows most systems behavior

Issue #376
pull/577/head
Tpt 1 year ago committed by Thomas Tanon
parent 00f179058e
commit 8a398db20e
  1. 284
      lib/spargebra/src/parser.rs

@ -8,11 +8,9 @@ use oxrdf::vocab::{rdf, xsd};
use peg::parser; use peg::parser;
use peg::str::LineCol; use peg::str::LineCol;
use rand::random; use rand::random;
use std::borrow::Cow;
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
use std::error::Error; use std::error::Error;
use std::mem::take; use std::mem::take;
use std::str::Chars;
use std::str::FromStr; use std::str::FromStr;
use std::{char, fmt}; use std::{char, fmt};
@ -32,7 +30,7 @@ pub fn parse_query(query: &str, base_iri: Option<&str>) -> Result<Query, ParseEr
aggregates: Vec::new(), aggregates: Vec::new(),
}; };
parser::QueryUnit(&unescape_unicode_codepoints(query), &mut state).map_err(|e| ParseError { parser::QueryUnit(query, &mut state).map_err(|e| ParseError {
inner: ParseErrorKind::Parser(e), inner: ParseErrorKind::Parser(e),
}) })
} }
@ -53,11 +51,8 @@ pub fn parse_update(update: &str, base_iri: Option<&str>) -> Result<Update, Pars
aggregates: Vec::new(), aggregates: Vec::new(),
}; };
let operations = let operations = parser::UpdateInit(update, &mut state).map_err(|e| ParseError {
parser::UpdateInit(&unescape_unicode_codepoints(update), &mut state).map_err(|e| {
ParseError {
inner: ParseErrorKind::Parser(e), inner: ParseErrorKind::Parser(e),
}
})?; })?;
Ok(Update { Ok(Update {
operations, operations,
@ -724,11 +719,11 @@ pub struct ParserState {
} }
impl ParserState { impl ParserState {
fn parse_iri(&self, iri: &str) -> Result<Iri<String>, IriParseError> { fn parse_iri(&self, iri: String) -> Result<Iri<String>, IriParseError> {
if let Some(base_iri) = &self.base_iri { if let Some(base_iri) = &self.base_iri {
base_iri.resolve(iri) base_iri.resolve(&iri)
} else { } else {
Iri::parse(iri.to_owned()) Iri::parse(iri)
} }
} }
@ -746,211 +741,69 @@ impl ParserState {
} }
} }
pub fn unescape_unicode_codepoints(input: &str) -> Cow<'_, str> { fn unescape_iriref(mut input: &str) -> Result<String, &'static str> {
if needs_unescape_unicode_codepoints(input) { let mut output = String::with_capacity(input.len());
UnescapeUnicodeCharIterator::new(input).collect() while let Some((before, after)) = input.split_once('\\') {
} else { output.push_str(before);
input.into() let mut after = after.chars();
} let (escape, after) = match after.next() {
} Some('u') => read_hex_char::<4>(after.as_str())?,
Some('U') => read_hex_char::<8>(after.as_str())?,
fn needs_unescape_unicode_codepoints(input: &str) -> bool { Some(_) => {
let bytes = input.as_bytes(); return Err(
for i in 1..bytes.len() { "IRIs are only allowed to contain escape sequences \\uXXXX and \\UXXXXXXXX",
if (bytes[i] == b'u' || bytes[i] == b'U') && bytes[i - 1] == b'\\' { )
return true;
} }
None => return Err("IRIs are not allowed to end with a '\'"),
};
output.push(escape);
input = after;
} }
false output.push_str(input);
} Ok(output)
struct UnescapeUnicodeCharIterator<'a> {
iter: Chars<'a>,
buffer: String,
} }
impl<'a> UnescapeUnicodeCharIterator<'a> { fn unescape_string(mut input: &str) -> Result<String, &'static str> {
fn new(string: &'a str) -> Self { let mut output = String::with_capacity(input.len());
Self { while let Some((before, after)) = input.split_once('\\') {
iter: string.chars(), output.push_str(before);
buffer: String::with_capacity(9), let mut after = after.chars();
} let (escape, after) = match after.next() {
Some('t') => ('\u{0009}', after.as_str()),
Some('b') => ('\u{0008}', after.as_str()),
Some('n') => ('\u{000A}', after.as_str()),
Some('r') => ('\u{000D}', after.as_str()),
Some('f') => ('\u{000C}', after.as_str()),
Some('"') => ('\u{0022}', after.as_str()),
Some('\'') => ('\u{0027}', after.as_str()),
Some('\\') => ('\u{005C}', after.as_str()),
Some('u') => read_hex_char::<4>(after.as_str())?,
Some('U') => read_hex_char::<8>(after.as_str())?,
Some(_) => return Err("The character that can be escaped in strings are tbnrf\"'\\"),
None => return Err("strings are not allowed to end with a '\'"),
};
output.push(escape);
input = after;
} }
output.push_str(input);
Ok(output)
} }
impl<'a> Iterator for UnescapeUnicodeCharIterator<'a> { fn read_hex_char<const SIZE: usize>(input: &str) -> Result<(char, &str), &'static str> {
type Item = char; if let Some(escape) = input.get(..SIZE) {
if let Some(char) = u32::from_str_radix(escape, 16)
fn next(&mut self) -> Option<char> {
if !self.buffer.is_empty() {
return Some(self.buffer.remove(0));
}
match self.iter.next()? {
'\\' => match self.iter.next() {
Some('u') => {
self.buffer.push('u');
for _ in 0..4 {
if let Some(c) = self.iter.next() {
self.buffer.push(c);
} else {
return Some('\\');
}
}
if let Some(c) = u32::from_str_radix(&self.buffer[1..], 16)
.ok() .ok()
.and_then(char::from_u32) .and_then(char::from_u32)
{ {
self.buffer.clear(); Ok((char, &input[SIZE..]))
Some(c)
} else { } else {
Some('\\') Err("\\u escape sequence should be followed by hexadecimal digits")
} }
}
Some('U') => {
self.buffer.push('U');
for _ in 0..8 {
if let Some(c) = self.iter.next() {
self.buffer.push(c);
} else {
return Some('\\');
}
}
if let Some(c) = u32::from_str_radix(&self.buffer[1..], 16)
.ok()
.and_then(char::from_u32)
{
self.buffer.clear();
Some(c)
} else {
Some('\\')
}
}
Some(c) => {
self.buffer.push(c);
Some('\\')
}
None => Some('\\'),
},
c => Some(c),
}
}
}
pub fn unescape_characters<'a>(
input: &'a str,
characters: &'static [u8],
replacement: &'static StaticCharSliceMap,
) -> Cow<'a, str> {
if needs_unescape_characters(input, characters) {
UnescapeCharsIterator::new(input, replacement).collect()
} else { } else {
input.into() Err("\\u escape sequence should be followed by hexadecimal digits")
} }
} }
fn needs_unescape_characters(input: &str, characters: &[u8]) -> bool {
let bytes = input.as_bytes();
for i in 1..bytes.len() {
if bytes[i - 1] == b'\\' && characters.contains(&bytes[i]) {
return true;
}
}
false
}
struct UnescapeCharsIterator<'a> {
iter: Chars<'a>,
buffer: Option<char>,
replacement: &'static StaticCharSliceMap,
}
impl<'a> UnescapeCharsIterator<'a> {
fn new(string: &'a str, replacement: &'static StaticCharSliceMap) -> Self {
Self {
iter: string.chars(),
buffer: None,
replacement,
}
}
}
impl<'a> Iterator for UnescapeCharsIterator<'a> {
type Item = char;
fn next(&mut self) -> Option<char> {
if let Some(ch) = self.buffer {
self.buffer = None;
return Some(ch);
}
match self.iter.next()? {
'\\' => match self.iter.next() {
Some(ch) => {
if let Some(replace) = self.replacement.get(ch) {
Some(replace)
} else {
self.buffer = Some(ch);
Some('\\')
}
}
None => Some('\\'),
},
c => Some(c),
}
}
}
pub struct StaticCharSliceMap {
keys: &'static [char],
values: &'static [char],
}
impl StaticCharSliceMap {
pub const fn new(keys: &'static [char], values: &'static [char]) -> Self {
Self { keys, values }
}
pub fn get(&self, key: char) -> Option<char> {
for i in 0..self.keys.len() {
if self.keys[i] == key {
return Some(self.values[i]);
}
}
None
}
}
const UNESCAPE_CHARACTERS: [u8; 8] = [b't', b'b', b'n', b'r', b'f', b'"', b'\'', b'\\'];
const UNESCAPE_REPLACEMENT: StaticCharSliceMap = StaticCharSliceMap::new(
&['t', 'b', 'n', 'r', 'f', '"', '\'', '\\'],
&[
'\u{0009}', '\u{0008}', '\u{000A}', '\u{000D}', '\u{000C}', '\u{0022}', '\u{0027}',
'\u{005C}',
],
);
fn unescape_echars(input: &str) -> Cow<'_, str> {
unescape_characters(input, &UNESCAPE_CHARACTERS, &UNESCAPE_REPLACEMENT)
}
const UNESCAPE_PN_CHARACTERS: [u8; 20] = [
b'_', b'~', b'.', b'-', b'!', b'$', b'&', b'\'', b'(', b')', b'*', b'+', b',', b';', b'=',
b'/', b'?', b'#', b'@', b'%',
];
const UNESCAPE_PN_REPLACEMENT: StaticCharSliceMap = StaticCharSliceMap::new(
&[
'_', '~', '.', '-', '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', '/', '?', '#',
'@', '%',
],
&[
'_', '~', '.', '-', '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', '/', '?', '#',
'@', '%',
],
);
pub fn unescape_pn_local(input: &str) -> Cow<'_, str> {
unescape_characters(input, &UNESCAPE_PN_CHARACTERS, &UNESCAPE_PN_REPLACEMENT)
}
fn variable() -> Variable { fn variable() -> Variable {
Variable::new_unchecked(format!("{:x}", random::<u128>())) Variable::new_unchecked(format!("{:x}", random::<u128>()))
} }
@ -2143,7 +1996,7 @@ parser! {
} / ANON() { BlankNode::default() } } / ANON() { BlankNode::default() }
rule IRIREF() -> Iri<String> = "<" i:$((!['>'] [_])*) ">" {? rule IRIREF() -> Iri<String> = "<" i:$((!['>'] [_])*) ">" {?
state.parse_iri(i).map_err(|_| "IRI parsing failed") state.parse_iri(unescape_iriref(i)?).map_err(|_| "IRI parsing failed")
} }
rule PNAME_NS() -> &'input str = ns:$(PN_PREFIX()?) ":" { rule PNAME_NS() -> &'input str = ns:$(PN_PREFIX()?) ":" {
@ -2152,8 +2005,11 @@ parser! {
rule PNAME_LN() -> Iri<String> = ns:PNAME_NS() local:$(PN_LOCAL()) {? rule PNAME_LN() -> Iri<String> = ns:PNAME_NS() local:$(PN_LOCAL()) {?
if let Some(base) = state.namespaces.get(ns) { if let Some(base) = state.namespaces.get(ns) {
let mut iri = base.clone(); let mut iri = String::with_capacity(base.len() + local.len());
iri.push_str(&unescape_pn_local(local)); iri.push_str(base);
for chunk in local.split('\\') { // We remove \
iri.push_str(chunk);
}
Iri::parse(iri).map_err(|_| "IRI parsing failed") Iri::parse(iri).map_err(|_| "IRI parsing failed")
} else { } else {
Err("Prefix not found") Err("Prefix not found")
@ -2192,29 +2048,31 @@ parser! {
rule EXPONENT() = ['e' | 'E'] ['+' | '-']? ['0'..='9']+ rule EXPONENT() = ['e' | 'E'] ['+' | '-']? ['0'..='9']+
rule STRING_LITERAL1() -> String = "'" l:$((STRING_LITERAL1_simple_char() / ECHAR())*) "'" { rule STRING_LITERAL1() -> String = "'" l:$((STRING_LITERAL1_simple_char() / ECHAR() / UCHAR())*) "'" {?
unescape_echars(l).to_string() unescape_string(l)
} }
rule STRING_LITERAL1_simple_char() = !['\u{27}' | '\u{5C}' | '\u{A}' | '\u{D}'] [_] rule STRING_LITERAL1_simple_char() = !['\u{27}' | '\u{5C}' | '\u{A}' | '\u{D}'] [_]
rule STRING_LITERAL2() -> String = "\"" l:$((STRING_LITERAL2_simple_char() / ECHAR())*) "\"" { rule STRING_LITERAL2() -> String = "\"" l:$((STRING_LITERAL2_simple_char() / ECHAR() / UCHAR())*) "\"" {?
unescape_echars(l).to_string() unescape_string(l)
} }
rule STRING_LITERAL2_simple_char() = !['\u{22}' | '\u{5C}' | '\u{A}' | '\u{D}'] [_] rule STRING_LITERAL2_simple_char() = !['\u{22}' | '\u{5C}' | '\u{A}' | '\u{D}'] [_]
rule STRING_LITERAL_LONG1() -> String = "'''" l:$(STRING_LITERAL_LONG1_inner()*) "'''" { rule STRING_LITERAL_LONG1() -> String = "'''" l:$(STRING_LITERAL_LONG1_inner()*) "'''" {?
unescape_echars(l).to_string() unescape_string(l)
} }
rule STRING_LITERAL_LONG1_inner() = ("''" / "'")? (STRING_LITERAL_LONG1_simple_char() / ECHAR()) rule STRING_LITERAL_LONG1_inner() = ("''" / "'")? (STRING_LITERAL_LONG1_simple_char() / ECHAR() / UCHAR())
rule STRING_LITERAL_LONG1_simple_char() = !['\'' | '\\'] [_] rule STRING_LITERAL_LONG1_simple_char() = !['\'' | '\\'] [_]
rule STRING_LITERAL_LONG2() -> String = "\"\"\"" l:$(STRING_LITERAL_LONG2_inner()*) "\"\"\"" { rule STRING_LITERAL_LONG2() -> String = "\"\"\"" l:$(STRING_LITERAL_LONG2_inner()*) "\"\"\"" {?
unescape_echars(l).to_string() unescape_string(l)
} }
rule STRING_LITERAL_LONG2_inner() = ("\"\"" / "\"")? (STRING_LITERAL_LONG2_simple_char() / ECHAR()) rule STRING_LITERAL_LONG2_inner() = ("\"\"" / "\"")? (STRING_LITERAL_LONG2_simple_char() / ECHAR() / UCHAR())
rule STRING_LITERAL_LONG2_simple_char() = !['"' | '\\'] [_] rule STRING_LITERAL_LONG2_simple_char() = !['"' | '\\'] [_]
rule UCHAR() = "\\u" HEX() HEX() HEX() HEX() / "\\U" HEX() HEX() HEX() HEX() HEX() HEX() HEX() HEX()
rule ECHAR() = "\\" ['t' | 'b' | 'n' | 'r' | 'f' | '"' |'\'' | '\\'] rule ECHAR() = "\\" ['t' | 'b' | 'n' | 'r' | 'f' | '"' |'\'' | '\\']
rule NIL() = "(" WS()* ")" rule NIL() = "(" WS()* ")"
@ -2223,7 +2081,7 @@ parser! {
rule ANON() = "[" WS()* "]" rule ANON() = "[" WS()* "]"
rule PN_CHARS_BASE() = ['A' ..= 'Z' | 'a' ..= 'z' | '\u{00C0}' ..='\u{00D6}' | '\u{00D8}'..='\u{00F6}' | '\u{00F8}'..='\u{02FF}' | '\u{0370}'..='\u{037D}' | '\u{037F}'..='\u{1FFF}' | '\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' | '\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' | '\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}'] rule PN_CHARS_BASE() = ['A' ..= 'Z' | 'a' ..= 'z' | '\u{00C0}'..='\u{00D6}' | '\u{00D8}'..='\u{00F6}' | '\u{00F8}'..='\u{02FF}' | '\u{0370}'..='\u{037D}' | '\u{037F}'..='\u{1FFF}' | '\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' | '\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' | '\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}']
rule PN_CHARS_U() = ['_'] / PN_CHARS_BASE() rule PN_CHARS_U() = ['_'] / PN_CHARS_BASE()

Loading…
Cancel
Save