|
|
|
@ -8,11 +8,9 @@ use oxrdf::vocab::{rdf, xsd}; |
|
|
|
|
use peg::parser; |
|
|
|
|
use peg::str::LineCol; |
|
|
|
|
use rand::random; |
|
|
|
|
use std::borrow::Cow; |
|
|
|
|
use std::collections::{HashMap, HashSet}; |
|
|
|
|
use std::error::Error; |
|
|
|
|
use std::mem::take; |
|
|
|
|
use std::str::Chars; |
|
|
|
|
use std::str::FromStr; |
|
|
|
|
use std::{char, fmt}; |
|
|
|
|
|
|
|
|
@ -32,7 +30,7 @@ pub fn parse_query(query: &str, base_iri: Option<&str>) -> Result<Query, ParseEr |
|
|
|
|
aggregates: Vec::new(), |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
parser::QueryUnit(&unescape_unicode_codepoints(query), &mut state).map_err(|e| ParseError { |
|
|
|
|
parser::QueryUnit(query, &mut state).map_err(|e| ParseError { |
|
|
|
|
inner: ParseErrorKind::Parser(e), |
|
|
|
|
}) |
|
|
|
|
} |
|
|
|
@ -53,11 +51,8 @@ pub fn parse_update(update: &str, base_iri: Option<&str>) -> Result<Update, Pars |
|
|
|
|
aggregates: Vec::new(), |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
let operations = |
|
|
|
|
parser::UpdateInit(&unescape_unicode_codepoints(update), &mut state).map_err(|e| { |
|
|
|
|
ParseError { |
|
|
|
|
let operations = parser::UpdateInit(update, &mut state).map_err(|e| ParseError { |
|
|
|
|
inner: ParseErrorKind::Parser(e), |
|
|
|
|
} |
|
|
|
|
})?; |
|
|
|
|
Ok(Update { |
|
|
|
|
operations, |
|
|
|
@ -724,11 +719,11 @@ pub struct ParserState { |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
impl ParserState { |
|
|
|
|
fn parse_iri(&self, iri: &str) -> Result<Iri<String>, IriParseError> { |
|
|
|
|
fn parse_iri(&self, iri: String) -> Result<Iri<String>, IriParseError> { |
|
|
|
|
if let Some(base_iri) = &self.base_iri { |
|
|
|
|
base_iri.resolve(iri) |
|
|
|
|
base_iri.resolve(&iri) |
|
|
|
|
} else { |
|
|
|
|
Iri::parse(iri.to_owned()) |
|
|
|
|
Iri::parse(iri) |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -746,210 +741,68 @@ impl ParserState { |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
pub fn unescape_unicode_codepoints(input: &str) -> Cow<'_, str> { |
|
|
|
|
if needs_unescape_unicode_codepoints(input) { |
|
|
|
|
UnescapeUnicodeCharIterator::new(input).collect() |
|
|
|
|
} else { |
|
|
|
|
input.into() |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
fn needs_unescape_unicode_codepoints(input: &str) -> bool { |
|
|
|
|
let bytes = input.as_bytes(); |
|
|
|
|
for i in 1..bytes.len() { |
|
|
|
|
if (bytes[i] == b'u' || bytes[i] == b'U') && bytes[i - 1] == b'\\' { |
|
|
|
|
return true; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
false |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
struct UnescapeUnicodeCharIterator<'a> { |
|
|
|
|
iter: Chars<'a>, |
|
|
|
|
buffer: String, |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
impl<'a> UnescapeUnicodeCharIterator<'a> { |
|
|
|
|
fn new(string: &'a str) -> Self { |
|
|
|
|
Self { |
|
|
|
|
iter: string.chars(), |
|
|
|
|
buffer: String::with_capacity(9), |
|
|
|
|
fn unescape_iriref(mut input: &str) -> Result<String, &'static str> { |
|
|
|
|
let mut output = String::with_capacity(input.len()); |
|
|
|
|
while let Some((before, after)) = input.split_once('\\') { |
|
|
|
|
output.push_str(before); |
|
|
|
|
let mut after = after.chars(); |
|
|
|
|
let (escape, after) = match after.next() { |
|
|
|
|
Some('u') => read_hex_char::<4>(after.as_str())?, |
|
|
|
|
Some('U') => read_hex_char::<8>(after.as_str())?, |
|
|
|
|
Some(_) => { |
|
|
|
|
return Err( |
|
|
|
|
"IRIs are only allowed to contain escape sequences \\uXXXX and \\UXXXXXXXX", |
|
|
|
|
) |
|
|
|
|
} |
|
|
|
|
None => return Err("IRIs are not allowed to end with a '\'"), |
|
|
|
|
}; |
|
|
|
|
output.push(escape); |
|
|
|
|
input = after; |
|
|
|
|
} |
|
|
|
|
output.push_str(input); |
|
|
|
|
Ok(output) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
fn unescape_string(mut input: &str) -> Result<String, &'static str> { |
|
|
|
|
let mut output = String::with_capacity(input.len()); |
|
|
|
|
while let Some((before, after)) = input.split_once('\\') { |
|
|
|
|
output.push_str(before); |
|
|
|
|
let mut after = after.chars(); |
|
|
|
|
let (escape, after) = match after.next() { |
|
|
|
|
Some('t') => ('\u{0009}', after.as_str()), |
|
|
|
|
Some('b') => ('\u{0008}', after.as_str()), |
|
|
|
|
Some('n') => ('\u{000A}', after.as_str()), |
|
|
|
|
Some('r') => ('\u{000D}', after.as_str()), |
|
|
|
|
Some('f') => ('\u{000C}', after.as_str()), |
|
|
|
|
Some('"') => ('\u{0022}', after.as_str()), |
|
|
|
|
Some('\'') => ('\u{0027}', after.as_str()), |
|
|
|
|
Some('\\') => ('\u{005C}', after.as_str()), |
|
|
|
|
Some('u') => read_hex_char::<4>(after.as_str())?, |
|
|
|
|
Some('U') => read_hex_char::<8>(after.as_str())?, |
|
|
|
|
Some(_) => return Err("The character that can be escaped in strings are tbnrf\"'\\"), |
|
|
|
|
None => return Err("strings are not allowed to end with a '\'"), |
|
|
|
|
}; |
|
|
|
|
output.push(escape); |
|
|
|
|
input = after; |
|
|
|
|
} |
|
|
|
|
output.push_str(input); |
|
|
|
|
Ok(output) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
impl<'a> Iterator for UnescapeUnicodeCharIterator<'a> { |
|
|
|
|
type Item = char; |
|
|
|
|
|
|
|
|
|
fn next(&mut self) -> Option<char> { |
|
|
|
|
if !self.buffer.is_empty() { |
|
|
|
|
return Some(self.buffer.remove(0)); |
|
|
|
|
} |
|
|
|
|
match self.iter.next()? { |
|
|
|
|
'\\' => match self.iter.next() { |
|
|
|
|
Some('u') => { |
|
|
|
|
self.buffer.push('u'); |
|
|
|
|
for _ in 0..4 { |
|
|
|
|
if let Some(c) = self.iter.next() { |
|
|
|
|
self.buffer.push(c); |
|
|
|
|
} else { |
|
|
|
|
return Some('\\'); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
if let Some(c) = u32::from_str_radix(&self.buffer[1..], 16) |
|
|
|
|
fn read_hex_char<const SIZE: usize>(input: &str) -> Result<(char, &str), &'static str> { |
|
|
|
|
if let Some(escape) = input.get(..SIZE) { |
|
|
|
|
if let Some(char) = u32::from_str_radix(escape, 16) |
|
|
|
|
.ok() |
|
|
|
|
.and_then(char::from_u32) |
|
|
|
|
{ |
|
|
|
|
self.buffer.clear(); |
|
|
|
|
Some(c) |
|
|
|
|
} else { |
|
|
|
|
Some('\\') |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
Some('U') => { |
|
|
|
|
self.buffer.push('U'); |
|
|
|
|
for _ in 0..8 { |
|
|
|
|
if let Some(c) = self.iter.next() { |
|
|
|
|
self.buffer.push(c); |
|
|
|
|
} else { |
|
|
|
|
return Some('\\'); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
if let Some(c) = u32::from_str_radix(&self.buffer[1..], 16) |
|
|
|
|
.ok() |
|
|
|
|
.and_then(char::from_u32) |
|
|
|
|
{ |
|
|
|
|
self.buffer.clear(); |
|
|
|
|
Some(c) |
|
|
|
|
} else { |
|
|
|
|
Some('\\') |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
Some(c) => { |
|
|
|
|
self.buffer.push(c); |
|
|
|
|
Some('\\') |
|
|
|
|
} |
|
|
|
|
None => Some('\\'), |
|
|
|
|
}, |
|
|
|
|
c => Some(c), |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
pub fn unescape_characters<'a>( |
|
|
|
|
input: &'a str, |
|
|
|
|
characters: &'static [u8], |
|
|
|
|
replacement: &'static StaticCharSliceMap, |
|
|
|
|
) -> Cow<'a, str> { |
|
|
|
|
if needs_unescape_characters(input, characters) { |
|
|
|
|
UnescapeCharsIterator::new(input, replacement).collect() |
|
|
|
|
Ok((char, &input[SIZE..])) |
|
|
|
|
} else { |
|
|
|
|
input.into() |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
fn needs_unescape_characters(input: &str, characters: &[u8]) -> bool { |
|
|
|
|
let bytes = input.as_bytes(); |
|
|
|
|
for i in 1..bytes.len() { |
|
|
|
|
if bytes[i - 1] == b'\\' && characters.contains(&bytes[i]) { |
|
|
|
|
return true; |
|
|
|
|
} |
|
|
|
|
Err("\\u escape sequence should be followed by hexadecimal digits") |
|
|
|
|
} |
|
|
|
|
false |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
struct UnescapeCharsIterator<'a> { |
|
|
|
|
iter: Chars<'a>, |
|
|
|
|
buffer: Option<char>, |
|
|
|
|
replacement: &'static StaticCharSliceMap, |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
impl<'a> UnescapeCharsIterator<'a> { |
|
|
|
|
fn new(string: &'a str, replacement: &'static StaticCharSliceMap) -> Self { |
|
|
|
|
Self { |
|
|
|
|
iter: string.chars(), |
|
|
|
|
buffer: None, |
|
|
|
|
replacement, |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
impl<'a> Iterator for UnescapeCharsIterator<'a> { |
|
|
|
|
type Item = char; |
|
|
|
|
|
|
|
|
|
fn next(&mut self) -> Option<char> { |
|
|
|
|
if let Some(ch) = self.buffer { |
|
|
|
|
self.buffer = None; |
|
|
|
|
return Some(ch); |
|
|
|
|
} |
|
|
|
|
match self.iter.next()? { |
|
|
|
|
'\\' => match self.iter.next() { |
|
|
|
|
Some(ch) => { |
|
|
|
|
if let Some(replace) = self.replacement.get(ch) { |
|
|
|
|
Some(replace) |
|
|
|
|
} else { |
|
|
|
|
self.buffer = Some(ch); |
|
|
|
|
Some('\\') |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
None => Some('\\'), |
|
|
|
|
}, |
|
|
|
|
c => Some(c), |
|
|
|
|
Err("\\u escape sequence should be followed by hexadecimal digits") |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
pub struct StaticCharSliceMap { |
|
|
|
|
keys: &'static [char], |
|
|
|
|
values: &'static [char], |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
impl StaticCharSliceMap { |
|
|
|
|
pub const fn new(keys: &'static [char], values: &'static [char]) -> Self { |
|
|
|
|
Self { keys, values } |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
pub fn get(&self, key: char) -> Option<char> { |
|
|
|
|
for i in 0..self.keys.len() { |
|
|
|
|
if self.keys[i] == key { |
|
|
|
|
return Some(self.values[i]); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
None |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
const UNESCAPE_CHARACTERS: [u8; 8] = [b't', b'b', b'n', b'r', b'f', b'"', b'\'', b'\\']; |
|
|
|
|
const UNESCAPE_REPLACEMENT: StaticCharSliceMap = StaticCharSliceMap::new( |
|
|
|
|
&['t', 'b', 'n', 'r', 'f', '"', '\'', '\\'], |
|
|
|
|
&[ |
|
|
|
|
'\u{0009}', '\u{0008}', '\u{000A}', '\u{000D}', '\u{000C}', '\u{0022}', '\u{0027}', |
|
|
|
|
'\u{005C}', |
|
|
|
|
], |
|
|
|
|
); |
|
|
|
|
|
|
|
|
|
fn unescape_echars(input: &str) -> Cow<'_, str> { |
|
|
|
|
unescape_characters(input, &UNESCAPE_CHARACTERS, &UNESCAPE_REPLACEMENT) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
const UNESCAPE_PN_CHARACTERS: [u8; 20] = [ |
|
|
|
|
b'_', b'~', b'.', b'-', b'!', b'$', b'&', b'\'', b'(', b')', b'*', b'+', b',', b';', b'=', |
|
|
|
|
b'/', b'?', b'#', b'@', b'%', |
|
|
|
|
]; |
|
|
|
|
const UNESCAPE_PN_REPLACEMENT: StaticCharSliceMap = StaticCharSliceMap::new( |
|
|
|
|
&[ |
|
|
|
|
'_', '~', '.', '-', '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', '/', '?', '#', |
|
|
|
|
'@', '%', |
|
|
|
|
], |
|
|
|
|
&[ |
|
|
|
|
'_', '~', '.', '-', '!', '$', '&', '\'', '(', ')', '*', '+', ',', ';', '=', '/', '?', '#', |
|
|
|
|
'@', '%', |
|
|
|
|
], |
|
|
|
|
); |
|
|
|
|
|
|
|
|
|
pub fn unescape_pn_local(input: &str) -> Cow<'_, str> { |
|
|
|
|
unescape_characters(input, &UNESCAPE_PN_CHARACTERS, &UNESCAPE_PN_REPLACEMENT) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
fn variable() -> Variable { |
|
|
|
|
Variable::new_unchecked(format!("{:x}", random::<u128>())) |
|
|
|
@ -2143,7 +1996,7 @@ parser! { |
|
|
|
|
} / ANON() { BlankNode::default() } |
|
|
|
|
|
|
|
|
|
rule IRIREF() -> Iri<String> = "<" i:$((!['>'] [_])*) ">" {? |
|
|
|
|
state.parse_iri(i).map_err(|_| "IRI parsing failed") |
|
|
|
|
state.parse_iri(unescape_iriref(i)?).map_err(|_| "IRI parsing failed") |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
rule PNAME_NS() -> &'input str = ns:$(PN_PREFIX()?) ":" { |
|
|
|
@ -2152,8 +2005,11 @@ parser! { |
|
|
|
|
|
|
|
|
|
rule PNAME_LN() -> Iri<String> = ns:PNAME_NS() local:$(PN_LOCAL()) {? |
|
|
|
|
if let Some(base) = state.namespaces.get(ns) { |
|
|
|
|
let mut iri = base.clone(); |
|
|
|
|
iri.push_str(&unescape_pn_local(local)); |
|
|
|
|
let mut iri = String::with_capacity(base.len() + local.len()); |
|
|
|
|
iri.push_str(base); |
|
|
|
|
for chunk in local.split('\\') { // We remove \
|
|
|
|
|
iri.push_str(chunk); |
|
|
|
|
} |
|
|
|
|
Iri::parse(iri).map_err(|_| "IRI parsing failed") |
|
|
|
|
} else { |
|
|
|
|
Err("Prefix not found") |
|
|
|
@ -2192,29 +2048,31 @@ parser! { |
|
|
|
|
|
|
|
|
|
rule EXPONENT() = ['e' | 'E'] ['+' | '-']? ['0'..='9']+ |
|
|
|
|
|
|
|
|
|
rule STRING_LITERAL1() -> String = "'" l:$((STRING_LITERAL1_simple_char() / ECHAR())*) "'" { |
|
|
|
|
unescape_echars(l).to_string() |
|
|
|
|
rule STRING_LITERAL1() -> String = "'" l:$((STRING_LITERAL1_simple_char() / ECHAR() / UCHAR())*) "'" {? |
|
|
|
|
unescape_string(l) |
|
|
|
|
} |
|
|
|
|
rule STRING_LITERAL1_simple_char() = !['\u{27}' | '\u{5C}' | '\u{A}' | '\u{D}'] [_] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
rule STRING_LITERAL2() -> String = "\"" l:$((STRING_LITERAL2_simple_char() / ECHAR())*) "\"" { |
|
|
|
|
unescape_echars(l).to_string() |
|
|
|
|
rule STRING_LITERAL2() -> String = "\"" l:$((STRING_LITERAL2_simple_char() / ECHAR() / UCHAR())*) "\"" {? |
|
|
|
|
unescape_string(l) |
|
|
|
|
} |
|
|
|
|
rule STRING_LITERAL2_simple_char() = !['\u{22}' | '\u{5C}' | '\u{A}' | '\u{D}'] [_] |
|
|
|
|
|
|
|
|
|
rule STRING_LITERAL_LONG1() -> String = "'''" l:$(STRING_LITERAL_LONG1_inner()*) "'''" { |
|
|
|
|
unescape_echars(l).to_string() |
|
|
|
|
rule STRING_LITERAL_LONG1() -> String = "'''" l:$(STRING_LITERAL_LONG1_inner()*) "'''" {? |
|
|
|
|
unescape_string(l) |
|
|
|
|
} |
|
|
|
|
rule STRING_LITERAL_LONG1_inner() = ("''" / "'")? (STRING_LITERAL_LONG1_simple_char() / ECHAR()) |
|
|
|
|
rule STRING_LITERAL_LONG1_inner() = ("''" / "'")? (STRING_LITERAL_LONG1_simple_char() / ECHAR() / UCHAR()) |
|
|
|
|
rule STRING_LITERAL_LONG1_simple_char() = !['\'' | '\\'] [_] |
|
|
|
|
|
|
|
|
|
rule STRING_LITERAL_LONG2() -> String = "\"\"\"" l:$(STRING_LITERAL_LONG2_inner()*) "\"\"\"" { |
|
|
|
|
unescape_echars(l).to_string() |
|
|
|
|
rule STRING_LITERAL_LONG2() -> String = "\"\"\"" l:$(STRING_LITERAL_LONG2_inner()*) "\"\"\"" {? |
|
|
|
|
unescape_string(l) |
|
|
|
|
} |
|
|
|
|
rule STRING_LITERAL_LONG2_inner() = ("\"\"" / "\"")? (STRING_LITERAL_LONG2_simple_char() / ECHAR()) |
|
|
|
|
rule STRING_LITERAL_LONG2_inner() = ("\"\"" / "\"")? (STRING_LITERAL_LONG2_simple_char() / ECHAR() / UCHAR()) |
|
|
|
|
rule STRING_LITERAL_LONG2_simple_char() = !['"' | '\\'] [_] |
|
|
|
|
|
|
|
|
|
rule UCHAR() = "\\u" HEX() HEX() HEX() HEX() / "\\U" HEX() HEX() HEX() HEX() HEX() HEX() HEX() HEX() |
|
|
|
|
|
|
|
|
|
rule ECHAR() = "\\" ['t' | 'b' | 'n' | 'r' | 'f' | '"' |'\'' | '\\'] |
|
|
|
|
|
|
|
|
|
rule NIL() = "(" WS()* ")" |
|
|
|
|