|
|
@ -6,12 +6,12 @@ use oxrdf::NamedNode; |
|
|
|
use std::borrow::Cow; |
|
|
|
use std::borrow::Cow; |
|
|
|
use std::cmp::min; |
|
|
|
use std::cmp::min; |
|
|
|
use std::collections::HashMap; |
|
|
|
use std::collections::HashMap; |
|
|
|
use std::ops::{Range, RangeInclusive}; |
|
|
|
use std::ops::Range; |
|
|
|
use std::str; |
|
|
|
use std::str; |
|
|
|
|
|
|
|
|
|
|
|
#[derive(Debug, PartialEq, Eq)] |
|
|
|
#[derive(Debug, PartialEq, Eq)] |
|
|
|
pub enum N3Token<'a> { |
|
|
|
pub enum N3Token<'a> { |
|
|
|
IriRef(Iri<String>), |
|
|
|
IriRef(String), |
|
|
|
PrefixedName { |
|
|
|
PrefixedName { |
|
|
|
prefix: &'a str, |
|
|
|
prefix: &'a str, |
|
|
|
local: Cow<'a, str>, |
|
|
|
local: Cow<'a, str>, |
|
|
@ -42,6 +42,7 @@ pub struct N3LexerOptions { |
|
|
|
|
|
|
|
|
|
|
|
pub struct N3Lexer { |
|
|
|
pub struct N3Lexer { |
|
|
|
mode: N3LexerMode, |
|
|
|
mode: N3LexerMode, |
|
|
|
|
|
|
|
unchecked: bool, |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// TODO: there are a lot of 'None' (missing data) returned even if the stream is ending!!!
|
|
|
|
// TODO: there are a lot of 'None' (missing data) returned even if the stream is ending!!!
|
|
|
@ -61,7 +62,7 @@ impl TokenRecognizer for N3Lexer { |
|
|
|
b'<' => match *data.get(1)? { |
|
|
|
b'<' => match *data.get(1)? { |
|
|
|
b'<' => Some((2, Ok(N3Token::Punctuation("<<")))), |
|
|
|
b'<' => Some((2, Ok(N3Token::Punctuation("<<")))), |
|
|
|
b'=' if self.mode == N3LexerMode::N3 => { |
|
|
|
b'=' if self.mode == N3LexerMode::N3 => { |
|
|
|
if let Some((consumed, result)) = Self::recognize_iri(data, options) { |
|
|
|
if let Some((consumed, result)) = self.recognize_iri(data, options) { |
|
|
|
Some(if let Ok(result) = result { |
|
|
|
Some(if let Ok(result) = result { |
|
|
|
(consumed, Ok(result)) |
|
|
|
(consumed, Ok(result)) |
|
|
|
} else { |
|
|
|
} else { |
|
|
@ -74,7 +75,7 @@ impl TokenRecognizer for N3Lexer { |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
b'-' if self.mode == N3LexerMode::N3 => { |
|
|
|
b'-' if self.mode == N3LexerMode::N3 => { |
|
|
|
if let Some((consumed, result)) = Self::recognize_iri(data, options) { |
|
|
|
if let Some((consumed, result)) = self.recognize_iri(data, options) { |
|
|
|
Some(if let Ok(result) = result { |
|
|
|
Some(if let Ok(result) = result { |
|
|
|
(consumed, Ok(result)) |
|
|
|
(consumed, Ok(result)) |
|
|
|
} else { |
|
|
|
} else { |
|
|
@ -86,7 +87,7 @@ impl TokenRecognizer for N3Lexer { |
|
|
|
None |
|
|
|
None |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
_ => Self::recognize_iri(data, options), |
|
|
|
_ => self.recognize_iri(data, options), |
|
|
|
}, |
|
|
|
}, |
|
|
|
b'>' => { |
|
|
|
b'>' => { |
|
|
|
if *data.get(1)? == b'>' { |
|
|
|
if *data.get(1)? == b'>' { |
|
|
@ -119,7 +120,7 @@ impl TokenRecognizer for N3Lexer { |
|
|
|
Self::recognize_string(data, b'\'') |
|
|
|
Self::recognize_string(data, b'\'') |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
b'@' => Self::recognize_lang_tag(data), |
|
|
|
b'@' => self.recognize_lang_tag(data), |
|
|
|
b'.' => match data.get(1) { |
|
|
|
b'.' => match data.get(1) { |
|
|
|
Some(b'0'..=b'9') => Self::recognize_number(data), |
|
|
|
Some(b'0'..=b'9') => Self::recognize_number(data), |
|
|
|
Some(_) => Some((1, Ok(N3Token::Punctuation(".")))), |
|
|
|
Some(_) => Some((1, Ok(N3Token::Punctuation(".")))), |
|
|
@ -162,18 +163,19 @@ impl TokenRecognizer for N3Lexer { |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
b'0'..=b'9' | b'+' | b'-' => Self::recognize_number(data), |
|
|
|
b'0'..=b'9' | b'+' | b'-' => Self::recognize_number(data), |
|
|
|
b'?' => Self::recognize_variable(data, is_ending), |
|
|
|
b'?' => self.recognize_variable(data, is_ending), |
|
|
|
_ => Self::recognize_pname_or_keyword(data, is_ending), |
|
|
|
_ => self.recognize_pname_or_keyword(data, is_ending), |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
impl N3Lexer { |
|
|
|
impl N3Lexer { |
|
|
|
pub fn new(mode: N3LexerMode) -> Self { |
|
|
|
pub fn new(mode: N3LexerMode, unchecked: bool) -> Self { |
|
|
|
Self { mode } |
|
|
|
Self { mode, unchecked } |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
fn recognize_iri( |
|
|
|
fn recognize_iri( |
|
|
|
|
|
|
|
&self, |
|
|
|
data: &[u8], |
|
|
|
data: &[u8], |
|
|
|
options: &N3LexerOptions, |
|
|
|
options: &N3LexerOptions, |
|
|
|
) -> Option<(usize, Result<N3Token<'static>, TokenRecognizerError>)> { |
|
|
|
) -> Option<(usize, Result<N3Token<'static>, TokenRecognizerError>)> { |
|
|
@ -186,7 +188,8 @@ impl N3Lexer { |
|
|
|
i += end; |
|
|
|
i += end; |
|
|
|
match data[i] { |
|
|
|
match data[i] { |
|
|
|
b'>' => { |
|
|
|
b'>' => { |
|
|
|
return Some((i + 1, Self::parse_iri(string, 0..=i, options))); |
|
|
|
#[allow(clippy::range_plus_one)] |
|
|
|
|
|
|
|
return Some((i + 1, self.parse_iri(string, 0..i + 1, options))); |
|
|
|
} |
|
|
|
} |
|
|
|
b'\\' => { |
|
|
|
b'\\' => { |
|
|
|
let (additional, c) = Self::recognize_escape(&data[i..], i, false)?; |
|
|
|
let (additional, c) = Self::recognize_escape(&data[i..], i, false)?; |
|
|
@ -205,29 +208,36 @@ impl N3Lexer { |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
fn parse_iri( |
|
|
|
fn parse_iri( |
|
|
|
|
|
|
|
&self, |
|
|
|
iri: Vec<u8>, |
|
|
|
iri: Vec<u8>, |
|
|
|
position: RangeInclusive<usize>, |
|
|
|
position: Range<usize>, |
|
|
|
options: &N3LexerOptions, |
|
|
|
options: &N3LexerOptions, |
|
|
|
) -> Result<N3Token<'static>, TokenRecognizerError> { |
|
|
|
) -> Result<N3Token<'static>, TokenRecognizerError> { |
|
|
|
let iri = String::from_utf8(iri).map_err(|e| { |
|
|
|
let iri = string_from_utf8(iri, position.clone())?; |
|
|
|
( |
|
|
|
Ok(N3Token::IriRef( |
|
|
|
position.clone(), |
|
|
|
if let Some(base_iri) = options.base_iri.as_ref() { |
|
|
|
format!("The IRI contains invalid UTF-8 characters: {e}"), |
|
|
|
if self.unchecked { |
|
|
|
) |
|
|
|
base_iri.resolve_unchecked(&iri) |
|
|
|
})?; |
|
|
|
} else { |
|
|
|
let iri = if let Some(base_iri) = options.base_iri.as_ref() { |
|
|
|
|
|
|
|
base_iri.resolve(&iri) |
|
|
|
base_iri.resolve(&iri) |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
.map_err(|e| (position, e.to_string()))? |
|
|
|
|
|
|
|
.into_inner() |
|
|
|
|
|
|
|
} else if self.unchecked { |
|
|
|
|
|
|
|
iri |
|
|
|
} else { |
|
|
|
} else { |
|
|
|
Iri::parse(iri) |
|
|
|
Iri::parse(iri) |
|
|
|
} |
|
|
|
.map_err(|e| (position, e.to_string()))? |
|
|
|
.map_err(|e| (position, e.to_string()))?; |
|
|
|
.into_inner() |
|
|
|
Ok(N3Token::IriRef(iri)) |
|
|
|
}, |
|
|
|
|
|
|
|
)) |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
fn recognize_pname_or_keyword( |
|
|
|
fn recognize_pname_or_keyword<'a>( |
|
|
|
data: &[u8], |
|
|
|
&self, |
|
|
|
|
|
|
|
data: &'a [u8], |
|
|
|
is_ending: bool, |
|
|
|
is_ending: bool, |
|
|
|
) -> Option<(usize, Result<N3Token<'_>, TokenRecognizerError>)> { |
|
|
|
) -> Option<(usize, Result<N3Token<'a>, TokenRecognizerError>)> { |
|
|
|
// [139s] PNAME_NS ::= PN_PREFIX? ':'
|
|
|
|
// [139s] PNAME_NS ::= PN_PREFIX? ':'
|
|
|
|
// [140s] PNAME_LN ::= PNAME_NS PN_LOCAL
|
|
|
|
// [140s] PNAME_LN ::= PNAME_NS PN_LOCAL
|
|
|
|
|
|
|
|
|
|
|
@ -303,7 +313,8 @@ impl N3Lexer { |
|
|
|
)); |
|
|
|
)); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
let (consumed, pn_local_result) = Self::recognize_optional_pn_local(&data[i..], is_ending)?; |
|
|
|
let (consumed, pn_local_result) = |
|
|
|
|
|
|
|
self.recognize_optional_pn_local(&data[i..], is_ending)?; |
|
|
|
Some(( |
|
|
|
Some(( |
|
|
|
consumed + i, |
|
|
|
consumed + i, |
|
|
|
pn_local_result.map(|(local, might_be_invalid_iri)| N3Token::PrefixedName { |
|
|
|
pn_local_result.map(|(local, might_be_invalid_iri)| N3Token::PrefixedName { |
|
|
@ -314,12 +325,13 @@ impl N3Lexer { |
|
|
|
)) |
|
|
|
)) |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
fn recognize_variable( |
|
|
|
fn recognize_variable<'a>( |
|
|
|
data: &[u8], |
|
|
|
&self, |
|
|
|
|
|
|
|
data: &'a [u8], |
|
|
|
is_ending: bool, |
|
|
|
is_ending: bool, |
|
|
|
) -> Option<(usize, Result<N3Token<'_>, TokenRecognizerError>)> { |
|
|
|
) -> Option<(usize, Result<N3Token<'a>, TokenRecognizerError>)> { |
|
|
|
// [36] QUICK_VAR_NAME ::= "?" PN_LOCAL
|
|
|
|
// [36] QUICK_VAR_NAME ::= "?" PN_LOCAL
|
|
|
|
let (consumed, result) = Self::recognize_optional_pn_local(&data[1..], is_ending)?; |
|
|
|
let (consumed, result) = self.recognize_optional_pn_local(&data[1..], is_ending)?; |
|
|
|
Some(( |
|
|
|
Some(( |
|
|
|
consumed + 1, |
|
|
|
consumed + 1, |
|
|
|
result.and_then(|(name, _)| { |
|
|
|
result.and_then(|(name, _)| { |
|
|
@ -332,10 +344,11 @@ impl N3Lexer { |
|
|
|
)) |
|
|
|
)) |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
fn recognize_optional_pn_local( |
|
|
|
fn recognize_optional_pn_local<'a>( |
|
|
|
data: &[u8], |
|
|
|
&self, |
|
|
|
|
|
|
|
data: &'a [u8], |
|
|
|
is_ending: bool, |
|
|
|
is_ending: bool, |
|
|
|
) -> Option<(usize, Result<(Cow<'_, str>, bool), TokenRecognizerError>)> { |
|
|
|
) -> Option<(usize, Result<(Cow<'a, str>, bool), TokenRecognizerError>)> { |
|
|
|
// [168s] PN_LOCAL ::= (PN_CHARS_U | ':' | [0-9] | PLX) ((PN_CHARS | '.' | ':' | PLX)* (PN_CHARS | ':' | PLX))?
|
|
|
|
// [168s] PN_LOCAL ::= (PN_CHARS_U | ':' | [0-9] | PLX) ((PN_CHARS | '.' | ':' | PLX)* (PN_CHARS | ':' | PLX))?
|
|
|
|
let mut i = 0; |
|
|
|
let mut i = 0; |
|
|
|
let mut buffer = None; // Buffer if there are some escaped characters
|
|
|
|
let mut buffer = None; // Buffer if there are some escaped characters
|
|
|
@ -359,7 +372,8 @@ impl N3Lexer { |
|
|
|
} else if c == '\\' { |
|
|
|
} else if c == '\\' { |
|
|
|
i += 1; |
|
|
|
i += 1; |
|
|
|
let a = char::from(*data.get(i)?); |
|
|
|
let a = char::from(*data.get(i)?); |
|
|
|
if matches!( |
|
|
|
if self.unchecked |
|
|
|
|
|
|
|
|| matches!( |
|
|
|
a, |
|
|
|
a, |
|
|
|
'_' | '~' |
|
|
|
'_' | '~' |
|
|
|
| '.' |
|
|
|
| '.' |
|
|
@ -375,7 +389,8 @@ impl N3Lexer { |
|
|
|
| ',' |
|
|
|
| ',' |
|
|
|
| ';' |
|
|
|
| ';' |
|
|
|
| '=' |
|
|
|
| '=' |
|
|
|
) { |
|
|
|
) |
|
|
|
|
|
|
|
{ |
|
|
|
// ok to escape
|
|
|
|
// ok to escape
|
|
|
|
} else if matches!(a, '/' | '?' | '#' | '@' | '%') { |
|
|
|
} else if matches!(a, '/' | '?' | '#' | '@' | '%') { |
|
|
|
// ok to escape but requires IRI validation
|
|
|
|
// ok to escape but requires IRI validation
|
|
|
@ -406,12 +421,18 @@ impl N3Lexer { |
|
|
|
{ |
|
|
|
{ |
|
|
|
return Some((0, Ok((Cow::Borrowed(""), false)))); |
|
|
|
return Some((0, Ok((Cow::Borrowed(""), false)))); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
if !self.unchecked { |
|
|
|
might_be_invalid_iri |= |
|
|
|
might_be_invalid_iri |= |
|
|
|
Self::is_possible_pn_chars_base_but_not_valid_iri(c) || c == ':'; |
|
|
|
Self::is_possible_pn_chars_base_but_not_valid_iri(c) |
|
|
|
|
|
|
|
|| c == ':'; |
|
|
|
|
|
|
|
} |
|
|
|
i += consumed; |
|
|
|
i += consumed; |
|
|
|
} else if Self::is_possible_pn_chars(c) || c == ':' || c == '.' { |
|
|
|
} else if Self::is_possible_pn_chars(c) || c == ':' || c == '.' { |
|
|
|
|
|
|
|
if !self.unchecked { |
|
|
|
might_be_invalid_iri |= |
|
|
|
might_be_invalid_iri |= |
|
|
|
Self::is_possible_pn_chars_base_but_not_valid_iri(c) || c == ':'; |
|
|
|
Self::is_possible_pn_chars_base_but_not_valid_iri(c) |
|
|
|
|
|
|
|
|| c == ':'; |
|
|
|
|
|
|
|
} |
|
|
|
i += consumed; |
|
|
|
i += consumed; |
|
|
|
} else { |
|
|
|
} else { |
|
|
|
let buffer = if let Some(mut buffer) = buffer { |
|
|
|
let buffer = if let Some(mut buffer) = buffer { |
|
|
@ -518,9 +539,10 @@ impl N3Lexer { |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
fn recognize_lang_tag( |
|
|
|
fn recognize_lang_tag<'a>( |
|
|
|
data: &[u8], |
|
|
|
&self, |
|
|
|
) -> Option<(usize, Result<N3Token<'_>, TokenRecognizerError>)> { |
|
|
|
data: &'a [u8], |
|
|
|
|
|
|
|
) -> Option<(usize, Result<N3Token<'a>, TokenRecognizerError>)> { |
|
|
|
// [144s] LANGTAG ::= '@' [a-zA-Z]+ ('-' [a-zA-Z0-9]+)*
|
|
|
|
// [144s] LANGTAG ::= '@' [a-zA-Z]+ ('-' [a-zA-Z0-9]+)*
|
|
|
|
let mut is_last_block_empty = true; |
|
|
|
let mut is_last_block_empty = true; |
|
|
|
for (i, c) in data[1..].iter().enumerate() { |
|
|
|
for (i, c) in data[1..].iter().enumerate() { |
|
|
@ -532,25 +554,29 @@ impl N3Lexer { |
|
|
|
Err((1..2, "A language code should always start with a letter").into()), |
|
|
|
Err((1..2, "A language code should always start with a letter").into()), |
|
|
|
)); |
|
|
|
)); |
|
|
|
} else if is_last_block_empty { |
|
|
|
} else if is_last_block_empty { |
|
|
|
return Some((i, Self::parse_lang_tag(&data[1..i], 1..i - 1))); |
|
|
|
return Some((i, self.parse_lang_tag(&data[1..i], 1..i - 1))); |
|
|
|
} else if *c == b'-' { |
|
|
|
} else if *c == b'-' { |
|
|
|
is_last_block_empty = true; |
|
|
|
is_last_block_empty = true; |
|
|
|
} else { |
|
|
|
} else { |
|
|
|
return Some((i + 1, Self::parse_lang_tag(&data[1..=i], 1..i))); |
|
|
|
return Some((i + 1, self.parse_lang_tag(&data[1..=i], 1..i))); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
None |
|
|
|
None |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
fn parse_lang_tag( |
|
|
|
fn parse_lang_tag<'a>( |
|
|
|
lang_tag: &[u8], |
|
|
|
&self, |
|
|
|
|
|
|
|
lang_tag: &'a [u8], |
|
|
|
position: Range<usize>, |
|
|
|
position: Range<usize>, |
|
|
|
) -> Result<N3Token<'_>, TokenRecognizerError> { |
|
|
|
) -> Result<N3Token<'a>, TokenRecognizerError> { |
|
|
|
Ok(N3Token::LangTag( |
|
|
|
let lang_tag = str_from_utf8(lang_tag, position.clone())?; |
|
|
|
LanguageTag::parse(str_from_utf8(lang_tag, position.clone())?) |
|
|
|
Ok(N3Token::LangTag(if self.unchecked { |
|
|
|
|
|
|
|
lang_tag |
|
|
|
|
|
|
|
} else { |
|
|
|
|
|
|
|
LanguageTag::parse(lang_tag) |
|
|
|
.map_err(|e| (position.clone(), e.to_string()))? |
|
|
|
.map_err(|e| (position.clone(), e.to_string()))? |
|
|
|
.into_inner(), |
|
|
|
.into_inner() |
|
|
|
)) |
|
|
|
})) |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
fn recognize_string( |
|
|
|
fn recognize_string( |
|
|
@ -933,3 +959,14 @@ fn str_from_utf8(data: &[u8], range: Range<usize>) -> Result<&str, TokenRecogniz |
|
|
|
.into() |
|
|
|
.into() |
|
|
|
}) |
|
|
|
}) |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fn string_from_utf8(data: Vec<u8>, range: Range<usize>) -> Result<String, TokenRecognizerError> { |
|
|
|
|
|
|
|
String::from_utf8(data).map_err(|e| { |
|
|
|
|
|
|
|
( |
|
|
|
|
|
|
|
range.start + e.utf8_error().valid_up_to() |
|
|
|
|
|
|
|
..min(range.end, range.start + e.utf8_error().valid_up_to() + 4), |
|
|
|
|
|
|
|
format!("Invalid UTF-8: {e}"), |
|
|
|
|
|
|
|
) |
|
|
|
|
|
|
|
.into() |
|
|
|
|
|
|
|
}) |
|
|
|
|
|
|
|
} |
|
|
|