- Compatible with async IO - Turtle/TriG parser recovery on simple errorspull/555/head
parent
a1cbfdf67d
commit
71b1768d28
@ -0,0 +1,28 @@ |
||||
#![no_main] |
||||
|
||||
use libfuzzer_sys::fuzz_target; |
||||
use oxttl::N3Parser; |
||||
|
||||
fuzz_target!(|data: &[u8]| { |
||||
let mut quads = Vec::new(); |
||||
let mut parser = N3Parser::new() |
||||
.with_base_iri("http://example.com/") |
||||
.unwrap() |
||||
.parse(); |
||||
for chunk in data.split(|c| *c == 0xFF) { |
||||
parser.extend_from_slice(chunk); |
||||
while let Some(result) = parser.read_next() { |
||||
if let Ok(quad) = result { |
||||
quads.push(quad); |
||||
} |
||||
} |
||||
} |
||||
parser.end(); |
||||
while let Some(result) = parser.read_next() { |
||||
if let Ok(quad) = result { |
||||
quads.push(quad); |
||||
} |
||||
} |
||||
assert!(parser.is_end()); |
||||
//TODO: serialize
|
||||
}); |
@ -0,0 +1,49 @@ |
||||
#![no_main] |
||||
|
||||
use libfuzzer_sys::fuzz_target; |
||||
use oxttl::{NQuadsParser, NQuadsSerializer}; |
||||
|
||||
fuzz_target!(|data: &[u8]| { |
||||
// We parse
|
||||
let mut quads = Vec::new(); |
||||
let mut parser = NQuadsParser::new().with_quoted_triples().parse(); |
||||
for chunk in data.split(|c| *c == 0xFF) { |
||||
parser.extend_from_slice(chunk); |
||||
while let Some(result) = parser.read_next() { |
||||
if let Ok(quad) = result { |
||||
quads.push(quad); |
||||
} |
||||
} |
||||
} |
||||
parser.end(); |
||||
while let Some(result) = parser.read_next() { |
||||
if let Ok(quad) = result { |
||||
quads.push(quad); |
||||
} |
||||
} |
||||
assert!(parser.is_end()); |
||||
|
||||
// We serialize
|
||||
let mut writer = NQuadsSerializer::new().serialize_to_write(Vec::new()); |
||||
for quad in &quads { |
||||
writer.write_quad(quad).unwrap(); |
||||
} |
||||
let new_serialization = writer.finish(); |
||||
|
||||
// We parse the serialization
|
||||
let new_quads = NQuadsParser::new() |
||||
.with_quoted_triples() |
||||
.parse_from_read(new_serialization.as_slice()) |
||||
.collect::<Result<Vec<_>, _>>() |
||||
.map_err(|e| { |
||||
format!( |
||||
"Error on {:?} from {quads:?} based on {:?}: {e}", |
||||
String::from_utf8_lossy(&new_serialization), |
||||
String::from_utf8_lossy(data) |
||||
) |
||||
}) |
||||
.unwrap(); |
||||
|
||||
// We check the roundtrip has not changed anything
|
||||
assert_eq!(new_quads, quads); |
||||
}); |
@ -0,0 +1,53 @@ |
||||
#![no_main] |
||||
|
||||
use libfuzzer_sys::fuzz_target; |
||||
use oxttl::{TriGParser, TriGSerializer}; |
||||
|
||||
fuzz_target!(|data: &[u8]| { |
||||
// We parse
|
||||
let mut quads = Vec::new(); |
||||
let mut parser = TriGParser::new() |
||||
.with_quoted_triples() |
||||
.with_base_iri("http://example.com/") |
||||
.unwrap() |
||||
.parse(); |
||||
for chunk in data.split(|c| *c == 0xFF) { |
||||
parser.extend_from_slice(chunk); |
||||
while let Some(result) = parser.read_next() { |
||||
if let Ok(quad) = result { |
||||
quads.push(quad); |
||||
} |
||||
} |
||||
} |
||||
parser.end(); |
||||
while let Some(result) = parser.read_next() { |
||||
if let Ok(quad) = result { |
||||
quads.push(quad); |
||||
} |
||||
} |
||||
assert!(parser.is_end()); |
||||
|
||||
// We serialize
|
||||
let mut writer = TriGSerializer::new().serialize_to_write(Vec::new()); |
||||
for quad in &quads { |
||||
writer.write_quad(quad).unwrap(); |
||||
} |
||||
let new_serialization = writer.finish().unwrap(); |
||||
|
||||
// We parse the serialization
|
||||
let new_quads = TriGParser::new() |
||||
.with_quoted_triples() |
||||
.parse_from_read(new_serialization.as_slice()) |
||||
.collect::<Result<Vec<_>, _>>() |
||||
.map_err(|e| { |
||||
format!( |
||||
"Error on {:?} from {quads:?} based on {:?}: {e}", |
||||
String::from_utf8_lossy(&new_serialization), |
||||
String::from_utf8_lossy(data) |
||||
) |
||||
}) |
||||
.unwrap(); |
||||
|
||||
// We check the roundtrip has not changed anything
|
||||
assert_eq!(new_quads, quads); |
||||
}); |
@ -0,0 +1,27 @@ |
||||
[package] |
||||
name = "oxttl" |
||||
version = "0.1.0" |
||||
authors = ["Tpt <thomas@pellissier-tanon.fr>"] |
||||
license = "MIT OR Apache-2.0" |
||||
readme = "README.md" |
||||
keywords = ["SPARQL"] |
||||
repository = "https://github.com/oxigraph/oxigraph/tree/master/lib/oxttl" |
||||
homepage = "https://oxigraph.org/" |
||||
description = """ |
||||
N-Triples parser |
||||
""" |
||||
edition = "2021" |
||||
rust-version = "1.65" |
||||
|
||||
[features] |
||||
default = [] |
||||
rdf-star = ["oxrdf/rdf-star"] |
||||
|
||||
[dependencies] |
||||
memchr = "2" |
||||
oxrdf = { version = "0.2.0-alpha.1-dev", path = "../oxrdf" } |
||||
oxiri = "0.2" |
||||
oxilangtag = "0.1" |
||||
|
||||
[package.metadata.docs.rs] |
||||
all-features = true |
@ -0,0 +1,938 @@ |
||||
use crate::toolkit::{TokenRecognizer, TokenRecognizerError}; |
||||
use memchr::{memchr, memchr2}; |
||||
use oxilangtag::LanguageTag; |
||||
use oxiri::Iri; |
||||
use oxrdf::NamedNode; |
||||
use std::borrow::Cow; |
||||
use std::collections::HashMap; |
||||
use std::ops::{Range, RangeInclusive}; |
||||
use std::str; |
||||
|
||||
#[derive(Debug, PartialEq, Eq)] |
||||
pub enum N3Token<'a> { |
||||
IriRef(Iri<String>), |
||||
PrefixedName { |
||||
prefix: &'a str, |
||||
local: Cow<'a, str>, |
||||
might_be_invalid_iri: bool, |
||||
}, |
||||
Variable(Cow<'a, str>), |
||||
BlankNodeLabel(&'a str), |
||||
String(String), |
||||
Integer(&'a str), |
||||
Decimal(&'a str), |
||||
Double(&'a str), |
||||
LangTag(&'a str), |
||||
Punctuation(&'a str), |
||||
PlainKeyword(&'a str), |
||||
} |
||||
|
||||
#[derive(Eq, PartialEq)] |
||||
pub enum N3LexerMode { |
||||
NTriples, |
||||
Turtle, |
||||
N3, |
||||
} |
||||
|
||||
#[derive(Default)] |
||||
pub struct N3LexerOptions { |
||||
pub base_iri: Option<Iri<String>>, |
||||
} |
||||
|
||||
pub struct N3Lexer { |
||||
mode: N3LexerMode, |
||||
} |
||||
|
||||
// TODO: there are a lot of 'None' (missing data) returned even if the stream is ending!!!
|
||||
// TODO: simplify by not giving is_end and fail with an "unexpected eof" is none is returned when is_end=true?
|
||||
|
||||
impl TokenRecognizer for N3Lexer { |
||||
type Token<'a> = N3Token<'a>; |
||||
type Options = N3LexerOptions; |
||||
|
||||
fn recognize_next_token<'a>( |
||||
&mut self, |
||||
data: &'a [u8], |
||||
is_ending: bool, |
||||
options: &N3LexerOptions, |
||||
) -> Option<(usize, Result<N3Token<'a>, TokenRecognizerError>)> { |
||||
match *data.first()? { |
||||
b'<' => match *data.get(1)? { |
||||
b'<' => Some((2, Ok(N3Token::Punctuation("<<")))), |
||||
b'=' if self.mode == N3LexerMode::N3 => { |
||||
if let Some((consumed, result)) = Self::recognize_iri(data, options) { |
||||
Some(if let Ok(result) = result { |
||||
(consumed, Ok(result)) |
||||
} else { |
||||
(2, Ok(N3Token::Punctuation("<="))) |
||||
}) |
||||
} else if is_ending { |
||||
Some((2, Ok(N3Token::Punctuation("<=")))) |
||||
} else { |
||||
None |
||||
} |
||||
} |
||||
b'-' if self.mode == N3LexerMode::N3 => { |
||||
if let Some((consumed, result)) = Self::recognize_iri(data, options) { |
||||
Some(if let Ok(result) = result { |
||||
(consumed, Ok(result)) |
||||
} else { |
||||
(2, Ok(N3Token::Punctuation("<-"))) |
||||
}) |
||||
} else if is_ending { |
||||
Some((2, Ok(N3Token::Punctuation("<-")))) |
||||
} else { |
||||
None |
||||
} |
||||
} |
||||
_ => Self::recognize_iri(data, options), |
||||
}, |
||||
b'>' => { |
||||
if *data.get(1)? == b'>' { |
||||
Some((2, Ok(N3Token::Punctuation(">>")))) |
||||
} else { |
||||
Some((1, Ok(N3Token::Punctuation(">")))) |
||||
} |
||||
} |
||||
b'_' => match data.get(1)? { |
||||
b':' => Self::recognize_blank_node_label(data), |
||||
c => Some(( |
||||
1, |
||||
Err((0, format!("Unexpected character '{}'", char::from(*c))).into()), |
||||
)), |
||||
}, |
||||
b'"' => { |
||||
if self.mode != N3LexerMode::NTriples |
||||
&& *data.get(1)? == b'"' |
||||
&& *data.get(2)? == b'"' |
||||
{ |
||||
Self::recognize_long_string(data, b'"') |
||||
} else { |
||||
Self::recognize_string(data, b'"') |
||||
} |
||||
} |
||||
b'\'' if self.mode != N3LexerMode::NTriples => { |
||||
if *data.get(1)? == b'\'' && *data.get(2)? == b'\'' { |
||||
Self::recognize_long_string(data, b'\'') |
||||
} else { |
||||
Self::recognize_string(data, b'\'') |
||||
} |
||||
} |
||||
b'@' => Self::recognize_lang_tag(data), |
||||
b'.' => match data.get(1) { |
||||
Some(b'0'..=b'9') => Self::recognize_number(data), |
||||
Some(_) => Some((1, Ok(N3Token::Punctuation(".")))), |
||||
None => is_ending.then_some((1, Ok(N3Token::Punctuation(".")))), |
||||
}, |
||||
b'^' => { |
||||
if *data.get(1)? == b'^' { |
||||
Some((2, Ok(N3Token::Punctuation("^^")))) |
||||
} else { |
||||
Some((1, Ok(N3Token::Punctuation("^")))) |
||||
} |
||||
} |
||||
b'(' => Some((1, Ok(N3Token::Punctuation("(")))), |
||||
b')' => Some((1, Ok(N3Token::Punctuation(")")))), |
||||
b'[' => Some((1, Ok(N3Token::Punctuation("[")))), |
||||
b']' => Some((1, Ok(N3Token::Punctuation("]")))), |
||||
b'{' => { |
||||
if *data.get(1)? == b'|' { |
||||
Some((2, Ok(N3Token::Punctuation("{|")))) |
||||
} else { |
||||
Some((1, Ok(N3Token::Punctuation("{")))) |
||||
} |
||||
} |
||||
b'}' => Some((1, Ok(N3Token::Punctuation("}")))), |
||||
b',' => Some((1, Ok(N3Token::Punctuation(",")))), |
||||
b';' => Some((1, Ok(N3Token::Punctuation(";")))), |
||||
b'!' => Some((1, Ok(N3Token::Punctuation("!")))), |
||||
b'|' => { |
||||
if *data.get(1)? == b'}' { |
||||
Some((2, Ok(N3Token::Punctuation("|}")))) |
||||
} else { |
||||
Some((1, Ok(N3Token::Punctuation("|")))) |
||||
} |
||||
} |
||||
b'=' => { |
||||
if *data.get(1)? == b'>' { |
||||
Some((2, Ok(N3Token::Punctuation("=>")))) |
||||
} else { |
||||
Some((1, Ok(N3Token::Punctuation("=")))) |
||||
} |
||||
} |
||||
b'0'..=b'9' | b'+' | b'-' => Self::recognize_number(data), |
||||
b'?' => Self::recognize_variable(data, is_ending), |
||||
_ => Self::recognize_pname_or_keyword(data, is_ending), |
||||
} |
||||
} |
||||
} |
||||
|
||||
impl N3Lexer { |
||||
pub fn new(mode: N3LexerMode) -> Self { |
||||
Self { mode } |
||||
} |
||||
|
||||
fn recognize_iri( |
||||
data: &[u8], |
||||
options: &N3LexerOptions, |
||||
) -> Option<(usize, Result<N3Token<'static>, TokenRecognizerError>)> { |
||||
// [18] IRIREF ::= '<' ([^#x00-#x20<>"{}|^`\] | UCHAR)* '>' /* #x00=NULL #01-#x1F=control codes #x20=space */
|
||||
let mut string = Vec::new(); |
||||
let mut i = 1; |
||||
loop { |
||||
let end = memchr2(b'>', b'\\', &data[i..])?; |
||||
string.extend_from_slice(&data[i..i + end]); |
||||
i += end; |
||||
match data[i] { |
||||
b'>' => { |
||||
return Some((i + 1, Self::parse_iri(string, 0..=i, options))); |
||||
} |
||||
b'\\' => { |
||||
let (additional, c) = Self::recognize_escape(&data[i..], i, false)?; |
||||
i += additional + 1; |
||||
match c { |
||||
Ok(c) => { |
||||
let mut buf = [0; 4]; |
||||
string.extend_from_slice(c.encode_utf8(&mut buf).as_bytes()); |
||||
} |
||||
Err(e) => return Some((i, Err(e))), |
||||
} |
||||
} |
||||
_ => unreachable!(), |
||||
} |
||||
} |
||||
} |
||||
|
||||
fn parse_iri( |
||||
iri: Vec<u8>, |
||||
position: RangeInclusive<usize>, |
||||
options: &N3LexerOptions, |
||||
) -> Result<N3Token<'static>, TokenRecognizerError> { |
||||
let iri = String::from_utf8(iri).map_err(|e| { |
||||
( |
||||
position.clone(), |
||||
format!("The IRI contains invalid UTF-8 characters: {e}"), |
||||
) |
||||
})?; |
||||
let iri = if let Some(base_iri) = options.base_iri.as_ref() { |
||||
base_iri.resolve(&iri) |
||||
} else { |
||||
Iri::parse(iri) |
||||
} |
||||
.map_err(|e| (position, e.to_string()))?; |
||||
Ok(N3Token::IriRef(iri)) |
||||
} |
||||
|
||||
fn recognize_pname_or_keyword( |
||||
data: &[u8], |
||||
is_ending: bool, |
||||
) -> Option<(usize, Result<N3Token<'_>, TokenRecognizerError>)> { |
||||
// [139s] PNAME_NS ::= PN_PREFIX? ':'
|
||||
// [140s] PNAME_LN ::= PNAME_NS PN_LOCAL
|
||||
|
||||
// [167s] PN_PREFIX ::= PN_CHARS_BASE ((PN_CHARS | '.')* PN_CHARS)?
|
||||
let mut i = 0; |
||||
loop { |
||||
if let Some(r) = Self::recognize_unicode_char(&data[i..], i) { |
||||
match r { |
||||
Ok((c, consumed)) => { |
||||
if c == ':' { |
||||
i += consumed; |
||||
break; |
||||
} else if i == 0 { |
||||
if !Self::is_possible_pn_chars_base(c) { |
||||
return Some(( |
||||
consumed, |
||||
Err(( |
||||
0..consumed, |
||||
format!( |
||||
"'{c}' is not allowed at the beginning of a prefix name" |
||||
), |
||||
) |
||||
.into()), |
||||
)); |
||||
} |
||||
i += consumed; |
||||
} else if Self::is_possible_pn_chars(c) || c == '.' { |
||||
i += consumed; |
||||
} else { |
||||
while data[..i].ends_with(b".") { |
||||
i -= 1; |
||||
} |
||||
return Some(( |
||||
i, |
||||
Ok(N3Token::PlainKeyword(str::from_utf8(&data[..i]).unwrap())), |
||||
)); |
||||
} |
||||
} |
||||
Err(e) => return Some((e.position.end, Err(e))), |
||||
} |
||||
} else if is_ending { |
||||
while data[..i].ends_with(b".") { |
||||
i -= 1; |
||||
} |
||||
return Some(if i == 0 { |
||||
( |
||||
1, |
||||
Err((0..1, format!("Unexpected byte {}", data[0])).into()), |
||||
) |
||||
} else { |
||||
( |
||||
i, |
||||
Ok(N3Token::PlainKeyword(str::from_utf8(&data[..i]).unwrap())), |
||||
) |
||||
}); |
||||
} else { |
||||
return None; |
||||
} |
||||
} |
||||
let pn_prefix = str::from_utf8(&data[..i - 1]).unwrap(); |
||||
if pn_prefix.ends_with('.') { |
||||
return Some(( |
||||
i, |
||||
Err(( |
||||
0..i, |
||||
format!( |
||||
"'{pn_prefix}' is not a valid prefix: prefixes are not allowed to end with '.'"), |
||||
) |
||||
.into()), |
||||
)); |
||||
} |
||||
|
||||
let (consumed, pn_local_result) = Self::recognize_optional_pn_local(&data[i..], is_ending)?; |
||||
Some(( |
||||
consumed + i, |
||||
pn_local_result.map(|(local, might_be_invalid_iri)| N3Token::PrefixedName { |
||||
prefix: pn_prefix, |
||||
local, |
||||
might_be_invalid_iri, |
||||
}), |
||||
)) |
||||
} |
||||
|
||||
fn recognize_variable( |
||||
data: &[u8], |
||||
is_ending: bool, |
||||
) -> Option<(usize, Result<N3Token<'_>, TokenRecognizerError>)> { |
||||
// [36] QUICK_VAR_NAME ::= "?" PN_LOCAL
|
||||
let (consumed, result) = Self::recognize_optional_pn_local(&data[1..], is_ending)?; |
||||
Some(( |
||||
consumed + 1, |
||||
result.and_then(|(name, _)| { |
||||
if name.is_empty() { |
||||
Err((0..consumed, "A variable name is not allowed to be empty").into()) |
||||
} else { |
||||
Ok(N3Token::Variable(name)) |
||||
} |
||||
}), |
||||
)) |
||||
} |
||||
|
||||
fn recognize_optional_pn_local( |
||||
data: &[u8], |
||||
is_ending: bool, |
||||
) -> Option<(usize, Result<(Cow<'_, str>, bool), TokenRecognizerError>)> { |
||||
// [168s] PN_LOCAL ::= (PN_CHARS_U | ':' | [0-9] | PLX) ((PN_CHARS | '.' | ':' | PLX)* (PN_CHARS | ':' | PLX))?
|
||||
let mut i = 0; |
||||
let mut buffer = None; // Buffer if there are some escaped characters
|
||||
let mut position_that_is_already_in_buffer = 0; |
||||
let mut might_be_invalid_iri = false; |
||||
loop { |
||||
if let Some(r) = Self::recognize_unicode_char(&data[i..], i) { |
||||
match r { |
||||
Ok((c, consumed)) => { |
||||
if c == '%' { |
||||
i += 1; |
||||
let a = char::from(*data.get(i)?); |
||||
i += 1; |
||||
let b = char::from(*data.get(i)?); |
||||
if !matches!(a, '0'..='9' | 'A'..='F' | 'a'..='f') |
||||
|| !matches!(b, '0'..='9' | 'A'..='F' | 'a'..='f') |
||||
{ |
||||
return Some((i + 1, Err(( |
||||
i - 2..=i, format!("escapes in IRIs should be % followed by two hexadecimal characters, found '%{a}{b}'") |
||||
).into()))); |
||||
} |
||||
i += 1; |
||||
} else if c == '\\' { |
||||
i += 1; |
||||
let a = char::from(*data.get(i)?); |
||||
if matches!( |
||||
a, |
||||
'_' | '~' |
||||
| '.' |
||||
| '-' |
||||
| '!' |
||||
| '$' |
||||
| '&' |
||||
| '\'' |
||||
| '(' |
||||
| ')' |
||||
| '*' |
||||
| '+' |
||||
| ',' |
||||
| ';' |
||||
| '=' |
||||
) { |
||||
// ok to escape
|
||||
} else if matches!(a, '/' | '?' | '#' | '@' | '%') { |
||||
// ok to escape but requires IRI validation
|
||||
might_be_invalid_iri = true; |
||||
} else { |
||||
return Some((i + 1, Err(( |
||||
i..=i, format!("The character that are allowed to be escaped in IRIs are _~.-!$&'()*+,;=/?#@%, found '{a}'") |
||||
).into()))); |
||||
} |
||||
let buffer = buffer.get_or_insert_with(String::new); |
||||
// We add the missing bytes
|
||||
if i - position_that_is_already_in_buffer > 1 { |
||||
buffer.push_str( |
||||
str::from_utf8( |
||||
&data[position_that_is_already_in_buffer..i - 1], |
||||
) |
||||
.unwrap(), |
||||
) |
||||
} |
||||
buffer.push(a); |
||||
i += 1; |
||||
position_that_is_already_in_buffer = i; |
||||
} else if i == 0 { |
||||
if !(Self::is_possible_pn_chars_u(c) || c == ':' || c.is_ascii_digit()) |
||||
{ |
||||
return Some((0, Ok((Cow::Borrowed(""), false)))); |
||||
} |
||||
might_be_invalid_iri |= |
||||
Self::is_possible_pn_chars_base_but_not_valid_iri(c) || c == ':'; |
||||
i += consumed; |
||||
} else if Self::is_possible_pn_chars(c) || c == ':' || c == '.' { |
||||
might_be_invalid_iri |= |
||||
Self::is_possible_pn_chars_base_but_not_valid_iri(c) || c == ':'; |
||||
i += consumed; |
||||
} else { |
||||
let buffer = if let Some(mut buffer) = buffer { |
||||
buffer.push_str( |
||||
str::from_utf8(&data[position_that_is_already_in_buffer..i]) |
||||
.unwrap(), |
||||
); |
||||
// We do not include the last dot
|
||||
while buffer.ends_with('.') { |
||||
buffer.pop(); |
||||
i -= 1; |
||||
} |
||||
Cow::Owned(buffer) |
||||
} else { |
||||
let mut data = str::from_utf8(&data[..i]).unwrap(); |
||||
// We do not include the last dot
|
||||
while let Some(d) = data.strip_suffix('.') { |
||||
data = d; |
||||
i -= 1; |
||||
} |
||||
Cow::Borrowed(data) |
||||
}; |
||||
return Some((i, Ok((buffer, might_be_invalid_iri)))); |
||||
} |
||||
} |
||||
Err(e) => return Some((e.position.end, Err(e))), |
||||
} |
||||
} else if is_ending { |
||||
let buffer = if let Some(mut buffer) = buffer { |
||||
// We do not include the last dot
|
||||
while buffer.ends_with('.') { |
||||
buffer.pop(); |
||||
i -= 1; |
||||
} |
||||
Cow::Owned(buffer) |
||||
} else { |
||||
let mut data = str::from_utf8(&data[..i]).unwrap(); |
||||
// We do not include the last dot
|
||||
while let Some(d) = data.strip_suffix('.') { |
||||
data = d; |
||||
i -= 1; |
||||
} |
||||
Cow::Borrowed(data) |
||||
}; |
||||
return Some((i, Ok((buffer, might_be_invalid_iri)))); |
||||
} else { |
||||
return None; |
||||
} |
||||
} |
||||
} |
||||
|
||||
fn recognize_blank_node_label( |
||||
data: &[u8], |
||||
) -> Option<(usize, Result<N3Token<'_>, TokenRecognizerError>)> { |
||||
// [141s] BLANK_NODE_LABEL ::= '_:' (PN_CHARS_U | [0-9]) ((PN_CHARS | '.')* PN_CHARS)?
|
||||
let mut i = 2; |
||||
loop { |
||||
match Self::recognize_unicode_char(&data[i..], i)? { |
||||
Ok((c, consumed)) => { |
||||
if (i == 2 && (Self::is_possible_pn_chars_u(c) || c.is_ascii_digit())) |
||||
|| (i > 2 && Self::is_possible_pn_chars(c)) |
||||
{ |
||||
// Ok
|
||||
} else if i > 2 && c == '.' { |
||||
if data[i - 1] == b'.' { |
||||
i -= 1; |
||||
return Some(( |
||||
i, |
||||
Ok(N3Token::BlankNodeLabel( |
||||
str::from_utf8(&data[2..i]).unwrap(), |
||||
)), |
||||
)); |
||||
} |
||||
} else if i == 0 { |
||||
return Some(( |
||||
i, |
||||
Err((0..i, "A blank node ID should not be empty").into()), |
||||
)); |
||||
} else if data[i - 1] == b'.' { |
||||
i -= 1; |
||||
return Some(( |
||||
i, |
||||
Ok(N3Token::BlankNodeLabel( |
||||
str::from_utf8(&data[2..i]).unwrap(), |
||||
)), |
||||
)); |
||||
} else { |
||||
return Some(( |
||||
i, |
||||
Ok(N3Token::BlankNodeLabel( |
||||
str::from_utf8(&data[2..i]).unwrap(), |
||||
)), |
||||
)); |
||||
} |
||||
i += consumed; |
||||
} |
||||
Err(e) => return Some((e.position.end, Err(e))), |
||||
} |
||||
} |
||||
} |
||||
|
||||
fn recognize_lang_tag( |
||||
data: &[u8], |
||||
) -> Option<(usize, Result<N3Token<'_>, TokenRecognizerError>)> { |
||||
// [144s] LANGTAG ::= '@' [a-zA-Z]+ ('-' [a-zA-Z0-9]+)*
|
||||
let mut is_last_block_empty = true; |
||||
for (i, c) in data[1..].iter().enumerate() { |
||||
if c.is_ascii_alphabetic() { |
||||
is_last_block_empty = false; |
||||
} else if i == 0 { |
||||
return Some(( |
||||
1, |
||||
Err((1..2, "A language code should always start with a letter").into()), |
||||
)); |
||||
} else if is_last_block_empty { |
||||
return Some((i, Self::parse_lang_tag(&data[1..i], 1..i - 1))); |
||||
} else if *c == b'-' { |
||||
is_last_block_empty = true; |
||||
} else { |
||||
return Some((i + 1, Self::parse_lang_tag(&data[1..=i], 1..i))); |
||||
} |
||||
} |
||||
None |
||||
} |
||||
|
||||
fn parse_lang_tag( |
||||
lang_tag: &[u8], |
||||
position: Range<usize>, |
||||
) -> Result<N3Token<'_>, TokenRecognizerError> { |
||||
Ok(N3Token::LangTag( |
||||
LanguageTag::parse(str::from_utf8(lang_tag).unwrap()) |
||||
.map_err(|e| (position.clone(), e.to_string()))? |
||||
.into_inner(), |
||||
)) |
||||
} |
||||
|
||||
fn recognize_string( |
||||
data: &[u8], |
||||
delimiter: u8, |
||||
) -> Option<(usize, Result<N3Token<'static>, TokenRecognizerError>)> { |
||||
// [22] STRING_LITERAL_QUOTE ::= '"' ([^#x22#x5C#xA#xD] | ECHAR | UCHAR)* '"' /* #x22=" #x5C=\ #xA=new line #xD=carriage return */
|
||||
// [23] STRING_LITERAL_SINGLE_QUOTE ::= "'" ([^#x27#x5C#xA#xD] | ECHAR | UCHAR)* "'" /* #x27=' #x5C=\ #xA=new line #xD=carriage return */
|
||||
let mut string = String::new(); |
||||
let mut i = 1; |
||||
loop { |
||||
let end = memchr2(delimiter, b'\\', &data[i..])?; |
||||
match str::from_utf8(&data[i..i + end]) { |
||||
Ok(a) => string.push_str(a), |
||||
Err(e) => { |
||||
return Some(( |
||||
end, |
||||
Err(( |
||||
i..i + end, |
||||
format!("The string contains invalid UTF-8 characters: {e}"), |
||||
) |
||||
.into()), |
||||
)) |
||||
} |
||||
}; |
||||
i += end; |
||||
match data[i] { |
||||
c if c == delimiter => { |
||||
return Some((i + 1, Ok(N3Token::String(string)))); |
||||
} |
||||
b'\\' => { |
||||
let (additional, c) = Self::recognize_escape(&data[i..], i, true)?; |
||||
i += additional + 1; |
||||
match c { |
||||
Ok(c) => { |
||||
string.push(c); |
||||
} |
||||
Err(e) => { |
||||
// We read until the end of string char
|
||||
let end = memchr(delimiter, &data[i..])?; |
||||
return Some((i + end + 1, Err(e))); |
||||
} |
||||
} |
||||
} |
||||
_ => unreachable!(), |
||||
} |
||||
} |
||||
} |
||||
|
||||
fn recognize_long_string( |
||||
data: &[u8], |
||||
delimiter: u8, |
||||
) -> Option<(usize, Result<N3Token<'static>, TokenRecognizerError>)> { |
||||
// [24] STRING_LITERAL_LONG_SINGLE_QUOTE ::= "'''" (("'" | "''")? ([^'\] | ECHAR | UCHAR))* "'''"
|
||||
// [25] STRING_LITERAL_LONG_QUOTE ::= '"""' (('"' | '""')? ([^"\] | ECHAR | UCHAR))* '"""'
|
||||
let mut string = String::new(); |
||||
let mut i = 3; |
||||
loop { |
||||
let end = memchr2(delimiter, b'\\', &data[i..])?; |
||||
match str::from_utf8(&data[i..i + end]) { |
||||
Ok(a) => string.push_str(a), |
||||
Err(e) => { |
||||
return Some(( |
||||
end, |
||||
Err(( |
||||
i..i + end, |
||||
format!("The string contains invalid UTF-8 characters: {e}"), |
||||
) |
||||
.into()), |
||||
)) |
||||
} |
||||
}; |
||||
i += end; |
||||
match data[i] { |
||||
c if c == delimiter => { |
||||
if *data.get(i + 1)? == delimiter && *data.get(i + 2)? == delimiter { |
||||
return Some((i + 3, Ok(N3Token::String(string)))); |
||||
} |
||||
i += 1; |
||||
string.push(char::from(delimiter)); |
||||
} |
||||
b'\\' => { |
||||
let (additional, c) = Self::recognize_escape(&data[i..], i, true)?; |
||||
i += additional + 1; |
||||
match c { |
||||
Ok(c) => { |
||||
string.push(c); |
||||
} |
||||
Err(e) => return Some((i, Err(e))), |
||||
} |
||||
} |
||||
_ => unreachable!(), |
||||
} |
||||
} |
||||
} |
||||
|
||||
fn recognize_number(data: &[u8]) -> Option<(usize, Result<N3Token<'_>, TokenRecognizerError>)> { |
||||
// [19] INTEGER ::= [+-]? [0-9]+
|
||||
// [20] DECIMAL ::= [+-]? [0-9]* '.' [0-9]+
|
||||
// [21] DOUBLE ::= [+-]? ([0-9]+ '.' [0-9]* EXPONENT | '.' [0-9]+ EXPONENT | [0-9]+ EXPONENT)
|
||||
// [154s] EXPONENT ::= [eE] [+-]? [0-9]+
|
||||
let mut i = 0; |
||||
let c = *data.first()?; |
||||
if matches!(c, b'+' | b'-') { |
||||
i += 1; |
||||
} |
||||
// We read the digits before .
|
||||
let mut count_before: usize = 0; |
||||
loop { |
||||
let c = *data.get(i)?; |
||||
if c.is_ascii_digit() { |
||||
i += 1; |
||||
count_before += 1; |
||||
} else { |
||||
break; |
||||
} |
||||
} |
||||
|
||||
// We read the digits after .
|
||||
#[allow(clippy::if_then_some_else_none)] |
||||
let count_after = if *data.get(i)? == b'.' { |
||||
i += 1; |
||||
|
||||
let mut count_after = 0; |
||||
loop { |
||||
let c = *data.get(i)?; |
||||
if c.is_ascii_digit() { |
||||
i += 1; |
||||
count_after += 1; |
||||
} else { |
||||
break; |
||||
} |
||||
} |
||||
Some(count_after) |
||||
} else { |
||||
None |
||||
}; |
||||
|
||||
// End
|
||||
let c = *data.get(i)?; |
||||
if matches!(c, b'e' | b'E') { |
||||
i += 1; |
||||
|
||||
let c = *data.get(i)?; |
||||
if matches!(c, b'+' | b'-') { |
||||
i += 1; |
||||
} |
||||
|
||||
let mut found = false; |
||||
loop { |
||||
let c = *data.get(i)?; |
||||
if c.is_ascii_digit() { |
||||
i += 1; |
||||
found = true; |
||||
} else { |
||||
break; |
||||
} |
||||
} |
||||
Some(( |
||||
i, |
||||
if !found { |
||||
Err((0..i, "A double exponent cannot be empty").into()) |
||||
} else if count_before == 0 && count_after.unwrap_or(0) == 0 { |
||||
Err((0..i, "A double should not be empty").into()) |
||||
} else { |
||||
Ok(N3Token::Double(str::from_utf8(&data[..i]).unwrap())) |
||||
}, |
||||
)) |
||||
} else if let Some(count_after) = count_after { |
||||
if count_after == 0 { |
||||
// We do not consume the '.' after all
|
||||
i -= 1; |
||||
Some(( |
||||
i, |
||||
if count_before == 0 { |
||||
Err((0..i, "An integer should not be empty").into()) |
||||
} else { |
||||
Ok(N3Token::Integer(str::from_utf8(&data[..i]).unwrap())) |
||||
}, |
||||
)) |
||||
} else { |
||||
Some((i, Ok(N3Token::Decimal(str::from_utf8(&data[..i]).unwrap())))) |
||||
} |
||||
} else { |
||||
Some(( |
||||
i, |
||||
if count_before == 0 { |
||||
Err((0..i, "An integer should not be empty").into()) |
||||
} else { |
||||
Ok(N3Token::Integer(str::from_utf8(&data[..i]).unwrap())) |
||||
}, |
||||
)) |
||||
} |
||||
} |
||||
|
||||
fn recognize_escape( |
||||
data: &[u8], |
||||
position: usize, |
||||
with_echar: bool, |
||||
) -> Option<(usize, Result<char, TokenRecognizerError>)> { |
||||
// [26] UCHAR ::= '\u' HEX HEX HEX HEX | '\U' HEX HEX HEX HEX HEX HEX HEX HEX
|
||||
// [159s] ECHAR ::= '\' [tbnrf"'\]
|
||||
match *data.get(1)? { |
||||
b'u' => match Self::recognize_hex_char(&data[2..], 4, 'u', position) { |
||||
Ok(c) => Some((5, Ok(c?))), |
||||
Err(e) => Some((5, Err(e))), |
||||
}, |
||||
b'U' => match Self::recognize_hex_char(&data[2..], 8, 'u', position) { |
||||
Ok(c) => Some((9, Ok(c?))), |
||||
Err(e) => Some((9, Err(e))), |
||||
}, |
||||
b't' if with_echar => Some((1, Ok('\t'))), |
||||
b'b' if with_echar => Some((1, Ok('\x08'))), |
||||
b'n' if with_echar => Some((1, Ok('\n'))), |
||||
b'r' if with_echar => Some((1, Ok('\r'))), |
||||
b'f' if with_echar => Some((1, Ok('\x0C'))), |
||||
b'"' if with_echar => Some((1, Ok('"'))), |
||||
b'\'' if with_echar => Some((1, Ok('\''))), |
||||
b'\\' if with_echar => Some((1, Ok('\\'))), |
||||
c => Some(( |
||||
1, |
||||
Err(( |
||||
position..position + 2, |
||||
format!("Unexpected escape character '\\{}'", char::from(c)), |
||||
) |
||||
.into()), |
||||
)), //TODO: read until end of string
|
||||
} |
||||
} |
||||
|
||||
fn recognize_hex_char( |
||||
data: &[u8], |
||||
len: usize, |
||||
escape_char: char, |
||||
position: usize, |
||||
) -> Result<Option<char>, TokenRecognizerError> { |
||||
if data.len() < len { |
||||
return Ok(None); |
||||
} |
||||
let val = str::from_utf8(&data[..len]).map_err(|e| { |
||||
( |
||||
position..position + len + 2, |
||||
format!("The escape sequence contains invalid UTF-8 characters: {e}"), |
||||
) |
||||
})?; |
||||
let codepoint = u32::from_str_radix(val, 16).map_err(|e| { |
||||
( |
||||
position..position + len + 2, |
||||
format!( |
||||
"The escape sequence '\\{escape_char}{val}' is not a valid hexadecimal string: {e}" |
||||
), |
||||
) |
||||
})?; |
||||
let c = char::from_u32(codepoint).ok_or_else(|| { |
||||
( |
||||
position..position + len +2, |
||||
format!( |
||||
"The escape sequence '\\{escape_char}{val}' is encoding {codepoint:X} that is not a valid unicode character", |
||||
), |
||||
) |
||||
})?; |
||||
Ok(Some(c)) |
||||
} |
||||
|
||||
fn recognize_unicode_char( |
||||
data: &[u8], |
||||
position: usize, |
||||
) -> Option<Result<(char, usize), TokenRecognizerError>> { |
||||
let mut code_point: u32; |
||||
let bytes_needed: usize; |
||||
let mut lower_boundary = 0x80; |
||||
let mut upper_boundary = 0xBF; |
||||
|
||||
let byte = *data.first()?; |
||||
match byte { |
||||
0x00..=0x7F => return Some(Ok((char::from(byte), 1))), |
||||
0xC2..=0xDF => { |
||||
bytes_needed = 1; |
||||
code_point = u32::from(byte) & 0x1F; |
||||
} |
||||
0xE0..=0xEF => { |
||||
if byte == 0xE0 { |
||||
lower_boundary = 0xA0; |
||||
} |
||||
if byte == 0xED { |
||||
upper_boundary = 0x9F; |
||||
} |
||||
bytes_needed = 2; |
||||
code_point = u32::from(byte) & 0xF; |
||||
} |
||||
0xF0..=0xF4 => { |
||||
if byte == 0xF0 { |
||||
lower_boundary = 0x90; |
||||
} |
||||
if byte == 0xF4 { |
||||
upper_boundary = 0x8F; |
||||
} |
||||
bytes_needed = 3; |
||||
code_point = u32::from(byte) & 0x7; |
||||
} |
||||
_ => { |
||||
return Some(Err(( |
||||
position..=position, |
||||
"Invalid UTF-8 character encoding", |
||||
) |
||||
.into())) |
||||
} |
||||
} |
||||
|
||||
for i in 1..=bytes_needed { |
||||
let byte = *data.get(i)?; |
||||
if byte < lower_boundary || upper_boundary < byte { |
||||
return Some(Err(( |
||||
position..=position + i, |
||||
"Invalid UTF-8 character encoding", |
||||
) |
||||
.into())); |
||||
} |
||||
lower_boundary = 0x80; |
||||
upper_boundary = 0xBF; |
||||
code_point = (code_point << 6) | (u32::from(byte) & 0x3F); |
||||
} |
||||
|
||||
Some( |
||||
char::from_u32(code_point) |
||||
.map(|c| (c, bytes_needed + 1)) |
||||
.ok_or_else(|| { |
||||
( |
||||
position..=position + bytes_needed, |
||||
format!("The codepoint {code_point:X} is not a valid unicode character"), |
||||
) |
||||
.into() |
||||
}), |
||||
) |
||||
} |
||||
|
||||
// [157s] PN_CHARS_BASE ::= [A-Z] | [a-z] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x02FF] | [#x0370-#x037D] | [#x037F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
|
||||
fn is_possible_pn_chars_base(c: char) -> bool { |
||||
matches!(c, |
||||
'A'..='Z' |
||||
| 'a'..='z' |
||||
| '\u{00C0}'..='\u{00D6}' |
||||
| '\u{00D8}'..='\u{00F6}' |
||||
| '\u{00F8}'..='\u{02FF}' |
||||
| '\u{0370}'..='\u{037D}' |
||||
| '\u{037F}'..='\u{1FFF}' |
||||
| '\u{200C}'..='\u{200D}' |
||||
| '\u{2070}'..='\u{218F}' |
||||
| '\u{2C00}'..='\u{2FEF}' |
||||
| '\u{3001}'..='\u{D7FF}' |
||||
| '\u{F900}'..='\u{FDCF}' |
||||
| '\u{FDF0}'..='\u{FFFD}' |
||||
| '\u{10000}'..='\u{EFFFF}') |
||||
} |
||||
|
||||
// [158s] PN_CHARS_U ::= PN_CHARS_BASE | '_' | ':'
|
||||
fn is_possible_pn_chars_u(c: char) -> bool { |
||||
Self::is_possible_pn_chars_base(c) || c == '_' |
||||
} |
||||
|
||||
// [160s] PN_CHARS ::= PN_CHARS_U | '-' | [0-9] | #x00B7 | [#x0300-#x036F] | [#x203F-#x2040]
|
||||
fn is_possible_pn_chars(c: char) -> bool { |
||||
Self::is_possible_pn_chars_u(c) |
||||
|| matches!(c, |
||||
'-' | '0'..='9' | '\u{00B7}' | '\u{0300}'..='\u{036F}' | '\u{203F}'..='\u{2040}') |
||||
} |
||||
|
||||
fn is_possible_pn_chars_base_but_not_valid_iri(c: char) -> bool { |
||||
matches!(c, '\u{FFF0}'..='\u{FFFD}') |
||||
|| u32::from(c) % u32::from('\u{FFFE}') == 0 |
||||
|| u32::from(c) % u32::from('\u{FFFF}') == 0 |
||||
} |
||||
} |
||||
|
||||
pub fn resolve_local_name( |
||||
prefix: &str, |
||||
local: &str, |
||||
might_be_invalid_iri: bool, |
||||
prefixes: &HashMap<String, Iri<String>>, |
||||
) -> Result<NamedNode, String> { |
||||
if let Some(start) = prefixes.get(prefix) { |
||||
let iri = format!("{start}{local}"); |
||||
if might_be_invalid_iri || start.path().is_empty() { |
||||
// We validate again. We always validate if the local part might be the IRI authority.
|
||||
if let Err(e) = Iri::parse(iri.as_str()) { |
||||
return Err(format!( |
||||
"The prefixed name {prefix}:{local} builds IRI {iri} that is invalid: {e}" |
||||
)); |
||||
} |
||||
} |
||||
Ok(NamedNode::new_unchecked(iri)) |
||||
} else { |
||||
Err(format!("The prefix {prefix}: has not been declared")) |
||||
} |
||||
} |
@ -0,0 +1,19 @@ |
||||
mod lexer; |
||||
mod line_formats; |
||||
pub mod n3; |
||||
pub mod nquads; |
||||
pub mod ntriples; |
||||
mod terse; |
||||
mod toolkit; |
||||
pub mod trig; |
||||
pub mod turtle; |
||||
|
||||
pub use crate::n3::N3Parser; |
||||
pub use crate::nquads::{NQuadsParser, NQuadsSerializer}; |
||||
pub use crate::ntriples::{NTriplesParser, NTriplesSerializer}; |
||||
pub use crate::toolkit::{ParseError, ParseOrIoError}; |
||||
pub use crate::trig::{TriGParser, TriGSerializer}; |
||||
pub use crate::turtle::{TurtleParser, TurtleSerializer}; |
||||
|
||||
pub(crate) const MIN_BUFFER_SIZE: usize = 4096; |
||||
pub(crate) const MAX_BUFFER_SIZE: usize = 4096 * 4096; |
@ -0,0 +1,305 @@ |
||||
//! Shared parser implementation for N-Triples and N-Quads.
|
||||
|
||||
use crate::lexer::{N3Lexer, N3LexerMode, N3LexerOptions, N3Token}; |
||||
use crate::toolkit::{Lexer, Parser, RuleRecognizer, RuleRecognizerError}; |
||||
use crate::{MAX_BUFFER_SIZE, MIN_BUFFER_SIZE}; |
||||
#[cfg(feature = "rdf-star")] |
||||
use oxrdf::Triple; |
||||
use oxrdf::{BlankNode, GraphName, Literal, NamedNode, Quad, Subject, Term}; |
||||
|
||||
pub struct NQuadsRecognizer { |
||||
stack: Vec<NQuadsState>, |
||||
with_graph_name: bool, |
||||
#[cfg(feature = "rdf-star")] |
||||
with_quoted_triples: bool, |
||||
lexer_options: N3LexerOptions, |
||||
subjects: Vec<Subject>, |
||||
predicates: Vec<NamedNode>, |
||||
objects: Vec<Term>, |
||||
} |
||||
|
||||
enum NQuadsState { |
||||
ExpectSubject, |
||||
ExpectPredicate, |
||||
ExpectedObject, |
||||
ExpectPossibleGraphOrEndOfQuotedTriple, |
||||
ExpectDot, |
||||
ExpectLiteralAnnotationOrGraphNameOrDot { |
||||
value: String, |
||||
}, |
||||
ExpectLiteralDatatype { |
||||
value: String, |
||||
}, |
||||
#[cfg(feature = "rdf-star")] |
||||
AfterQuotedSubject, |
||||
#[cfg(feature = "rdf-star")] |
||||
AfterQuotedObject, |
||||
} |
||||
|
||||
impl RuleRecognizer for NQuadsRecognizer { |
||||
type TokenRecognizer = N3Lexer; |
||||
type Output = Quad; |
||||
|
||||
fn error_recovery_state(mut self) -> Self { |
||||
self.stack.clear(); |
||||
self.subjects.clear(); |
||||
self.predicates.clear(); |
||||
self.objects.clear(); |
||||
self |
||||
} |
||||
|
||||
fn recognize_next( |
||||
mut self, |
||||
token: N3Token, |
||||
results: &mut Vec<Quad>, |
||||
errors: &mut Vec<RuleRecognizerError>, |
||||
) -> Self { |
||||
if let Some(state) = self.stack.pop() { |
||||
match state { |
||||
NQuadsState::ExpectSubject => match token { |
||||
N3Token::IriRef(s) => { |
||||
self.subjects |
||||
.push(NamedNode::new_unchecked(s.into_inner()).into()); |
||||
self.stack.push(NQuadsState::ExpectPredicate); |
||||
self |
||||
} |
||||
N3Token::BlankNodeLabel(s) => { |
||||
self.subjects.push(BlankNode::new_unchecked(s).into()); |
||||
self.stack.push(NQuadsState::ExpectPredicate); |
||||
self |
||||
} |
||||
#[cfg(feature = "rdf-star")] |
||||
N3Token::Punctuation("<<") if self.with_quoted_triples => { |
||||
self.stack.push(NQuadsState::AfterQuotedSubject); |
||||
self.stack.push(NQuadsState::ExpectSubject); |
||||
self |
||||
} |
||||
token => self.error( |
||||
errors, |
||||
format!("The subject of a triple should be an IRI or a blank node, {token:?} found"), |
||||
), |
||||
}, |
||||
NQuadsState::ExpectPredicate => match token { |
||||
N3Token::IriRef(p) => { |
||||
self.predicates |
||||
.push(NamedNode::new_unchecked(p.into_inner())); |
||||
self.stack.push(NQuadsState::ExpectedObject); |
||||
self |
||||
} |
||||
token => self.error( |
||||
errors, |
||||
format!("The predicate of a triple should be an IRI, {token:?} found"), |
||||
), |
||||
}, |
||||
NQuadsState::ExpectedObject => match token { |
||||
N3Token::IriRef(o) => { |
||||
self.objects |
||||
.push(NamedNode::new_unchecked(o.into_inner()).into()); |
||||
self.stack |
||||
.push(NQuadsState::ExpectPossibleGraphOrEndOfQuotedTriple); |
||||
self |
||||
} |
||||
N3Token::BlankNodeLabel(o) => { |
||||
self.objects.push(BlankNode::new_unchecked(o).into()); |
||||
self.stack |
||||
.push(NQuadsState::ExpectPossibleGraphOrEndOfQuotedTriple); |
||||
self |
||||
} |
||||
N3Token::String(value) => { |
||||
self.stack |
||||
.push(NQuadsState::ExpectLiteralAnnotationOrGraphNameOrDot { value }); |
||||
self |
||||
} |
||||
#[cfg(feature = "rdf-star")] |
||||
N3Token::Punctuation("<<") if self.with_quoted_triples => { |
||||
self.stack.push(NQuadsState::AfterQuotedObject); |
||||
self.stack.push(NQuadsState::ExpectSubject); |
||||
self |
||||
} |
||||
token => self.error( |
||||
errors, |
||||
format!("The object of a triple should be an IRI, a blank node or a literal, {token:?} found"), |
||||
), |
||||
}, |
||||
NQuadsState::ExpectLiteralAnnotationOrGraphNameOrDot { value } => match token { |
||||
N3Token::LangTag(lang_tag) => { |
||||
self.objects.push( |
||||
Literal::new_language_tagged_literal_unchecked( |
||||
value, |
||||
lang_tag.to_ascii_lowercase(), |
||||
) |
||||
.into(), |
||||
); |
||||
self.stack |
||||
.push(NQuadsState::ExpectPossibleGraphOrEndOfQuotedTriple); |
||||
self |
||||
} |
||||
N3Token::Punctuation("^^") => { |
||||
self.stack |
||||
.push(NQuadsState::ExpectLiteralDatatype { value }); |
||||
self |
||||
} |
||||
token => { |
||||
self.objects.push(Literal::new_simple_literal(value).into()); |
||||
self.stack |
||||
.push(NQuadsState::ExpectPossibleGraphOrEndOfQuotedTriple); |
||||
self.recognize_next(token, results, errors) |
||||
} |
||||
}, |
||||
NQuadsState::ExpectLiteralDatatype { value } => match token { |
||||
N3Token::IriRef(d) => { |
||||
self.objects.push( |
||||
Literal::new_typed_literal( |
||||
value, |
||||
NamedNode::new_unchecked(d.into_inner()), |
||||
) |
||||
.into(), |
||||
); |
||||
self.stack |
||||
.push(NQuadsState::ExpectPossibleGraphOrEndOfQuotedTriple); |
||||
self |
||||
} |
||||
token => self.error(errors, format!("A literal datatype must be an IRI, found {token:?}")), |
||||
}, |
||||
NQuadsState::ExpectPossibleGraphOrEndOfQuotedTriple => { |
||||
if self.stack.is_empty() { |
||||
match token { |
||||
N3Token::IriRef(g) if self.with_graph_name => { |
||||
self.emit_quad( |
||||
results, |
||||
NamedNode::new_unchecked(g.into_inner()).into(), |
||||
); |
||||
self.stack.push(NQuadsState::ExpectDot); |
||||
self |
||||
} |
||||
N3Token::BlankNodeLabel(g) if self.with_graph_name => { |
||||
self.emit_quad(results, BlankNode::new_unchecked(g).into()); |
||||
self.stack.push(NQuadsState::ExpectDot); |
||||
self |
||||
} |
||||
token => { |
||||
self.emit_quad(results, GraphName::DefaultGraph); |
||||
self.stack.push(NQuadsState::ExpectDot); |
||||
self.recognize_next(token, results, errors) |
||||
} |
||||
} |
||||
} else if token == N3Token::Punctuation(">>") { |
||||
self |
||||
} else { |
||||
self.error(errors, "Expecting the end of a quoted triple '>>'") |
||||
} |
||||
} |
||||
NQuadsState::ExpectDot => match token { |
||||
N3Token::Punctuation(".") => { |
||||
self.stack.push(NQuadsState::ExpectSubject); |
||||
self |
||||
} |
||||
token => { |
||||
errors.push("Quads should be followed by a dot".into()); |
||||
self.stack.push(NQuadsState::ExpectSubject); |
||||
self.recognize_next(token, results, errors) |
||||
} |
||||
}, |
||||
#[cfg(feature = "rdf-star")] |
||||
NQuadsState::AfterQuotedSubject => { |
||||
let triple = Triple { |
||||
subject: self.subjects.pop().unwrap(), |
||||
predicate: self.predicates.pop().unwrap(), |
||||
object: self.objects.pop().unwrap(), |
||||
}; |
||||
self.subjects.push(triple.into()); |
||||
self.stack.push(NQuadsState::ExpectPredicate); |
||||
self.recognize_next(token, results, errors) |
||||
} |
||||
#[cfg(feature = "rdf-star")] |
||||
NQuadsState::AfterQuotedObject => { |
||||
let triple = Triple { |
||||
subject: self.subjects.pop().unwrap(), |
||||
predicate: self.predicates.pop().unwrap(), |
||||
object: self.objects.pop().unwrap(), |
||||
}; |
||||
self.objects.push(triple.into()); |
||||
self.stack |
||||
.push(NQuadsState::ExpectPossibleGraphOrEndOfQuotedTriple); |
||||
self.recognize_next(token, results, errors) |
||||
} |
||||
} |
||||
} else if token == N3Token::Punctuation(".") { |
||||
self.stack.push(NQuadsState::ExpectSubject); |
||||
self |
||||
} else { |
||||
self |
||||
} |
||||
} |
||||
|
||||
fn recognize_end(mut self, results: &mut Vec<Quad>, errors: &mut Vec<RuleRecognizerError>) { |
||||
match &*self.stack { |
||||
[NQuadsState::ExpectSubject] | [] => (), |
||||
[NQuadsState::ExpectDot] => errors.push("Triples should be followed by a dot".into()), |
||||
[NQuadsState::ExpectPossibleGraphOrEndOfQuotedTriple] => { |
||||
self.emit_quad(results, GraphName::DefaultGraph); |
||||
errors.push("Triples should be followed by a dot".into()) |
||||
} |
||||
[NQuadsState::ExpectLiteralAnnotationOrGraphNameOrDot { ref value }] => { |
||||
self.objects.push(Literal::new_simple_literal(value).into()); |
||||
self.emit_quad(results, GraphName::DefaultGraph); |
||||
errors.push("Triples should be followed by a dot".into()) |
||||
} |
||||
_ => errors.push("Unexpected end".into()), //TODO
|
||||
} |
||||
} |
||||
|
||||
fn lexer_options(&self) -> &N3LexerOptions { |
||||
&self.lexer_options |
||||
} |
||||
} |
||||
|
||||
impl NQuadsRecognizer { |
||||
pub fn new_parser( |
||||
with_graph_name: bool, |
||||
#[cfg(feature = "rdf-star")] with_quoted_triples: bool, |
||||
) -> Parser<Self> { |
||||
Parser::new( |
||||
Lexer::new( |
||||
N3Lexer::new(N3LexerMode::NTriples), |
||||
MIN_BUFFER_SIZE, |
||||
MAX_BUFFER_SIZE, |
||||
true, |
||||
Some(b"#"), |
||||
), |
||||
NQuadsRecognizer { |
||||
stack: vec![NQuadsState::ExpectSubject], |
||||
with_graph_name, |
||||
#[cfg(feature = "rdf-star")] |
||||
with_quoted_triples, |
||||
lexer_options: N3LexerOptions::default(), |
||||
subjects: Vec::new(), |
||||
predicates: Vec::new(), |
||||
objects: Vec::new(), |
||||
}, |
||||
) |
||||
} |
||||
|
||||
#[must_use] |
||||
fn error( |
||||
mut self, |
||||
errors: &mut Vec<RuleRecognizerError>, |
||||
msg: impl Into<RuleRecognizerError>, |
||||
) -> Self { |
||||
errors.push(msg.into()); |
||||
self.stack.clear(); |
||||
self.subjects.clear(); |
||||
self.predicates.clear(); |
||||
self.objects.clear(); |
||||
self |
||||
} |
||||
|
||||
fn emit_quad(&mut self, results: &mut Vec<Quad>, graph_name: GraphName) { |
||||
results.push(Quad { |
||||
subject: self.subjects.pop().unwrap(), |
||||
predicate: self.predicates.pop().unwrap(), |
||||
object: self.objects.pop().unwrap(), |
||||
graph_name, |
||||
}) |
||||
} |
||||
} |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,393 @@ |
||||
//! A [N-Quads](https://www.w3.org/TR/n-quads/) streaming parser implemented by [`NQuadsParser`].
|
||||
|
||||
use crate::line_formats::NQuadsRecognizer; |
||||
use crate::toolkit::{FromReadIterator, ParseError, ParseOrIoError, Parser}; |
||||
use oxrdf::{Quad, QuadRef}; |
||||
use std::io::{self, Read, Write}; |
||||
|
||||
/// A [N-Quads](https://www.w3.org/TR/n-quads/) streaming parser.
|
||||
///
|
||||
/// Support for [N-Quads-star](https://w3c.github.io/rdf-star/cg-spec/2021-12-17.html#n-quads-star) is available behind the `rdf-star` feature and the [`NQuadsParser::with_quoted_triples`] option.
|
||||
///
|
||||
/// Count the number of people:
|
||||
/// ```
|
||||
/// use oxrdf::NamedNodeRef;
|
||||
/// use oxttl::{NQuadsParser, ParseError};
|
||||
///
|
||||
/// let file = b"<http://example.com/foo> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .
|
||||
/// <http://example.com/foo> <http://schema.org/name> \"Foo\" .
|
||||
/// <http://example.com/bar> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .
|
||||
/// <http://example.com/bar> <http://schema.org/name> \"Bar\" .";
|
||||
///
|
||||
/// let rdf_type = NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?;
|
||||
/// let schema_person = NamedNodeRef::new("http://schema.org/Person")?;
|
||||
/// let mut count = 0;
|
||||
/// for quad in NQuadsParser::new().parse_from_read(file.as_ref()) {
|
||||
/// let quad = quad?;
|
||||
/// if quad.predicate == rdf_type && quad.object == schema_person.into() {
|
||||
/// count += 1;
|
||||
/// }
|
||||
/// }
|
||||
/// assert_eq!(2, count);
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
#[derive(Default)] |
||||
pub struct NQuadsParser { |
||||
#[cfg(feature = "rdf-star")] |
||||
with_quoted_triples: bool, |
||||
} |
||||
|
||||
impl NQuadsParser { |
||||
/// Builds a new [`NQuadsParser`].
|
||||
#[inline] |
||||
pub fn new() -> Self { |
||||
Self::default() |
||||
} |
||||
|
||||
/// Enables [N-Quads-star](https://w3c.github.io/rdf-star/cg-spec/2021-12-17.html#n-quads-star).
|
||||
#[cfg(feature = "rdf-star")] |
||||
#[inline] |
||||
#[must_use] |
||||
pub fn with_quoted_triples(mut self) -> Self { |
||||
self.with_quoted_triples = true; |
||||
self |
||||
} |
||||
|
||||
/// Parses a N-Quads file from a [`Read`] implementation.
|
||||
///
|
||||
/// Count the number of people:
|
||||
/// ```
|
||||
/// use oxrdf::NamedNodeRef;
|
||||
/// use oxttl::{NQuadsParser, ParseError};
|
||||
///
|
||||
/// let file = b"<http://example.com/foo> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .
|
||||
/// <http://example.com/foo> <http://schema.org/name> \"Foo\" .
|
||||
/// <http://example.com/bar> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .
|
||||
/// <http://example.com/bar> <http://schema.org/name> \"Bar\" .";
|
||||
///
|
||||
/// let rdf_type = NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?;
|
||||
/// let schema_person = NamedNodeRef::new("http://schema.org/Person")?;
|
||||
/// let mut count = 0;
|
||||
/// for quad in NQuadsParser::new().parse_from_read(file.as_ref()) {
|
||||
/// let quad = quad?;
|
||||
/// if quad.predicate == rdf_type && quad.object == schema_person.into() {
|
||||
/// count += 1;
|
||||
/// }
|
||||
/// }
|
||||
/// assert_eq!(2, count);
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
pub fn parse_from_read<R: Read>(&self, read: R) -> FromReadNQuadsReader<R> { |
||||
FromReadNQuadsReader { |
||||
inner: self.parse().parser.parse_from_read(read), |
||||
} |
||||
} |
||||
|
||||
/// Allows to parse a N-Quads file by using a low-level API.
|
||||
///
|
||||
/// Count the number of people:
|
||||
/// ```
|
||||
/// use oxrdf::NamedNodeRef;
|
||||
/// use oxttl::{NQuadsParser, ParseError};
|
||||
///
|
||||
/// let file: [&[u8]; 4] = [
|
||||
/// b"<http://example.com/foo> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .\n",
|
||||
/// b"<http://example.com/foo> <http://schema.org/name> \"Foo\" .\n",
|
||||
/// b"<http://example.com/bar> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .\n",
|
||||
/// b"<http://example.com/bar> <http://schema.org/name> \"Bar\" .\n"
|
||||
/// ];
|
||||
///
|
||||
/// let rdf_type = NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?;
|
||||
/// let schema_person = NamedNodeRef::new("http://schema.org/Person")?;
|
||||
/// let mut count = 0;
|
||||
/// let mut parser = NQuadsParser::new().parse();
|
||||
/// let mut file_chunks = file.iter();
|
||||
/// while !parser.is_end() {
|
||||
/// // We feed more data to the parser
|
||||
/// if let Some(chunk) = file_chunks.next() {
|
||||
/// parser.extend_from_slice(chunk);
|
||||
/// } else {
|
||||
/// parser.end(); // It's finished
|
||||
/// }
|
||||
/// // We read as many quads from the parser as possible
|
||||
/// while let Some(quad) = parser.read_next() {
|
||||
/// let quad = quad?;
|
||||
/// if quad.predicate == rdf_type && quad.object == schema_person.into() {
|
||||
/// count += 1;
|
||||
/// }
|
||||
/// }
|
||||
/// }
|
||||
/// assert_eq!(2, count);
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
#[allow(clippy::unused_self)] |
||||
pub fn parse(&self) -> LowLevelNQuadsReader { |
||||
LowLevelNQuadsReader { |
||||
parser: NQuadsRecognizer::new_parser( |
||||
true, |
||||
#[cfg(feature = "rdf-star")] |
||||
self.with_quoted_triples, |
||||
), |
||||
} |
||||
} |
||||
} |
||||
|
||||
/// Parses a N-Quads file from a [`Read`] implementation. Can be built using [`NQuadsParser::parse_from_read`].
|
||||
///
|
||||
/// Count the number of people:
|
||||
/// ```
|
||||
/// use oxrdf::NamedNodeRef;
|
||||
/// use oxttl::{NQuadsParser, ParseError};
|
||||
///
|
||||
/// let file = b"<http://example.com/foo> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .
|
||||
/// <http://example.com/foo> <http://schema.org/name> \"Foo\" .
|
||||
/// <http://example.com/bar> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .
|
||||
/// <http://example.com/bar> <http://schema.org/name> \"Bar\" .";
|
||||
///
|
||||
/// let rdf_type = NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?;
|
||||
/// let schema_person = NamedNodeRef::new("http://schema.org/Person")?;
|
||||
/// let mut count = 0;
|
||||
/// for quad in NQuadsParser::new().parse_from_read(file.as_ref()) {
|
||||
/// let quad = quad?;
|
||||
/// if quad.predicate == rdf_type && quad.object == schema_person.into() {
|
||||
/// count += 1;
|
||||
/// }
|
||||
/// }
|
||||
/// assert_eq!(2, count);
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
pub struct FromReadNQuadsReader<R: Read> { |
||||
inner: FromReadIterator<R, NQuadsRecognizer>, |
||||
} |
||||
|
||||
impl<R: Read> Iterator for FromReadNQuadsReader<R> { |
||||
type Item = Result<Quad, ParseOrIoError>; |
||||
|
||||
fn next(&mut self) -> Option<Result<Quad, ParseOrIoError>> { |
||||
self.inner.next() |
||||
} |
||||
} |
||||
|
||||
/// Parses a N-Quads file by using a low-level API. Can be built using [`NQuadsParser::parse`].
|
||||
///
|
||||
/// Count the number of people:
|
||||
/// ```
|
||||
/// use oxrdf::NamedNodeRef;
|
||||
/// use oxttl::{NQuadsParser, ParseError};
|
||||
///
|
||||
/// let file: [&[u8]; 4] = [
|
||||
/// b"<http://example.com/foo> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .\n",
|
||||
/// b"<http://example.com/foo> <http://schema.org/name> \"Foo\" .\n",
|
||||
/// b"<http://example.com/bar> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .\n",
|
||||
/// b"<http://example.com/bar> <http://schema.org/name> \"Bar\" .\n"
|
||||
/// ];
|
||||
///
|
||||
/// let rdf_type = NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?;
|
||||
/// let schema_person = NamedNodeRef::new("http://schema.org/Person")?;
|
||||
/// let mut count = 0;
|
||||
/// let mut parser = NQuadsParser::new().parse();
|
||||
/// let mut file_chunks = file.iter();
|
||||
/// while !parser.is_end() {
|
||||
/// // We feed more data to the parser
|
||||
/// if let Some(chunk) = file_chunks.next() {
|
||||
/// parser.extend_from_slice(chunk);
|
||||
/// } else {
|
||||
/// parser.end(); // It's finished
|
||||
/// }
|
||||
/// // We read as many quads from the parser as possible
|
||||
/// while let Some(quad) = parser.read_next() {
|
||||
/// let quad = quad?;
|
||||
/// if quad.predicate == rdf_type && quad.object == schema_person.into() {
|
||||
/// count += 1;
|
||||
/// }
|
||||
/// }
|
||||
/// }
|
||||
/// assert_eq!(2, count);
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
pub struct LowLevelNQuadsReader { |
||||
parser: Parser<NQuadsRecognizer>, |
||||
} |
||||
|
||||
impl LowLevelNQuadsReader { |
||||
/// Adds some extra bytes to the parser. Should be called when [`read_next`](Self::read_next) returns [`None`] and there is still unread data.
|
||||
pub fn extend_from_slice(&mut self, other: &[u8]) { |
||||
self.parser.extend_from_slice(other) |
||||
} |
||||
|
||||
/// Tell the parser that the file is finished.
|
||||
///
|
||||
/// This triggers the parsing of the final bytes and might lead [`read_next`](Self::read_next) to return some extra values.
|
||||
pub fn end(&mut self) { |
||||
self.parser.end() |
||||
} |
||||
|
||||
/// Returns if the parsing is finished i.e. [`end`](Self::end) has been called and [`read_next`](Self::read_next) is always going to return `None`.
|
||||
pub fn is_end(&self) -> bool { |
||||
self.parser.is_end() |
||||
} |
||||
|
||||
/// Attempt to parse a new quad from the already provided data.
|
||||
///
|
||||
/// Returns [`None`] if the parsing is finished or more data is required.
|
||||
/// If it is the case more data should be fed using [`extend_from_slice`](Self::extend_from_slice).
|
||||
pub fn read_next(&mut self) -> Option<Result<Quad, ParseError>> { |
||||
self.parser.read_next() |
||||
} |
||||
} |
||||
|
||||
/// A [N-Quads](https://www.w3.org/TR/n-quads/) serializer.
|
||||
///
|
||||
/// Support for [N-Quads-star](https://w3c.github.io/rdf-star/cg-spec/2021-12-17.html#n-quads-star) is available behind the `rdf-star` feature.
|
||||
///
|
||||
/// ```
|
||||
/// use oxrdf::{NamedNodeRef, QuadRef};
|
||||
/// use oxttl::NQuadsSerializer;
|
||||
///
|
||||
/// let mut buf = Vec::new();
|
||||
/// let mut writer = NQuadsSerializer::new().serialize_to_write(buf);
|
||||
/// writer.write_quad(QuadRef::new(
|
||||
/// NamedNodeRef::new("http://example.com#me")?,
|
||||
/// NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?,
|
||||
/// NamedNodeRef::new("http://schema.org/Person")?,
|
||||
/// NamedNodeRef::new("http://example.com")?,
|
||||
/// ))?;
|
||||
/// assert_eq!(
|
||||
/// b"<http://example.com#me> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> <http://example.com> .\n",
|
||||
/// writer.finish().as_slice()
|
||||
/// );
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
#[derive(Default)] |
||||
pub struct NQuadsSerializer; |
||||
|
||||
impl NQuadsSerializer { |
||||
/// Builds a new [`NQuadsSerializer`].
|
||||
#[inline] |
||||
pub fn new() -> Self { |
||||
Self |
||||
} |
||||
|
||||
/// Writes a N-Quads file to a [`Write`] implementation.
|
||||
///
|
||||
/// ```
|
||||
/// use oxrdf::{NamedNodeRef, QuadRef};
|
||||
/// use oxttl::NQuadsSerializer;
|
||||
///
|
||||
/// let mut buf = Vec::new();
|
||||
/// let mut writer = NQuadsSerializer::new().serialize_to_write(buf);
|
||||
/// writer.write_quad(QuadRef::new(
|
||||
/// NamedNodeRef::new("http://example.com#me")?,
|
||||
/// NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?,
|
||||
/// NamedNodeRef::new("http://schema.org/Person")?,
|
||||
/// NamedNodeRef::new("http://example.com")?,
|
||||
/// ))?;
|
||||
/// assert_eq!(
|
||||
/// b"<http://example.com#me> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> <http://example.com> .\n",
|
||||
/// writer.finish().as_slice()
|
||||
/// );
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
pub fn serialize_to_write<W: Write>(&self, write: W) -> ToWriteNQuadsWriter<W> { |
||||
ToWriteNQuadsWriter { |
||||
write, |
||||
writer: self.serialize(), |
||||
} |
||||
} |
||||
|
||||
/// Builds a low-level N-Quads writer.
|
||||
///
|
||||
/// ```
|
||||
/// use oxrdf::{NamedNodeRef, QuadRef};
|
||||
/// use oxttl::NQuadsSerializer;
|
||||
///
|
||||
/// let mut buf = Vec::new();
|
||||
/// let mut writer = NQuadsSerializer::new().serialize();
|
||||
/// writer.write_quad(QuadRef::new(
|
||||
/// NamedNodeRef::new("http://example.com#me")?,
|
||||
/// NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?,
|
||||
/// NamedNodeRef::new("http://schema.org/Person")?,
|
||||
/// NamedNodeRef::new("http://example.com")?,
|
||||
/// ), &mut buf)?;
|
||||
/// assert_eq!(
|
||||
/// b"<http://example.com#me> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> <http://example.com> .\n",
|
||||
/// buf.as_slice()
|
||||
/// );
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
#[allow(clippy::unused_self)] |
||||
pub fn serialize(&self) -> LowLevelNQuadsWriter { |
||||
LowLevelNQuadsWriter |
||||
} |
||||
} |
||||
|
||||
/// Writes a N-Quads file to a [`Write`] implementation. Can be built using [`NQuadsSerializer::serialize_to_write`].
|
||||
///
|
||||
/// ```
|
||||
/// use oxrdf::{NamedNodeRef, QuadRef};
|
||||
/// use oxttl::NQuadsSerializer;
|
||||
///
|
||||
/// let mut buf = Vec::new();
|
||||
/// let mut writer = NQuadsSerializer::new().serialize_to_write(buf);
|
||||
/// writer.write_quad(QuadRef::new(
|
||||
/// NamedNodeRef::new("http://example.com#me")?,
|
||||
/// NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?,
|
||||
/// NamedNodeRef::new("http://schema.org/Person")?,
|
||||
/// NamedNodeRef::new("http://example.com")?,
|
||||
/// ))?;
|
||||
/// assert_eq!(
|
||||
/// b"<http://example.com#me> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> <http://example.com> .\n",
|
||||
/// writer.finish().as_slice()
|
||||
/// );
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
pub struct ToWriteNQuadsWriter<W: Write> { |
||||
write: W, |
||||
writer: LowLevelNQuadsWriter, |
||||
} |
||||
|
||||
impl<W: Write> ToWriteNQuadsWriter<W> { |
||||
/// Writes an extra quad.
|
||||
pub fn write_quad<'a>(&mut self, q: impl Into<QuadRef<'a>>) -> io::Result<()> { |
||||
self.writer.write_quad(q, &mut self.write) |
||||
} |
||||
|
||||
/// Ends the write process and returns the underlying [`Write`].
|
||||
pub fn finish(self) -> W { |
||||
self.write |
||||
} |
||||
} |
||||
|
||||
/// Writes a N-Quads file by using a low-level API. Can be built using [`NQuadsSerializer::serialize`].
|
||||
///
|
||||
/// ```
|
||||
/// use oxrdf::{NamedNodeRef, QuadRef};
|
||||
/// use oxttl::NQuadsSerializer;
|
||||
///
|
||||
/// let mut buf = Vec::new();
|
||||
/// let mut writer = NQuadsSerializer::new().serialize();
|
||||
/// writer.write_quad(QuadRef::new(
|
||||
/// NamedNodeRef::new("http://example.com#me")?,
|
||||
/// NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?,
|
||||
/// NamedNodeRef::new("http://schema.org/Person")?,
|
||||
/// NamedNodeRef::new("http://example.com")?,
|
||||
/// ), &mut buf)?;
|
||||
/// assert_eq!(
|
||||
/// b"<http://example.com#me> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> <http://example.com> .\n",
|
||||
/// buf.as_slice()
|
||||
/// );
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
pub struct LowLevelNQuadsWriter; |
||||
|
||||
impl LowLevelNQuadsWriter { |
||||
/// Writes an extra quad.
|
||||
#[allow(clippy::unused_self)] |
||||
pub fn write_quad<'a>( |
||||
&mut self, |
||||
q: impl Into<QuadRef<'a>>, |
||||
mut write: impl Write, |
||||
) -> io::Result<()> { |
||||
writeln!(write, "{} .", q.into()) |
||||
} |
||||
} |
@ -0,0 +1,389 @@ |
||||
//! A [N-Triples](https://www.w3.org/TR/n-triples/) streaming parser implemented by [`NTriplesParser`]
|
||||
//! and a serializer implemented by [`NTriplesSerializer`].
|
||||
|
||||
use crate::line_formats::NQuadsRecognizer; |
||||
use crate::toolkit::{FromReadIterator, ParseError, ParseOrIoError, Parser}; |
||||
use oxrdf::{Triple, TripleRef}; |
||||
use std::io::{self, Read, Write}; |
||||
|
||||
/// A [N-Triples](https://www.w3.org/TR/n-triples/) streaming parser.
|
||||
///
|
||||
/// Support for [N-Triples-star](https://w3c.github.io/rdf-star/cg-spec/2021-12-17.html#n-triples-star) is available behind the `rdf-star` feature and the [`NTriplesParser::with_quoted_triples`] option.
|
||||
///
|
||||
/// Count the number of people:
|
||||
/// ```
|
||||
/// use oxrdf::NamedNodeRef;
|
||||
/// use oxttl::{NTriplesParser, ParseError};
|
||||
///
|
||||
/// let file = b"<http://example.com/foo> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .
|
||||
/// <http://example.com/foo> <http://schema.org/name> \"Foo\" .
|
||||
/// <http://example.com/bar> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .
|
||||
/// <http://example.com/bar> <http://schema.org/name> \"Bar\" .";
|
||||
///
|
||||
/// let rdf_type = NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?;
|
||||
/// let schema_person = NamedNodeRef::new("http://schema.org/Person")?;
|
||||
/// let mut count = 0;
|
||||
/// for triple in NTriplesParser::new().parse_from_read(file.as_ref()) {
|
||||
/// let triple = triple?;
|
||||
/// if triple.predicate == rdf_type && triple.object == schema_person.into() {
|
||||
/// count += 1;
|
||||
/// }
|
||||
/// }
|
||||
/// assert_eq!(2, count);
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
#[derive(Default)] |
||||
pub struct NTriplesParser { |
||||
#[cfg(feature = "rdf-star")] |
||||
with_quoted_triples: bool, |
||||
} |
||||
|
||||
impl NTriplesParser { |
||||
/// Builds a new [`NTriplesParser`].
|
||||
#[inline] |
||||
pub fn new() -> Self { |
||||
Self::default() |
||||
} |
||||
|
||||
/// Enables [N-Triples-star](https://w3c.github.io/rdf-star/cg-spec/2021-12-17.html#n-triples-star).
|
||||
#[cfg(feature = "rdf-star")] |
||||
#[inline] |
||||
#[must_use] |
||||
pub fn with_quoted_triples(mut self) -> Self { |
||||
self.with_quoted_triples = true; |
||||
self |
||||
} |
||||
|
||||
/// Parses a N-Triples file from a [`Read`] implementation.
|
||||
///
|
||||
/// Count the number of people:
|
||||
/// ```
|
||||
/// use oxrdf::NamedNodeRef;
|
||||
/// use oxttl::{NTriplesParser, ParseError};
|
||||
///
|
||||
/// let file = b"<http://example.com/foo> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .
|
||||
/// <http://example.com/foo> <http://schema.org/name> \"Foo\" .
|
||||
/// <http://example.com/bar> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .
|
||||
/// <http://example.com/bar> <http://schema.org/name> \"Bar\" .";
|
||||
///
|
||||
/// let rdf_type = NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?;
|
||||
/// let schema_person = NamedNodeRef::new("http://schema.org/Person")?;
|
||||
/// let mut count = 0;
|
||||
/// for triple in NTriplesParser::new().parse_from_read(file.as_ref()) {
|
||||
/// let triple = triple?;
|
||||
/// if triple.predicate == rdf_type && triple.object == schema_person.into() {
|
||||
/// count += 1;
|
||||
/// }
|
||||
/// }
|
||||
/// assert_eq!(2, count);
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
pub fn parse_from_read<R: Read>(&self, read: R) -> FromReadNTriplesReader<R> { |
||||
FromReadNTriplesReader { |
||||
inner: self.parse().parser.parse_from_read(read), |
||||
} |
||||
} |
||||
|
||||
/// Allows to parse a N-Triples file by using a low-level API.
|
||||
///
|
||||
/// Count the number of people:
|
||||
/// ```
|
||||
/// use oxrdf::NamedNodeRef;
|
||||
/// use oxttl::{NTriplesParser, ParseError};
|
||||
///
|
||||
/// let file: [&[u8]; 4] = [
|
||||
/// b"<http://example.com/foo> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .\n",
|
||||
/// b"<http://example.com/foo> <http://schema.org/name> \"Foo\" .\n",
|
||||
/// b"<http://example.com/bar> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .\n",
|
||||
/// b"<http://example.com/bar> <http://schema.org/name> \"Bar\" .\n"
|
||||
/// ];
|
||||
///
|
||||
/// let rdf_type = NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?;
|
||||
/// let schema_person = NamedNodeRef::new("http://schema.org/Person")?;
|
||||
/// let mut count = 0;
|
||||
/// let mut parser = NTriplesParser::new().parse();
|
||||
/// let mut file_chunks = file.iter();
|
||||
/// while !parser.is_end() {
|
||||
/// // We feed more data to the parser
|
||||
/// if let Some(chunk) = file_chunks.next() {
|
||||
/// parser.extend_from_slice(chunk);
|
||||
/// } else {
|
||||
/// parser.end(); // It's finished
|
||||
/// }
|
||||
/// // We read as many triples from the parser as possible
|
||||
/// while let Some(triple) = parser.read_next() {
|
||||
/// let triple = triple?;
|
||||
/// if triple.predicate == rdf_type && triple.object == schema_person.into() {
|
||||
/// count += 1;
|
||||
/// }
|
||||
/// }
|
||||
/// }
|
||||
/// assert_eq!(2, count);
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
#[allow(clippy::unused_self)] |
||||
pub fn parse(&self) -> LowLevelNTriplesReader { |
||||
LowLevelNTriplesReader { |
||||
parser: NQuadsRecognizer::new_parser( |
||||
false, |
||||
#[cfg(feature = "rdf-star")] |
||||
self.with_quoted_triples, |
||||
), |
||||
} |
||||
} |
||||
} |
||||
|
||||
/// Parses a N-Triples file from a [`Read`] implementation. Can be built using [`NTriplesParser::parse_from_read`].
|
||||
///
|
||||
/// Count the number of people:
|
||||
/// ```
|
||||
/// use oxrdf::NamedNodeRef;
|
||||
/// use oxttl::{NTriplesParser, ParseError};
|
||||
///
|
||||
/// let file = b"<http://example.com/foo> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .
|
||||
/// <http://example.com/foo> <http://schema.org/name> \"Foo\" .
|
||||
/// <http://example.com/bar> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .
|
||||
/// <http://example.com/bar> <http://schema.org/name> \"Bar\" .";
|
||||
///
|
||||
/// let rdf_type = NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?;
|
||||
/// let schema_person = NamedNodeRef::new("http://schema.org/Person")?;
|
||||
/// let mut count = 0;
|
||||
/// for triple in NTriplesParser::new().parse_from_read(file.as_ref()) {
|
||||
/// let triple = triple?;
|
||||
/// if triple.predicate == rdf_type && triple.object == schema_person.into() {
|
||||
/// count += 1;
|
||||
/// }
|
||||
/// }
|
||||
/// assert_eq!(2, count);
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
pub struct FromReadNTriplesReader<R: Read> { |
||||
inner: FromReadIterator<R, NQuadsRecognizer>, |
||||
} |
||||
|
||||
impl<R: Read> Iterator for FromReadNTriplesReader<R> { |
||||
type Item = Result<Triple, ParseOrIoError>; |
||||
|
||||
fn next(&mut self) -> Option<Result<Triple, ParseOrIoError>> { |
||||
Some(self.inner.next()?.map(Into::into)) |
||||
} |
||||
} |
||||
|
||||
/// Parses a N-Triples file by using a low-level API. Can be built using [`NTriplesParser::parse`].
|
||||
///
|
||||
/// Count the number of people:
|
||||
/// ```
|
||||
/// use oxrdf::NamedNodeRef;
|
||||
/// use oxttl::{NTriplesParser, ParseError};
|
||||
///
|
||||
/// let file: [&[u8]; 4] = [
|
||||
/// b"<http://example.com/foo> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .\n",
|
||||
/// b"<http://example.com/foo> <http://schema.org/name> \"Foo\" .\n",
|
||||
/// b"<http://example.com/bar> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .\n",
|
||||
/// b"<http://example.com/bar> <http://schema.org/name> \"Bar\" .\n"
|
||||
/// ];
|
||||
///
|
||||
/// let rdf_type = NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?;
|
||||
/// let schema_person = NamedNodeRef::new("http://schema.org/Person")?;
|
||||
/// let mut count = 0;
|
||||
/// let mut parser = NTriplesParser::new().parse();
|
||||
/// let mut file_chunks = file.iter();
|
||||
/// while !parser.is_end() {
|
||||
/// // We feed more data to the parser
|
||||
/// if let Some(chunk) = file_chunks.next() {
|
||||
/// parser.extend_from_slice(chunk);
|
||||
/// } else {
|
||||
/// parser.end(); // It's finished
|
||||
/// }
|
||||
/// // We read as many triples from the parser as possible
|
||||
/// while let Some(triple) = parser.read_next() {
|
||||
/// let triple = triple?;
|
||||
/// if triple.predicate == rdf_type && triple.object == schema_person.into() {
|
||||
/// count += 1;
|
||||
/// }
|
||||
/// }
|
||||
/// }
|
||||
/// assert_eq!(2, count);
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
pub struct LowLevelNTriplesReader { |
||||
parser: Parser<NQuadsRecognizer>, |
||||
} |
||||
|
||||
impl LowLevelNTriplesReader { |
||||
/// Adds some extra bytes to the parser. Should be called when [`read_next`](Self::read_next) returns [`None`] and there is still unread data.
|
||||
pub fn extend_from_slice(&mut self, other: &[u8]) { |
||||
self.parser.extend_from_slice(other) |
||||
} |
||||
|
||||
/// Tell the parser that the file is finished.
|
||||
///
|
||||
/// This triggers the parsing of the final bytes and might lead [`read_next`](Self::read_next) to return some extra values.
|
||||
pub fn end(&mut self) { |
||||
self.parser.end() |
||||
} |
||||
|
||||
/// Returns if the parsing is finished i.e. [`end`](Self::end) has been called and [`read_next`](Self::read_next) is always going to return `None`.
|
||||
pub fn is_end(&self) -> bool { |
||||
self.parser.is_end() |
||||
} |
||||
|
||||
/// Attempt to parse a new triple from the already provided data.
|
||||
///
|
||||
/// Returns [`None`] if the parsing is finished or more data is required.
|
||||
/// If it is the case more data should be fed using [`extend_from_slice`](Self::extend_from_slice).
|
||||
pub fn read_next(&mut self) -> Option<Result<Triple, ParseError>> { |
||||
Some(self.parser.read_next()?.map(Into::into)) |
||||
} |
||||
} |
||||
|
||||
/// A [N-Triples](https://www.w3.org/TR/n-triples/) serializer.
|
||||
///
|
||||
/// Support for [N-Triples-star](https://w3c.github.io/rdf-star/cg-spec/2021-12-17.html#n-triples-star) is available behind the `rdf-star` feature.
|
||||
///
|
||||
/// ```
|
||||
/// use oxrdf::{NamedNodeRef, TripleRef};
|
||||
/// use oxttl::NTriplesSerializer;
|
||||
///
|
||||
/// let mut buf = Vec::new();
|
||||
/// let mut writer = NTriplesSerializer::new().serialize_to_write(buf);
|
||||
/// writer.write_triple(TripleRef::new(
|
||||
/// NamedNodeRef::new("http://example.com#me")?,
|
||||
/// NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?,
|
||||
/// NamedNodeRef::new("http://schema.org/Person")?,
|
||||
/// ))?;
|
||||
/// assert_eq!(
|
||||
/// b"<http://example.com#me> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .\n",
|
||||
/// writer.finish().as_slice()
|
||||
/// );
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
#[derive(Default)] |
||||
pub struct NTriplesSerializer; |
||||
|
||||
impl NTriplesSerializer { |
||||
/// Builds a new [`NTriplesSerializer`].
|
||||
#[inline] |
||||
pub fn new() -> Self { |
||||
Self |
||||
} |
||||
|
||||
/// Writes a N-Triples file to a [`Write`] implementation.
|
||||
///
|
||||
/// ```
|
||||
/// use oxrdf::{NamedNodeRef, TripleRef};
|
||||
/// use oxttl::NTriplesSerializer;
|
||||
///
|
||||
/// let mut buf = Vec::new();
|
||||
/// let mut writer = NTriplesSerializer::new().serialize_to_write(buf);
|
||||
/// writer.write_triple(TripleRef::new(
|
||||
/// NamedNodeRef::new("http://example.com#me")?,
|
||||
/// NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?,
|
||||
/// NamedNodeRef::new("http://schema.org/Person")?,
|
||||
/// ))?;
|
||||
/// assert_eq!(
|
||||
/// b"<http://example.com#me> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .\n",
|
||||
/// writer.finish().as_slice()
|
||||
/// );
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
pub fn serialize_to_write<W: Write>(&self, write: W) -> ToWriteNTriplesWriter<W> { |
||||
ToWriteNTriplesWriter { |
||||
write, |
||||
writer: self.serialize(), |
||||
} |
||||
} |
||||
|
||||
/// Builds a low-level N-Triples writer.
|
||||
///
|
||||
/// ```
|
||||
/// use oxrdf::{NamedNodeRef, TripleRef};
|
||||
/// use oxttl::NTriplesSerializer;
|
||||
///
|
||||
/// let mut buf = Vec::new();
|
||||
/// let mut writer = NTriplesSerializer::new().serialize();
|
||||
/// writer.write_triple(TripleRef::new(
|
||||
/// NamedNodeRef::new("http://example.com#me")?,
|
||||
/// NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?,
|
||||
/// NamedNodeRef::new("http://schema.org/Person")?,
|
||||
/// ), &mut buf)?;
|
||||
/// assert_eq!(
|
||||
/// b"<http://example.com#me> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .\n",
|
||||
/// buf.as_slice()
|
||||
/// );
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
#[allow(clippy::unused_self)] |
||||
pub fn serialize(&self) -> LowLevelNTriplesWriter { |
||||
LowLevelNTriplesWriter |
||||
} |
||||
} |
||||
|
||||
/// Writes a N-Triples file to a [`Write`] implementation. Can be built using [`NTriplesSerializer::serialize_to_write`].
|
||||
///
|
||||
/// ```
|
||||
/// use oxrdf::{NamedNodeRef, TripleRef};
|
||||
/// use oxttl::NTriplesSerializer;
|
||||
///
|
||||
/// let mut buf = Vec::new();
|
||||
/// let mut writer = NTriplesSerializer::new().serialize_to_write(buf);
|
||||
/// writer.write_triple(TripleRef::new(
|
||||
/// NamedNodeRef::new("http://example.com#me")?,
|
||||
/// NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?,
|
||||
/// NamedNodeRef::new("http://schema.org/Person")?,
|
||||
/// ))?;
|
||||
/// assert_eq!(
|
||||
/// b"<http://example.com#me> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .\n",
|
||||
/// writer.finish().as_slice()
|
||||
/// );
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
pub struct ToWriteNTriplesWriter<W: Write> { |
||||
write: W, |
||||
writer: LowLevelNTriplesWriter, |
||||
} |
||||
|
||||
impl<W: Write> ToWriteNTriplesWriter<W> { |
||||
/// Writes an extra triple.
|
||||
pub fn write_triple<'a>(&mut self, t: impl Into<TripleRef<'a>>) -> io::Result<()> { |
||||
self.writer.write_triple(t, &mut self.write) |
||||
} |
||||
|
||||
/// Ends the write process and returns the underlying [`Write`].
|
||||
pub fn finish(self) -> W { |
||||
self.write |
||||
} |
||||
} |
||||
|
||||
/// Writes a N-Triples file by using a low-level API. Can be built using [`NTriplesSerializer::serialize`].
|
||||
///
|
||||
/// ```
|
||||
/// use oxrdf::{NamedNodeRef, TripleRef};
|
||||
/// use oxttl::NTriplesSerializer;
|
||||
///
|
||||
/// let mut buf = Vec::new();
|
||||
/// let mut writer = NTriplesSerializer::new().serialize();
|
||||
/// writer.write_triple(TripleRef::new(
|
||||
/// NamedNodeRef::new("http://example.com#me")?,
|
||||
/// NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?,
|
||||
/// NamedNodeRef::new("http://schema.org/Person")?,
|
||||
/// ), &mut buf)?;
|
||||
/// assert_eq!(
|
||||
/// b"<http://example.com#me> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .\n",
|
||||
/// buf.as_slice()
|
||||
/// );
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
pub struct LowLevelNTriplesWriter; |
||||
|
||||
impl LowLevelNTriplesWriter { |
||||
/// Writes an extra triple.
|
||||
#[allow(clippy::unused_self)] |
||||
pub fn write_triple<'a>( |
||||
&mut self, |
||||
t: impl Into<TripleRef<'a>>, |
||||
mut write: impl Write, |
||||
) -> io::Result<()> { |
||||
writeln!(write, "{} .", t.into()) |
||||
} |
||||
} |
@ -0,0 +1,932 @@ |
||||
//! Shared parser implementation for Turtle and TriG.
|
||||
|
||||
use crate::lexer::{resolve_local_name, N3Lexer, N3LexerMode, N3LexerOptions, N3Token}; |
||||
use crate::toolkit::{Lexer, Parser, RuleRecognizer, RuleRecognizerError}; |
||||
use crate::{MAX_BUFFER_SIZE, MIN_BUFFER_SIZE}; |
||||
use oxiri::Iri; |
||||
#[cfg(feature = "rdf-star")] |
||||
use oxrdf::Triple; |
||||
use oxrdf::{ |
||||
vocab::{rdf, xsd}, |
||||
BlankNode, GraphName, Literal, NamedNode, NamedOrBlankNode, Quad, Subject, Term, |
||||
}; |
||||
use std::collections::HashMap; |
||||
|
||||
pub struct TriGRecognizer { |
||||
stack: Vec<TriGState>, |
||||
with_graph_name: bool, |
||||
#[cfg(feature = "rdf-star")] |
||||
with_quoted_triples: bool, |
||||
lexer_options: N3LexerOptions, |
||||
prefixes: HashMap<String, Iri<String>>, |
||||
cur_subject: Vec<Subject>, |
||||
cur_predicate: Vec<NamedNode>, |
||||
cur_object: Vec<Term>, |
||||
cur_graph: GraphName, |
||||
} |
||||
|
||||
impl RuleRecognizer for TriGRecognizer { |
||||
type TokenRecognizer = N3Lexer; |
||||
type Output = Quad; |
||||
|
||||
fn error_recovery_state(mut self) -> Self { |
||||
self.stack.clear(); |
||||
self.cur_subject.clear(); |
||||
self.cur_predicate.clear(); |
||||
self.cur_object.clear(); |
||||
self.cur_graph = GraphName::DefaultGraph; |
||||
self |
||||
} |
||||
|
||||
fn recognize_next( |
||||
mut self, |
||||
token: N3Token, |
||||
results: &mut Vec<Quad>, |
||||
errors: &mut Vec<RuleRecognizerError>, |
||||
) -> Self { |
||||
if let Some(rule) = self.stack.pop() { |
||||
match rule { |
||||
// [1g] trigDoc ::= (directive | block)*
|
||||
// [2g] block ::= triplesOrGraph | wrappedGraph | triples2 | "GRAPH" labelOrSubject wrappedGraph
|
||||
// [3] directive ::= prefixID | base | sparqlPrefix | sparqlBase
|
||||
// [4] prefixID ::= '@prefix' PNAME_NS IRIREF '.'
|
||||
// [5] base ::= '@base' IRIREF '.'
|
||||
// [5s] sparqlPrefix ::= "PREFIX" PNAME_NS IRIREF
|
||||
// [6s] sparqlBase ::= "BASE" IRIREF
|
||||
TriGState::TriGDoc => { |
||||
self.cur_graph = GraphName::DefaultGraph; |
||||
self.stack.push(TriGState::TriGDoc); |
||||
match token { |
||||
N3Token::PlainKeyword(k) if k.eq_ignore_ascii_case("base") => { |
||||
self.stack.push(TriGState::BaseExpectIri); |
||||
self |
||||
} |
||||
N3Token::PlainKeyword(k) if k.eq_ignore_ascii_case("prefix") => { |
||||
self.stack.push(TriGState::PrefixExpectPrefix); |
||||
self |
||||
} |
||||
N3Token::LangTag("prefix") => { |
||||
self.stack.push(TriGState::ExpectDot); |
||||
self.stack.push(TriGState::PrefixExpectPrefix); |
||||
self |
||||
} |
||||
N3Token::LangTag("base") => { |
||||
self.stack.push(TriGState::ExpectDot); |
||||
self.stack.push(TriGState::BaseExpectIri); |
||||
self |
||||
} |
||||
N3Token::PlainKeyword(k) if k.eq_ignore_ascii_case("graph") && self.with_graph_name => { |
||||
self.stack.push(TriGState::WrappedGraph); |
||||
self.stack.push(TriGState::GraphName); |
||||
self |
||||
} |
||||
token @ N3Token::Punctuation("{") if self.with_graph_name => { |
||||
self.stack.push(TriGState::WrappedGraph); |
||||
self.recognize_next(token, results, errors) |
||||
} |
||||
token => { |
||||
self.stack.push(TriGState::TriplesOrGraph); |
||||
self.recognize_next(token, results, errors) |
||||
} |
||||
} |
||||
}, |
||||
TriGState::ExpectDot => { |
||||
self.cur_subject.pop(); |
||||
if token == N3Token::Punctuation(".") { |
||||
self |
||||
} else { |
||||
errors.push("A dot is expected at the end of statements".into()); |
||||
self.recognize_next(token, results, errors) |
||||
} |
||||
}, |
||||
TriGState::BaseExpectIri => match token { |
||||
N3Token::IriRef(iri) => { |
||||
self.lexer_options.base_iri = Some(iri); |
||||
self |
||||
} |
||||
_ => self.error(errors, "The BASE keyword should be followed by an IRI"), |
||||
}, |
||||
TriGState::PrefixExpectPrefix => match token { |
||||
N3Token::PrefixedName { prefix, local, .. } if local.is_empty() => { |
||||
self.stack.push(TriGState::PrefixExpectIri { name: prefix.to_owned() }); |
||||
self |
||||
} |
||||
_ => { |
||||
self.error(errors, "The PREFIX keyword should be followed by a prefix like 'ex:'") |
||||
} |
||||
}, |
||||
TriGState::PrefixExpectIri { name } => match token { |
||||
N3Token::IriRef(iri) => { |
||||
self.prefixes.insert(name, iri); |
||||
self |
||||
} |
||||
_ => self.error(errors, "The PREFIX declaration should be followed by a prefix and its value as an IRI"), |
||||
}, |
||||
// [3g] triplesOrGraph ::= labelOrSubject ( wrappedGraph | predicateObjectList '.' ) | quotedTriple predicateObjectList '.'
|
||||
// [4g] triples2 ::= blankNodePropertyList predicateObjectList? '.' | collection predicateObjectList '.'
|
||||
TriGState::TriplesOrGraph => match token { |
||||
N3Token::IriRef(iri) => { |
||||
self.stack.push(TriGState::WrappedGraphOrPredicateObjectList { |
||||
term: NamedNode::new_unchecked(iri.into_inner()).into() |
||||
}); |
||||
self |
||||
} |
||||
N3Token::PrefixedName { prefix, local, might_be_invalid_iri } => match resolve_local_name(prefix, &local, might_be_invalid_iri, &self.prefixes) { |
||||
Ok(t) => { |
||||
self.stack.push(TriGState::WrappedGraphOrPredicateObjectList { |
||||
term: t.into() |
||||
}); |
||||
self |
||||
}, |
||||
Err(e) => self.error(errors, e) |
||||
} |
||||
N3Token::BlankNodeLabel(label) => { |
||||
self.stack.push(TriGState::WrappedGraphOrPredicateObjectList { |
||||
term: BlankNode::new_unchecked(label).into() |
||||
}); |
||||
self |
||||
} |
||||
N3Token::Punctuation("[") => { |
||||
self.stack.push(TriGState::WrappedGraphBlankNodePropertyListCurrent); |
||||
self |
||||
} |
||||
N3Token::Punctuation("(") => { |
||||
self.stack.push(TriGState::ExpectDot); |
||||
self.stack.push(TriGState::PredicateObjectList); |
||||
self.stack.push(TriGState::SubjectCollectionBeginning); |
||||
self |
||||
} |
||||
#[cfg(feature = "rdf-star")] |
||||
N3Token::Punctuation("<<") if self.with_quoted_triples => { |
||||
self.stack.push(TriGState::ExpectDot); |
||||
self.stack.push(TriGState::PredicateObjectList); |
||||
self.stack.push(TriGState::SubjectQuotedTripleEnd); |
||||
self.stack.push(TriGState::QuotedObject); |
||||
self.stack.push(TriGState::Verb); |
||||
self.stack.push(TriGState::QuotedSubject); |
||||
self |
||||
} |
||||
token => { |
||||
self.error(errors, format!("The token {token:?} is not a valid subject or graph name")) |
||||
} |
||||
} |
||||
TriGState::WrappedGraphOrPredicateObjectList { term } => { |
||||
if token == N3Token::Punctuation("{") && self.with_graph_name { |
||||
self.cur_graph = term.into(); |
||||
self.stack.push(TriGState::WrappedGraph); |
||||
} else { |
||||
self.cur_subject.push(term.into()); |
||||
self.stack.push(TriGState::ExpectDot); |
||||
self.stack.push(TriGState::PredicateObjectList); |
||||
} |
||||
self.recognize_next(token, results, errors) |
||||
} |
||||
TriGState::WrappedGraphBlankNodePropertyListCurrent => if token == N3Token::Punctuation("]") { |
||||
self.stack.push(TriGState::WrappedGraphOrPredicateObjectList { |
||||
term: BlankNode::default().into() |
||||
}); |
||||
self |
||||
} else { |
||||
self.cur_subject.push(BlankNode::default().into()); |
||||
self.stack.push(TriGState::ExpectDot); |
||||
self.stack.push(TriGState::SubjectBlankNodePropertyListEnd); |
||||
self.stack.push(TriGState::PredicateObjectList); |
||||
self.recognize_next(token, results, errors) |
||||
} |
||||
TriGState::SubjectBlankNodePropertyListEnd => if token == N3Token::Punctuation("]") { |
||||
self.stack.push(TriGState::SubjectBlankNodePropertyListAfter ); |
||||
self |
||||
} else { |
||||
errors.push("blank node property lists should end with a ']'".into()); |
||||
self.stack.push(TriGState::SubjectBlankNodePropertyListAfter ); |
||||
self.recognize_next(token, results, errors) |
||||
} |
||||
TriGState::SubjectBlankNodePropertyListAfter => if matches!(token, N3Token::Punctuation("." | "}")) { |
||||
self.recognize_next(token, results, errors) |
||||
} else { |
||||
self.stack.push(TriGState::PredicateObjectList); |
||||
self.recognize_next(token, results, errors) |
||||
} |
||||
TriGState::SubjectCollectionBeginning => { |
||||
match token { |
||||
N3Token::Punctuation(")") => { |
||||
self.cur_subject.push(rdf::NIL.into()); |
||||
self |
||||
} |
||||
token => { |
||||
let root = BlankNode::default(); |
||||
self.cur_subject.push(root.clone().into()); |
||||
self.cur_subject.push(root.into()); |
||||
self.cur_predicate.push(rdf::FIRST.into()); |
||||
self.stack.push(TriGState::SubjectCollectionPossibleEnd); |
||||
self.stack.push(TriGState::Object); |
||||
self.recognize_next(token, results, errors) |
||||
} |
||||
} |
||||
}, |
||||
TriGState::SubjectCollectionPossibleEnd => { |
||||
let old = self.cur_subject.pop().unwrap(); |
||||
self.cur_object.pop(); |
||||
match token { |
||||
N3Token::Punctuation(")") => { |
||||
self.cur_predicate.pop(); |
||||
results.push(Quad::new( |
||||
old, |
||||
rdf::REST, |
||||
rdf::NIL, |
||||
self.cur_graph.clone() |
||||
)); |
||||
self |
||||
} |
||||
token => { |
||||
let new = BlankNode::default(); |
||||
results.push(Quad::new( |
||||
old, |
||||
rdf::REST, |
||||
new.clone(), |
||||
self.cur_graph.clone() |
||||
)); |
||||
self.cur_subject.push(new.into()); |
||||
self.stack.push(TriGState::ObjectCollectionPossibleEnd); |
||||
self.stack.push(TriGState::Object); |
||||
self.recognize_next(token, results, errors) |
||||
} |
||||
} |
||||
} |
||||
// [5g] wrappedGraph ::= '{' triplesBlock? '}'
|
||||
// [6g] triplesBlock ::= triples ('.' triplesBlock?)?
|
||||
TriGState::WrappedGraph => if token == N3Token::Punctuation("{") { |
||||
self.stack.push(TriGState::WrappedGraphPossibleEnd); |
||||
self.stack.push(TriGState::Triples); |
||||
self |
||||
} else { |
||||
self.error(errors, "The GRAPH keyword should be followed by a graph name and a value in '{'") |
||||
}, |
||||
TriGState::WrappedGraphPossibleEnd => { |
||||
self.cur_subject.pop(); |
||||
match token { |
||||
N3Token::Punctuation("}") => { |
||||
self |
||||
} |
||||
N3Token::Punctuation(".") => { |
||||
self.stack.push(TriGState::WrappedGraphPossibleEnd); |
||||
self.stack.push(TriGState::Triples); |
||||
self |
||||
} |
||||
token => { |
||||
errors.push("A '}' or a '.' is expected at the end of a graph block".into()); |
||||
self.recognize_next(token, results, errors) |
||||
} |
||||
} |
||||
} |
||||
// [6] triples ::= subject predicateObjectList | blankNodePropertyList predicateObjectList?
|
||||
// [10] subject ::= iri | BlankNode | collection | quotedTriple
|
||||
TriGState::Triples => match token { |
||||
N3Token::Punctuation("}") => { |
||||
self.recognize_next(token, results, errors) // Early end
|
||||
}, |
||||
N3Token::Punctuation("[") => { |
||||
self.cur_subject.push(BlankNode::default().into()); |
||||
self.stack.push(TriGState::TriplesBlankNodePropertyListCurrent); |
||||
self |
||||
} |
||||
N3Token::IriRef(iri) => { |
||||
self.cur_subject.push(NamedNode::new_unchecked(iri.into_inner()).into()); |
||||
self.stack.push(TriGState::PredicateObjectList); |
||||
self |
||||
} |
||||
N3Token::PrefixedName { prefix, local, might_be_invalid_iri } => match resolve_local_name(prefix, &local, might_be_invalid_iri, &self.prefixes) { |
||||
Ok(t) => { |
||||
self.cur_subject.push(t.into()); |
||||
self.stack.push(TriGState::PredicateObjectList); |
||||
self |
||||
}, |
||||
Err(e) => self.error(errors, e) |
||||
} |
||||
N3Token::BlankNodeLabel(label) => { |
||||
self.cur_subject.push(BlankNode::new_unchecked(label).into()); |
||||
self.stack.push(TriGState::PredicateObjectList); |
||||
self |
||||
} |
||||
N3Token::Punctuation("(") => { |
||||
self.stack.push(TriGState::PredicateObjectList); |
||||
self.stack.push(TriGState::SubjectCollectionBeginning); |
||||
self |
||||
} |
||||
#[cfg(feature = "rdf-star")] |
||||
N3Token::Punctuation("<<") if self.with_quoted_triples => { |
||||
self.stack.push(TriGState::PredicateObjectList); |
||||
self.stack.push(TriGState::SubjectQuotedTripleEnd); |
||||
self.stack.push(TriGState::QuotedObject); |
||||
self.stack.push(TriGState::Verb); |
||||
self.stack.push(TriGState::QuotedSubject); |
||||
self |
||||
} |
||||
token => { |
||||
self.error(errors, format!("The token {token:?} is not a valid RDF subject")) |
||||
} |
||||
}, |
||||
TriGState::TriplesBlankNodePropertyListCurrent => if token == N3Token::Punctuation("]") { |
||||
self.stack.push(TriGState::PredicateObjectList); |
||||
self |
||||
} else { |
||||
self.stack.push(TriGState::SubjectBlankNodePropertyListEnd); |
||||
self.stack.push(TriGState::PredicateObjectList); |
||||
self.recognize_next(token, results, errors) |
||||
} |
||||
// [7g] labelOrSubject ::= iri | BlankNode
|
||||
TriGState::GraphName => match token { |
||||
N3Token::IriRef(iri) => { |
||||
self.cur_graph = NamedNode::new_unchecked(iri.into_inner()).into(); |
||||
self |
||||
} |
||||
N3Token::PrefixedName { prefix, local, might_be_invalid_iri } => match resolve_local_name(prefix, &local, might_be_invalid_iri, &self.prefixes) { |
||||
Ok(t) => { |
||||
self.cur_graph = t.into(); |
||||
self |
||||
}, |
||||
Err(e) => self.error(errors, e) |
||||
} |
||||
N3Token::BlankNodeLabel(label) => { |
||||
self.cur_graph = BlankNode::new_unchecked(label).into(); |
||||
self |
||||
} |
||||
N3Token::Punctuation("[") => { |
||||
self.stack.push(TriGState::GraphNameAnonEnd); |
||||
self |
||||
} |
||||
token => { |
||||
self.error(errors, format!("The token {token:?} is not a valid graph name")) |
||||
} |
||||
} |
||||
TriGState::GraphNameAnonEnd => if token == N3Token::Punctuation("]") { |
||||
self.cur_graph = BlankNode::default().into(); |
||||
self |
||||
} else { |
||||
self.error(errors, "Anonymous blank node with a property list are not allowed as graph name") |
||||
} |
||||
// [7] predicateObjectList ::= verb objectList (';' (verb objectList)?)*
|
||||
TriGState::PredicateObjectList => { |
||||
self.stack.push(TriGState::PredicateObjectListEnd); |
||||
self.stack.push(TriGState::ObjectsList); |
||||
self.stack.push(TriGState::Verb); |
||||
self.recognize_next(token, results, errors) |
||||
}, |
||||
TriGState::PredicateObjectListEnd => { |
||||
self.cur_predicate.pop(); |
||||
if token == N3Token::Punctuation(";") { |
||||
self.stack.push(TriGState::PredicateObjectListPossibleContinuation); |
||||
self |
||||
} else { |
||||
self.recognize_next(token, results, errors) |
||||
} |
||||
}, |
||||
TriGState::PredicateObjectListPossibleContinuation => if token == N3Token::Punctuation(";") { |
||||
self.stack.push(TriGState::PredicateObjectListPossibleContinuation); |
||||
self |
||||
} else if matches!(token, N3Token::Punctuation("." | "}" | "]")) { |
||||
self.recognize_next(token, results, errors) |
||||
} else { |
||||
self.stack.push(TriGState::PredicateObjectListEnd); |
||||
self.stack.push(TriGState::ObjectsList); |
||||
self.stack.push(TriGState::Verb); |
||||
self.recognize_next(token, results, errors) |
||||
}, |
||||
// [8] objectList ::= object annotation? ( ',' object annotation? )*
|
||||
// [30t] annotation ::= '{|' predicateObjectList '|}'
|
||||
TriGState::ObjectsList => { |
||||
self.stack.push(TriGState::ObjectsListEnd); |
||||
self.stack.push(TriGState::Object); |
||||
self.recognize_next(token, results, errors) |
||||
} |
||||
TriGState::ObjectsListEnd => { |
||||
match token { |
||||
N3Token::Punctuation(",") => { |
||||
self.cur_object.pop(); |
||||
self.stack.push(TriGState::ObjectsListEnd); |
||||
self.stack.push(TriGState::Object); |
||||
self |
||||
}, |
||||
#[cfg(feature = "rdf-star")] |
||||
N3Token::Punctuation("{|") => { |
||||
let triple = Triple::new( |
||||
self.cur_subject.last().unwrap().clone(), |
||||
self.cur_predicate.last().unwrap().clone(), |
||||
self.cur_object.pop().unwrap() |
||||
); |
||||
self.cur_subject.push(triple.into()); |
||||
self.stack.push(TriGState::AnnotationEnd); |
||||
self.stack.push(TriGState::PredicateObjectList); |
||||
self |
||||
} |
||||
token => { |
||||
self.cur_object.pop(); |
||||
self.recognize_next(token, results, errors) |
||||
} |
||||
} |
||||
}, |
||||
#[cfg(feature = "rdf-star")] |
||||
TriGState::AnnotationEnd => { |
||||
self.cur_subject.pop(); |
||||
self.stack.push(TriGState::ObjectsListAfterAnnotation); |
||||
if token == N3Token::Punctuation("|}") { |
||||
self |
||||
} else { |
||||
self.error(errors, "Annotations should end with '|}'") |
||||
} |
||||
}, |
||||
#[cfg(feature = "rdf-star")] |
||||
TriGState::ObjectsListAfterAnnotation => if token == N3Token::Punctuation(",") { |
||||
self.stack.push(TriGState::ObjectsListEnd); |
||||
self.stack.push(TriGState::Object); |
||||
self |
||||
} else { |
||||
self.recognize_next(token, results, errors) |
||||
}, |
||||
// [9] verb ::= predicate | 'a'
|
||||
// [11] predicate ::= iri
|
||||
TriGState::Verb => match token { |
||||
N3Token::PlainKeyword("a") => { |
||||
self.cur_predicate.push(rdf::TYPE.into()); |
||||
self |
||||
} |
||||
N3Token::IriRef(iri) => { |
||||
self.cur_predicate.push(NamedNode::new_unchecked(iri.into_inner())); |
||||
self |
||||
} |
||||
N3Token::PrefixedName { prefix, local, might_be_invalid_iri } => match resolve_local_name(prefix, &local, might_be_invalid_iri, &self.prefixes) { |
||||
Ok(t) => { |
||||
self.cur_predicate.push(t); |
||||
self |
||||
}, |
||||
Err(e) => self.error(errors, e) |
||||
} |
||||
token => { |
||||
self.error(errors, format!("The token {token:?} is not a valid predicate")) |
||||
} |
||||
} |
||||
// [12] object ::= iri | BlankNode | collection | blankNodePropertyList | literal | quotedTriple
|
||||
// [13] literal ::= RDFLiteral | NumericLiteral | BooleanLiteral
|
||||
// [14] blank ::= BlankNode | collection
|
||||
// [15] blankNodePropertyList ::= '[' predicateObjectList ']'
|
||||
// [16] collection ::= '(' object* ')'
|
||||
// [17] NumericLiteral ::= INTEGER | DECIMAL | DOUBLE
|
||||
// [128s] RDFLiteral ::= String (LANGTAG | '^^' iri)?
|
||||
// [133s] BooleanLiteral ::= 'true' | 'false'
|
||||
// [18] String ::= STRING_LITERAL_QUOTE | STRING_LITERAL_SINGLE_QUOTE | STRING_LITERAL_LONG_SINGLE_QUOTE | STRING_LITERAL_LONG_QUOTE
|
||||
// [135s] iri ::= IRIREF | PrefixedName
|
||||
// [136s] PrefixedName ::= PNAME_LN | PNAME_NS
|
||||
// [137s] BlankNode ::= BLANK_NODE_LABEL | ANON
|
||||
TriGState::Object => match token { |
||||
N3Token::IriRef(iri) => { |
||||
self.cur_object.push(NamedNode::new_unchecked(iri.into_inner()).into()); |
||||
self.emit_quad(results); |
||||
self |
||||
} |
||||
N3Token::PrefixedName { prefix, local, might_be_invalid_iri } => match resolve_local_name(prefix, &local, might_be_invalid_iri, &self.prefixes) { |
||||
Ok(t) => { |
||||
self.cur_object.push(t.into()); |
||||
self.emit_quad(results); |
||||
self |
||||
}, |
||||
Err(e) => self.error(errors, e) |
||||
} |
||||
N3Token::BlankNodeLabel(label) => { |
||||
self.cur_object.push(BlankNode::new_unchecked(label).into()); |
||||
self.emit_quad(results); |
||||
self |
||||
} |
||||
N3Token::Punctuation("[") => { |
||||
self.stack.push(TriGState::ObjectBlankNodePropertyListCurrent); |
||||
self |
||||
} |
||||
N3Token::Punctuation("(") => { |
||||
self.stack.push(TriGState::ObjectCollectionBeginning); |
||||
self |
||||
} |
||||
N3Token::String(value) => { |
||||
self.stack.push(TriGState::LiteralPossibleSuffix { value, emit: true }); |
||||
self |
||||
} |
||||
N3Token::Integer(v) => { |
||||
self.cur_object.push(Literal::new_typed_literal(v, xsd::INTEGER).into()); |
||||
self.emit_quad(results); |
||||
self |
||||
} |
||||
N3Token::Decimal(v) => { |
||||
self.cur_object.push(Literal::new_typed_literal(v, xsd::DECIMAL).into()); |
||||
self.emit_quad(results); |
||||
self |
||||
} |
||||
N3Token::Double(v) => { |
||||
self.cur_object.push(Literal::new_typed_literal(v, xsd::DOUBLE).into()); |
||||
self.emit_quad(results); |
||||
self |
||||
} |
||||
N3Token::PlainKeyword("true") => { |
||||
self.cur_object.push(Literal::new_typed_literal("true", xsd::BOOLEAN).into()); |
||||
self.emit_quad(results); |
||||
self |
||||
} |
||||
N3Token::PlainKeyword("false") => { |
||||
self.cur_object.push(Literal::new_typed_literal("false", xsd::BOOLEAN).into()); |
||||
self.emit_quad(results); |
||||
self |
||||
} |
||||
#[cfg(feature = "rdf-star")] |
||||
N3Token::Punctuation("<<") if self.with_quoted_triples => { |
||||
self.stack.push(TriGState::ObjectQuotedTripleEnd { emit: true }); |
||||
self.stack.push(TriGState::QuotedObject); |
||||
self.stack.push(TriGState::Verb); |
||||
self.stack.push(TriGState::QuotedSubject); |
||||
self |
||||
} |
||||
token => { |
||||
self.error(errors, format!("This is not a valid RDF object: {token:?}")) |
||||
} |
||||
|
||||
} |
||||
TriGState::ObjectBlankNodePropertyListCurrent => if token == N3Token::Punctuation("]") { |
||||
self.cur_object.push(BlankNode::default().into()); |
||||
self.emit_quad(results); |
||||
self |
||||
} else { |
||||
self.cur_subject.push(BlankNode::default().into()); |
||||
self.stack.push(TriGState::ObjectBlankNodePropertyListEnd); |
||||
self.stack.push(TriGState::PredicateObjectList); |
||||
self.recognize_next(token, results, errors) |
||||
} |
||||
TriGState::ObjectBlankNodePropertyListEnd => if token == N3Token::Punctuation("]") { |
||||
self.cur_object.push(self.cur_subject.pop().unwrap().into()); |
||||
self.emit_quad(results); |
||||
self |
||||
} else { |
||||
self.error(errors, "blank node property lists should end with a ']'") |
||||
} |
||||
TriGState::ObjectCollectionBeginning => match token { |
||||
N3Token::Punctuation(")") => { |
||||
self.cur_object.push(rdf::NIL.into()); |
||||
self.emit_quad(results); |
||||
self |
||||
} |
||||
token => { |
||||
let root = BlankNode::default(); |
||||
self.cur_object.push(root.clone().into()); |
||||
self.emit_quad(results); |
||||
self.cur_subject.push(root.into()); |
||||
self.cur_predicate.push(rdf::FIRST.into()); |
||||
self.stack.push(TriGState::ObjectCollectionPossibleEnd); |
||||
self.stack.push(TriGState::Object); |
||||
self.recognize_next(token, results, errors) |
||||
} |
||||
}, |
||||
TriGState::ObjectCollectionPossibleEnd => { |
||||
let old = self.cur_subject.pop().unwrap(); |
||||
self.cur_object.pop(); |
||||
match token { |
||||
N3Token::Punctuation(")") => { |
||||
self.cur_predicate.pop(); |
||||
results.push(Quad::new(old, |
||||
rdf::REST, |
||||
rdf::NIL, |
||||
self.cur_graph.clone() |
||||
)); |
||||
self |
||||
} |
||||
token => { |
||||
let new = BlankNode::default(); |
||||
results.push(Quad::new(old, |
||||
rdf::REST, |
||||
new.clone(), |
||||
self.cur_graph.clone() |
||||
)); |
||||
self.cur_subject.push(new.into()); |
||||
self.stack.push(TriGState::ObjectCollectionPossibleEnd); |
||||
self.stack.push(TriGState::Object); |
||||
self.recognize_next(token, results, errors) |
||||
} |
||||
} |
||||
} |
||||
TriGState::LiteralPossibleSuffix { value, emit } => { |
||||
match token { |
||||
N3Token::LangTag(lang) => { |
||||
self.cur_object.push(Literal::new_language_tagged_literal_unchecked(value, lang.to_ascii_lowercase()).into()); |
||||
if emit { |
||||
self.emit_quad(results); |
||||
} |
||||
self |
||||
}, |
||||
N3Token::Punctuation("^^") => { |
||||
self.stack.push(TriGState::LiteralExpectDatatype { value, emit }); |
||||
self |
||||
} |
||||
token => { |
||||
self.cur_object.push(Literal::new_simple_literal(value).into()); |
||||
if emit { |
||||
self.emit_quad(results); |
||||
} |
||||
self.recognize_next(token, results, errors) |
||||
} |
||||
} |
||||
} |
||||
TriGState::LiteralExpectDatatype { value, emit } => { |
||||
match token { |
||||
N3Token::IriRef(datatype) => { |
||||
self.cur_object.push(Literal::new_typed_literal(value, NamedNode::new_unchecked(datatype.into_inner())).into()); |
||||
if emit { |
||||
self.emit_quad(results); |
||||
} |
||||
self |
||||
}, |
||||
N3Token::PrefixedName { prefix, local, might_be_invalid_iri } => match resolve_local_name(prefix, &local, might_be_invalid_iri, &self.prefixes) { |
||||
Ok(t) => { |
||||
self.cur_object.push(Literal::new_typed_literal(value, t).into()); |
||||
if emit { |
||||
self.emit_quad(results); |
||||
} |
||||
self |
||||
}, |
||||
Err(e) => self.error(errors, e) |
||||
} |
||||
token => { |
||||
self.error(errors, format!("Expecting a datatype IRI after '^^, found {token:?}")).recognize_next(token, results, errors) |
||||
} |
||||
} |
||||
} |
||||
// [27t] quotedTriple ::= '<<' qtSubject verb qtObject '>>'
|
||||
#[cfg(feature = "rdf-star")] |
||||
TriGState::SubjectQuotedTripleEnd => { |
||||
let triple = Triple::new( |
||||
self.cur_subject.pop().unwrap(), |
||||
self.cur_predicate.pop().unwrap(), |
||||
self.cur_object.pop().unwrap() |
||||
); |
||||
self.cur_subject.push(triple.into()); |
||||
if token == N3Token::Punctuation(">>") { |
||||
self |
||||
} else { |
||||
self.error(errors, format!("Expecting '>>' to close a quoted triple, found {token:?}")) |
||||
} |
||||
} |
||||
#[cfg(feature = "rdf-star")] |
||||
TriGState::ObjectQuotedTripleEnd { emit } => { |
||||
let triple = Triple::new( |
||||
self.cur_subject.pop().unwrap(), |
||||
self.cur_predicate.pop().unwrap(), |
||||
self.cur_object.pop().unwrap() |
||||
); |
||||
self.cur_object.push(triple.into()); |
||||
if emit { |
||||
self.emit_quad(results); |
||||
} |
||||
if token == N3Token::Punctuation(">>") { |
||||
self |
||||
} else { |
||||
self.error(errors, format!("Expecting '>>' to close a quoted triple, found {token:?}")) |
||||
} |
||||
} |
||||
// [28t] qtSubject ::= iri | BlankNode | quotedTriple
|
||||
#[cfg(feature = "rdf-star")] |
||||
TriGState::QuotedSubject => match token { |
||||
N3Token::Punctuation("[") => { |
||||
self.cur_subject.push(BlankNode::default().into()); |
||||
self.stack.push(TriGState::QuotedAnonEnd); |
||||
self |
||||
} |
||||
N3Token::IriRef(iri) => { |
||||
self.cur_subject.push(NamedNode::new_unchecked(iri.into_inner()).into()); |
||||
self |
||||
} |
||||
N3Token::PrefixedName { prefix, local, might_be_invalid_iri } => match resolve_local_name(prefix, &local, might_be_invalid_iri, &self.prefixes) { |
||||
Ok(t) => { |
||||
self.cur_subject.push(t.into()); |
||||
self |
||||
}, |
||||
Err(e) => self.error(errors, e) |
||||
} |
||||
N3Token::BlankNodeLabel(label) => { |
||||
self.cur_subject.push(BlankNode::new_unchecked(label).into()); |
||||
self |
||||
} |
||||
N3Token::Punctuation("<<") => { |
||||
self.stack.push(TriGState::SubjectQuotedTripleEnd); |
||||
self.stack.push(TriGState::QuotedObject); |
||||
self.stack.push(TriGState::Verb); |
||||
self.stack.push(TriGState::QuotedSubject); |
||||
self |
||||
} |
||||
token => self.error(errors, format!("This is not a valid RDF quoted triple subject: {token:?}")) |
||||
} |
||||
// [29t] qtObject ::= iri | BlankNode | literal | quotedTriple
|
||||
#[cfg(feature = "rdf-star")] |
||||
TriGState::QuotedObject => match token { |
||||
N3Token::Punctuation("[") => { |
||||
self.cur_object.push(BlankNode::default().into()); |
||||
self.stack.push(TriGState::QuotedAnonEnd); |
||||
self |
||||
} |
||||
N3Token::IriRef(iri) => { |
||||
self.cur_object.push(NamedNode::new_unchecked(iri.into_inner()).into()); |
||||
self |
||||
} |
||||
N3Token::PrefixedName { prefix, local, might_be_invalid_iri } => match resolve_local_name(prefix, &local, might_be_invalid_iri, &self.prefixes) { |
||||
Ok(t) => { |
||||
self.cur_object.push(t.into()); |
||||
self |
||||
}, |
||||
Err(e) => self.error(errors, e) |
||||
} |
||||
N3Token::BlankNodeLabel(label) => { |
||||
self.cur_object.push(BlankNode::new_unchecked(label).into()); |
||||
self |
||||
} |
||||
N3Token::String(value) => { |
||||
self.stack.push(TriGState::LiteralPossibleSuffix { value, emit: false }); |
||||
self |
||||
} |
||||
N3Token::Integer(v) => { |
||||
self.cur_object.push(Literal::new_typed_literal(v, xsd::INTEGER).into()); |
||||
self |
||||
} |
||||
N3Token::Decimal(v) => { |
||||
self.cur_object.push(Literal::new_typed_literal(v, xsd::DECIMAL).into()); |
||||
self |
||||
} |
||||
N3Token::Double(v) => { |
||||
self.cur_object.push(Literal::new_typed_literal(v, xsd::DOUBLE).into()); |
||||
self |
||||
} |
||||
N3Token::PlainKeyword("true") => { |
||||
self.cur_object.push(Literal::new_typed_literal("true", xsd::BOOLEAN).into()); |
||||
self |
||||
} |
||||
N3Token::PlainKeyword("false") => { |
||||
self.cur_object.push(Literal::new_typed_literal("false", xsd::BOOLEAN).into()); |
||||
self |
||||
} |
||||
N3Token::Punctuation("<<") => { |
||||
self.stack.push(TriGState::ObjectQuotedTripleEnd { emit: false }); |
||||
self.stack.push(TriGState::QuotedObject); |
||||
self.stack.push(TriGState::Verb); |
||||
self.stack.push(TriGState::QuotedSubject); |
||||
self |
||||
} |
||||
token => self.error(errors, format!("This is not a valid RDF quoted triple object: {token:?}")) |
||||
} |
||||
#[cfg(feature = "rdf-star")] |
||||
TriGState::QuotedAnonEnd => if token == N3Token::Punctuation("]") { |
||||
self |
||||
} else { |
||||
self.error(errors, "Anonymous blank node with a property list are not allowed in quoted triples") |
||||
} |
||||
} |
||||
} else if token == N3Token::Punctuation(".") || token == N3Token::Punctuation("}") { |
||||
//TODO: be smarter depending if we are in '{' or not
|
||||
self.stack.push(TriGState::TriGDoc); |
||||
self |
||||
} else { |
||||
self |
||||
} |
||||
} |
||||
|
||||
fn recognize_end( |
||||
mut self, |
||||
results: &mut Vec<Self::Output>, |
||||
errors: &mut Vec<RuleRecognizerError>, |
||||
) { |
||||
match &*self.stack { |
||||
[] | [TriGState::TriGDoc] => { |
||||
debug_assert!(self.cur_subject.is_empty()); |
||||
debug_assert!(self.cur_predicate.is_empty()); |
||||
debug_assert!(self.cur_object.is_empty()); |
||||
} |
||||
[.., TriGState::LiteralPossibleSuffix { value, emit: true }] => { |
||||
self.cur_object |
||||
.push(Literal::new_simple_literal(value).into()); |
||||
self.emit_quad(results); |
||||
errors.push("Triples should be followed by a dot".into()) |
||||
} |
||||
_ => errors.push("Unexpected end".into()), //TODO
|
||||
} |
||||
} |
||||
|
||||
fn lexer_options(&self) -> &N3LexerOptions { |
||||
&self.lexer_options |
||||
} |
||||
} |
||||
|
||||
impl TriGRecognizer { |
||||
pub fn new_parser( |
||||
with_graph_name: bool, |
||||
#[cfg(feature = "rdf-star")] with_quoted_triples: bool, |
||||
base_iri: Option<Iri<String>>, |
||||
prefixes: HashMap<String, Iri<String>>, |
||||
) -> Parser<Self> { |
||||
Parser::new( |
||||
Lexer::new( |
||||
N3Lexer::new(N3LexerMode::Turtle), |
||||
MIN_BUFFER_SIZE, |
||||
MAX_BUFFER_SIZE, |
||||
true, |
||||
Some(b"#"), |
||||
), |
||||
TriGRecognizer { |
||||
stack: vec![TriGState::TriGDoc], |
||||
with_graph_name, |
||||
#[cfg(feature = "rdf-star")] |
||||
with_quoted_triples, |
||||
lexer_options: N3LexerOptions { base_iri }, |
||||
prefixes, |
||||
cur_subject: Vec::new(), |
||||
cur_predicate: Vec::new(), |
||||
cur_object: Vec::new(), |
||||
cur_graph: GraphName::DefaultGraph, |
||||
}, |
||||
) |
||||
} |
||||
|
||||
#[must_use] |
||||
fn error( |
||||
mut self, |
||||
errors: &mut Vec<RuleRecognizerError>, |
||||
msg: impl Into<RuleRecognizerError>, |
||||
) -> Self { |
||||
errors.push(msg.into()); |
||||
self.stack.clear(); |
||||
self.cur_subject.clear(); |
||||
self.cur_predicate.clear(); |
||||
self.cur_object.clear(); |
||||
self.cur_graph = GraphName::DefaultGraph; |
||||
self |
||||
} |
||||
|
||||
fn emit_quad(&mut self, results: &mut Vec<Quad>) { |
||||
results.push(Quad::new( |
||||
self.cur_subject.last().unwrap().clone(), |
||||
self.cur_predicate.last().unwrap().clone(), |
||||
self.cur_object.last().unwrap().clone(), |
||||
self.cur_graph.clone(), |
||||
)); |
||||
} |
||||
} |
||||
|
||||
#[derive(Debug)] |
||||
enum TriGState { |
||||
TriGDoc, |
||||
ExpectDot, |
||||
BaseExpectIri, |
||||
PrefixExpectPrefix, |
||||
PrefixExpectIri { |
||||
name: String, |
||||
}, |
||||
TriplesOrGraph, |
||||
WrappedGraphBlankNodePropertyListCurrent, |
||||
SubjectBlankNodePropertyListEnd, |
||||
SubjectBlankNodePropertyListAfter, |
||||
SubjectCollectionBeginning, |
||||
SubjectCollectionPossibleEnd, |
||||
WrappedGraphOrPredicateObjectList { |
||||
term: NamedOrBlankNode, |
||||
}, |
||||
WrappedGraph, |
||||
WrappedGraphPossibleEnd, |
||||
GraphName, |
||||
GraphNameAnonEnd, |
||||
Triples, |
||||
TriplesBlankNodePropertyListCurrent, |
||||
PredicateObjectList, |
||||
PredicateObjectListEnd, |
||||
PredicateObjectListPossibleContinuation, |
||||
ObjectsList, |
||||
ObjectsListEnd, |
||||
#[cfg(feature = "rdf-star")] |
||||
AnnotationEnd, |
||||
#[cfg(feature = "rdf-star")] |
||||
ObjectsListAfterAnnotation, |
||||
Verb, |
||||
Object, |
||||
ObjectBlankNodePropertyListCurrent, |
||||
ObjectBlankNodePropertyListEnd, |
||||
ObjectCollectionBeginning, |
||||
ObjectCollectionPossibleEnd, |
||||
LiteralPossibleSuffix { |
||||
value: String, |
||||
emit: bool, |
||||
}, |
||||
LiteralExpectDatatype { |
||||
value: String, |
||||
emit: bool, |
||||
}, |
||||
#[cfg(feature = "rdf-star")] |
||||
SubjectQuotedTripleEnd, |
||||
#[cfg(feature = "rdf-star")] |
||||
ObjectQuotedTripleEnd { |
||||
emit: bool, |
||||
}, |
||||
#[cfg(feature = "rdf-star")] |
||||
QuotedSubject, |
||||
#[cfg(feature = "rdf-star")] |
||||
QuotedObject, |
||||
#[cfg(feature = "rdf-star")] |
||||
QuotedAnonEnd, |
||||
} |
@ -0,0 +1,280 @@ |
||||
use memchr::memchr2; |
||||
use std::error::Error; |
||||
use std::fmt; |
||||
use std::io::{self, Read}; |
||||
use std::ops::{Range, RangeInclusive}; |
||||
|
||||
pub trait TokenRecognizer { |
||||
type Token<'a> |
||||
where |
||||
Self: 'a; |
||||
type Options: Default; |
||||
|
||||
fn recognize_next_token<'a>( |
||||
&mut self, |
||||
data: &'a [u8], |
||||
is_ending: bool, |
||||
config: &Self::Options, |
||||
) -> Option<(usize, Result<Self::Token<'a>, TokenRecognizerError>)>; |
||||
} |
||||
|
||||
pub struct TokenRecognizerError { |
||||
pub position: Range<usize>, |
||||
pub message: String, |
||||
} |
||||
|
||||
impl<S: Into<String>> From<(Range<usize>, S)> for TokenRecognizerError { |
||||
fn from((position, message): (Range<usize>, S)) -> Self { |
||||
Self { |
||||
position, |
||||
message: message.into(), |
||||
} |
||||
} |
||||
} |
||||
|
||||
#[allow(clippy::range_plus_one)] |
||||
impl<S: Into<String>> From<(RangeInclusive<usize>, S)> for TokenRecognizerError { |
||||
fn from((position, message): (RangeInclusive<usize>, S)) -> Self { |
||||
(*position.start()..*position.end() + 1, message).into() |
||||
} |
||||
} |
||||
|
||||
impl<S: Into<String>> From<(usize, S)> for TokenRecognizerError { |
||||
fn from((position, message): (usize, S)) -> Self { |
||||
(position..=position, message).into() |
||||
} |
||||
} |
||||
|
||||
pub struct TokenWithPosition<T> { |
||||
pub token: T, |
||||
pub position: Range<usize>, |
||||
} |
||||
|
||||
pub struct Lexer<R: TokenRecognizer> { |
||||
parser: R, |
||||
data: Vec<u8>, |
||||
start: usize, |
||||
end: usize, |
||||
is_ending: bool, |
||||
position: usize, |
||||
min_buffer_size: usize, |
||||
max_buffer_size: usize, |
||||
is_line_jump_whitespace: bool, |
||||
line_comment_start: Option<&'static [u8]>, |
||||
} |
||||
|
||||
impl<R: TokenRecognizer> Lexer<R> { |
||||
pub fn new( |
||||
parser: R, |
||||
min_buffer_size: usize, |
||||
max_buffer_size: usize, |
||||
is_line_jump_whitespace: bool, |
||||
line_comment_start: Option<&'static [u8]>, |
||||
) -> Self { |
||||
Self { |
||||
parser, |
||||
data: Vec::new(), |
||||
start: 0, |
||||
end: 0, |
||||
is_ending: false, |
||||
position: 0, |
||||
min_buffer_size, |
||||
max_buffer_size, |
||||
is_line_jump_whitespace, |
||||
line_comment_start, |
||||
} |
||||
} |
||||
|
||||
pub fn extend_from_slice(&mut self, other: &[u8]) { |
||||
self.shrink_if_useful(); |
||||
self.data.truncate(self.end); |
||||
self.data.extend_from_slice(other); |
||||
self.end = self.data.len(); |
||||
} |
||||
|
||||
pub fn end(&mut self) { |
||||
self.is_ending = true; |
||||
} |
||||
|
||||
pub fn extend_from_read(&mut self, read: &mut impl Read) -> io::Result<()> { |
||||
self.shrink_if_useful(); |
||||
let min_end = self.end + self.min_buffer_size; |
||||
if min_end > self.max_buffer_size { |
||||
return Err(io::Error::new( |
||||
io::ErrorKind::OutOfMemory, |
||||
format!( |
||||
"The buffer maximal size is {} < {min_end}", |
||||
self.max_buffer_size |
||||
), |
||||
)); |
||||
} |
||||
if self.data.len() < min_end { |
||||
self.data.resize(min_end, 0); |
||||
} |
||||
if self.data.len() < self.data.capacity() { |
||||
// We keep extending to have as much space as available without reallocation
|
||||
self.data.resize(self.data.capacity(), 0); |
||||
} |
||||
let read = read.read(&mut self.data[self.end..])?; |
||||
self.end += read; |
||||
self.is_ending = read == 0; |
||||
Ok(()) |
||||
} |
||||
|
||||
pub fn read_next( |
||||
&mut self, |
||||
options: &R::Options, |
||||
) -> Option<Result<TokenWithPosition<R::Token<'_>>, LexerError>> { |
||||
self.skip_whitespaces_and_comments()?; |
||||
let (consumed, result) = if let Some(r) = self.parser.recognize_next_token( |
||||
&self.data[self.start..self.end], |
||||
self.is_ending, |
||||
options, |
||||
) { |
||||
r |
||||
} else { |
||||
return if self.is_ending { |
||||
if self.start == self.end { |
||||
None // We have finished
|
||||
} else { |
||||
let error = LexerError { |
||||
position: self.position..self.position + (self.end - self.start), |
||||
message: "Unexpected end of file".into(), |
||||
}; |
||||
self.end = self.start; // We consume everything
|
||||
Some(Err(error)) |
||||
} |
||||
} else { |
||||
None |
||||
}; |
||||
}; |
||||
debug_assert!( |
||||
consumed > 0, |
||||
"The lexer must consume at least one byte each time" |
||||
); |
||||
debug_assert!( |
||||
self.start + consumed <= self.end, |
||||
"The lexer tried to consumed {consumed} bytes but only {} bytes are readable", |
||||
self.end - self.start |
||||
); |
||||
let old_position = self.position; |
||||
self.start += consumed; |
||||
self.position += consumed; |
||||
Some(match result { |
||||
Ok(token) => Ok(TokenWithPosition { |
||||
token, |
||||
position: old_position..self.position, |
||||
}), |
||||
Err(e) => Err(LexerError { |
||||
position: e.position.start + self.position..e.position.end + self.position, |
||||
message: e.message, |
||||
}), |
||||
}) |
||||
} |
||||
|
||||
pub fn is_end(&self) -> bool { |
||||
self.is_ending && self.end == self.start |
||||
} |
||||
|
||||
fn skip_whitespaces_and_comments(&mut self) -> Option<()> { |
||||
loop { |
||||
self.skip_whitespaces(); |
||||
|
||||
let buf = &self.data[self.start..self.end]; |
||||
if let Some(line_comment_start) = self.line_comment_start { |
||||
if buf.starts_with(line_comment_start) { |
||||
// Comment
|
||||
if let Some(end) = memchr2(b'\r', b'\n', &buf[line_comment_start.len()..]) { |
||||
self.start += end + line_comment_start.len(); |
||||
self.position += end + line_comment_start.len(); |
||||
continue; |
||||
} |
||||
if self.is_ending { |
||||
self.end = self.start; // EOF
|
||||
return Some(()); |
||||
} |
||||
return None; // We need more data
|
||||
} |
||||
} |
||||
return Some(()); |
||||
} |
||||
} |
||||
|
||||
fn skip_whitespaces(&mut self) { |
||||
if self.is_line_jump_whitespace { |
||||
for (i, c) in self.data[self.start..self.end].iter().enumerate() { |
||||
if !matches!(c, b' ' | b'\t' | b'\r' | b'\n') { |
||||
self.start += i; |
||||
self.position += i; |
||||
return; |
||||
} |
||||
//TODO: SIMD
|
||||
} |
||||
} else { |
||||
for (i, c) in self.data[self.start..self.end].iter().enumerate() { |
||||
if !matches!(c, b' ' | b'\t') { |
||||
self.start += i; |
||||
self.position += i; |
||||
return; |
||||
} |
||||
//TODO: SIMD
|
||||
} |
||||
} |
||||
// We only have whitespaces
|
||||
self.position += self.end - self.start; |
||||
self.end = self.start; |
||||
} |
||||
|
||||
fn shrink_if_useful(&mut self) { |
||||
if self.start * 2 > self.data.len() { |
||||
// We have read more than half of the buffer, let's move the data to the beginning
|
||||
self.data.copy_within(self.start..self.end, 0); |
||||
self.end -= self.start; |
||||
self.start = 0; |
||||
} |
||||
} |
||||
} |
||||
|
||||
#[derive(Debug)] |
||||
pub struct LexerError { |
||||
position: Range<usize>, |
||||
message: String, |
||||
} |
||||
|
||||
impl LexerError { |
||||
pub fn position(&self) -> Range<usize> { |
||||
self.position.clone() |
||||
} |
||||
|
||||
pub fn message(&self) -> &str { |
||||
&self.message |
||||
} |
||||
|
||||
pub fn into_message(self) -> String { |
||||
self.message |
||||
} |
||||
} |
||||
|
||||
impl fmt::Display for LexerError { |
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
||||
if self.position.start + 1 == self.position.end { |
||||
write!( |
||||
f, |
||||
"Lexer error at byte {}: {}", |
||||
self.position.start, self.message |
||||
) |
||||
} else { |
||||
write!( |
||||
f, |
||||
"Lexer error between bytes {} and {}: {}", |
||||
self.position.start, self.position.end, self.message |
||||
) |
||||
} |
||||
} |
||||
} |
||||
|
||||
impl Error for LexerError { |
||||
fn description(&self) -> &str { |
||||
self.message() |
||||
} |
||||
} |
@ -0,0 +1,11 @@ |
||||
//! oxttl parsing toolkit.
|
||||
//!
|
||||
//! Provides the basic code to write plain Rust lexers and parsers able to read files chunk by chunk.
|
||||
|
||||
mod lexer; |
||||
mod parser; |
||||
|
||||
pub use self::lexer::{Lexer, LexerError, TokenRecognizer, TokenRecognizerError}; |
||||
pub use self::parser::{ |
||||
FromReadIterator, ParseError, ParseOrIoError, Parser, RuleRecognizer, RuleRecognizerError, |
||||
}; |
@ -0,0 +1,244 @@ |
||||
use crate::toolkit::lexer::TokenWithPosition; |
||||
use crate::toolkit::{Lexer, LexerError, TokenRecognizer}; |
||||
use std::error::Error; |
||||
use std::io::Read; |
||||
use std::ops::Range; |
||||
use std::{fmt, io}; |
||||
|
||||
pub trait RuleRecognizer: Sized { |
||||
type TokenRecognizer: TokenRecognizer; |
||||
type Output; |
||||
|
||||
fn error_recovery_state(self) -> Self; |
||||
|
||||
fn recognize_next( |
||||
self, |
||||
token: <Self::TokenRecognizer as TokenRecognizer>::Token<'_>, |
||||
results: &mut Vec<Self::Output>, |
||||
errors: &mut Vec<RuleRecognizerError>, |
||||
) -> Self; |
||||
|
||||
fn recognize_end(self, results: &mut Vec<Self::Output>, errors: &mut Vec<RuleRecognizerError>); |
||||
|
||||
fn lexer_options(&self) -> &<Self::TokenRecognizer as TokenRecognizer>::Options; |
||||
} |
||||
|
||||
pub struct RuleRecognizerError { |
||||
pub message: String, |
||||
} |
||||
|
||||
impl<S: Into<String>> From<S> for RuleRecognizerError { |
||||
fn from(message: S) -> Self { |
||||
Self { |
||||
message: message.into(), |
||||
} |
||||
} |
||||
} |
||||
|
||||
pub struct Parser<RR: RuleRecognizer> { |
||||
lexer: Lexer<RR::TokenRecognizer>, |
||||
state: Option<RR>, |
||||
results: Vec<RR::Output>, |
||||
errors: Vec<RuleRecognizerError>, |
||||
position: Range<usize>, |
||||
default_lexer_options: <RR::TokenRecognizer as TokenRecognizer>::Options, |
||||
} |
||||
|
||||
impl<RR: RuleRecognizer> Parser<RR> { |
||||
pub fn new(lexer: Lexer<RR::TokenRecognizer>, recognizer: RR) -> Self { |
||||
Self { |
||||
lexer, |
||||
state: Some(recognizer), |
||||
results: vec![], |
||||
errors: vec![], |
||||
position: 0..0, |
||||
default_lexer_options: <RR::TokenRecognizer as TokenRecognizer>::Options::default(), |
||||
} |
||||
} |
||||
|
||||
pub fn extend_from_slice(&mut self, other: &[u8]) { |
||||
self.lexer.extend_from_slice(other) |
||||
} |
||||
|
||||
pub fn end(&mut self) { |
||||
self.lexer.end() |
||||
} |
||||
|
||||
pub fn is_end(&self) -> bool { |
||||
self.state.is_none() && self.results.is_empty() && self.errors.is_empty() |
||||
} |
||||
|
||||
pub fn read_next(&mut self) -> Option<Result<RR::Output, ParseError>> { |
||||
loop { |
||||
if let Some(error) = self.errors.pop() { |
||||
return Some(Err(ParseError { |
||||
position: self.position.clone(), |
||||
message: error.message, |
||||
})); |
||||
} |
||||
if let Some(result) = self.results.pop() { |
||||
return Some(Ok(result)); |
||||
} |
||||
if let Some(result) = self.lexer.read_next( |
||||
self.state |
||||
.as_ref() |
||||
.map_or(&self.default_lexer_options, |p| p.lexer_options()), |
||||
) { |
||||
match result { |
||||
Ok(TokenWithPosition { token, position }) => { |
||||
self.position = position; |
||||
self.state = self.state.take().map(|state| { |
||||
state.recognize_next(token, &mut self.results, &mut self.errors) |
||||
}); |
||||
continue; |
||||
} |
||||
Err(e) => { |
||||
self.state = self.state.take().map(RR::error_recovery_state); |
||||
return Some(Err(e.into())); |
||||
} |
||||
} |
||||
} |
||||
if self.lexer.is_end() { |
||||
if let Some(state) = self.state.take() { |
||||
state.recognize_end(&mut self.results, &mut self.errors) |
||||
} else { |
||||
return None; |
||||
} |
||||
} else { |
||||
return None; |
||||
} |
||||
} |
||||
} |
||||
|
||||
pub fn parse_from_read<R: Read>(self, read: R) -> FromReadIterator<R, RR> { |
||||
FromReadIterator { read, parser: self } |
||||
} |
||||
} |
||||
|
||||
/// An error from parsing.
|
||||
///
|
||||
/// It is composed of a message and a byte range in the input.
|
||||
#[derive(Debug)] |
||||
pub struct ParseError { |
||||
position: Range<usize>, |
||||
message: String, |
||||
} |
||||
|
||||
impl ParseError { |
||||
/// The invalid byte range in the input.
|
||||
pub fn position(&self) -> Range<usize> { |
||||
self.position.clone() |
||||
} |
||||
|
||||
/// The error message.
|
||||
pub fn message(&self) -> &str { |
||||
&self.message |
||||
} |
||||
|
||||
/// Converts this error to an error message.
|
||||
pub fn into_message(self) -> String { |
||||
self.message |
||||
} |
||||
} |
||||
|
||||
impl fmt::Display for ParseError { |
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
||||
if self.position.start + 1 == self.position.end { |
||||
write!( |
||||
f, |
||||
"Parser error at byte {}: {}", |
||||
self.position.start, self.message |
||||
) |
||||
} else { |
||||
write!( |
||||
f, |
||||
"Parser error between bytes {} and {}: {}", |
||||
self.position.start, self.position.end, self.message |
||||
) |
||||
} |
||||
} |
||||
} |
||||
|
||||
impl Error for ParseError {} |
||||
|
||||
impl From<ParseError> for io::Error { |
||||
fn from(error: ParseError) -> Self { |
||||
io::Error::new(io::ErrorKind::InvalidData, error) |
||||
} |
||||
} |
||||
|
||||
impl From<LexerError> for ParseError { |
||||
fn from(e: LexerError) -> Self { |
||||
Self { |
||||
position: e.position(), |
||||
message: e.into_message(), |
||||
} |
||||
} |
||||
} |
||||
|
||||
/// The union of [`ParseError`] and [`std::io::Error`].
|
||||
#[derive(Debug)] |
||||
pub enum ParseOrIoError { |
||||
Parse(ParseError), |
||||
Io(io::Error), |
||||
} |
||||
|
||||
impl fmt::Display for ParseOrIoError { |
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
||||
match self { |
||||
Self::Parse(e) => e.fmt(f), |
||||
Self::Io(e) => e.fmt(f), |
||||
} |
||||
} |
||||
} |
||||
|
||||
impl Error for ParseOrIoError { |
||||
fn source(&self) -> Option<&(dyn Error + 'static)> { |
||||
Some(match self { |
||||
Self::Parse(e) => e, |
||||
Self::Io(e) => e, |
||||
}) |
||||
} |
||||
} |
||||
|
||||
impl From<ParseError> for ParseOrIoError { |
||||
fn from(error: ParseError) -> Self { |
||||
Self::Parse(error) |
||||
} |
||||
} |
||||
|
||||
impl From<io::Error> for ParseOrIoError { |
||||
fn from(error: io::Error) -> Self { |
||||
Self::Io(error) |
||||
} |
||||
} |
||||
|
||||
impl From<ParseOrIoError> for io::Error { |
||||
fn from(error: ParseOrIoError) -> Self { |
||||
match error { |
||||
ParseOrIoError::Parse(e) => e.into(), |
||||
ParseOrIoError::Io(e) => e, |
||||
} |
||||
} |
||||
} |
||||
|
||||
pub struct FromReadIterator<R: Read, RR: RuleRecognizer> { |
||||
read: R, |
||||
parser: Parser<RR>, |
||||
} |
||||
|
||||
impl<R: Read, RR: RuleRecognizer> Iterator for FromReadIterator<R, RR> { |
||||
type Item = Result<RR::Output, ParseOrIoError>; |
||||
|
||||
fn next(&mut self) -> Option<Self::Item> { |
||||
while !self.parser.is_end() { |
||||
if let Some(result) = self.parser.read_next() { |
||||
return Some(result.map_err(ParseOrIoError::Parse)); |
||||
} |
||||
if let Err(e) = self.parser.lexer.extend_from_read(&mut self.read) { |
||||
return Some(Err(e.into())); |
||||
} |
||||
} |
||||
None |
||||
} |
||||
} |
@ -0,0 +1,666 @@ |
||||
//! A [TriG](https://www.w3.org/TR/trig/) streaming parser implemented by [`TriGParser`].
|
||||
|
||||
use crate::terse::TriGRecognizer; |
||||
use crate::toolkit::{FromReadIterator, ParseError, ParseOrIoError, Parser}; |
||||
use oxiri::{Iri, IriParseError}; |
||||
use oxrdf::{vocab::xsd, GraphName, NamedNode, Quad, QuadRef, Subject, TermRef}; |
||||
use std::collections::HashMap; |
||||
use std::fmt; |
||||
use std::io::{self, Read, Write}; |
||||
|
||||
/// A [TriG](https://www.w3.org/TR/trig/) streaming parser.
|
||||
///
|
||||
/// Support for [TriG-star](https://w3c.github.io/rdf-star/cg-spec/2021-12-17.html#trig-star) is available behind the `rdf-star` feature and the [`TriGParser::with_quoted_triples`] option.
|
||||
///
|
||||
/// Count the number of people:
|
||||
/// ```
|
||||
/// use oxrdf::NamedNodeRef;
|
||||
/// use oxttl::{TriGParser, ParseError};
|
||||
///
|
||||
/// let file = b"@base <http://example.com/> .
|
||||
/// @prefix schema: <http://schema.org/> .
|
||||
/// <foo> a schema:Person ;
|
||||
/// schema:name \"Foo\" .
|
||||
/// <bar> a schema:Person ;
|
||||
/// schema:name \"Bar\" .";
|
||||
///
|
||||
/// let rdf_type = NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?;
|
||||
/// let schema_person = NamedNodeRef::new("http://schema.org/Person")?;
|
||||
/// let mut count = 0;
|
||||
/// for quad in TriGParser::new().parse_from_read(file.as_ref()) {
|
||||
/// let quad = quad?;
|
||||
/// if quad.predicate == rdf_type && quad.object == schema_person.into() {
|
||||
/// count += 1;
|
||||
/// }
|
||||
/// }
|
||||
/// assert_eq!(2, count);
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
#[derive(Default)] |
||||
pub struct TriGParser { |
||||
base: Option<Iri<String>>, |
||||
prefixes: HashMap<String, Iri<String>>, |
||||
#[cfg(feature = "rdf-star")] |
||||
with_quoted_triples: bool, |
||||
} |
||||
|
||||
impl TriGParser { |
||||
/// Builds a new [`TriGParser`].
|
||||
#[inline] |
||||
pub fn new() -> Self { |
||||
Self::default() |
||||
} |
||||
|
||||
#[inline] |
||||
pub fn with_base_iri(mut self, base_iri: impl Into<String>) -> Result<Self, IriParseError> { |
||||
self.base = Some(Iri::parse(base_iri.into())?); |
||||
Ok(self) |
||||
} |
||||
|
||||
#[inline] |
||||
pub fn with_prefix( |
||||
mut self, |
||||
prefix_name: impl Into<String>, |
||||
prefix_iri: impl Into<String>, |
||||
) -> Result<Self, IriParseError> { |
||||
self.prefixes |
||||
.insert(prefix_name.into(), Iri::parse(prefix_iri.into())?); |
||||
Ok(self) |
||||
} |
||||
|
||||
/// Enables [TriG-star](https://w3c.github.io/rdf-star/cg-spec/2021-12-17.html#trig-star).
|
||||
#[cfg(feature = "rdf-star")] |
||||
#[inline] |
||||
#[must_use] |
||||
pub fn with_quoted_triples(mut self) -> Self { |
||||
self.with_quoted_triples = true; |
||||
self |
||||
} |
||||
|
||||
/// Parses a TriG file from a [`Read`] implementation.
|
||||
///
|
||||
/// Count the number of people:
|
||||
/// ```
|
||||
/// use oxrdf::NamedNodeRef;
|
||||
/// use oxttl::{TriGParser, ParseError};
|
||||
///
|
||||
/// let file = b"@base <http://example.com/> .
|
||||
/// @prefix schema: <http://schema.org/> .
|
||||
/// <foo> a schema:Person ;
|
||||
/// schema:name \"Foo\" .
|
||||
/// <bar> a schema:Person ;
|
||||
/// schema:name \"Bar\" .";
|
||||
///
|
||||
/// let rdf_type = NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?;
|
||||
/// let schema_person = NamedNodeRef::new("http://schema.org/Person")?;
|
||||
/// let mut count = 0;
|
||||
/// for quad in TriGParser::new().parse_from_read(file.as_ref()) {
|
||||
/// let quad = quad?;
|
||||
/// if quad.predicate == rdf_type && quad.object == schema_person.into() {
|
||||
/// count += 1;
|
||||
/// }
|
||||
/// }
|
||||
/// assert_eq!(2, count);
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
pub fn parse_from_read<R: Read>(&self, read: R) -> FromReadTriGReader<R> { |
||||
FromReadTriGReader { |
||||
inner: self.parse().parser.parse_from_read(read), |
||||
} |
||||
} |
||||
|
||||
/// Allows to parse a TriG file by using a low-level API.
|
||||
///
|
||||
/// Count the number of people:
|
||||
/// ```
|
||||
/// use oxrdf::NamedNodeRef;
|
||||
/// use oxttl::{TriGParser, ParseError};
|
||||
///
|
||||
/// let file: [&[u8]; 5] = [b"@base <http://example.com/>",
|
||||
/// b". @prefix schema: <http://schema.org/> .",
|
||||
/// b"<foo> a schema:Person",
|
||||
/// b" ; schema:name \"Foo\" . <bar>",
|
||||
/// b" a schema:Person ; schema:name \"Bar\" ."
|
||||
/// ];
|
||||
///
|
||||
/// let rdf_type = NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?;
|
||||
/// let schema_person = NamedNodeRef::new("http://schema.org/Person")?;
|
||||
/// let mut count = 0;
|
||||
/// let mut parser = TriGParser::new().parse();
|
||||
/// let mut file_chunks = file.iter();
|
||||
/// while !parser.is_end() {
|
||||
/// // We feed more data to the parser
|
||||
/// if let Some(chunk) = file_chunks.next() {
|
||||
/// parser.extend_from_slice(chunk);
|
||||
/// } else {
|
||||
/// parser.end(); // It's finished
|
||||
/// }
|
||||
/// // We read as many quads from the parser as possible
|
||||
/// while let Some(quad) = parser.read_next() {
|
||||
/// let quad = quad?;
|
||||
/// if quad.predicate == rdf_type && quad.object == schema_person.into() {
|
||||
/// count += 1;
|
||||
/// }
|
||||
/// }
|
||||
/// }
|
||||
/// assert_eq!(2, count);
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
pub fn parse(&self) -> LowLevelTriGReader { |
||||
LowLevelTriGReader { |
||||
parser: TriGRecognizer::new_parser( |
||||
true, |
||||
#[cfg(feature = "rdf-star")] |
||||
self.with_quoted_triples, |
||||
self.base.clone(), |
||||
self.prefixes.clone(), |
||||
), |
||||
} |
||||
} |
||||
} |
||||
|
||||
/// Parses a TriG file from a [`Read`] implementation. Can be built using [`TriGParser::parse_from_read`].
|
||||
///
|
||||
/// Count the number of people:
|
||||
/// ```
|
||||
/// use oxrdf::NamedNodeRef;
|
||||
/// use oxttl::{TriGParser, ParseError};
|
||||
///
|
||||
/// let file = b"@base <http://example.com/> .
|
||||
/// @prefix schema: <http://schema.org/> .
|
||||
/// <foo> a schema:Person ;
|
||||
/// schema:name \"Foo\" .
|
||||
/// <bar> a schema:Person ;
|
||||
/// schema:name \"Bar\" .";
|
||||
///
|
||||
/// let rdf_type = NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?;
|
||||
/// let schema_person = NamedNodeRef::new("http://schema.org/Person")?;
|
||||
/// let mut count = 0;
|
||||
/// for quad in TriGParser::new().parse_from_read(file.as_ref()) {
|
||||
/// let quad = quad?;
|
||||
/// if quad.predicate == rdf_type && quad.object == schema_person.into() {
|
||||
/// count += 1;
|
||||
/// }
|
||||
/// }
|
||||
/// assert_eq!(2, count);
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
pub struct FromReadTriGReader<R: Read> { |
||||
inner: FromReadIterator<R, TriGRecognizer>, |
||||
} |
||||
|
||||
impl<R: Read> Iterator for FromReadTriGReader<R> { |
||||
type Item = Result<Quad, ParseOrIoError>; |
||||
|
||||
fn next(&mut self) -> Option<Result<Quad, ParseOrIoError>> { |
||||
self.inner.next() |
||||
} |
||||
} |
||||
|
||||
/// Parses a TriG file by using a low-level API. Can be built using [`TriGParser::parse`].
|
||||
///
|
||||
/// Count the number of people:
|
||||
/// ```
|
||||
/// use oxrdf::NamedNodeRef;
|
||||
/// use oxttl::{TriGParser, ParseError};
|
||||
///
|
||||
/// let file: [&[u8]; 5] = [b"@base <http://example.com/>",
|
||||
/// b". @prefix schema: <http://schema.org/> .",
|
||||
/// b"<foo> a schema:Person",
|
||||
/// b" ; schema:name \"Foo\" . <bar>",
|
||||
/// b" a schema:Person ; schema:name \"Bar\" ."
|
||||
/// ];
|
||||
///
|
||||
/// let rdf_type = NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?;
|
||||
/// let schema_person = NamedNodeRef::new("http://schema.org/Person")?;
|
||||
/// let mut count = 0;
|
||||
/// let mut parser = TriGParser::new().parse();
|
||||
/// let mut file_chunks = file.iter();
|
||||
/// while !parser.is_end() {
|
||||
/// // We feed more data to the parser
|
||||
/// if let Some(chunk) = file_chunks.next() {
|
||||
/// parser.extend_from_slice(chunk);
|
||||
/// } else {
|
||||
/// parser.end(); // It's finished
|
||||
/// }
|
||||
/// // We read as many quads from the parser as possible
|
||||
/// while let Some(quad) = parser.read_next() {
|
||||
/// let quad = quad?;
|
||||
/// if quad.predicate == rdf_type && quad.object == schema_person.into() {
|
||||
/// count += 1;
|
||||
/// }
|
||||
/// }
|
||||
/// }
|
||||
/// assert_eq!(2, count);
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
pub struct LowLevelTriGReader { |
||||
parser: Parser<TriGRecognizer>, |
||||
} |
||||
|
||||
impl LowLevelTriGReader { |
||||
/// Adds some extra bytes to the parser. Should be called when [`read_next`](Self::read_next) returns [`None`] and there is still unread data.
|
||||
pub fn extend_from_slice(&mut self, other: &[u8]) { |
||||
self.parser.extend_from_slice(other) |
||||
} |
||||
|
||||
/// Tell the parser that the file is finished.
|
||||
///
|
||||
/// This triggers the parsing of the final bytes and might lead [`read_next`](Self::read_next) to return some extra values.
|
||||
pub fn end(&mut self) { |
||||
self.parser.end() |
||||
} |
||||
|
||||
/// Returns if the parsing is finished i.e. [`end`](Self::end) has been called and [`read_next`](Self::read_next) is always going to return `None`.
|
||||
pub fn is_end(&self) -> bool { |
||||
self.parser.is_end() |
||||
} |
||||
|
||||
/// Attempt to parse a new quad from the already provided data.
|
||||
///
|
||||
/// Returns [`None`] if the parsing is finished or more data is required.
|
||||
/// If it is the case more data should be fed using [`extend_from_slice`](Self::extend_from_slice).
|
||||
pub fn read_next(&mut self) -> Option<Result<Quad, ParseError>> { |
||||
self.parser.read_next() |
||||
} |
||||
} |
||||
|
||||
/// A [TriG](https://www.w3.org/TR/trig/) serializer.
|
||||
///
|
||||
/// Support for [TriG-star](https://w3c.github.io/rdf-star/cg-spec/2021-12-17.html#trig-star) is available behind the `rdf-star` feature.
|
||||
///
|
||||
/// ```
|
||||
/// use oxrdf::{NamedNodeRef, QuadRef};
|
||||
/// use oxttl::TriGSerializer;
|
||||
///
|
||||
/// let mut buf = Vec::new();
|
||||
/// let mut writer = TriGSerializer::new().serialize_to_write(buf);
|
||||
/// writer.write_quad(QuadRef::new(
|
||||
/// NamedNodeRef::new("http://example.com#me")?,
|
||||
/// NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?,
|
||||
/// NamedNodeRef::new("http://schema.org/Person")?,
|
||||
/// NamedNodeRef::new("http://example.com")?,
|
||||
/// ))?;
|
||||
/// assert_eq!(
|
||||
/// b"<http://example.com> {\n\t<http://example.com#me> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .\n}\n",
|
||||
/// writer.finish()?.as_slice()
|
||||
/// );
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
#[derive(Default)] |
||||
pub struct TriGSerializer; |
||||
|
||||
impl TriGSerializer { |
||||
/// Builds a new [`TriGSerializer`].
|
||||
#[inline] |
||||
pub fn new() -> Self { |
||||
Self |
||||
} |
||||
|
||||
/// Writes a TriG file to a [`Write`] implementation.
|
||||
///
|
||||
/// ```
|
||||
/// use oxrdf::{NamedNodeRef, QuadRef};
|
||||
/// use oxttl::TriGSerializer;
|
||||
///
|
||||
/// let mut buf = Vec::new();
|
||||
/// let mut writer = TriGSerializer::new().serialize_to_write(buf);
|
||||
/// writer.write_quad(QuadRef::new(
|
||||
/// NamedNodeRef::new("http://example.com#me")?,
|
||||
/// NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?,
|
||||
/// NamedNodeRef::new("http://schema.org/Person")?,
|
||||
/// NamedNodeRef::new("http://example.com")?,
|
||||
/// ))?;
|
||||
/// assert_eq!(
|
||||
/// b"<http://example.com> {\n\t<http://example.com#me> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .\n}\n",
|
||||
/// writer.finish()?.as_slice()
|
||||
/// );
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
pub fn serialize_to_write<W: Write>(&self, write: W) -> ToWriteTriGWriter<W> { |
||||
ToWriteTriGWriter { |
||||
write, |
||||
writer: self.serialize(), |
||||
} |
||||
} |
||||
|
||||
/// Builds a low-level TriG writer.
|
||||
///
|
||||
/// ```
|
||||
/// use oxrdf::{NamedNodeRef, QuadRef};
|
||||
/// use oxttl::TriGSerializer;
|
||||
///
|
||||
/// let mut buf = Vec::new();
|
||||
/// let mut writer = TriGSerializer::new().serialize();
|
||||
/// writer.write_quad(QuadRef::new(
|
||||
/// NamedNodeRef::new("http://example.com#me")?,
|
||||
/// NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?,
|
||||
/// NamedNodeRef::new("http://schema.org/Person")?,
|
||||
/// NamedNodeRef::new("http://example.com")?,
|
||||
/// ), &mut buf)?;
|
||||
/// writer.finish(&mut buf)?;
|
||||
/// assert_eq!(
|
||||
/// b"<http://example.com> {\n\t<http://example.com#me> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .\n}\n",
|
||||
/// buf.as_slice()
|
||||
/// );
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
#[allow(clippy::unused_self)] |
||||
pub fn serialize(&self) -> LowLevelTriGWriter { |
||||
LowLevelTriGWriter { |
||||
current_graph_name: GraphName::DefaultGraph, |
||||
current_subject_predicate: None, |
||||
} |
||||
} |
||||
} |
||||
|
||||
/// Writes a TriG file to a [`Write`] implementation. Can be built using [`TriGSerializer::serialize_to_write`].
|
||||
///
|
||||
/// ```
|
||||
/// use oxrdf::{NamedNodeRef, QuadRef};
|
||||
/// use oxttl::TriGSerializer;
|
||||
///
|
||||
/// let mut buf = Vec::new();
|
||||
/// let mut writer = TriGSerializer::new().serialize_to_write(buf);
|
||||
/// writer.write_quad(QuadRef::new(
|
||||
/// NamedNodeRef::new("http://example.com#me")?,
|
||||
/// NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?,
|
||||
/// NamedNodeRef::new("http://schema.org/Person")?,
|
||||
/// NamedNodeRef::new("http://example.com")?,
|
||||
/// ))?;
|
||||
/// assert_eq!(
|
||||
/// b"<http://example.com> {\n\t<http://example.com#me> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .\n}\n",
|
||||
/// writer.finish()?.as_slice()
|
||||
/// );
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
pub struct ToWriteTriGWriter<W: Write> { |
||||
write: W, |
||||
writer: LowLevelTriGWriter, |
||||
} |
||||
|
||||
impl<W: Write> ToWriteTriGWriter<W> { |
||||
/// Writes an extra quad.
|
||||
pub fn write_quad<'a>(&mut self, q: impl Into<QuadRef<'a>>) -> io::Result<()> { |
||||
self.writer.write_quad(q, &mut self.write) |
||||
} |
||||
|
||||
/// Ends the write process and returns the underlying [`Write`].
|
||||
pub fn finish(mut self) -> io::Result<W> { |
||||
self.writer.finish(&mut self.write)?; |
||||
Ok(self.write) |
||||
} |
||||
} |
||||
|
||||
/// Writes a TriG file by using a low-level API. Can be built using [`TriGSerializer::serialize`].
|
||||
///
|
||||
/// ```
|
||||
/// use oxrdf::{NamedNodeRef, QuadRef};
|
||||
/// use oxttl::TriGSerializer;
|
||||
///
|
||||
/// let mut buf = Vec::new();
|
||||
/// let mut writer = TriGSerializer::new().serialize();
|
||||
/// writer.write_quad(QuadRef::new(
|
||||
/// NamedNodeRef::new("http://example.com#me")?,
|
||||
/// NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?,
|
||||
/// NamedNodeRef::new("http://schema.org/Person")?,
|
||||
/// NamedNodeRef::new("http://example.com")?,
|
||||
/// ), &mut buf)?;
|
||||
/// writer.finish(&mut buf)?;
|
||||
/// assert_eq!(
|
||||
/// b"<http://example.com> {\n\t<http://example.com#me> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .\n}\n",
|
||||
/// buf.as_slice()
|
||||
/// );
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
pub struct LowLevelTriGWriter { |
||||
current_graph_name: GraphName, |
||||
current_subject_predicate: Option<(Subject, NamedNode)>, |
||||
} |
||||
|
||||
impl LowLevelTriGWriter { |
||||
/// Writes an extra quad.
|
||||
pub fn write_quad<'a>( |
||||
&mut self, |
||||
q: impl Into<QuadRef<'a>>, |
||||
mut write: impl Write, |
||||
) -> io::Result<()> { |
||||
let q = q.into(); |
||||
if q.graph_name == self.current_graph_name.as_ref() { |
||||
if let Some((current_subject, current_predicate)) = |
||||
self.current_subject_predicate.take() |
||||
{ |
||||
if q.subject == current_subject.as_ref() { |
||||
if q.predicate == current_predicate { |
||||
self.current_subject_predicate = Some((current_subject, current_predicate)); |
||||
write!(write, " , {}", TurtleTerm(q.object)) |
||||
} else { |
||||
self.current_subject_predicate = |
||||
Some((current_subject, q.predicate.into_owned())); |
||||
writeln!(write, " ;")?; |
||||
if !self.current_graph_name.is_default_graph() { |
||||
write!(write, "\t")?; |
||||
} |
||||
write!(write, "\t{} {}", q.predicate, TurtleTerm(q.object)) |
||||
} |
||||
} else { |
||||
self.current_subject_predicate = |
||||
Some((q.subject.into_owned(), q.predicate.into_owned())); |
||||
writeln!(write, " .")?; |
||||
if !self.current_graph_name.is_default_graph() { |
||||
write!(write, "\t")?; |
||||
} |
||||
write!( |
||||
write, |
||||
"{} {} {}", |
||||
TurtleTerm(q.subject.into()), |
||||
q.predicate, |
||||
TurtleTerm(q.object) |
||||
) |
||||
} |
||||
} else { |
||||
self.current_subject_predicate = |
||||
Some((q.subject.into_owned(), q.predicate.into_owned())); |
||||
if !self.current_graph_name.is_default_graph() { |
||||
write!(write, "\t")?; |
||||
} |
||||
write!( |
||||
write, |
||||
"{} {} {}", |
||||
TurtleTerm(q.subject.into()), |
||||
q.predicate, |
||||
TurtleTerm(q.object) |
||||
) |
||||
} |
||||
} else { |
||||
if self.current_subject_predicate.is_some() { |
||||
writeln!(write, " .")?; |
||||
} |
||||
if !self.current_graph_name.is_default_graph() { |
||||
writeln!(write, "}}")?; |
||||
} |
||||
self.current_graph_name = q.graph_name.into_owned(); |
||||
self.current_subject_predicate = |
||||
Some((q.subject.into_owned(), q.predicate.into_owned())); |
||||
if !self.current_graph_name.is_default_graph() { |
||||
writeln!(write, "{} {{", q.graph_name)?; |
||||
write!(write, "\t")?; |
||||
} |
||||
write!( |
||||
write, |
||||
"{} {} {}", |
||||
TurtleTerm(q.subject.into()), |
||||
q.predicate, |
||||
TurtleTerm(q.object) |
||||
) |
||||
} |
||||
} |
||||
|
||||
/// Finishes to write the file.
|
||||
pub fn finish(&mut self, mut write: impl Write) -> io::Result<()> { |
||||
if self.current_subject_predicate.is_some() { |
||||
writeln!(write, " .")?; |
||||
} |
||||
if !self.current_graph_name.is_default_graph() { |
||||
writeln!(write, "}}")?; |
||||
} |
||||
Ok(()) |
||||
} |
||||
} |
||||
|
||||
struct TurtleTerm<'a>(TermRef<'a>); |
||||
|
||||
impl<'a> fmt::Display for TurtleTerm<'a> { |
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
||||
match self.0 { |
||||
TermRef::NamedNode(v) => write!(f, "{v}"), |
||||
TermRef::BlankNode(v) => write!(f, "{v}"), |
||||
TermRef::Literal(v) => { |
||||
let value = v.value(); |
||||
let inline = match v.datatype() { |
||||
xsd::BOOLEAN => is_turtle_boolean(value), |
||||
xsd::INTEGER => is_turtle_integer(value), |
||||
xsd::DECIMAL => is_turtle_decimal(value), |
||||
xsd::DOUBLE => is_turtle_double(value), |
||||
_ => false, |
||||
}; |
||||
if inline { |
||||
write!(f, "{value}") |
||||
} else { |
||||
write!(f, "{v}") |
||||
} |
||||
} |
||||
#[cfg(feature = "rdf-star")] |
||||
TermRef::Triple(t) => { |
||||
write!( |
||||
f, |
||||
"<< {} {} {} >>", |
||||
TurtleTerm(t.subject.as_ref().into()), |
||||
t.predicate, |
||||
TurtleTerm(t.object.as_ref()) |
||||
) |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
fn is_turtle_boolean(value: &str) -> bool { |
||||
matches!(value, "true" | "false") |
||||
} |
||||
|
||||
fn is_turtle_integer(value: &str) -> bool { |
||||
// [19] INTEGER ::= [+-]? [0-9]+
|
||||
let mut value = value.as_bytes(); |
||||
if let Some(v) = value.strip_prefix(b"+") { |
||||
value = v; |
||||
} else if let Some(v) = value.strip_prefix(b"-") { |
||||
value = v; |
||||
} |
||||
!value.is_empty() && value.iter().all(u8::is_ascii_digit) |
||||
} |
||||
|
||||
fn is_turtle_decimal(value: &str) -> bool { |
||||
// [20] DECIMAL ::= [+-]? [0-9]* '.' [0-9]+
|
||||
let mut value = value.as_bytes(); |
||||
if let Some(v) = value.strip_prefix(b"+") { |
||||
value = v; |
||||
} else if let Some(v) = value.strip_prefix(b"-") { |
||||
value = v; |
||||
} |
||||
while value.first().map_or(false, u8::is_ascii_digit) { |
||||
value = &value[1..]; |
||||
} |
||||
if let Some(v) = value.strip_prefix(b".") { |
||||
value = v; |
||||
} else { |
||||
return false; |
||||
} |
||||
!value.is_empty() && value.iter().all(u8::is_ascii_digit) |
||||
} |
||||
|
||||
fn is_turtle_double(value: &str) -> bool { |
||||
// [21] DOUBLE ::= [+-]? ([0-9]+ '.' [0-9]* EXPONENT | '.' [0-9]+ EXPONENT | [0-9]+ EXPONENT)
|
||||
// [154s] EXPONENT ::= [eE] [+-]? [0-9]+
|
||||
let mut value = value.as_bytes(); |
||||
if let Some(v) = value.strip_prefix(b"+") { |
||||
value = v; |
||||
} else if let Some(v) = value.strip_prefix(b"-") { |
||||
value = v; |
||||
} |
||||
let mut with_before = false; |
||||
while value.first().map_or(false, u8::is_ascii_digit) { |
||||
value = &value[1..]; |
||||
with_before = true; |
||||
} |
||||
let mut with_after = false; |
||||
if let Some(v) = value.strip_prefix(b".") { |
||||
value = v; |
||||
while value.first().map_or(false, u8::is_ascii_digit) { |
||||
value = &value[1..]; |
||||
with_after = true; |
||||
} |
||||
} |
||||
if let Some(v) = value.strip_prefix(b"e") { |
||||
value = v; |
||||
} else if let Some(v) = value.strip_prefix(b"E") { |
||||
value = v; |
||||
} else { |
||||
return false; |
||||
} |
||||
if let Some(v) = value.strip_prefix(b"+") { |
||||
value = v; |
||||
} else if let Some(v) = value.strip_prefix(b"-") { |
||||
value = v; |
||||
} |
||||
(with_before || with_after) && !value.is_empty() && value.iter().all(u8::is_ascii_digit) |
||||
} |
||||
|
||||
#[cfg(test)] |
||||
mod tests { |
||||
use super::*; |
||||
use oxrdf::vocab::xsd; |
||||
use oxrdf::{BlankNodeRef, GraphNameRef, LiteralRef, NamedNodeRef}; |
||||
|
||||
#[test] |
||||
fn test_write() -> io::Result<()> { |
||||
let mut writer = TriGSerializer::new().serialize_to_write(Vec::new()); |
||||
writer.write_quad(QuadRef::new( |
||||
NamedNodeRef::new_unchecked("http://example.com/s"), |
||||
NamedNodeRef::new_unchecked("http://example.com/p"), |
||||
NamedNodeRef::new_unchecked("http://example.com/o"), |
||||
NamedNodeRef::new_unchecked("http://example.com/g"), |
||||
))?; |
||||
writer.write_quad(QuadRef::new( |
||||
NamedNodeRef::new_unchecked("http://example.com/s"), |
||||
NamedNodeRef::new_unchecked("http://example.com/p"), |
||||
LiteralRef::new_simple_literal("foo"), |
||||
NamedNodeRef::new_unchecked("http://example.com/g"), |
||||
))?; |
||||
writer.write_quad(QuadRef::new( |
||||
NamedNodeRef::new_unchecked("http://example.com/s"), |
||||
NamedNodeRef::new_unchecked("http://example.com/p2"), |
||||
LiteralRef::new_language_tagged_literal_unchecked("foo", "en"), |
||||
NamedNodeRef::new_unchecked("http://example.com/g"), |
||||
))?; |
||||
writer.write_quad(QuadRef::new( |
||||
BlankNodeRef::new_unchecked("b"), |
||||
NamedNodeRef::new_unchecked("http://example.com/p2"), |
||||
BlankNodeRef::new_unchecked("b2"), |
||||
NamedNodeRef::new_unchecked("http://example.com/g"), |
||||
))?; |
||||
writer.write_quad(QuadRef::new( |
||||
BlankNodeRef::new_unchecked("b"), |
||||
NamedNodeRef::new_unchecked("http://example.com/p2"), |
||||
LiteralRef::new_typed_literal("true", xsd::BOOLEAN), |
||||
GraphNameRef::DefaultGraph, |
||||
))?; |
||||
writer.write_quad(QuadRef::new( |
||||
BlankNodeRef::new_unchecked("b"), |
||||
NamedNodeRef::new_unchecked("http://example.com/p2"), |
||||
LiteralRef::new_typed_literal("false", xsd::BOOLEAN), |
||||
NamedNodeRef::new_unchecked("http://example.com/g2"), |
||||
))?; |
||||
assert_eq!(String::from_utf8(writer.finish()?).unwrap(), "<http://example.com/g> {\n\t<http://example.com/s> <http://example.com/p> <http://example.com/o> , \"foo\" ;\n\t\t<http://example.com/p2> \"foo\"@en .\n\t_:b <http://example.com/p2> _:b2 .\n}\n_:b <http://example.com/p2> true .\n<http://example.com/g2> {\n\t_:b <http://example.com/p2> false .\n}\n"); |
||||
Ok(()) |
||||
} |
||||
} |
@ -0,0 +1,462 @@ |
||||
//! A [Turtle](https://www.w3.org/TR/turtle/) streaming parser implemented by [`TurtleParser`].
|
||||
|
||||
use crate::terse::TriGRecognizer; |
||||
use crate::toolkit::{FromReadIterator, ParseError, ParseOrIoError, Parser}; |
||||
use crate::trig::{LowLevelTriGWriter, ToWriteTriGWriter}; |
||||
use crate::TriGSerializer; |
||||
use oxiri::{Iri, IriParseError}; |
||||
use oxrdf::{GraphNameRef, Triple, TripleRef}; |
||||
use std::collections::HashMap; |
||||
use std::io::{self, Read, Write}; |
||||
|
||||
/// A [Turtle](https://www.w3.org/TR/turtle/) streaming parser.
|
||||
///
|
||||
/// Support for [Turtle-star](https://w3c.github.io/rdf-star/cg-spec/2021-12-17.html#turtle-star) is available behind the `rdf-star` feature and the [`TurtleParser::with_quoted_triples`] option.
|
||||
///
|
||||
/// Count the number of people:
|
||||
/// ```
|
||||
/// use oxrdf::NamedNodeRef;
|
||||
/// use oxttl::{TurtleParser, ParseError};
|
||||
///
|
||||
/// let file = b"@base <http://example.com/> .
|
||||
/// @prefix schema: <http://schema.org/> .
|
||||
/// <foo> a schema:Person ;
|
||||
/// schema:name \"Foo\" .
|
||||
/// <bar> a schema:Person ;
|
||||
/// schema:name \"Bar\" .";
|
||||
///
|
||||
/// let rdf_type = NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?;
|
||||
/// let schema_person = NamedNodeRef::new("http://schema.org/Person")?;
|
||||
/// let mut count = 0;
|
||||
/// for triple in TurtleParser::new().parse_from_read(file.as_ref()) {
|
||||
/// let triple = triple?;
|
||||
/// if triple.predicate == rdf_type && triple.object == schema_person.into() {
|
||||
/// count += 1;
|
||||
/// }
|
||||
/// }
|
||||
/// assert_eq!(2, count);
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
#[derive(Default)] |
||||
pub struct TurtleParser { |
||||
base: Option<Iri<String>>, |
||||
prefixes: HashMap<String, Iri<String>>, |
||||
#[cfg(feature = "rdf-star")] |
||||
with_quoted_triples: bool, |
||||
} |
||||
|
||||
impl TurtleParser { |
||||
/// Builds a new [`TurtleParser`].
|
||||
#[inline] |
||||
pub fn new() -> Self { |
||||
Self::default() |
||||
} |
||||
|
||||
#[inline] |
||||
pub fn with_base_iri(mut self, base_iri: impl Into<String>) -> Result<Self, IriParseError> { |
||||
self.base = Some(Iri::parse(base_iri.into())?); |
||||
Ok(self) |
||||
} |
||||
|
||||
#[inline] |
||||
pub fn with_prefix( |
||||
mut self, |
||||
prefix_name: impl Into<String>, |
||||
prefix_iri: impl Into<String>, |
||||
) -> Result<Self, IriParseError> { |
||||
self.prefixes |
||||
.insert(prefix_name.into(), Iri::parse(prefix_iri.into())?); |
||||
Ok(self) |
||||
} |
||||
|
||||
/// Enables [Turtle-star](https://w3c.github.io/rdf-star/cg-spec/2021-12-17.html#turtle-star).
|
||||
#[cfg(feature = "rdf-star")] |
||||
#[inline] |
||||
#[must_use] |
||||
pub fn with_quoted_triples(mut self) -> Self { |
||||
self.with_quoted_triples = true; |
||||
self |
||||
} |
||||
|
||||
/// Parses a Turtle file from a [`Read`] implementation.
|
||||
///
|
||||
/// Count the number of people:
|
||||
/// ```
|
||||
/// use oxrdf::NamedNodeRef;
|
||||
/// use oxttl::{TurtleParser, ParseError};
|
||||
///
|
||||
/// let file = b"@base <http://example.com/> .
|
||||
/// @prefix schema: <http://schema.org/> .
|
||||
/// <foo> a schema:Person ;
|
||||
/// schema:name \"Foo\" .
|
||||
/// <bar> a schema:Person ;
|
||||
/// schema:name \"Bar\" .";
|
||||
///
|
||||
/// let rdf_type = NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?;
|
||||
/// let schema_person = NamedNodeRef::new("http://schema.org/Person")?;
|
||||
/// let mut count = 0;
|
||||
/// for triple in TurtleParser::new().parse_from_read(file.as_ref()) {
|
||||
/// let triple = triple?;
|
||||
/// if triple.predicate == rdf_type && triple.object == schema_person.into() {
|
||||
/// count += 1;
|
||||
/// }
|
||||
/// }
|
||||
/// assert_eq!(2, count);
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
pub fn parse_from_read<R: Read>(&self, read: R) -> FromReadTurtleReader<R> { |
||||
FromReadTurtleReader { |
||||
inner: self.parse().parser.parse_from_read(read), |
||||
} |
||||
} |
||||
|
||||
/// Allows to parse a Turtle file by using a low-level API.
|
||||
///
|
||||
/// Count the number of people:
|
||||
/// ```
|
||||
/// use oxrdf::NamedNodeRef;
|
||||
/// use oxttl::{TurtleParser, ParseError};
|
||||
///
|
||||
/// let file: [&[u8]; 5] = [b"@base <http://example.com/>",
|
||||
/// b". @prefix schema: <http://schema.org/> .",
|
||||
/// b"<foo> a schema:Person",
|
||||
/// b" ; schema:name \"Foo\" . <bar>",
|
||||
/// b" a schema:Person ; schema:name \"Bar\" ."
|
||||
/// ];
|
||||
///
|
||||
/// let rdf_type = NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?;
|
||||
/// let schema_person = NamedNodeRef::new("http://schema.org/Person")?;
|
||||
/// let mut count = 0;
|
||||
/// let mut parser = TurtleParser::new().parse();
|
||||
/// let mut file_chunks = file.iter();
|
||||
/// while !parser.is_end() {
|
||||
/// // We feed more data to the parser
|
||||
/// if let Some(chunk) = file_chunks.next() {
|
||||
/// parser.extend_from_slice(chunk);
|
||||
/// } else {
|
||||
/// parser.end(); // It's finished
|
||||
/// }
|
||||
/// // We read as many triples from the parser as possible
|
||||
/// while let Some(triple) = parser.read_next() {
|
||||
/// let triple = triple?;
|
||||
/// if triple.predicate == rdf_type && triple.object == schema_person.into() {
|
||||
/// count += 1;
|
||||
/// }
|
||||
/// }
|
||||
/// }
|
||||
/// assert_eq!(2, count);
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
pub fn parse(&self) -> LowLevelTurtleReader { |
||||
LowLevelTurtleReader { |
||||
parser: TriGRecognizer::new_parser( |
||||
false, |
||||
#[cfg(feature = "rdf-star")] |
||||
self.with_quoted_triples, |
||||
self.base.clone(), |
||||
self.prefixes.clone(), |
||||
), |
||||
} |
||||
} |
||||
} |
||||
|
||||
/// Parses a Turtle file from a [`Read`] implementation. Can be built using [`TurtleParser::parse_from_read`].
|
||||
///
|
||||
/// Count the number of people:
|
||||
/// ```
|
||||
/// use oxrdf::NamedNodeRef;
|
||||
/// use oxttl::{TurtleParser, ParseError};
|
||||
///
|
||||
/// let file = b"@base <http://example.com/> .
|
||||
/// @prefix schema: <http://schema.org/> .
|
||||
/// <foo> a schema:Person ;
|
||||
/// schema:name \"Foo\" .
|
||||
/// <bar> a schema:Person ;
|
||||
/// schema:name \"Bar\" .";
|
||||
///
|
||||
/// let rdf_type = NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?;
|
||||
/// let schema_person = NamedNodeRef::new("http://schema.org/Person")?;
|
||||
/// let mut count = 0;
|
||||
/// for triple in TurtleParser::new().parse_from_read(file.as_ref()) {
|
||||
/// let triple = triple?;
|
||||
/// if triple.predicate == rdf_type && triple.object == schema_person.into() {
|
||||
/// count += 1;
|
||||
/// }
|
||||
/// }
|
||||
/// assert_eq!(2, count);
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
pub struct FromReadTurtleReader<R: Read> { |
||||
inner: FromReadIterator<R, TriGRecognizer>, |
||||
} |
||||
|
||||
impl<R: Read> Iterator for FromReadTurtleReader<R> { |
||||
type Item = Result<Triple, ParseOrIoError>; |
||||
|
||||
fn next(&mut self) -> Option<Result<Triple, ParseOrIoError>> { |
||||
Some(self.inner.next()?.map(Into::into)) |
||||
} |
||||
} |
||||
|
||||
/// Parses a Turtle file by using a low-level API. Can be built using [`TurtleParser::parse`].
|
||||
///
|
||||
/// Count the number of people:
|
||||
/// ```
|
||||
/// use oxrdf::NamedNodeRef;
|
||||
/// use oxttl::{TurtleParser, ParseError};
|
||||
///
|
||||
/// let file: [&[u8]; 5] = [b"@base <http://example.com/>",
|
||||
/// b". @prefix schema: <http://schema.org/> .",
|
||||
/// b"<foo> a schema:Person",
|
||||
/// b" ; schema:name \"Foo\" . <bar>",
|
||||
/// b" a schema:Person ; schema:name \"Bar\" ."
|
||||
/// ];
|
||||
///
|
||||
/// let rdf_type = NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?;
|
||||
/// let schema_person = NamedNodeRef::new("http://schema.org/Person")?;
|
||||
/// let mut count = 0;
|
||||
/// let mut parser = TurtleParser::new().parse();
|
||||
/// let mut file_chunks = file.iter();
|
||||
/// while !parser.is_end() {
|
||||
/// // We feed more data to the parser
|
||||
/// if let Some(chunk) = file_chunks.next() {
|
||||
/// parser.extend_from_slice(chunk);
|
||||
/// } else {
|
||||
/// parser.end(); // It's finished
|
||||
/// }
|
||||
/// // We read as many triples from the parser as possible
|
||||
/// while let Some(triple) = parser.read_next() {
|
||||
/// let triple = triple?;
|
||||
/// if triple.predicate == rdf_type && triple.object == schema_person.into() {
|
||||
/// count += 1;
|
||||
/// }
|
||||
/// }
|
||||
/// }
|
||||
/// assert_eq!(2, count);
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
pub struct LowLevelTurtleReader { |
||||
parser: Parser<TriGRecognizer>, |
||||
} |
||||
|
||||
impl LowLevelTurtleReader { |
||||
/// Adds some extra bytes to the parser. Should be called when [`read_next`](Self::read_next) returns [`None`] and there is still unread data.
|
||||
pub fn extend_from_slice(&mut self, other: &[u8]) { |
||||
self.parser.extend_from_slice(other) |
||||
} |
||||
|
||||
/// Tell the parser that the file is finished.
|
||||
///
|
||||
/// This triggers the parsing of the final bytes and might lead [`read_next`](Self::read_next) to return some extra values.
|
||||
pub fn end(&mut self) { |
||||
self.parser.end() |
||||
} |
||||
|
||||
/// Returns if the parsing is finished i.e. [`end`](Self::end) has been called and [`read_next`](Self::read_next) is always going to return `None`.
|
||||
pub fn is_end(&self) -> bool { |
||||
self.parser.is_end() |
||||
} |
||||
|
||||
/// Attempt to parse a new triple from the already provided data.
|
||||
///
|
||||
/// Returns [`None`] if the parsing is finished or more data is required.
|
||||
/// If it is the case more data should be fed using [`extend_from_slice`](Self::extend_from_slice).
|
||||
pub fn read_next(&mut self) -> Option<Result<Triple, ParseError>> { |
||||
Some(self.parser.read_next()?.map(Into::into)) |
||||
} |
||||
} |
||||
|
||||
/// A [Turtle](https://www.w3.org/TR/turtle/) serializer.
|
||||
///
|
||||
/// Support for [Turtle-star](https://w3c.github.io/rdf-star/cg-spec/2021-12-17.html#turtle-star) is available behind the `rdf-star` feature.
|
||||
///
|
||||
/// ```
|
||||
/// use oxrdf::{NamedNodeRef, TripleRef};
|
||||
/// use oxttl::TurtleSerializer;
|
||||
///
|
||||
/// let mut buf = Vec::new();
|
||||
/// let mut writer = TurtleSerializer::new().serialize_to_write(buf);
|
||||
/// writer.write_triple(TripleRef::new(
|
||||
/// NamedNodeRef::new("http://example.com#me")?,
|
||||
/// NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?,
|
||||
/// NamedNodeRef::new("http://schema.org/Person")?,
|
||||
/// ))?;
|
||||
/// assert_eq!(
|
||||
/// b"<http://example.com#me> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .\n",
|
||||
/// writer.finish()?.as_slice()
|
||||
/// );
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
#[derive(Default)] |
||||
pub struct TurtleSerializer { |
||||
inner: TriGSerializer, |
||||
} |
||||
|
||||
impl TurtleSerializer { |
||||
/// Builds a new [`TurtleSerializer`].
|
||||
#[inline] |
||||
pub fn new() -> Self { |
||||
Self::default() |
||||
} |
||||
|
||||
/// Writes a Turtle file to a [`Write`] implementation.
|
||||
///
|
||||
/// ```
|
||||
/// use oxrdf::{NamedNodeRef, TripleRef};
|
||||
/// use oxttl::TurtleSerializer;
|
||||
///
|
||||
/// let mut buf = Vec::new();
|
||||
/// let mut writer = TurtleSerializer::new().serialize_to_write(buf);
|
||||
/// writer.write_triple(TripleRef::new(
|
||||
/// NamedNodeRef::new("http://example.com#me")?,
|
||||
/// NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?,
|
||||
/// NamedNodeRef::new("http://schema.org/Person")?,
|
||||
/// ))?;
|
||||
/// assert_eq!(
|
||||
/// b"<http://example.com#me> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .\n",
|
||||
/// writer.finish()?.as_slice()
|
||||
/// );
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
pub fn serialize_to_write<W: Write>(&self, write: W) -> ToWriteTurtleWriter<W> { |
||||
ToWriteTurtleWriter { |
||||
inner: self.inner.serialize_to_write(write), |
||||
} |
||||
} |
||||
|
||||
/// Builds a low-level Turtle writer.
|
||||
///
|
||||
/// ```
|
||||
/// use oxrdf::{NamedNodeRef, TripleRef};
|
||||
/// use oxttl::TurtleSerializer;
|
||||
///
|
||||
/// let mut buf = Vec::new();
|
||||
/// let mut writer = TurtleSerializer::new().serialize();
|
||||
/// writer.write_triple(TripleRef::new(
|
||||
/// NamedNodeRef::new("http://example.com#me")?,
|
||||
/// NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?,
|
||||
/// NamedNodeRef::new("http://schema.org/Person")?,
|
||||
/// ), &mut buf)?;
|
||||
/// writer.finish(&mut buf)?;
|
||||
/// assert_eq!(
|
||||
/// b"<http://example.com#me> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .\n",
|
||||
/// buf.as_slice()
|
||||
/// );
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
pub fn serialize(&self) -> LowLevelTurtleWriter { |
||||
LowLevelTurtleWriter { |
||||
inner: self.inner.serialize(), |
||||
} |
||||
} |
||||
} |
||||
|
||||
/// Writes a Turtle file to a [`Write`] implementation. Can be built using [`TurtleSerializer::serialize_to_write`].
|
||||
///
|
||||
/// ```
|
||||
/// use oxrdf::{NamedNodeRef, TripleRef};
|
||||
/// use oxttl::TurtleSerializer;
|
||||
///
|
||||
/// let mut buf = Vec::new();
|
||||
/// let mut writer = TurtleSerializer::new().serialize_to_write(buf);
|
||||
/// writer.write_triple(TripleRef::new(
|
||||
/// NamedNodeRef::new("http://example.com#me")?,
|
||||
/// NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?,
|
||||
/// NamedNodeRef::new("http://schema.org/Person")?,
|
||||
/// ))?;
|
||||
/// assert_eq!(
|
||||
/// b"<http://example.com#me> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .\n",
|
||||
/// writer.finish()?.as_slice()
|
||||
/// );
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
pub struct ToWriteTurtleWriter<W: Write> { |
||||
inner: ToWriteTriGWriter<W>, |
||||
} |
||||
|
||||
impl<W: Write> ToWriteTurtleWriter<W> { |
||||
/// Writes an extra triple.
|
||||
pub fn write_triple<'a>(&mut self, t: impl Into<TripleRef<'a>>) -> io::Result<()> { |
||||
self.inner |
||||
.write_quad(t.into().in_graph(GraphNameRef::DefaultGraph)) |
||||
} |
||||
|
||||
/// Ends the write process and returns the underlying [`Write`].
|
||||
pub fn finish(self) -> io::Result<W> { |
||||
self.inner.finish() |
||||
} |
||||
} |
||||
|
||||
/// Writes a Turtle file by using a low-level API. Can be built using [`TurtleSerializer::serialize`].
|
||||
///
|
||||
/// ```
|
||||
/// use oxrdf::{NamedNodeRef, TripleRef};
|
||||
/// use oxttl::TurtleSerializer;
|
||||
///
|
||||
/// let mut buf = Vec::new();
|
||||
/// let mut writer = TurtleSerializer::new().serialize();
|
||||
/// writer.write_triple(TripleRef::new(
|
||||
/// NamedNodeRef::new("http://example.com#me")?,
|
||||
/// NamedNodeRef::new("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")?,
|
||||
/// NamedNodeRef::new("http://schema.org/Person")?,
|
||||
/// ), &mut buf)?;
|
||||
/// writer.finish(&mut buf)?;
|
||||
/// assert_eq!(
|
||||
/// b"<http://example.com#me> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> .\n",
|
||||
/// buf.as_slice()
|
||||
/// );
|
||||
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
|
||||
/// ```
|
||||
pub struct LowLevelTurtleWriter { |
||||
inner: LowLevelTriGWriter, |
||||
} |
||||
|
||||
impl LowLevelTurtleWriter { |
||||
/// Writes an extra triple.
|
||||
pub fn write_triple<'a>( |
||||
&mut self, |
||||
t: impl Into<TripleRef<'a>>, |
||||
write: impl Write, |
||||
) -> io::Result<()> { |
||||
self.inner |
||||
.write_quad(t.into().in_graph(GraphNameRef::DefaultGraph), write) |
||||
} |
||||
|
||||
/// Finishes to write the file.
|
||||
pub fn finish(&mut self, write: impl Write) -> io::Result<()> { |
||||
self.inner.finish(write) |
||||
} |
||||
} |
||||
|
||||
#[cfg(test)] |
||||
mod tests { |
||||
use super::*; |
||||
use oxrdf::{BlankNodeRef, LiteralRef, NamedNodeRef}; |
||||
|
||||
#[test] |
||||
fn test_write() -> io::Result<()> { |
||||
let mut writer = TurtleSerializer::new().serialize_to_write(Vec::new()); |
||||
writer.write_triple(TripleRef::new( |
||||
NamedNodeRef::new_unchecked("http://example.com/s"), |
||||
NamedNodeRef::new_unchecked("http://example.com/p"), |
||||
NamedNodeRef::new_unchecked("http://example.com/o"), |
||||
))?; |
||||
writer.write_triple(TripleRef::new( |
||||
NamedNodeRef::new_unchecked("http://example.com/s"), |
||||
NamedNodeRef::new_unchecked("http://example.com/p"), |
||||
LiteralRef::new_simple_literal("foo"), |
||||
))?; |
||||
writer.write_triple(TripleRef::new( |
||||
NamedNodeRef::new_unchecked("http://example.com/s"), |
||||
NamedNodeRef::new_unchecked("http://example.com/p2"), |
||||
LiteralRef::new_language_tagged_literal_unchecked("foo", "en"), |
||||
))?; |
||||
writer.write_triple(TripleRef::new( |
||||
BlankNodeRef::new_unchecked("b"), |
||||
NamedNodeRef::new_unchecked("http://example.com/p2"), |
||||
BlankNodeRef::new_unchecked("b2"), |
||||
))?; |
||||
assert_eq!(String::from_utf8(writer.finish()?).unwrap(), "<http://example.com/s> <http://example.com/p> <http://example.com/o> , \"foo\" ;\n\t<http://example.com/p2> \"foo\"@en .\n_:b <http://example.com/p2> _:b2 .\n"); |
||||
Ok(()) |
||||
} |
||||
} |
@ -0,0 +1 @@ |
||||
Subproject commit 5fa35bf602669a467cfd0ab24cc732fe49f2b927 |
@ -0,0 +1,194 @@ |
||||
use anyhow::Result; |
||||
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; |
||||
use oxigraph_testsuite::files::read_file; |
||||
use oxigraph_testsuite::manifest::TestManifest; |
||||
use rio_api::parser::*; |
||||
use rio_turtle::*; |
||||
use std::io::Read; |
||||
|
||||
fn test_data_from_testsuite(manifest_uri: String, include_tests_types: &[&str]) -> Result<Vec<u8>> { |
||||
let manifest = TestManifest::new([manifest_uri]); |
||||
let mut data = Vec::default(); |
||||
for test in manifest { |
||||
let test = test?; |
||||
if include_tests_types.contains(&test.kind.as_str()) { |
||||
read_file(&test.action.unwrap())?.read_to_end(&mut data)?; |
||||
data.push(b'\n'); |
||||
} |
||||
} |
||||
Ok(data) |
||||
} |
||||
|
||||
fn ntriples_test_data() -> Result<Vec<u8>> { |
||||
test_data_from_testsuite( |
||||
"http://w3c.github.io/rdf-tests/ntriples/manifest.ttl".to_owned(), |
||||
&["http://www.w3.org/ns/rdftest#TestNTriplesPositiveSyntax"], |
||||
) |
||||
} |
||||
|
||||
fn turtle_test_data() -> Result<Vec<u8>> { |
||||
test_data_from_testsuite( |
||||
"http://w3c.github.io/rdf-tests/turtle/manifest.ttl".to_owned(), |
||||
&[ |
||||
"http://www.w3.org/ns/rdftest#TestTurtlePositiveSyntax", |
||||
"http://www.w3.org/ns/rdftest#TestTurtleEval", |
||||
], |
||||
) |
||||
} |
||||
|
||||
fn parse_bench( |
||||
c: &mut Criterion, |
||||
parser_name: &str, |
||||
data_name: &str, |
||||
data: Vec<u8>, |
||||
bench: impl Fn(&[u8]), |
||||
) { |
||||
let mut group = c.benchmark_group(parser_name); |
||||
group.throughput(Throughput::Bytes(data.len() as u64)); |
||||
group.bench_with_input(BenchmarkId::from_parameter(data_name), &data, |b, data| { |
||||
b.iter(|| bench(data)) |
||||
}); |
||||
group.finish(); |
||||
} |
||||
|
||||
fn parse_oxttl_ntriples(c: &mut Criterion, name: &str, data: Vec<u8>) { |
||||
parse_bench(c, "oxttl ntriples", name, data, |data| { |
||||
let mut parser = oxttl::NTriplesParser::new().parse(); |
||||
parser.extend_from_slice(data); |
||||
parser.end(); |
||||
while let Some(result) = parser.read_next() { |
||||
result.unwrap(); |
||||
} |
||||
}); |
||||
} |
||||
|
||||
fn parse_oxttl_turtle(c: &mut Criterion, name: &str, data: Vec<u8>) { |
||||
parse_bench(c, "oxttl turtle", name, data, |data| { |
||||
let mut parser = oxttl::TurtleParser::new().parse(); |
||||
parser.extend_from_slice(data); |
||||
parser.end(); |
||||
while let Some(result) = parser.read_next() { |
||||
result.unwrap(); |
||||
} |
||||
}); |
||||
} |
||||
|
||||
fn parse_rio_ntriples(c: &mut Criterion, name: &str, data: Vec<u8>) { |
||||
parse_bench(c, "rio ntriples", name, data, |data| { |
||||
let mut count: u64 = 0; |
||||
NTriplesParser::new(data) |
||||
.parse_all(&mut |_| { |
||||
count += 1; |
||||
Ok(()) as Result<(), TurtleError> |
||||
}) |
||||
.unwrap(); |
||||
}); |
||||
} |
||||
|
||||
fn parse_rio_turtle(c: &mut Criterion, name: &str, data: Vec<u8>) { |
||||
parse_bench(c, "rio turtle", name, data, |data| { |
||||
let mut count: u64 = 0; |
||||
TurtleParser::new(data, None) |
||||
.parse_all(&mut |_| { |
||||
count += 1; |
||||
Ok(()) as Result<(), TurtleError> |
||||
}) |
||||
.unwrap(); |
||||
}); |
||||
} |
||||
|
||||
fn bench_parse_oxttl_ntriples_with_ntriples(c: &mut Criterion) { |
||||
parse_oxttl_ntriples( |
||||
c, |
||||
"ntriples", |
||||
match ntriples_test_data() { |
||||
Ok(d) => d, |
||||
Err(e) => { |
||||
eprintln!("{e}"); |
||||
return; |
||||
} |
||||
}, |
||||
) |
||||
} |
||||
|
||||
fn bench_parse_oxttl_ntriples_with_turtle(c: &mut Criterion) { |
||||
parse_oxttl_turtle( |
||||
c, |
||||
"ntriples", |
||||
match ntriples_test_data() { |
||||
Ok(d) => d, |
||||
Err(e) => { |
||||
eprintln!("{e}"); |
||||
return; |
||||
} |
||||
}, |
||||
) |
||||
} |
||||
|
||||
fn bench_parse_oxttl_turtle_with_turtle(c: &mut Criterion) { |
||||
parse_oxttl_turtle( |
||||
c, |
||||
"turtle", |
||||
match turtle_test_data() { |
||||
Ok(d) => d, |
||||
Err(e) => { |
||||
eprintln!("{e}"); |
||||
return; |
||||
} |
||||
}, |
||||
) |
||||
} |
||||
|
||||
fn bench_parse_rio_ntriples_with_ntriples(c: &mut Criterion) { |
||||
parse_rio_ntriples( |
||||
c, |
||||
"ntriples", |
||||
match ntriples_test_data() { |
||||
Ok(d) => d, |
||||
Err(e) => { |
||||
eprintln!("{e}"); |
||||
return; |
||||
} |
||||
}, |
||||
) |
||||
} |
||||
|
||||
fn bench_parse_rio_ntriples_with_turtle(c: &mut Criterion) { |
||||
parse_rio_turtle( |
||||
c, |
||||
"ntriples", |
||||
match ntriples_test_data() { |
||||
Ok(d) => d, |
||||
Err(e) => { |
||||
eprintln!("{e}"); |
||||
return; |
||||
} |
||||
}, |
||||
) |
||||
} |
||||
|
||||
fn bench_parse_rio_turtle_with_turtle(c: &mut Criterion) { |
||||
parse_rio_turtle( |
||||
c, |
||||
"turtle", |
||||
match turtle_test_data() { |
||||
Ok(d) => d, |
||||
Err(e) => { |
||||
eprintln!("{e}"); |
||||
return; |
||||
} |
||||
}, |
||||
) |
||||
} |
||||
|
||||
criterion_group!( |
||||
w3c_testsuite, |
||||
bench_parse_rio_ntriples_with_ntriples, |
||||
bench_parse_rio_ntriples_with_turtle, |
||||
bench_parse_rio_turtle_with_turtle, |
||||
bench_parse_oxttl_ntriples_with_ntriples, |
||||
bench_parse_oxttl_ntriples_with_turtle, |
||||
bench_parse_oxttl_turtle_with_turtle |
||||
); |
||||
|
||||
criterion_main!(w3c_testsuite); |
@ -0,0 +1,2 @@ |
||||
_:` <http://example.com/pb> <http://example.com/o> . |
||||
<http://example.com/s> <http://example.com/p> <http://example.com/o> . |
@ -0,0 +1,2 @@ |
||||
<http:// /s> <http://example.com/p> <http://example.com/o> . |
||||
<http://example.com/s> <http://example.com/p> <http://example.com/o> . |
@ -0,0 +1,2 @@ |
||||
<http://example.com/s> <http://example.com/p> "\a" . |
||||
<http://example.com/s> <http://example.com/p> <http://example.com/o> . |
@ -0,0 +1,2 @@ |
||||
<http://example.com/s> <http://example.com/p> <http://example.com/o> . |
||||
<http://example.com/s> <http://example.com/p> <http://example.com/o2> . |
@ -0,0 +1,2 @@ |
||||
<http://example.com/s> <http://example.com/p> "o" . |
||||
<http://example.com/s> <http://example.com/p> <http://example.com/o> . |
@ -0,0 +1 @@ |
||||
<http://example.com/s> <http://example.com/p> <http://example.com/o> . |
@ -0,0 +1 @@ |
||||
<http://example.com/s> <http://example.com/p> "o" . |
@ -0,0 +1,129 @@ |
||||
@prefix mf: <http://www.w3.org/2001/sw/DataAccess/tests/test-manifest#> . |
||||
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . |
||||
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . |
||||
@prefix ox: <https://github.com/oxigraph/oxigraph/tests#> . |
||||
|
||||
<> |
||||
rdf:type mf:Manifest ; |
||||
rdfs:comment "Oxigraph parser recovery test cases" ; |
||||
mf:entries ( |
||||
<#invalid_iri_nt> |
||||
<#invalid_iri_ttl> |
||||
<#invalid_iri_n3> |
||||
<#invalid_bnode_nt> |
||||
<#invalid_bnode_ttl> |
||||
<#invalid_bnode_n3> |
||||
<#invalid_string_nt> |
||||
<#invalid_string_ttl> |
||||
<#invalid_string_n3> |
||||
<#missing_dot_at_end_of_triple_with_iri_middle_nt> |
||||
<#missing_dot_at_end_of_triple_with_iri_middle_ttl> |
||||
<#missing_dot_at_end_of_triple_with_iri_end_nt> |
||||
<#missing_dot_at_end_of_triple_with_iri_end_ttl> |
||||
<#missing_dot_at_end_of_triple_with_string_middle_nt> |
||||
<#missing_dot_at_end_of_triple_with_string_middle_ttl> |
||||
<#missing_dot_at_end_of_triple_with_string_end_nt> |
||||
<#missing_dot_at_end_of_triple_with_string_end_ttl> |
||||
) . |
||||
|
||||
<#invalid_iri_nt> |
||||
rdf:type ox:TestNTripleRecovery ; |
||||
mf:name "IRI with space" ; |
||||
mf:action <invalid_iri.nt> ; |
||||
mf:result <iri_spo.nt> . |
||||
|
||||
<#invalid_iri_ttl> |
||||
rdf:type ox:TestTurtleRecovery ; |
||||
mf:name "IRI with space" ; |
||||
mf:action <invalid_iri.nt> ; |
||||
mf:result <iri_spo.nt> . |
||||
|
||||
<#invalid_iri_n3> |
||||
rdf:type ox:TestN3Recovery ; |
||||
mf:name "IRI with space" ; |
||||
mf:action <invalid_iri.nt> ; |
||||
mf:result <iri_spo.nt> . |
||||
|
||||
<#invalid_bnode_nt> |
||||
rdf:type ox:TestNTripleRecovery ; |
||||
mf:name "bad character in blank node" ; |
||||
mf:action <invalid_bnode.nt> ; |
||||
mf:result <iri_spo.nt> . |
||||
|
||||
<#invalid_bnode_ttl> |
||||
rdf:type ox:TestTurtleRecovery ; |
||||
mf:name "bad character in blank node" ; |
||||
mf:action <invalid_bnode.nt> ; |
||||
mf:result <iri_spo.nt> . |
||||
|
||||
<#invalid_bnode_n3> |
||||
rdf:type ox:TestN3Recovery ; |
||||
mf:name "bad character in blank node" ; |
||||
mf:action <invalid_bnode.nt> ; |
||||
mf:result <iri_spo.nt> . |
||||
|
||||
<#invalid_string_nt> |
||||
rdf:type ox:TestNTripleRecovery ; |
||||
mf:name "invalid escape sequence in string" ; |
||||
mf:action <invalid_string.nt> ; |
||||
mf:result <iri_spo.nt> . |
||||
|
||||
<#invalid_string_ttl> |
||||
rdf:type ox:TestTurtleRecovery ; |
||||
mf:name "invalid escape sequence in string" ; |
||||
mf:action <invalid_string.nt> ; |
||||
mf:result <iri_spo.nt> . |
||||
|
||||
<#invalid_string_n3> |
||||
rdf:type ox:TestN3Recovery ; |
||||
mf:name "invalid escape sequence in string" ; |
||||
mf:action <invalid_string.nt> ; |
||||
mf:result <iri_spo.nt> . |
||||
|
||||
<#missing_dot_at_end_of_triple_with_iri_middle_nt> |
||||
rdf:type ox:TestNTripleRecovery ; |
||||
mf:name "missing dot at the end of a triple" ; |
||||
mf:action <missing_dot_at_end_of_triple_with_iri_middle.nt> ; |
||||
mf:result <iri2_spo.nt> . |
||||
|
||||
<#missing_dot_at_end_of_triple_with_iri_middle_ttl> |
||||
rdf:type ox:TestTurtleRecovery ; |
||||
mf:name "missing dot at the end of a triple" ; |
||||
mf:action <missing_dot_at_end_of_triple_with_iri_middle.nt> ; |
||||
mf:result <iri2_spo.nt> . |
||||
|
||||
<#missing_dot_at_end_of_triple_with_iri_end_nt> |
||||
rdf:type ox:TestNTripleRecovery ; |
||||
mf:name "missing dot at the end of a triple" ; |
||||
mf:action <missing_dot_at_end_of_triple_with_iri_end.nt> ; |
||||
mf:result <iri_spo.nt> . |
||||
|
||||
<#missing_dot_at_end_of_triple_with_iri_end_ttl> |
||||
rdf:type ox:TestTurtleRecovery ; |
||||
mf:name "missing dot at the end of a triple" ; |
||||
mf:action <missing_dot_at_end_of_triple_with_iri_end.nt> ; |
||||
mf:result <iri_spo.nt> . |
||||
|
||||
<#missing_dot_at_end_of_triple_with_string_middle_nt> |
||||
rdf:type ox:TestNTripleRecovery ; |
||||
mf:name "missing dot at the end of a triple" ; |
||||
mf:action <missing_dot_at_end_of_triple_with_string_middle.nt> ; |
||||
mf:result <iri2_string_spo.nt> . |
||||
|
||||
<#missing_dot_at_end_of_triple_with_string_middle_ttl> |
||||
rdf:type ox:TestTurtleRecovery ; |
||||
mf:name "missing dot at the end of a triple" ; |
||||
mf:action <missing_dot_at_end_of_triple_with_string_middle.nt> ; |
||||
mf:result <iri2_string_spo.nt> . |
||||
|
||||
<#missing_dot_at_end_of_triple_with_string_end_nt> |
||||
rdf:type ox:TestNTripleRecovery ; |
||||
mf:name "missing dot at the end of a triple" ; |
||||
mf:action <missing_dot_at_end_of_triple_with_string_end.nt> ; |
||||
mf:result <iri_string_spo.nt> . |
||||
|
||||
<#missing_dot_at_end_of_triple_with_string_end_ttl> |
||||
rdf:type ox:TestTurtleRecovery ; |
||||
mf:name "missing dot at the end of a triple" ; |
||||
mf:action <missing_dot_at_end_of_triple_with_string_end.nt> ; |
||||
mf:result <iri_string_spo.nt> . |
@ -0,0 +1 @@ |
||||
<http://example.com/s> <http://example.com/p> <http://example.com/o> |
@ -0,0 +1,2 @@ |
||||
<http://example.com/s> <http://example.com/p> <http://example.com/o> |
||||
<http://example.com/s> <http://example.com/p> <http://example.com/o2> . |
@ -0,0 +1 @@ |
||||
<http://example.com/s> <http://example.com/p> "o" |
@ -0,0 +1,2 @@ |
||||
<http://example.com/s> <http://example.com/p> "o" |
||||
<http://example.com/s> <http://example.com/p> <http://example.com/o> . |
@ -0,0 +1,2 @@ |
||||
<http://example.com/s> <http://example.com/p> "foo"@base . |
||||
<http://example.com/s> <http://example.com/p> "bar"@prefix . |
@ -0,0 +1,3 @@ |
||||
@prefix : <http://example.com/> . |
||||
|
||||
:s :p "foo"@base , "bar"@prefix . |
@ -0,0 +1 @@ |
||||
<http://foo> <http://foo> "foo"@badlanguagetag . |
@ -0,0 +1,2 @@ |
||||
(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((( |
||||
|
@ -0,0 +1,2 @@ |
||||
<urn:zamaudio:ZaMultiComp#preset001> <http://lv2plug.in/ns/ext/state#state> _:1 . |
||||
|
@ -0,0 +1,6 @@ |
||||
@prefix state: <http://lv2plug.in/ns/ext/state#> . |
||||
|
||||
<urn:zamaudio:ZaMultiComp#preset001> |
||||
state:state [ |
||||
] . |
||||
|
@ -0,0 +1,3 @@ |
||||
<http://example.com/prefix/s> <http://example.com/prefix/p> <http://example.com/true/o> . |
||||
<http://example.com/base/s> <http://example.com/base/p> <http://example.com/false/o> . |
||||
<http://example.com/graph/s> <http://example.com/graph/p> <http://example.com/graph/o> <http://example.com/graph/g> . |
@ -0,0 +1,2 @@ |
||||
<http://example.com/prefix/s> <http://example.com/prefix/p> <http://example.com/true/o> . |
||||
<http://example.com/base/s> <http://example.com/base/p> <http://example.com/false/o> . |
@ -0,0 +1,10 @@ |
||||
base <http://example.com/> |
||||
prefix prefix: <prefix/> |
||||
prefix base: <base/> |
||||
prefix graph: <graph/> |
||||
prefix true: <true/> |
||||
prefix false: <false/> |
||||
|
||||
prefix:s prefix:p true:o . |
||||
base:s base:p false:o . |
||||
graph:g { graph:s graph:p graph:o . } |
@ -0,0 +1,8 @@ |
||||
base <http://example.com/> |
||||
prefix prefix: <prefix/> |
||||
prefix base: <base/> |
||||
prefix true: <true/> |
||||
prefix false: <false/> |
||||
|
||||
prefix:s prefix:p true:o . |
||||
base:s base:p false:o . |
@ -0,0 +1 @@ |
||||
<http://foo> <http://foo> "foo"@en-us . |
@ -0,0 +1,4 @@ |
||||
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:foo="http://foo"> |
||||
<rdf:Description rdf:about="http://foo" xml:lang="en-US" foo:="foo"> |
||||
</rdf:Description> |
||||
</rdf:RDF> |
@ -0,0 +1 @@ |
||||
<http://foo> <http://foo> "foo"@en-US-US . |
@ -0,0 +1 @@ |
||||
<http://example.com/foo> <http://www.w3.org/1999/02/22-rdf-syntax-ns#value> " bar\n" . |
@ -0,0 +1,7 @@ |
||||
<?xml version="1.0"?> |
||||
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> |
||||
<rdf:Description rdf:about="http://example.com/foo"> |
||||
<rdf:value> bar |
||||
</rdf:value> |
||||
</rdf:Description> |
||||
</rdf:RDF> |
@ -0,0 +1,90 @@ |
||||
@prefix mf: <http://www.w3.org/2001/sw/DataAccess/tests/test-manifest#> . |
||||
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . |
||||
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . |
||||
@prefix rdft: <http://www.w3.org/ns/rdftest#> . |
||||
|
||||
<> |
||||
rdf:type mf:Manifest ; |
||||
rdfs:comment "Oxigraph parsers test case" ; |
||||
mf:entries ( |
||||
<#blank_node_with_linebreak> |
||||
<#bad_lang> |
||||
<#language_normalization_ttl> |
||||
<#language_normalization_xml> |
||||
<#xml_entities> |
||||
<#xml_nested_entities> |
||||
<#literal_value_space> |
||||
<#bad_parentheses> |
||||
<#keyword_vs_prefix_ttl> |
||||
<#keyword_vs_prefix_trig> |
||||
<#at_keywords_as_lang_tag> |
||||
) . |
||||
|
||||
<#no_end_line_jump> |
||||
rdf:type rdft:TestNTriplesPositiveSyntax ; |
||||
mf:name "No line jump at the end of the file" ; |
||||
mf:action <no_end_line_jump.nt> . |
||||
|
||||
<#blank_node_with_linebreak> |
||||
rdf:type rdft:TestTurtleEval ; |
||||
mf:name "blank node with linebreak" ; |
||||
mf:action <blank_node_with_linebreak.ttl> ; |
||||
mf:result <blank_node_with_linebreak.nt> . |
||||
|
||||
<#language_normalization_ttl> |
||||
rdf:type rdft:TestTurtleEval ; |
||||
mf:name "language case normalization" ; |
||||
mf:action <language_normalization.ttl> ; |
||||
mf:result <language_normalization.nt> . |
||||
|
||||
<#language_normalization_xml> |
||||
rdf:type rdft:TestXMLEval ; |
||||
mf:name "language case normalization" ; |
||||
mf:action <language_normalization.rdf> ; |
||||
mf:result <language_normalization.nt> . |
||||
|
||||
<#bad_lang> |
||||
rdf:type rdft:TestTurtleNegativeSyntax ; |
||||
mf:name "bad language tag" ; |
||||
mf:action <bad_lang.ttl> . |
||||
|
||||
<#xml_entities> |
||||
rdf:type rdft:TestXMLEval ; |
||||
mf:name "custom XML entities" ; |
||||
mf:action <xml_entities.rdf> ; |
||||
mf:result <xml_entities.nt> . |
||||
|
||||
<#xml_nested_entities> |
||||
rdf:type rdft:TestXMLEval ; |
||||
mf:name "custom XML entities with nested definitions" ; |
||||
mf:action <xml_nested_entities.rdf> ; |
||||
mf:result <xml_nested_entities.nt> . |
||||
|
||||
<#literal_value_space> |
||||
rdf:type rdft:TestXMLEval ; |
||||
mf:name "spaces in literal values" ; |
||||
mf:action <literal_value_space.rdf> ; |
||||
mf:result <literal_value_space.nt> . |
||||
|
||||
<#bad_parentheses> |
||||
rdf:type rdft:TestTurtleNegativeSyntax ; |
||||
mf:name "a lot of parentheses that might generate a stack overflow" ; |
||||
mf:action <bad_parentheses.ttl> . |
||||
|
||||
<#keyword_vs_prefix_ttl> |
||||
rdf:type rdft:TestTurtleEval ; |
||||
mf:name "usage of keywords as prefix" ; |
||||
mf:action <keyword_vs_prefix.ttl> ; |
||||
mf:result <keyword_vs_prefix.nt> . |
||||
|
||||
<#keyword_vs_prefix_trig> |
||||
rdf:type rdft:TestTrigEval ; |
||||
mf:name "usage of keywords as prefix" ; |
||||
mf:action <keyword_vs_prefix.trig> ; |
||||
mf:result <keyword_vs_prefix.nq> . |
||||
|
||||
<#at_keywords_as_lang_tag> |
||||
rdf:type rdft:TestTurtleEval ; |
||||
mf:name "usage of at keywords as language tags" ; |
||||
mf:action <at_keywords_as_lang_tag.ttl> ; |
||||
mf:result <at_keywords_as_lang_tag.nt> . |
@ -0,0 +1 @@ |
||||
<http://example.com> <http://example.com> <http://example.com> . |
@ -0,0 +1 @@ |
||||
<http://example.com/foo> <http://example.com/2/test> "bar"^^<http://www.w3.org/2001/XMLSchema#string> . |
@ -0,0 +1,10 @@ |
||||
<?xml version="1.0"?> |
||||
<!DOCTYPE rdf:RDF [ |
||||
<!ENTITY xsd "http://www.w3.org/2001/XMLSchema#" > |
||||
<!ENTITY ex "http://example.com/"> |
||||
]> |
||||
<rdf:RDF xmlns:ex2="&ex;2/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"> |
||||
<rdf:Description rdf:about="&ex;foo"> |
||||
<ex2:test rdf:datatype="&xsd;string">bar</ex2:test> |
||||
</rdf:Description> |
||||
</rdf:RDF> |
@ -0,0 +1 @@ |
||||
<http://example.com/foo> <http://example.com/2/test> "bar"^^<http://www.w3.org/2001/XMLSchema#string> . |
@ -0,0 +1,15 @@ |
||||
<?xml version="1.0"?> |
||||
|
||||
<!DOCTYPE rdf:RDF [ |
||||
<!ENTITY ex "http://example.com/"> |
||||
<!ENTITY w3 "http://www.w3.org"> |
||||
<!ENTITY rdf "&w3;/1999/02/22-rdf-syntax-ns#"> |
||||
<!ENTITY rdfs "&w3;/2000/01/rdf-schema#"> |
||||
<!ENTITY xsd "&w3;/2001/XMLSchema#"> |
||||
]> |
||||
|
||||
<rdf:RDF xmlns:ex2="&ex;2/" xmlns:rdf="&rdf;"> |
||||
<rdf:Description rdf:about="&ex;foo"> |
||||
<ex2:test rdf:datatype="&xsd;string">bar</ex2:test> |
||||
</rdf:Description> |
||||
</rdf:RDF> |
@ -0,0 +1,13 @@ |
||||
Copyright 2011-2022 David Robillard <d@drobilla.net> |
||||
|
||||
Permission to use, copy, modify, and/or distribute this software for any |
||||
purpose with or without fee is hereby granted, provided that the above |
||||
copyright notice and this permission notice appear in all copies. |
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH |
||||
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND |
||||
FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, |
||||
INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM |
||||
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR |
||||
OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR |
||||
PERFORMANCE OF THIS SOFTWARE. |
@ -0,0 +1 @@ |
||||
Testsuite from [Serd](https://drobilla.net/software/serd) project. |
@ -0,0 +1,2 @@ |
||||
# prefix name must end in a : |
||||
@prefix a <#> . |
@ -0,0 +1,3 @@ |
||||
# Forbidden by RDF - predicate cannot be blank |
||||
@prefix : <http://example.org/base#> . |
||||
:a [ :b :c ] :d . |
@ -0,0 +1,3 @@ |
||||
# Forbidden by RDF - predicate cannot be blank |
||||
@prefix : <http://example.org/base#> . |
||||
:a [] :b . |
@ -0,0 +1,3 @@ |
||||
# 'a' only allowed as a predicate |
||||
@prefix : <http://example.org/base#> . |
||||
a :a :b . |
@ -0,0 +1,3 @@ |
||||
# No comma is allowed in collections |
||||
@prefix : <http://example.org/stuff/1.0/> . |
||||
:a :b ( "apple", "banana" ) . |
@ -0,0 +1,4 @@ |
||||
# N3 {}s are not in Turtle |
||||
@prefix : <http://example.org/stuff/1.0/> . |
||||
{ :a :b :c . } :d :e . |
||||
|
@ -0,0 +1,3 @@ |
||||
# is and of are not in turtle |
||||
@prefix : <http://example.org/stuff/1.0/> . |
||||
:a is :b of :c . |
@ -0,0 +1,4 @@ |
||||
# paths are not in turtle |
||||
@prefix : <http://example.org/stuff/1.0/> . |
||||
:a.:b.:c . |
||||
:a^:b^:c . |
@ -0,0 +1,2 @@ |
||||
@keywords something. |
||||
# @keywords is not in turtle |
@ -0,0 +1,3 @@ |
||||
# implies is not in turtle |
||||
@prefix : <http://example.org/stuff/1.0/> . |
||||
:a => :b . |
@ -0,0 +1,3 @@ |
||||
# equivalence is not in turtle |
||||
@prefix : <http://example.org/stuff/1.0/> . |
||||
:a = :b . |
@ -0,0 +1,3 @@ |
||||
# @forAll is not in turtle |
||||
@prefix : <http://example.org/stuff/1.0/> . |
||||
@forAll :x . |
@ -0,0 +1,3 @@ |
||||
# @forSome is not in turtle |
||||
@prefix : <http://example.org/stuff/1.0/> . |
||||
@forSome :x . |
@ -0,0 +1,3 @@ |
||||
# <= is not in turtle |
||||
@prefix : <http://example.org/stuff/1.0/> . |
||||
:a <= :b . |
@ -0,0 +1,6 @@ |
||||
# Test long literals with missing end |
||||
@prefix : <http://example.org/ex#> . |
||||
:a :b """a long |
||||
literal |
||||
with |
||||
newlines |
@ -0,0 +1 @@ |
||||
@base "I'm quite certain this is not a URI" . |
@ -0,0 +1 @@ |
||||
<http://example.org/s> <http://example.org/p> _|invalid . |
@ -0,0 +1,3 @@ |
||||
@prefix eg: <http://example.org/> . |
||||
|
||||
_:.bad a eg:Thing . |
@ -0,0 +1,3 @@ |
||||
ﻴ# This file starts with the first two bytes of the UTF-8 Byte Order Mark |
||||
|
||||
<http://example.org/thing> a <http://example.org/Thing> . |
@ -0,0 +1,3 @@ |
||||
@prefix eg: <http://example.org/> . |
||||
|
||||
eg:†bad <http://example.org/p> <http://example.org/o> . |
@ -0,0 +1 @@ |
||||
bad†:s <http://example.org/p> <http://example.org/o> . |
@ -0,0 +1 @@ |
||||
<http://example.org/s> <http://example.org/p> "value"^<http://example.org/t> . |
@ -0,0 +1 @@ |
||||
<> <http://example.org/pred> "hello"^^"not-a-uri" . |
@ -0,0 +1 @@ |
||||
<http://example.org/s> . <http://example.org/p> <http://example.org/o> . |
@ -0,0 +1 @@ |
||||
[ <http://example.org/p> (1. |
@ -0,0 +1,3 @@ |
||||
@prefix eg: <http://example.org/> . |
||||
|
||||
<> eg:comment "" |
@ -0,0 +1,3 @@ |
||||
@prefix eg: <http://example.org/> . |
||||
|
||||
<> eg:comment " |
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue