Parsers: adds "unchecked" option for faster parsing

Does not validate IRIs and language tags
pull/714/head
Tpt 12 months ago committed by Thomas Tanon
parent d170b53609
commit a84b898fda
  1. 4
      Cargo.lock
  2. 2
      cli/Cargo.toml
  3. 11
      cli/src/main.rs
  4. 45
      fuzz/fuzz_targets/nquads.rs
  5. 66
      fuzz/fuzz_targets/trig.rs
  6. 2
      lib/Cargo.toml
  7. 10
      lib/benches/store.rs
  8. 2
      lib/oxrdf/Cargo.toml
  9. 72
      lib/oxrdfio/src/parser.rs
  10. 2
      lib/oxrdfxml/Cargo.toml
  11. 130
      lib/oxrdfxml/src/parser.rs
  12. 2
      lib/oxttl/Cargo.toml
  13. 175
      lib/oxttl/src/lexer.rs
  14. 13
      lib/oxttl/src/line_formats.rs
  15. 41
      lib/oxttl/src/n3.rs
  16. 13
      lib/oxttl/src/nquads.rs
  17. 36
      lib/oxttl/src/ntriples.rs
  18. 38
      lib/oxttl/src/terse.rs
  19. 13
      lib/oxttl/src/trig.rs
  20. 13
      lib/oxttl/src/turtle.rs
  21. 2
      lib/spargebra/Cargo.toml
  22. 12
      lib/src/store.rs
  23. 2
      lints/test_debian_compatibility.py

4
Cargo.lock generated

@ -1113,9 +1113,9 @@ checksum = "8d91edf4fbb970279443471345a4e8c491bf05bb283b3e6c88e4e606fd8c181b"
[[package]] [[package]]
name = "oxiri" name = "oxiri"
version = "0.2.2" version = "0.2.3-alpha.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bb175ec8981211357b7b379869c2f8d555881c55ea62311428ec0de46d89bd5c" checksum = "b225dad32cfaa43a960b93f01fa7f87528ac07e794b80f6d9a0153e0222557e2"
[[package]] [[package]]
name = "oxrdf" name = "oxrdf"

@ -32,7 +32,7 @@ clap = { version = "4.0", features = ["derive"] }
oxigraph = { version = "0.4.0-alpha.1-dev", path = "../lib" } oxigraph = { version = "0.4.0-alpha.1-dev", path = "../lib" }
rand = "0.8" rand = "0.8"
url = "2.4" url = "2.4"
oxiri = "0.2" oxiri = "0.2.3-alpha.1"
flate2 = "1.0" flate2 = "1.0"
rayon-core = "1.11" rayon-core = "1.11"

@ -121,6 +121,8 @@ enum Command {
destination: PathBuf, destination: PathBuf,
}, },
/// Load file(s) into the store. /// Load file(s) into the store.
///
/// Feel free to enable the --lenient option if you know your input is valid to get better performances.
Load { Load {
/// Directory in which Oxigraph data are persisted. /// Directory in which Oxigraph data are persisted.
#[arg(short, long, value_hint = ValueHint::DirPath)] #[arg(short, long, value_hint = ValueHint::DirPath)]
@ -143,6 +145,8 @@ enum Command {
#[arg(long, value_hint = ValueHint::Url)] #[arg(long, value_hint = ValueHint::Url)]
base: Option<String>, base: Option<String>,
/// Attempt to keep loading even if the data file is invalid. /// Attempt to keep loading even if the data file is invalid.
///
/// This disables most of validation on RDF content.
#[arg(long)] #[arg(long)]
lenient: bool, lenient: bool,
/// Name of the graph to load the data to. /// Name of the graph to load the data to.
@ -391,6 +395,7 @@ pub fn main() -> anyhow::Result<()> {
format.context("The --format option must be set when loading from stdin")?, format.context("The --format option must be set when loading from stdin")?,
base.as_deref(), base.as_deref(),
graph, graph,
lenient,
) )
} else { } else {
ThreadPoolBuilder::new() ThreadPoolBuilder::new()
@ -444,6 +449,7 @@ pub fn main() -> anyhow::Result<()> {
}), }),
base.as_deref(), base.as_deref(),
graph, graph,
lenient,
) )
} else { } else {
bulk_load( bulk_load(
@ -454,6 +460,7 @@ pub fn main() -> anyhow::Result<()> {
}), }),
base.as_deref(), base.as_deref(),
graph, graph,
lenient,
) )
} }
} { } {
@ -784,6 +791,7 @@ fn bulk_load(
format: RdfFormat, format: RdfFormat,
base_iri: Option<&str>, base_iri: Option<&str>,
to_graph_name: Option<NamedNode>, to_graph_name: Option<NamedNode>,
lenient: bool,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let mut parser = RdfParser::from_format(format); let mut parser = RdfParser::from_format(format);
if let Some(to_graph_name) = to_graph_name { if let Some(to_graph_name) = to_graph_name {
@ -794,6 +802,9 @@ fn bulk_load(
.with_base_iri(base_iri) .with_base_iri(base_iri)
.with_context(|| format!("Invalid base IRI {base_iri}"))?; .with_context(|| format!("Invalid base IRI {base_iri}"))?;
} }
if lenient {
parser = parser.unchecked();
}
loader.load_from_read(parser, read)?; loader.load_from_read(parser, read)?;
Ok(()) Ok(())
} }

@ -4,43 +4,60 @@ use libfuzzer_sys::fuzz_target;
use oxrdf::Quad; use oxrdf::Quad;
use oxttl::{NQuadsParser, NQuadsSerializer}; use oxttl::{NQuadsParser, NQuadsSerializer};
fn parse<'a>(chunks: impl IntoIterator<Item = &'a [u8]>) -> (Vec<Quad>, Vec<String>) { fn parse<'a>(
chunks: impl IntoIterator<Item = &'a [u8]>,
unchecked: bool,
) -> (Vec<Quad>, Vec<String>) {
let mut quads = Vec::new(); let mut quads = Vec::new();
let mut errors = Vec::new(); let mut errors = Vec::new();
let mut parser = NQuadsParser::new().with_quoted_triples().parse(); let mut parser = NQuadsParser::new().with_quoted_triples();
if unchecked {
parser = parser.unchecked();
}
let mut reader = parser.parse();
for chunk in chunks { for chunk in chunks {
parser.extend_from_slice(chunk); reader.extend_from_slice(chunk);
while let Some(result) = parser.read_next() { while let Some(result) = reader.read_next() {
match result { match result {
Ok(quad) => quads.push(quad), Ok(quad) => quads.push(quad),
Err(error) => errors.push(error.to_string()), Err(error) => errors.push(error.to_string()),
} }
} }
} }
parser.end(); reader.end();
while let Some(result) = parser.read_next() { while let Some(result) = reader.read_next() {
match result { match result {
Ok(quad) => quads.push(quad), Ok(quad) => quads.push(quad),
Err(error) => errors.push(error.to_string()), Err(error) => errors.push(error.to_string()),
} }
} }
assert!(parser.is_end()); assert!(reader.is_end());
(quads, errors) (quads, errors)
} }
fuzz_target!(|data: &[u8]| { fuzz_target!(|data: &[u8]| {
// We parse with splitting // We parse with splitting
let (quads, errors) = parse(data.split(|c| *c == 0xFF)); let (quads, errors) = parse(data.split(|c| *c == 0xFF), false);
// We parse without splitting // We parse without splitting
let (quads_without_split, errors_without_split) = parse([data let (quads_without_split, errors_without_split) = parse(
.iter() [data
.copied() .iter()
.filter(|c| *c != 0xFF) .copied()
.collect::<Vec<_>>() .filter(|c| *c != 0xFF)
.as_slice()]); .collect::<Vec<_>>()
.as_slice()],
false,
);
assert_eq!(quads, quads_without_split); assert_eq!(quads, quads_without_split);
assert_eq!(errors, errors_without_split); assert_eq!(errors, errors_without_split);
// We test also unchecked if valid
if errors.is_empty() {
let (quads_unchecked, errors_unchecked) = parse(data.split(|c| *c == 0xFF), true);
assert!(errors_unchecked.is_empty());
assert_eq!(quads, quads_unchecked);
}
// We serialize // We serialize
let mut writer = NQuadsSerializer::new().serialize_to_write(Vec::new()); let mut writer = NQuadsSerializer::new().serialize_to_write(Vec::new());
for quad in &quads { for quad in &quads {

@ -4,31 +4,37 @@ use libfuzzer_sys::fuzz_target;
use oxrdf::{Dataset, GraphName, Quad, Subject, Term, Triple}; use oxrdf::{Dataset, GraphName, Quad, Subject, Term, Triple};
use oxttl::{TriGParser, TriGSerializer}; use oxttl::{TriGParser, TriGSerializer};
fn parse<'a>(chunks: impl IntoIterator<Item = &'a [u8]>) -> (Vec<Quad>, Vec<String>) { fn parse<'a>(
chunks: impl IntoIterator<Item = &'a [u8]>,
unchecked: bool,
) -> (Vec<Quad>, Vec<String>) {
let mut quads = Vec::new(); let mut quads = Vec::new();
let mut errors = Vec::new(); let mut errors = Vec::new();
let mut parser = TriGParser::new() let mut parser = TriGParser::new()
.with_quoted_triples() .with_quoted_triples()
.with_base_iri("http://example.com/") .with_base_iri("http://example.com/")
.unwrap() .unwrap();
.parse(); if unchecked {
parser = parser.unchecked();
}
let mut reader = parser.parse();
for chunk in chunks { for chunk in chunks {
parser.extend_from_slice(chunk); reader.extend_from_slice(chunk);
while let Some(result) = parser.read_next() { while let Some(result) = reader.read_next() {
match result { match result {
Ok(quad) => quads.push(quad), Ok(quad) => quads.push(quad),
Err(error) => errors.push(error.to_string()), Err(error) => errors.push(error.to_string()),
} }
} }
} }
parser.end(); reader.end();
while let Some(result) = parser.read_next() { while let Some(result) = reader.read_next() {
match result { match result {
Ok(quad) => quads.push(quad), Ok(quad) => quads.push(quad),
Err(error) => errors.push(error.to_string()), Err(error) => errors.push(error.to_string()),
} }
} }
assert!(parser.is_end()); assert!(reader.is_end());
(quads, errors) (quads, errors)
} }
@ -66,14 +72,22 @@ fn serialize_quads(quads: &[Quad]) -> Vec<u8> {
fuzz_target!(|data: &[u8]| { fuzz_target!(|data: &[u8]| {
// We parse with splitting // We parse with splitting
let (quads, errors) = parse(data.split(|c| *c == 0xFF)); let (quads, errors) = parse(data.split(|c| *c == 0xFF), false);
// We parse without splitting // We parse without splitting
let (quads_without_split, errors_without_split) = parse([data let (quads_without_split, errors_without_split) = parse(
.iter() [data
.copied() .iter()
.filter(|c| *c != 0xFF) .copied()
.collect::<Vec<_>>() .filter(|c| *c != 0xFF)
.as_slice()]); .collect::<Vec<_>>()
.as_slice()],
false,
);
let (quads_unchecked, errors_unchecked) = parse(data.split(|c| *c == 0xFF), true);
if errors.is_empty() {
assert!(errors_unchecked.is_empty());
}
let bnodes_count = quads.iter().map(count_quad_blank_nodes).sum::<usize>(); let bnodes_count = quads.iter().map(count_quad_blank_nodes).sum::<usize>();
if bnodes_count == 0 { if bnodes_count == 0 {
assert_eq!( assert_eq!(
@ -83,6 +97,15 @@ fuzz_target!(|data: &[u8]| {
String::from_utf8_lossy(&serialize_quads(&quads)), String::from_utf8_lossy(&serialize_quads(&quads)),
String::from_utf8_lossy(&serialize_quads(&quads_without_split)) String::from_utf8_lossy(&serialize_quads(&quads_without_split))
); );
if errors.is_empty() {
assert_eq!(
quads,
quads_unchecked,
"Validating:\n{}\nUnchecked:\n{}",
String::from_utf8_lossy(&serialize_quads(&quads)),
String::from_utf8_lossy(&serialize_quads(&quads_unchecked))
);
}
} else if bnodes_count <= 4 { } else if bnodes_count <= 4 {
let mut dataset_with_split = quads.iter().collect::<Dataset>(); let mut dataset_with_split = quads.iter().collect::<Dataset>();
let mut dataset_without_split = quads_without_split.iter().collect::<Dataset>(); let mut dataset_without_split = quads_without_split.iter().collect::<Dataset>();
@ -95,6 +118,19 @@ fuzz_target!(|data: &[u8]| {
String::from_utf8_lossy(&serialize_quads(&quads)), String::from_utf8_lossy(&serialize_quads(&quads)),
String::from_utf8_lossy(&serialize_quads(&quads_without_split)) String::from_utf8_lossy(&serialize_quads(&quads_without_split))
); );
if errors.is_empty() {
if errors.is_empty() {
let mut dataset_unchecked = quads_unchecked.iter().collect::<Dataset>();
dataset_unchecked.canonicalize();
assert_eq!(
dataset_with_split,
dataset_unchecked,
"Validating:\n{}\nUnchecked:\n{}",
String::from_utf8_lossy(&serialize_quads(&quads)),
String::from_utf8_lossy(&serialize_quads(&quads_unchecked))
);
}
}
} }
assert_eq!(errors, errors_without_split); assert_eq!(errors, errors_without_split);

@ -31,7 +31,7 @@ hex = "0.4"
json-event-parser = "0.2.0-alpha.2" json-event-parser = "0.2.0-alpha.2"
md-5 = "0.10" md-5 = "0.10"
oxilangtag = "0.1" oxilangtag = "0.1"
oxiri = "0.2" oxiri = "0.2.3-alpha.1"
oxrdf = { version = "0.2.0-alpha.1-dev", path = "oxrdf", features = ["rdf-star", "oxsdatatypes"] } oxrdf = { version = "0.2.0-alpha.1-dev", path = "oxrdf", features = ["rdf-star", "oxsdatatypes"] }
oxrdfio = { version = "0.1.0-alpha.1-dev", path = "oxrdfio", features = ["rdf-star"] } oxrdfio = { version = "0.1.0-alpha.1-dev", path = "oxrdfio", features = ["rdf-star"] }
oxsdatatypes = { version = "0.2.0-alpha.1-dev", path="oxsdatatypes" } oxsdatatypes = { version = "0.2.0-alpha.1-dev", path="oxsdatatypes" }

@ -24,6 +24,16 @@ fn parse_nt(c: &mut Criterion) {
} }
}) })
}); });
group.bench_function("parse BSBM explore 1000 unchecked", |b| {
b.iter(|| {
for r in RdfParser::from_format(RdfFormat::NTriples)
.unchecked()
.parse_read(data.as_slice())
{
r.unwrap();
}
})
});
} }
fn store_load(c: &mut Criterion) { fn store_load(c: &mut Criterion) {

@ -21,7 +21,7 @@ rdf-star = []
[dependencies] [dependencies]
rand = "0.8" rand = "0.8"
oxilangtag = "0.1" oxilangtag = "0.1"
oxiri = "0.2" oxiri = "0.2.3-alpha.1"
oxsdatatypes = { version = "0.2.0-alpha.1-dev", path="../oxsdatatypes", optional = true } oxsdatatypes = { version = "0.2.0-alpha.1-dev", path="../oxsdatatypes", optional = true }
[lints] [lints]

@ -158,20 +158,16 @@ impl RdfParser {
/// # Result::<_,Box<dyn std::error::Error>>::Ok(()) /// # Result::<_,Box<dyn std::error::Error>>::Ok(())
/// ``` /// ```
#[inline] #[inline]
pub fn with_base_iri(self, base_iri: impl Into<String>) -> Result<Self, IriParseError> { pub fn with_base_iri(mut self, base_iri: impl Into<String>) -> Result<Self, IriParseError> {
Ok(Self { self.inner = match self.inner {
inner: match self.inner { RdfParserKind::N3(p) => RdfParserKind::N3(p),
RdfParserKind::N3(p) => RdfParserKind::N3(p), RdfParserKind::NTriples(p) => RdfParserKind::NTriples(p),
RdfParserKind::NTriples(p) => RdfParserKind::NTriples(p), RdfParserKind::NQuads(p) => RdfParserKind::NQuads(p),
RdfParserKind::NQuads(p) => RdfParserKind::NQuads(p), RdfParserKind::RdfXml(p) => RdfParserKind::RdfXml(p.with_base_iri(base_iri)?),
RdfParserKind::RdfXml(p) => RdfParserKind::RdfXml(p.with_base_iri(base_iri)?), RdfParserKind::TriG(p) => RdfParserKind::TriG(p.with_base_iri(base_iri)?),
RdfParserKind::TriG(p) => RdfParserKind::TriG(p.with_base_iri(base_iri)?), RdfParserKind::Turtle(p) => RdfParserKind::Turtle(p.with_base_iri(base_iri)?),
RdfParserKind::Turtle(p) => RdfParserKind::Turtle(p.with_base_iri(base_iri)?), };
}, Ok(self)
default_graph: self.default_graph,
without_named_graphs: self.without_named_graphs,
rename_blank_nodes: self.rename_blank_nodes,
})
} }
/// Provides the name graph name that should replace the default graph in the returned quads. /// Provides the name graph name that should replace the default graph in the returned quads.
@ -190,13 +186,9 @@ impl RdfParser {
/// # Result::<_,Box<dyn std::error::Error>>::Ok(()) /// # Result::<_,Box<dyn std::error::Error>>::Ok(())
/// ``` /// ```
#[inline] #[inline]
pub fn with_default_graph(self, default_graph: impl Into<GraphName>) -> Self { pub fn with_default_graph(mut self, default_graph: impl Into<GraphName>) -> Self {
Self { self.default_graph = default_graph.into();
inner: self.inner, self
default_graph: default_graph.into(),
without_named_graphs: self.without_named_graphs,
rename_blank_nodes: self.rename_blank_nodes,
}
} }
/// Sets that the parser must fail if parsing a named graph. /// Sets that the parser must fail if parsing a named graph.
@ -212,13 +204,9 @@ impl RdfParser {
/// assert!(parser.parse_read(file.as_bytes()).next().unwrap().is_err()); /// assert!(parser.parse_read(file.as_bytes()).next().unwrap().is_err());
/// ``` /// ```
#[inline] #[inline]
pub fn without_named_graphs(self) -> Self { pub fn without_named_graphs(mut self) -> Self {
Self { self.without_named_graphs = true;
inner: self.inner, self
default_graph: self.default_graph,
without_named_graphs: true,
rename_blank_nodes: self.rename_blank_nodes,
}
} }
/// Renames the blank nodes ids from the ones set in the serialization to random ids. /// Renames the blank nodes ids from the ones set in the serialization to random ids.
@ -240,13 +228,27 @@ impl RdfParser {
/// # Result::<_,Box<dyn std::error::Error>>::Ok(()) /// # Result::<_,Box<dyn std::error::Error>>::Ok(())
/// ``` /// ```
#[inline] #[inline]
pub fn rename_blank_nodes(self) -> Self { pub fn rename_blank_nodes(mut self) -> Self {
Self { self.rename_blank_nodes = true;
inner: self.inner, self
default_graph: self.default_graph, }
without_named_graphs: self.without_named_graphs,
rename_blank_nodes: true, /// Assumes the file is valid to make parsing faster.
} ///
/// It will skip some validations.
///
/// Note that if the file is actually not valid, then broken RDF might be emitted by the parser.
#[inline]
pub fn unchecked(mut self) -> Self {
self.inner = match self.inner {
RdfParserKind::N3(p) => RdfParserKind::N3(p.unchecked()),
RdfParserKind::NTriples(p) => RdfParserKind::NTriples(p.unchecked()),
RdfParserKind::NQuads(p) => RdfParserKind::NQuads(p.unchecked()),
RdfParserKind::RdfXml(p) => RdfParserKind::RdfXml(p.unchecked()),
RdfParserKind::TriG(p) => RdfParserKind::TriG(p.unchecked()),
RdfParserKind::Turtle(p) => RdfParserKind::Turtle(p.unchecked()),
};
self
} }
/// Parses from a [`Read`] implementation and returns an iterator of quads. /// Parses from a [`Read`] implementation and returns an iterator of quads.

@ -21,7 +21,7 @@ async-tokio = ["dep:tokio", "quick-xml/async-tokio"]
[dependencies] [dependencies]
oxrdf = { version = "0.2.0-alpha.1-dev", path = "../oxrdf" } oxrdf = { version = "0.2.0-alpha.1-dev", path = "../oxrdf" }
oxilangtag = "0.1" oxilangtag = "0.1"
oxiri = "0.2" oxiri = "0.2.3-alpha.1"
quick-xml = ">=0.29, <0.32" quick-xml = ">=0.29, <0.32"
tokio = { version = "1.29", optional = true, features = ["io-util"] } tokio = { version = "1.29", optional = true, features = ["io-util"] }

@ -52,6 +52,7 @@ use tokio::io::{AsyncRead, BufReader as AsyncBufReader};
#[derive(Default)] #[derive(Default)]
#[must_use] #[must_use]
pub struct RdfXmlParser { pub struct RdfXmlParser {
unchecked: bool,
base: Option<Iri<String>>, base: Option<Iri<String>>,
} }
@ -62,6 +63,17 @@ impl RdfXmlParser {
Self::default() Self::default()
} }
/// Assumes the file is valid to make parsing faster.
///
/// It will skip some validations.
///
/// Note that if the file is actually not valid, then broken RDF might be emitted by the parser.
#[inline]
pub fn unchecked(mut self) -> Self {
self.unchecked = true;
self
}
#[inline] #[inline]
pub fn with_base_iri(mut self, base_iri: impl Into<String>) -> Result<Self, IriParseError> { pub fn with_base_iri(mut self, base_iri: impl Into<String>) -> Result<Self, IriParseError> {
self.base = Some(Iri::parse(base_iri.into())?); self.base = Some(Iri::parse(base_iri.into())?);
@ -158,6 +170,7 @@ impl RdfXmlParser {
in_literal_depth: 0, in_literal_depth: 0,
known_rdf_id: HashSet::default(), known_rdf_id: HashSet::default(),
is_end: false, is_end: false,
unchecked: self.unchecked,
} }
} }
} }
@ -414,6 +427,7 @@ struct RdfXmlReader<R> {
in_literal_depth: usize, in_literal_depth: usize,
known_rdf_id: HashSet<String>, known_rdf_id: HashSet<String>,
is_end: bool, is_end: bool,
unchecked: bool,
} }
impl<R> RdfXmlReader<R> { impl<R> RdfXmlReader<R> {
@ -551,19 +565,28 @@ impl<R> RdfXmlReader<R> {
let attribute = attribute.map_err(Error::InvalidAttr)?; let attribute = attribute.map_err(Error::InvalidAttr)?;
if attribute.key.as_ref().starts_with(b"xml") { if attribute.key.as_ref().starts_with(b"xml") {
if attribute.key.as_ref() == b"xml:lang" { if attribute.key.as_ref() == b"xml:lang" {
let tag = self.convert_attribute(&attribute)?; let tag = self.convert_attribute(&attribute)?.to_ascii_lowercase();
language = Some( language = Some(if self.unchecked {
tag
} else {
LanguageTag::parse(tag.to_ascii_lowercase()) LanguageTag::parse(tag.to_ascii_lowercase())
.map_err(|error| SyntaxError { .map_err(|error| SyntaxError {
inner: SyntaxErrorKind::InvalidLanguageTag { tag, error }, inner: SyntaxErrorKind::InvalidLanguageTag { tag, error },
})? })?
.into_inner(), .into_inner()
); });
} else if attribute.key.as_ref() == b"xml:base" { } else if attribute.key.as_ref() == b"xml:base" {
let iri = self.convert_attribute(&attribute)?; let iri = self.convert_attribute(&attribute)?;
base_iri = Some(Iri::parse(iri.clone()).map_err(|error| SyntaxError { base_iri = Some(
inner: SyntaxErrorKind::InvalidIri { iri, error }, if self.unchecked {
})?) Iri::parse_unchecked(iri.clone())
} else {
Iri::parse(iri.clone())
}
.map_err(|error| SyntaxError {
inner: SyntaxErrorKind::InvalidIri { iri, error },
})?,
)
} else { } else {
// We ignore other xml attributes // We ignore other xml attributes
} }
@ -622,12 +645,7 @@ impl<R> RdfXmlReader<R> {
.into()); .into());
} else { } else {
property_attrs.push(( property_attrs.push((
NamedNode::new(attribute_url.clone()).map_err(|error| SyntaxError { self.parse_iri(attribute_url)?,
inner: SyntaxErrorKind::InvalidIri {
iri: attribute_url,
error,
},
})?,
self.convert_attribute(&attribute)?, self.convert_attribute(&attribute)?,
)); ));
} }
@ -637,7 +655,7 @@ impl<R> RdfXmlReader<R> {
//Parsing with the base URI //Parsing with the base URI
let id_attr = match id_attr { let id_attr = match id_attr {
Some(iri) => { Some(iri) => {
let iri = resolve(&base_iri, iri)?; let iri = self.resolve_iri(&base_iri, iri)?;
if self.known_rdf_id.contains(iri.as_str()) { if self.known_rdf_id.contains(iri.as_str()) {
return Err(SyntaxError::msg(format!( return Err(SyntaxError::msg(format!(
"{} has already been used as rdf:ID value", "{} has already been used as rdf:ID value",
@ -701,12 +719,7 @@ impl<R> RdfXmlReader<R> {
.into()); .into());
} else { } else {
Self::build_node_elt( Self::build_node_elt(
NamedNode::new(tag_name.clone()).map_err(|error| SyntaxError { self.parse_iri(tag_name)?,
inner: SyntaxErrorKind::InvalidIri {
iri: tag_name,
error,
},
})?,
base_iri, base_iri,
language, language,
id_attr, id_attr,
@ -727,12 +740,7 @@ impl<R> RdfXmlReader<R> {
.into()); .into());
} }
Self::build_node_elt( Self::build_node_elt(
NamedNode::new(tag_name.clone()).map_err(|error| SyntaxError { self.parse_iri(tag_name)?,
inner: SyntaxErrorKind::InvalidIri {
iri: tag_name,
error,
},
})?,
base_iri, base_iri,
language, language,
id_attr, id_attr,
@ -766,12 +774,7 @@ impl<R> RdfXmlReader<R> {
)) ))
.into()); .into());
} else { } else {
NamedNode::new(tag_name.clone()).map_err(|error| SyntaxError { self.parse_iri(tag_name)?
inner: SyntaxErrorKind::InvalidIri {
iri: tag_name,
error,
},
})?
}; };
match parse_type { match parse_type {
RdfXmlParseType::Default => { RdfXmlParseType::Default => {
@ -1156,32 +1159,51 @@ impl<R> RdfXmlReader<R> {
base_iri: &Option<Iri<String>>, base_iri: &Option<Iri<String>>,
attribute: &Attribute<'_>, attribute: &Attribute<'_>,
) -> Result<NamedNode, ParseError> { ) -> Result<NamedNode, ParseError> {
Ok(resolve(base_iri, self.convert_attribute(attribute)?)?) Ok(self.resolve_iri(base_iri, self.convert_attribute(attribute)?)?)
} }
fn resolve_entity(&self, e: &str) -> Option<&str> { fn resolve_iri(
self.custom_entities.get(e).map(String::as_str) &self,
base_iri: &Option<Iri<String>>,
relative_iri: String,
) -> Result<NamedNode, SyntaxError> {
if let Some(base_iri) = base_iri {
Ok(NamedNode::new_unchecked(
if self.unchecked {
base_iri.resolve_unchecked(&relative_iri)
} else {
base_iri.resolve(&relative_iri)
}
.map_err(|error| SyntaxError {
inner: SyntaxErrorKind::InvalidIri {
iri: relative_iri,
error,
},
})?
.into_inner(),
))
} else {
self.parse_iri(relative_iri)
}
} }
}
fn resolve(base_iri: &Option<Iri<String>>, relative_iri: String) -> Result<NamedNode, SyntaxError> { fn parse_iri(&self, relative_iri: String) -> Result<NamedNode, SyntaxError> {
if let Some(base_iri) = base_iri { Ok(NamedNode::new_unchecked(if self.unchecked {
Ok(base_iri relative_iri
.resolve(&relative_iri) } else {
.map_err(|error| SyntaxError { Iri::parse(relative_iri.clone())
inner: SyntaxErrorKind::InvalidIri { .map_err(|error| SyntaxError {
iri: relative_iri, inner: SyntaxErrorKind::InvalidIri {
error, iri: relative_iri,
}, error,
})? },
.into()) })?
} else { .into_inner()
NamedNode::new(relative_iri.clone()).map_err(|error| SyntaxError { }))
inner: SyntaxErrorKind::InvalidIri { }
iri: relative_iri,
error, fn resolve_entity(&self, e: &str) -> Option<&str> {
}, self.custom_entities.get(e).map(String::as_str)
})
} }
} }

@ -22,7 +22,7 @@ async-tokio = ["dep:tokio"]
[dependencies] [dependencies]
memchr = "2.5" memchr = "2.5"
oxrdf = { version = "0.2.0-alpha.1-dev", path = "../oxrdf" } oxrdf = { version = "0.2.0-alpha.1-dev", path = "../oxrdf" }
oxiri = "0.2" oxiri = "0.2.3-alpha.1"
oxilangtag = "0.1" oxilangtag = "0.1"
tokio = { version = "1.29", optional = true, features = ["io-util"] } tokio = { version = "1.29", optional = true, features = ["io-util"] }

@ -6,12 +6,12 @@ use oxrdf::NamedNode;
use std::borrow::Cow; use std::borrow::Cow;
use std::cmp::min; use std::cmp::min;
use std::collections::HashMap; use std::collections::HashMap;
use std::ops::{Range, RangeInclusive}; use std::ops::Range;
use std::str; use std::str;
#[derive(Debug, PartialEq, Eq)] #[derive(Debug, PartialEq, Eq)]
pub enum N3Token<'a> { pub enum N3Token<'a> {
IriRef(Iri<String>), IriRef(String),
PrefixedName { PrefixedName {
prefix: &'a str, prefix: &'a str,
local: Cow<'a, str>, local: Cow<'a, str>,
@ -42,6 +42,7 @@ pub struct N3LexerOptions {
pub struct N3Lexer { pub struct N3Lexer {
mode: N3LexerMode, mode: N3LexerMode,
unchecked: bool,
} }
// TODO: there are a lot of 'None' (missing data) returned even if the stream is ending!!! // TODO: there are a lot of 'None' (missing data) returned even if the stream is ending!!!
@ -61,7 +62,7 @@ impl TokenRecognizer for N3Lexer {
b'<' => match *data.get(1)? { b'<' => match *data.get(1)? {
b'<' => Some((2, Ok(N3Token::Punctuation("<<")))), b'<' => Some((2, Ok(N3Token::Punctuation("<<")))),
b'=' if self.mode == N3LexerMode::N3 => { b'=' if self.mode == N3LexerMode::N3 => {
if let Some((consumed, result)) = Self::recognize_iri(data, options) { if let Some((consumed, result)) = self.recognize_iri(data, options) {
Some(if let Ok(result) = result { Some(if let Ok(result) = result {
(consumed, Ok(result)) (consumed, Ok(result))
} else { } else {
@ -74,7 +75,7 @@ impl TokenRecognizer for N3Lexer {
} }
} }
b'-' if self.mode == N3LexerMode::N3 => { b'-' if self.mode == N3LexerMode::N3 => {
if let Some((consumed, result)) = Self::recognize_iri(data, options) { if let Some((consumed, result)) = self.recognize_iri(data, options) {
Some(if let Ok(result) = result { Some(if let Ok(result) = result {
(consumed, Ok(result)) (consumed, Ok(result))
} else { } else {
@ -86,7 +87,7 @@ impl TokenRecognizer for N3Lexer {
None None
} }
} }
_ => Self::recognize_iri(data, options), _ => self.recognize_iri(data, options),
}, },
b'>' => { b'>' => {
if *data.get(1)? == b'>' { if *data.get(1)? == b'>' {
@ -119,7 +120,7 @@ impl TokenRecognizer for N3Lexer {
Self::recognize_string(data, b'\'') Self::recognize_string(data, b'\'')
} }
} }
b'@' => Self::recognize_lang_tag(data), b'@' => self.recognize_lang_tag(data),
b'.' => match data.get(1) { b'.' => match data.get(1) {
Some(b'0'..=b'9') => Self::recognize_number(data), Some(b'0'..=b'9') => Self::recognize_number(data),
Some(_) => Some((1, Ok(N3Token::Punctuation(".")))), Some(_) => Some((1, Ok(N3Token::Punctuation(".")))),
@ -162,18 +163,19 @@ impl TokenRecognizer for N3Lexer {
} }
} }
b'0'..=b'9' | b'+' | b'-' => Self::recognize_number(data), b'0'..=b'9' | b'+' | b'-' => Self::recognize_number(data),
b'?' => Self::recognize_variable(data, is_ending), b'?' => self.recognize_variable(data, is_ending),
_ => Self::recognize_pname_or_keyword(data, is_ending), _ => self.recognize_pname_or_keyword(data, is_ending),
} }
} }
} }
impl N3Lexer { impl N3Lexer {
pub fn new(mode: N3LexerMode) -> Self { pub fn new(mode: N3LexerMode, unchecked: bool) -> Self {
Self { mode } Self { mode, unchecked }
} }
fn recognize_iri( fn recognize_iri(
&self,
data: &[u8], data: &[u8],
options: &N3LexerOptions, options: &N3LexerOptions,
) -> Option<(usize, Result<N3Token<'static>, TokenRecognizerError>)> { ) -> Option<(usize, Result<N3Token<'static>, TokenRecognizerError>)> {
@ -186,7 +188,8 @@ impl N3Lexer {
i += end; i += end;
match data[i] { match data[i] {
b'>' => { b'>' => {
return Some((i + 1, Self::parse_iri(string, 0..=i, options))); #[allow(clippy::range_plus_one)]
return Some((i + 1, self.parse_iri(string, 0..i + 1, options)));
} }
b'\\' => { b'\\' => {
let (additional, c) = Self::recognize_escape(&data[i..], i, false)?; let (additional, c) = Self::recognize_escape(&data[i..], i, false)?;
@ -205,29 +208,36 @@ impl N3Lexer {
} }
fn parse_iri( fn parse_iri(
&self,
iri: Vec<u8>, iri: Vec<u8>,
position: RangeInclusive<usize>, position: Range<usize>,
options: &N3LexerOptions, options: &N3LexerOptions,
) -> Result<N3Token<'static>, TokenRecognizerError> { ) -> Result<N3Token<'static>, TokenRecognizerError> {
let iri = String::from_utf8(iri).map_err(|e| { let iri = string_from_utf8(iri, position.clone())?;
( Ok(N3Token::IriRef(
position.clone(), if let Some(base_iri) = options.base_iri.as_ref() {
format!("The IRI contains invalid UTF-8 characters: {e}"), if self.unchecked {
) base_iri.resolve_unchecked(&iri)
})?; } else {
let iri = if let Some(base_iri) = options.base_iri.as_ref() { base_iri.resolve(&iri)
base_iri.resolve(&iri) }
} else { .map_err(|e| (position, e.to_string()))?
Iri::parse(iri) .into_inner()
} } else if self.unchecked {
.map_err(|e| (position, e.to_string()))?; iri
Ok(N3Token::IriRef(iri)) } else {
Iri::parse(iri)
.map_err(|e| (position, e.to_string()))?
.into_inner()
},
))
} }
fn recognize_pname_or_keyword( fn recognize_pname_or_keyword<'a>(
data: &[u8], &self,
data: &'a [u8],
is_ending: bool, is_ending: bool,
) -> Option<(usize, Result<N3Token<'_>, TokenRecognizerError>)> { ) -> Option<(usize, Result<N3Token<'a>, TokenRecognizerError>)> {
// [139s] PNAME_NS ::= PN_PREFIX? ':' // [139s] PNAME_NS ::= PN_PREFIX? ':'
// [140s] PNAME_LN ::= PNAME_NS PN_LOCAL // [140s] PNAME_LN ::= PNAME_NS PN_LOCAL
@ -303,7 +313,8 @@ impl N3Lexer {
)); ));
} }
let (consumed, pn_local_result) = Self::recognize_optional_pn_local(&data[i..], is_ending)?; let (consumed, pn_local_result) =
self.recognize_optional_pn_local(&data[i..], is_ending)?;
Some(( Some((
consumed + i, consumed + i,
pn_local_result.map(|(local, might_be_invalid_iri)| N3Token::PrefixedName { pn_local_result.map(|(local, might_be_invalid_iri)| N3Token::PrefixedName {
@ -314,12 +325,13 @@ impl N3Lexer {
)) ))
} }
fn recognize_variable( fn recognize_variable<'a>(
data: &[u8], &self,
data: &'a [u8],
is_ending: bool, is_ending: bool,
) -> Option<(usize, Result<N3Token<'_>, TokenRecognizerError>)> { ) -> Option<(usize, Result<N3Token<'a>, TokenRecognizerError>)> {
// [36] QUICK_VAR_NAME ::= "?" PN_LOCAL // [36] QUICK_VAR_NAME ::= "?" PN_LOCAL
let (consumed, result) = Self::recognize_optional_pn_local(&data[1..], is_ending)?; let (consumed, result) = self.recognize_optional_pn_local(&data[1..], is_ending)?;
Some(( Some((
consumed + 1, consumed + 1,
result.and_then(|(name, _)| { result.and_then(|(name, _)| {
@ -332,10 +344,11 @@ impl N3Lexer {
)) ))
} }
fn recognize_optional_pn_local( fn recognize_optional_pn_local<'a>(
data: &[u8], &self,
data: &'a [u8],
is_ending: bool, is_ending: bool,
) -> Option<(usize, Result<(Cow<'_, str>, bool), TokenRecognizerError>)> { ) -> Option<(usize, Result<(Cow<'a, str>, bool), TokenRecognizerError>)> {
// [168s] PN_LOCAL ::= (PN_CHARS_U | ':' | [0-9] | PLX) ((PN_CHARS | '.' | ':' | PLX)* (PN_CHARS | ':' | PLX))? // [168s] PN_LOCAL ::= (PN_CHARS_U | ':' | [0-9] | PLX) ((PN_CHARS | '.' | ':' | PLX)* (PN_CHARS | ':' | PLX))?
let mut i = 0; let mut i = 0;
let mut buffer = None; // Buffer if there are some escaped characters let mut buffer = None; // Buffer if there are some escaped characters
@ -359,23 +372,25 @@ impl N3Lexer {
} else if c == '\\' { } else if c == '\\' {
i += 1; i += 1;
let a = char::from(*data.get(i)?); let a = char::from(*data.get(i)?);
if matches!( if self.unchecked
a, || matches!(
'_' | '~' a,
| '.' '_' | '~'
| '-' | '.'
| '!' | '-'
| '$' | '!'
| '&' | '$'
| '\'' | '&'
| '(' | '\''
| ')' | '('
| '*' | ')'
| '+' | '*'
| ',' | '+'
| ';' | ','
| '=' | ';'
) { | '='
)
{
// ok to escape // ok to escape
} else if matches!(a, '/' | '?' | '#' | '@' | '%') { } else if matches!(a, '/' | '?' | '#' | '@' | '%') {
// ok to escape but requires IRI validation // ok to escape but requires IRI validation
@ -406,12 +421,18 @@ impl N3Lexer {
{ {
return Some((0, Ok((Cow::Borrowed(""), false)))); return Some((0, Ok((Cow::Borrowed(""), false))));
} }
might_be_invalid_iri |= if !self.unchecked {
Self::is_possible_pn_chars_base_but_not_valid_iri(c) || c == ':'; might_be_invalid_iri |=
Self::is_possible_pn_chars_base_but_not_valid_iri(c)
|| c == ':';
}
i += consumed; i += consumed;
} else if Self::is_possible_pn_chars(c) || c == ':' || c == '.' { } else if Self::is_possible_pn_chars(c) || c == ':' || c == '.' {
might_be_invalid_iri |= if !self.unchecked {
Self::is_possible_pn_chars_base_but_not_valid_iri(c) || c == ':'; might_be_invalid_iri |=
Self::is_possible_pn_chars_base_but_not_valid_iri(c)
|| c == ':';
}
i += consumed; i += consumed;
} else { } else {
let buffer = if let Some(mut buffer) = buffer { let buffer = if let Some(mut buffer) = buffer {
@ -518,9 +539,10 @@ impl N3Lexer {
} }
} }
fn recognize_lang_tag( fn recognize_lang_tag<'a>(
data: &[u8], &self,
) -> Option<(usize, Result<N3Token<'_>, TokenRecognizerError>)> { data: &'a [u8],
) -> Option<(usize, Result<N3Token<'a>, TokenRecognizerError>)> {
// [144s] LANGTAG ::= '@' [a-zA-Z]+ ('-' [a-zA-Z0-9]+)* // [144s] LANGTAG ::= '@' [a-zA-Z]+ ('-' [a-zA-Z0-9]+)*
let mut is_last_block_empty = true; let mut is_last_block_empty = true;
for (i, c) in data[1..].iter().enumerate() { for (i, c) in data[1..].iter().enumerate() {
@ -532,25 +554,29 @@ impl N3Lexer {
Err((1..2, "A language code should always start with a letter").into()), Err((1..2, "A language code should always start with a letter").into()),
)); ));
} else if is_last_block_empty { } else if is_last_block_empty {
return Some((i, Self::parse_lang_tag(&data[1..i], 1..i - 1))); return Some((i, self.parse_lang_tag(&data[1..i], 1..i - 1)));
} else if *c == b'-' { } else if *c == b'-' {
is_last_block_empty = true; is_last_block_empty = true;
} else { } else {
return Some((i + 1, Self::parse_lang_tag(&data[1..=i], 1..i))); return Some((i + 1, self.parse_lang_tag(&data[1..=i], 1..i)));
} }
} }
None None
} }
fn parse_lang_tag( fn parse_lang_tag<'a>(
lang_tag: &[u8], &self,
lang_tag: &'a [u8],
position: Range<usize>, position: Range<usize>,
) -> Result<N3Token<'_>, TokenRecognizerError> { ) -> Result<N3Token<'a>, TokenRecognizerError> {
Ok(N3Token::LangTag( let lang_tag = str_from_utf8(lang_tag, position.clone())?;
LanguageTag::parse(str_from_utf8(lang_tag, position.clone())?) Ok(N3Token::LangTag(if self.unchecked {
lang_tag
} else {
LanguageTag::parse(lang_tag)
.map_err(|e| (position.clone(), e.to_string()))? .map_err(|e| (position.clone(), e.to_string()))?
.into_inner(), .into_inner()
)) }))
} }
fn recognize_string( fn recognize_string(
@ -933,3 +959,14 @@ fn str_from_utf8(data: &[u8], range: Range<usize>) -> Result<&str, TokenRecogniz
.into() .into()
}) })
} }
fn string_from_utf8(data: Vec<u8>, range: Range<usize>) -> Result<String, TokenRecognizerError> {
String::from_utf8(data).map_err(|e| {
(
range.start + e.utf8_error().valid_up_to()
..min(range.end, range.start + e.utf8_error().valid_up_to() + 4),
format!("Invalid UTF-8: {e}"),
)
.into()
})
}

@ -63,7 +63,7 @@ impl RuleRecognizer for NQuadsRecognizer {
NQuadsState::ExpectSubject => match token { NQuadsState::ExpectSubject => match token {
N3Token::IriRef(s) => { N3Token::IriRef(s) => {
self.subjects self.subjects
.push(NamedNode::from(s).into()); .push(NamedNode::new_unchecked(s).into());
self.stack.push(NQuadsState::ExpectPredicate); self.stack.push(NQuadsState::ExpectPredicate);
self self
} }
@ -86,7 +86,7 @@ impl RuleRecognizer for NQuadsRecognizer {
NQuadsState::ExpectPredicate => match token { NQuadsState::ExpectPredicate => match token {
N3Token::IriRef(p) => { N3Token::IriRef(p) => {
self.predicates self.predicates
.push(p.into()); .push(NamedNode::new_unchecked(p));
self.stack.push(NQuadsState::ExpectedObject); self.stack.push(NQuadsState::ExpectedObject);
self self
} }
@ -98,7 +98,7 @@ impl RuleRecognizer for NQuadsRecognizer {
NQuadsState::ExpectedObject => match token { NQuadsState::ExpectedObject => match token {
N3Token::IriRef(o) => { N3Token::IriRef(o) => {
self.objects self.objects
.push(NamedNode::from(o).into()); .push(NamedNode::new_unchecked(o).into());
self.stack self.stack
.push(NQuadsState::ExpectPossibleGraphOrEndOfQuotedTriple); .push(NQuadsState::ExpectPossibleGraphOrEndOfQuotedTriple);
self self
@ -155,7 +155,7 @@ impl RuleRecognizer for NQuadsRecognizer {
self.objects.push( self.objects.push(
Literal::new_typed_literal( Literal::new_typed_literal(
value, value,
d NamedNode::new_unchecked(d)
) )
.into(), .into(),
); );
@ -171,7 +171,7 @@ impl RuleRecognizer for NQuadsRecognizer {
N3Token::IriRef(g) if context.with_graph_name => { N3Token::IriRef(g) if context.with_graph_name => {
self.emit_quad( self.emit_quad(
results, results,
NamedNode::from(g).into(), NamedNode::new_unchecked(g).into(),
); );
self.stack.push(NQuadsState::ExpectDot); self.stack.push(NQuadsState::ExpectDot);
self self
@ -264,10 +264,11 @@ impl NQuadsRecognizer {
pub fn new_parser( pub fn new_parser(
with_graph_name: bool, with_graph_name: bool,
#[cfg(feature = "rdf-star")] with_quoted_triples: bool, #[cfg(feature = "rdf-star")] with_quoted_triples: bool,
unchecked: bool,
) -> Parser<Self> { ) -> Parser<Self> {
Parser::new( Parser::new(
Lexer::new( Lexer::new(
N3Lexer::new(N3LexerMode::NTriples), N3Lexer::new(N3LexerMode::NTriples, unchecked),
MIN_BUFFER_SIZE, MIN_BUFFER_SIZE,
MAX_BUFFER_SIZE, MAX_BUFFER_SIZE,
true, true,

@ -206,6 +206,7 @@ impl From<Quad> for N3Quad {
#[derive(Default)] #[derive(Default)]
#[must_use] #[must_use]
pub struct N3Parser { pub struct N3Parser {
unchecked: bool,
base: Option<Iri<String>>, base: Option<Iri<String>>,
prefixes: HashMap<String, Iri<String>>, prefixes: HashMap<String, Iri<String>>,
} }
@ -217,6 +218,17 @@ impl N3Parser {
Self::default() Self::default()
} }
/// Assumes the file is valid to make parsing faster.
///
/// It will skip some validations.
///
/// Note that if the file is actually not valid, then broken RDF might be emitted by the parser.
#[inline]
pub fn unchecked(mut self) -> Self {
self.unchecked = true;
self
}
#[inline] #[inline]
pub fn with_base_iri(mut self, base_iri: impl Into<String>) -> Result<Self, IriParseError> { pub fn with_base_iri(mut self, base_iri: impl Into<String>) -> Result<Self, IriParseError> {
self.base = Some(Iri::parse(base_iri.into())?); self.base = Some(Iri::parse(base_iri.into())?);
@ -345,7 +357,7 @@ impl N3Parser {
/// ``` /// ```
pub fn parse(self) -> LowLevelN3Reader { pub fn parse(self) -> LowLevelN3Reader {
LowLevelN3Reader { LowLevelN3Reader {
parser: N3Recognizer::new_parser(self.base, self.prefixes), parser: N3Recognizer::new_parser(self.unchecked, self.base, self.prefixes),
} }
} }
} }
@ -665,8 +677,13 @@ impl RuleRecognizer for N3Recognizer {
} }
N3State::BaseExpectIri => return match token { N3State::BaseExpectIri => return match token {
N3Token::IriRef(iri) => { N3Token::IriRef(iri) => {
context.lexer_options.base_iri = Some(iri); match Iri::parse_unchecked(iri) {
self Ok(iri) => {
context.lexer_options.base_iri = Some(iri);
self
}
Err(e) => self.error(errors, format!("Invalid base IRI: {e}"))
}
} }
_ => self.error(errors, "The BASE keyword should be followed by an IRI"), _ => self.error(errors, "The BASE keyword should be followed by an IRI"),
}, },
@ -681,8 +698,13 @@ impl RuleRecognizer for N3Recognizer {
}, },
N3State::PrefixExpectIri { name } => return match token { N3State::PrefixExpectIri { name } => return match token {
N3Token::IriRef(iri) => { N3Token::IriRef(iri) => {
context.prefixes.insert(name, iri); match Iri::parse_unchecked(iri) {
self Ok(iri) => {
context.prefixes.insert(name, iri);
self
}
Err(e) => self.error(errors, format!("Invalid prefix IRI: {e}"))
}
} }
_ => self.error(errors, "The PREFIX declaration should be followed by a prefix and its value as an IRI"), _ => self.error(errors, "The PREFIX declaration should be followed by a prefix and its value as an IRI"),
}, },
@ -843,7 +865,7 @@ impl RuleRecognizer for N3Recognizer {
N3State::PathItem => { N3State::PathItem => {
return match token { return match token {
N3Token::IriRef(iri) => { N3Token::IriRef(iri) => {
self.terms.push(NamedNode::from(iri).into()); self.terms.push(NamedNode::new_unchecked(iri).into());
self self
} }
N3Token::PrefixedName { prefix, local, might_be_invalid_iri } => match resolve_local_name(prefix, &local, might_be_invalid_iri, &context.prefixes) { N3Token::PrefixedName { prefix, local, might_be_invalid_iri } => match resolve_local_name(prefix, &local, might_be_invalid_iri, &context.prefixes) {
@ -925,7 +947,7 @@ impl RuleRecognizer for N3Recognizer {
} }
N3State::IriPropertyList => return match token { N3State::IriPropertyList => return match token {
N3Token::IriRef(id) => { N3Token::IriRef(id) => {
self.terms.push(NamedNode::from(id).into()); self.terms.push(NamedNode::new_unchecked(id).into());
self.stack.push(N3State::PropertyListEnd); self.stack.push(N3State::PropertyListEnd);
self.stack.push(N3State::PredicateObjectList); self.stack.push(N3State::PredicateObjectList);
self self
@ -999,7 +1021,7 @@ impl RuleRecognizer for N3Recognizer {
N3State::LiteralExpectDatatype { value } => { N3State::LiteralExpectDatatype { value } => {
match token { match token {
N3Token::IriRef(datatype) => { N3Token::IriRef(datatype) => {
self.terms.push(Literal::new_typed_literal(value, datatype).into()); self.terms.push(Literal::new_typed_literal(value, NamedNode::new_unchecked(datatype)).into());
return self; return self;
} }
N3Token::PrefixedName { prefix, local, might_be_invalid_iri } => match resolve_local_name(prefix, &local, might_be_invalid_iri, &context.prefixes) { N3Token::PrefixedName { prefix, local, might_be_invalid_iri } => match resolve_local_name(prefix, &local, might_be_invalid_iri, &context.prefixes) {
@ -1096,12 +1118,13 @@ impl RuleRecognizer for N3Recognizer {
impl N3Recognizer { impl N3Recognizer {
pub fn new_parser( pub fn new_parser(
unchecked: bool,
base_iri: Option<Iri<String>>, base_iri: Option<Iri<String>>,
prefixes: HashMap<String, Iri<String>>, prefixes: HashMap<String, Iri<String>>,
) -> Parser<Self> { ) -> Parser<Self> {
Parser::new( Parser::new(
Lexer::new( Lexer::new(
N3Lexer::new(N3LexerMode::N3), N3Lexer::new(N3LexerMode::N3, unchecked),
MIN_BUFFER_SIZE, MIN_BUFFER_SIZE,
MAX_BUFFER_SIZE, MAX_BUFFER_SIZE,
true, true,

@ -37,6 +37,7 @@ use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
#[derive(Default)] #[derive(Default)]
#[must_use] #[must_use]
pub struct NQuadsParser { pub struct NQuadsParser {
unchecked: bool,
#[cfg(feature = "rdf-star")] #[cfg(feature = "rdf-star")]
with_quoted_triples: bool, with_quoted_triples: bool,
} }
@ -48,6 +49,17 @@ impl NQuadsParser {
Self::default() Self::default()
} }
/// Assumes the file is valid to make parsing faster.
///
/// It will skip some validations.
///
/// Note that if the file is actually not valid, then broken RDF might be emitted by the parser.
#[inline]
pub fn unchecked(mut self) -> Self {
self.unchecked = true;
self
}
/// Enables [N-Quads-star](https://w3c.github.io/rdf-star/cg-spec/2021-12-17.html#n-quads-star). /// Enables [N-Quads-star](https://w3c.github.io/rdf-star/cg-spec/2021-12-17.html#n-quads-star).
#[cfg(feature = "rdf-star")] #[cfg(feature = "rdf-star")]
#[inline] #[inline]
@ -165,6 +177,7 @@ impl NQuadsParser {
true, true,
#[cfg(feature = "rdf-star")] #[cfg(feature = "rdf-star")]
self.with_quoted_triples, self.with_quoted_triples,
self.unchecked,
), ),
} }
} }

@ -38,6 +38,7 @@ use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
#[derive(Default)] #[derive(Default)]
#[must_use] #[must_use]
pub struct NTriplesParser { pub struct NTriplesParser {
unchecked: bool,
#[cfg(feature = "rdf-star")] #[cfg(feature = "rdf-star")]
with_quoted_triples: bool, with_quoted_triples: bool,
} }
@ -49,6 +50,17 @@ impl NTriplesParser {
Self::default() Self::default()
} }
/// Assumes the file is valid to make parsing faster.
///
/// It will skip some validations.
///
/// Note that if the file is actually not valid, then broken RDF might be emitted by the parser. ///
#[inline]
pub fn unchecked(mut self) -> Self {
self.unchecked = true;
self
}
/// Enables [N-Triples-star](https://w3c.github.io/rdf-star/cg-spec/2021-12-17.html#n-triples-star). /// Enables [N-Triples-star](https://w3c.github.io/rdf-star/cg-spec/2021-12-17.html#n-triples-star).
#[cfg(feature = "rdf-star")] #[cfg(feature = "rdf-star")]
#[inline] #[inline]
@ -166,6 +178,7 @@ impl NTriplesParser {
false, false,
#[cfg(feature = "rdf-star")] #[cfg(feature = "rdf-star")]
self.with_quoted_triples, self.with_quoted_triples,
self.unchecked,
), ),
} }
} }
@ -542,3 +555,26 @@ impl LowLevelNTriplesWriter {
writeln!(write, "{} .", t.into()) writeln!(write, "{} .", t.into())
} }
} }
#[cfg(test)]
mod tests {
use super::*;
use oxrdf::{Literal, NamedNode};
#[test]
fn unchecked_parsing() {
let triples = NTriplesParser::new()
.unchecked()
.parse_read("<foo> <bar> \"baz\"@toolonglangtag .".as_bytes())
.collect::<Result<Vec<_>, _>>()
.unwrap();
assert_eq!(
triples,
[Triple::new(
NamedNode::new_unchecked("foo"),
NamedNode::new_unchecked("bar"),
Literal::new_language_tagged_literal_unchecked("baz", "toolonglangtag"),
)]
)
}
}

@ -107,8 +107,13 @@ impl RuleRecognizer for TriGRecognizer {
}, },
TriGState::BaseExpectIri => match token { TriGState::BaseExpectIri => match token {
N3Token::IriRef(iri) => { N3Token::IriRef(iri) => {
context.lexer_options.base_iri = Some(iri); match Iri::parse_unchecked(iri) {
self Ok(iri) => {
context.lexer_options.base_iri = Some(iri);
self
}
Err(e) => self.error(errors, format!("Invalid base IRI: {e}"))
}
} }
_ => self.error(errors, "The BASE keyword should be followed by an IRI"), _ => self.error(errors, "The BASE keyword should be followed by an IRI"),
}, },
@ -123,9 +128,13 @@ impl RuleRecognizer for TriGRecognizer {
}, },
TriGState::PrefixExpectIri { name } => match token { TriGState::PrefixExpectIri { name } => match token {
N3Token::IriRef(iri) => { N3Token::IriRef(iri) => {
context.prefixes.insert(name, iri); match Iri::parse_unchecked(iri) {
self Ok(iri) => {
} context.prefixes.insert(name, iri);
self
}
Err(e) => self.error(errors, format!("Invalid prefix IRI: {e}"))
} }
_ => self.error(errors, "The PREFIX declaration should be followed by a prefix and its value as an IRI"), _ => self.error(errors, "The PREFIX declaration should be followed by a prefix and its value as an IRI"),
}, },
// [3g] triplesOrGraph ::= labelOrSubject ( wrappedGraph | predicateObjectList '.' ) | quotedTriple predicateObjectList '.' // [3g] triplesOrGraph ::= labelOrSubject ( wrappedGraph | predicateObjectList '.' ) | quotedTriple predicateObjectList '.'
@ -133,7 +142,7 @@ impl RuleRecognizer for TriGRecognizer {
TriGState::TriplesOrGraph => match token { TriGState::TriplesOrGraph => match token {
N3Token::IriRef(iri) => { N3Token::IriRef(iri) => {
self.stack.push(TriGState::WrappedGraphOrPredicateObjectList { self.stack.push(TriGState::WrappedGraphOrPredicateObjectList {
term: NamedNode::from(iri).into() term: NamedNode::new_unchecked(iri).into()
}); });
self self
} }
@ -291,7 +300,7 @@ impl RuleRecognizer for TriGRecognizer {
self self
} }
N3Token::IriRef(iri) => { N3Token::IriRef(iri) => {
self.cur_subject.push(NamedNode::from(iri).into()); self.cur_subject.push(NamedNode::new_unchecked(iri).into());
self.stack.push(TriGState::PredicateObjectList); self.stack.push(TriGState::PredicateObjectList);
self self
} }
@ -337,7 +346,7 @@ impl RuleRecognizer for TriGRecognizer {
// [7g] labelOrSubject ::= iri | BlankNode // [7g] labelOrSubject ::= iri | BlankNode
TriGState::GraphName => match token { TriGState::GraphName => match token {
N3Token::IriRef(iri) => { N3Token::IriRef(iri) => {
self.cur_graph = NamedNode::from(iri).into(); self.cur_graph = NamedNode::new_unchecked(iri).into();
self self
} }
N3Token::PrefixedName { prefix, local, might_be_invalid_iri } => match resolve_local_name(prefix, &local, might_be_invalid_iri, &context.prefixes) { N3Token::PrefixedName { prefix, local, might_be_invalid_iri } => match resolve_local_name(prefix, &local, might_be_invalid_iri, &context.prefixes) {
@ -451,7 +460,7 @@ impl RuleRecognizer for TriGRecognizer {
self self
} }
N3Token::IriRef(iri) => { N3Token::IriRef(iri) => {
self.cur_predicate.push(NamedNode::from(iri)); self.cur_predicate.push(NamedNode::new_unchecked(iri));
self self
} }
N3Token::PrefixedName { prefix, local, might_be_invalid_iri } => match resolve_local_name(prefix, &local, might_be_invalid_iri, &context.prefixes) { N3Token::PrefixedName { prefix, local, might_be_invalid_iri } => match resolve_local_name(prefix, &local, might_be_invalid_iri, &context.prefixes) {
@ -479,7 +488,7 @@ impl RuleRecognizer for TriGRecognizer {
// [137s] BlankNode ::= BLANK_NODE_LABEL | ANON // [137s] BlankNode ::= BLANK_NODE_LABEL | ANON
TriGState::Object => match token { TriGState::Object => match token {
N3Token::IriRef(iri) => { N3Token::IriRef(iri) => {
self.cur_object.push(NamedNode::from(iri).into()); self.cur_object.push(NamedNode::new_unchecked(iri).into());
self.emit_quad(results); self.emit_quad(results);
self self
} }
@ -626,7 +635,7 @@ impl RuleRecognizer for TriGRecognizer {
TriGState::LiteralExpectDatatype { value, emit } => { TriGState::LiteralExpectDatatype { value, emit } => {
match token { match token {
N3Token::IriRef(datatype) => { N3Token::IriRef(datatype) => {
self.cur_object.push(Literal::new_typed_literal(value, datatype).into()); self.cur_object.push(Literal::new_typed_literal(value, NamedNode::new_unchecked(datatype)).into());
if emit { if emit {
self.emit_quad(results); self.emit_quad(results);
} }
@ -688,7 +697,7 @@ impl RuleRecognizer for TriGRecognizer {
self self
} }
N3Token::IriRef(iri) => { N3Token::IriRef(iri) => {
self.cur_subject.push(NamedNode::from(iri).into()); self.cur_subject.push(NamedNode::new_unchecked(iri).into());
self self
} }
N3Token::PrefixedName { prefix, local, might_be_invalid_iri } => match resolve_local_name(prefix, &local, might_be_invalid_iri, &context.prefixes) { N3Token::PrefixedName { prefix, local, might_be_invalid_iri } => match resolve_local_name(prefix, &local, might_be_invalid_iri, &context.prefixes) {
@ -720,7 +729,7 @@ impl RuleRecognizer for TriGRecognizer {
self self
} }
N3Token::IriRef(iri) => { N3Token::IriRef(iri) => {
self.cur_object.push(NamedNode::from(iri).into()); self.cur_object.push(NamedNode::new_unchecked(iri).into());
self self
} }
N3Token::PrefixedName { prefix, local, might_be_invalid_iri } => match resolve_local_name(prefix, &local, might_be_invalid_iri, &context.prefixes) { N3Token::PrefixedName { prefix, local, might_be_invalid_iri } => match resolve_local_name(prefix, &local, might_be_invalid_iri, &context.prefixes) {
@ -823,12 +832,13 @@ impl TriGRecognizer {
pub fn new_parser( pub fn new_parser(
with_graph_name: bool, with_graph_name: bool,
#[cfg(feature = "rdf-star")] with_quoted_triples: bool, #[cfg(feature = "rdf-star")] with_quoted_triples: bool,
unchecked: bool,
base_iri: Option<Iri<String>>, base_iri: Option<Iri<String>>,
prefixes: HashMap<String, Iri<String>>, prefixes: HashMap<String, Iri<String>>,
) -> Parser<Self> { ) -> Parser<Self> {
Parser::new( Parser::new(
Lexer::new( Lexer::new(
N3Lexer::new(N3LexerMode::Turtle), N3Lexer::new(N3LexerMode::Turtle, unchecked),
MIN_BUFFER_SIZE, MIN_BUFFER_SIZE,
MAX_BUFFER_SIZE, MAX_BUFFER_SIZE,
true, true,

@ -42,6 +42,7 @@ use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
#[derive(Default)] #[derive(Default)]
#[must_use] #[must_use]
pub struct TriGParser { pub struct TriGParser {
unchecked: bool,
base: Option<Iri<String>>, base: Option<Iri<String>>,
prefixes: HashMap<String, Iri<String>>, prefixes: HashMap<String, Iri<String>>,
#[cfg(feature = "rdf-star")] #[cfg(feature = "rdf-star")]
@ -55,6 +56,17 @@ impl TriGParser {
Self::default() Self::default()
} }
/// Assumes the file is valid to make parsing faster.
///
/// It will skip some validations.
///
/// Note that if the file is actually not valid, then broken RDF might be emitted by the parser.
#[inline]
pub fn unchecked(mut self) -> Self {
self.unchecked = true;
self
}
#[inline] #[inline]
pub fn with_base_iri(mut self, base_iri: impl Into<String>) -> Result<Self, IriParseError> { pub fn with_base_iri(mut self, base_iri: impl Into<String>) -> Result<Self, IriParseError> {
self.base = Some(Iri::parse(base_iri.into())?); self.base = Some(Iri::parse(base_iri.into())?);
@ -192,6 +204,7 @@ impl TriGParser {
true, true,
#[cfg(feature = "rdf-star")] #[cfg(feature = "rdf-star")]
self.with_quoted_triples, self.with_quoted_triples,
self.unchecked,
self.base, self.base,
self.prefixes, self.prefixes,
), ),

@ -44,6 +44,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
#[derive(Default)] #[derive(Default)]
#[must_use] #[must_use]
pub struct TurtleParser { pub struct TurtleParser {
unchecked: bool,
base: Option<Iri<String>>, base: Option<Iri<String>>,
prefixes: HashMap<String, Iri<String>>, prefixes: HashMap<String, Iri<String>>,
#[cfg(feature = "rdf-star")] #[cfg(feature = "rdf-star")]
@ -57,6 +58,17 @@ impl TurtleParser {
Self::default() Self::default()
} }
/// Assumes the file is valid to make parsing faster.
///
/// It will skip some validations.
///
/// Note that if the file is actually not valid, then broken RDF might be emitted by the parser.
#[inline]
pub fn unchecked(mut self) -> Self {
self.unchecked = true;
self
}
#[inline] #[inline]
pub fn with_base_iri(mut self, base_iri: impl Into<String>) -> Result<Self, IriParseError> { pub fn with_base_iri(mut self, base_iri: impl Into<String>) -> Result<Self, IriParseError> {
self.base = Some(Iri::parse(base_iri.into())?); self.base = Some(Iri::parse(base_iri.into())?);
@ -194,6 +206,7 @@ impl TurtleParser {
false, false,
#[cfg(feature = "rdf-star")] #[cfg(feature = "rdf-star")]
self.with_quoted_triples, self.with_quoted_triples,
self.unchecked,
self.base, self.base,
self.prefixes, self.prefixes,
), ),

@ -23,7 +23,7 @@ sep-0006 = []
[dependencies] [dependencies]
peg = "0.8" peg = "0.8"
rand = "0.8" rand = "0.8"
oxiri = "0.2" oxiri = "0.2.3-alpha.1"
oxilangtag = "0.1" oxilangtag = "0.1"
oxrdf = { version = "0.2.0-alpha.1-dev", path="../oxrdf" } oxrdf = { version = "0.2.0-alpha.1-dev", path="../oxrdf" }

@ -1053,7 +1053,6 @@ impl<'a> Transaction<'a> {
/// Retrieves quads with a filter on each quad component. /// Retrieves quads with a filter on each quad component.
/// ///
/// Usage example: /// Usage example:
/// Usage example:
/// ``` /// ```
/// use oxigraph::store::{StorageError, Store}; /// use oxigraph::store::{StorageError, Store};
/// use oxigraph::model::*; /// use oxigraph::model::*;
@ -1601,19 +1600,22 @@ impl BulkLoader {
/// ///
/// <div class="warning">This method is optimized for speed. See [the struct](BulkLoader) documentation for more details.</div> /// <div class="warning">This method is optimized for speed. See [the struct](BulkLoader) documentation for more details.</div>
/// ///
/// Usage example: /// To get better speed on valid datasets, consider enabling [`RdfParser::unchecked`] option to skip some validations.
///
/// Usage example: /// Usage example:
/// ``` /// ```
/// use oxigraph::store::Store; /// use oxigraph::store::Store;
/// use oxigraph::io::RdfFormat; /// use oxigraph::io::{RdfParser, RdfFormat};
/// use oxigraph::model::*; /// use oxigraph::model::*;
/// use oxrdfio::RdfParser;
/// ///
/// let store = Store::new()?; /// let store = Store::new()?;
/// ///
/// // insert a dataset file (former load_dataset method) /// // insert a dataset file (former load_dataset method)
/// let file = b"<http://example.com> <http://example.com> <http://example.com> <http://example.com/g> ."; /// let file = b"<http://example.com> <http://example.com> <http://example.com> <http://example.com/g> .";
/// store.bulk_loader().load_from_read(RdfFormat::NQuads, file.as_ref())?; /// store.bulk_loader().load_from_read(
/// RdfParser::from_format(RdfFormat::NQuads).unchecked(), // we inject a custom parser with options
/// file.as_ref()
/// )?;
/// ///
/// // insert a graph file (former load_graph method) /// // insert a graph file (former load_graph method)
/// let file = b"<> <> <> ."; /// let file = b"<> <> <> .";

@ -5,7 +5,7 @@ from urllib.request import urlopen
TARGET_DEBIAN_VERSIONS = ["sid"] TARGET_DEBIAN_VERSIONS = ["sid"]
IGNORE_PACKAGES = {"oxigraph-js", "oxigraph-testsuite", "pyoxigraph", "sparql-smith"} IGNORE_PACKAGES = {"oxigraph-js", "oxigraph-testsuite", "pyoxigraph", "sparql-smith"}
ALLOWED_MISSING_PACKAGES = {"codspeed-criterion-compat", "escargot", "json-event-parser", "oxhttp", "quick-xml"} ALLOWED_MISSING_PACKAGES = {"codspeed-criterion-compat", "escargot", "json-event-parser", "oxhttp", "oxiri", "quick-xml"}
base_path = Path(__file__).parent.parent base_path = Path(__file__).parent.parent

Loading…
Cancel
Save