diff --git a/Cargo.lock b/Cargo.lock index 809ca10e..e49add4e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1113,9 +1113,9 @@ checksum = "8d91edf4fbb970279443471345a4e8c491bf05bb283b3e6c88e4e606fd8c181b" [[package]] name = "oxiri" -version = "0.2.2" +version = "0.2.3-alpha.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb175ec8981211357b7b379869c2f8d555881c55ea62311428ec0de46d89bd5c" +checksum = "b225dad32cfaa43a960b93f01fa7f87528ac07e794b80f6d9a0153e0222557e2" [[package]] name = "oxrdf" diff --git a/cli/Cargo.toml b/cli/Cargo.toml index f60ce1ca..ff4dac7a 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -32,7 +32,7 @@ clap = { version = "4.0", features = ["derive"] } oxigraph = { version = "0.4.0-alpha.1-dev", path = "../lib" } rand = "0.8" url = "2.4" -oxiri = "0.2" +oxiri = "0.2.3-alpha.1" flate2 = "1.0" rayon-core = "1.11" diff --git a/cli/src/main.rs b/cli/src/main.rs index 4ab05892..b0f7a6fa 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -121,6 +121,8 @@ enum Command { destination: PathBuf, }, /// Load file(s) into the store. + /// + /// Feel free to enable the --lenient option if you know your input is valid to get better performances. Load { /// Directory in which Oxigraph data are persisted. #[arg(short, long, value_hint = ValueHint::DirPath)] @@ -143,6 +145,8 @@ enum Command { #[arg(long, value_hint = ValueHint::Url)] base: Option, /// Attempt to keep loading even if the data file is invalid. + /// + /// This disables most of validation on RDF content. #[arg(long)] lenient: bool, /// Name of the graph to load the data to. @@ -391,6 +395,7 @@ pub fn main() -> anyhow::Result<()> { format.context("The --format option must be set when loading from stdin")?, base.as_deref(), graph, + lenient, ) } else { ThreadPoolBuilder::new() @@ -444,6 +449,7 @@ pub fn main() -> anyhow::Result<()> { }), base.as_deref(), graph, + lenient, ) } else { bulk_load( @@ -454,6 +460,7 @@ pub fn main() -> anyhow::Result<()> { }), base.as_deref(), graph, + lenient, ) } } { @@ -784,6 +791,7 @@ fn bulk_load( format: RdfFormat, base_iri: Option<&str>, to_graph_name: Option, + lenient: bool, ) -> anyhow::Result<()> { let mut parser = RdfParser::from_format(format); if let Some(to_graph_name) = to_graph_name { @@ -794,6 +802,9 @@ fn bulk_load( .with_base_iri(base_iri) .with_context(|| format!("Invalid base IRI {base_iri}"))?; } + if lenient { + parser = parser.unchecked(); + } loader.load_from_read(parser, read)?; Ok(()) } diff --git a/fuzz/fuzz_targets/nquads.rs b/fuzz/fuzz_targets/nquads.rs index c964e229..8852343c 100644 --- a/fuzz/fuzz_targets/nquads.rs +++ b/fuzz/fuzz_targets/nquads.rs @@ -4,43 +4,60 @@ use libfuzzer_sys::fuzz_target; use oxrdf::Quad; use oxttl::{NQuadsParser, NQuadsSerializer}; -fn parse<'a>(chunks: impl IntoIterator) -> (Vec, Vec) { +fn parse<'a>( + chunks: impl IntoIterator, + unchecked: bool, +) -> (Vec, Vec) { let mut quads = Vec::new(); let mut errors = Vec::new(); - let mut parser = NQuadsParser::new().with_quoted_triples().parse(); + let mut parser = NQuadsParser::new().with_quoted_triples(); + if unchecked { + parser = parser.unchecked(); + } + let mut reader = parser.parse(); for chunk in chunks { - parser.extend_from_slice(chunk); - while let Some(result) = parser.read_next() { + reader.extend_from_slice(chunk); + while let Some(result) = reader.read_next() { match result { Ok(quad) => quads.push(quad), Err(error) => errors.push(error.to_string()), } } } - parser.end(); - while let Some(result) = parser.read_next() { + reader.end(); + while let Some(result) = reader.read_next() { match result { Ok(quad) => quads.push(quad), Err(error) => errors.push(error.to_string()), } } - assert!(parser.is_end()); + assert!(reader.is_end()); (quads, errors) } fuzz_target!(|data: &[u8]| { // We parse with splitting - let (quads, errors) = parse(data.split(|c| *c == 0xFF)); + let (quads, errors) = parse(data.split(|c| *c == 0xFF), false); // We parse without splitting - let (quads_without_split, errors_without_split) = parse([data - .iter() - .copied() - .filter(|c| *c != 0xFF) - .collect::>() - .as_slice()]); + let (quads_without_split, errors_without_split) = parse( + [data + .iter() + .copied() + .filter(|c| *c != 0xFF) + .collect::>() + .as_slice()], + false, + ); assert_eq!(quads, quads_without_split); assert_eq!(errors, errors_without_split); + // We test also unchecked if valid + if errors.is_empty() { + let (quads_unchecked, errors_unchecked) = parse(data.split(|c| *c == 0xFF), true); + assert!(errors_unchecked.is_empty()); + assert_eq!(quads, quads_unchecked); + } + // We serialize let mut writer = NQuadsSerializer::new().serialize_to_write(Vec::new()); for quad in &quads { diff --git a/fuzz/fuzz_targets/trig.rs b/fuzz/fuzz_targets/trig.rs index c0713e69..1ce03d1b 100644 --- a/fuzz/fuzz_targets/trig.rs +++ b/fuzz/fuzz_targets/trig.rs @@ -4,31 +4,37 @@ use libfuzzer_sys::fuzz_target; use oxrdf::{Dataset, GraphName, Quad, Subject, Term, Triple}; use oxttl::{TriGParser, TriGSerializer}; -fn parse<'a>(chunks: impl IntoIterator) -> (Vec, Vec) { +fn parse<'a>( + chunks: impl IntoIterator, + unchecked: bool, +) -> (Vec, Vec) { let mut quads = Vec::new(); let mut errors = Vec::new(); let mut parser = TriGParser::new() .with_quoted_triples() .with_base_iri("http://example.com/") - .unwrap() - .parse(); + .unwrap(); + if unchecked { + parser = parser.unchecked(); + } + let mut reader = parser.parse(); for chunk in chunks { - parser.extend_from_slice(chunk); - while let Some(result) = parser.read_next() { + reader.extend_from_slice(chunk); + while let Some(result) = reader.read_next() { match result { Ok(quad) => quads.push(quad), Err(error) => errors.push(error.to_string()), } } } - parser.end(); - while let Some(result) = parser.read_next() { + reader.end(); + while let Some(result) = reader.read_next() { match result { Ok(quad) => quads.push(quad), Err(error) => errors.push(error.to_string()), } } - assert!(parser.is_end()); + assert!(reader.is_end()); (quads, errors) } @@ -66,14 +72,22 @@ fn serialize_quads(quads: &[Quad]) -> Vec { fuzz_target!(|data: &[u8]| { // We parse with splitting - let (quads, errors) = parse(data.split(|c| *c == 0xFF)); + let (quads, errors) = parse(data.split(|c| *c == 0xFF), false); // We parse without splitting - let (quads_without_split, errors_without_split) = parse([data - .iter() - .copied() - .filter(|c| *c != 0xFF) - .collect::>() - .as_slice()]); + let (quads_without_split, errors_without_split) = parse( + [data + .iter() + .copied() + .filter(|c| *c != 0xFF) + .collect::>() + .as_slice()], + false, + ); + let (quads_unchecked, errors_unchecked) = parse(data.split(|c| *c == 0xFF), true); + if errors.is_empty() { + assert!(errors_unchecked.is_empty()); + } + let bnodes_count = quads.iter().map(count_quad_blank_nodes).sum::(); if bnodes_count == 0 { assert_eq!( @@ -83,6 +97,15 @@ fuzz_target!(|data: &[u8]| { String::from_utf8_lossy(&serialize_quads(&quads)), String::from_utf8_lossy(&serialize_quads(&quads_without_split)) ); + if errors.is_empty() { + assert_eq!( + quads, + quads_unchecked, + "Validating:\n{}\nUnchecked:\n{}", + String::from_utf8_lossy(&serialize_quads(&quads)), + String::from_utf8_lossy(&serialize_quads(&quads_unchecked)) + ); + } } else if bnodes_count <= 4 { let mut dataset_with_split = quads.iter().collect::(); let mut dataset_without_split = quads_without_split.iter().collect::(); @@ -95,6 +118,19 @@ fuzz_target!(|data: &[u8]| { String::from_utf8_lossy(&serialize_quads(&quads)), String::from_utf8_lossy(&serialize_quads(&quads_without_split)) ); + if errors.is_empty() { + if errors.is_empty() { + let mut dataset_unchecked = quads_unchecked.iter().collect::(); + dataset_unchecked.canonicalize(); + assert_eq!( + dataset_with_split, + dataset_unchecked, + "Validating:\n{}\nUnchecked:\n{}", + String::from_utf8_lossy(&serialize_quads(&quads)), + String::from_utf8_lossy(&serialize_quads(&quads_unchecked)) + ); + } + } } assert_eq!(errors, errors_without_split); diff --git a/lib/Cargo.toml b/lib/Cargo.toml index 3df87dfe..fbb25452 100644 --- a/lib/Cargo.toml +++ b/lib/Cargo.toml @@ -31,7 +31,7 @@ hex = "0.4" json-event-parser = "0.2.0-alpha.2" md-5 = "0.10" oxilangtag = "0.1" -oxiri = "0.2" +oxiri = "0.2.3-alpha.1" oxrdf = { version = "0.2.0-alpha.1-dev", path = "oxrdf", features = ["rdf-star", "oxsdatatypes"] } oxrdfio = { version = "0.1.0-alpha.1-dev", path = "oxrdfio", features = ["rdf-star"] } oxsdatatypes = { version = "0.2.0-alpha.1-dev", path="oxsdatatypes" } diff --git a/lib/benches/store.rs b/lib/benches/store.rs index 22e03e26..0a606135 100644 --- a/lib/benches/store.rs +++ b/lib/benches/store.rs @@ -24,6 +24,16 @@ fn parse_nt(c: &mut Criterion) { } }) }); + group.bench_function("parse BSBM explore 1000 unchecked", |b| { + b.iter(|| { + for r in RdfParser::from_format(RdfFormat::NTriples) + .unchecked() + .parse_read(data.as_slice()) + { + r.unwrap(); + } + }) + }); } fn store_load(c: &mut Criterion) { diff --git a/lib/oxrdf/Cargo.toml b/lib/oxrdf/Cargo.toml index 2196bc9a..1eb030b8 100644 --- a/lib/oxrdf/Cargo.toml +++ b/lib/oxrdf/Cargo.toml @@ -21,7 +21,7 @@ rdf-star = [] [dependencies] rand = "0.8" oxilangtag = "0.1" -oxiri = "0.2" +oxiri = "0.2.3-alpha.1" oxsdatatypes = { version = "0.2.0-alpha.1-dev", path="../oxsdatatypes", optional = true } [lints] diff --git a/lib/oxrdfio/src/parser.rs b/lib/oxrdfio/src/parser.rs index 1b69f955..2046a784 100644 --- a/lib/oxrdfio/src/parser.rs +++ b/lib/oxrdfio/src/parser.rs @@ -158,20 +158,16 @@ impl RdfParser { /// # Result::<_,Box>::Ok(()) /// ``` #[inline] - pub fn with_base_iri(self, base_iri: impl Into) -> Result { - Ok(Self { - inner: match self.inner { - RdfParserKind::N3(p) => RdfParserKind::N3(p), - RdfParserKind::NTriples(p) => RdfParserKind::NTriples(p), - RdfParserKind::NQuads(p) => RdfParserKind::NQuads(p), - RdfParserKind::RdfXml(p) => RdfParserKind::RdfXml(p.with_base_iri(base_iri)?), - RdfParserKind::TriG(p) => RdfParserKind::TriG(p.with_base_iri(base_iri)?), - RdfParserKind::Turtle(p) => RdfParserKind::Turtle(p.with_base_iri(base_iri)?), - }, - default_graph: self.default_graph, - without_named_graphs: self.without_named_graphs, - rename_blank_nodes: self.rename_blank_nodes, - }) + pub fn with_base_iri(mut self, base_iri: impl Into) -> Result { + self.inner = match self.inner { + RdfParserKind::N3(p) => RdfParserKind::N3(p), + RdfParserKind::NTriples(p) => RdfParserKind::NTriples(p), + RdfParserKind::NQuads(p) => RdfParserKind::NQuads(p), + RdfParserKind::RdfXml(p) => RdfParserKind::RdfXml(p.with_base_iri(base_iri)?), + RdfParserKind::TriG(p) => RdfParserKind::TriG(p.with_base_iri(base_iri)?), + RdfParserKind::Turtle(p) => RdfParserKind::Turtle(p.with_base_iri(base_iri)?), + }; + Ok(self) } /// Provides the name graph name that should replace the default graph in the returned quads. @@ -190,13 +186,9 @@ impl RdfParser { /// # Result::<_,Box>::Ok(()) /// ``` #[inline] - pub fn with_default_graph(self, default_graph: impl Into) -> Self { - Self { - inner: self.inner, - default_graph: default_graph.into(), - without_named_graphs: self.without_named_graphs, - rename_blank_nodes: self.rename_blank_nodes, - } + pub fn with_default_graph(mut self, default_graph: impl Into) -> Self { + self.default_graph = default_graph.into(); + self } /// Sets that the parser must fail if parsing a named graph. @@ -212,13 +204,9 @@ impl RdfParser { /// assert!(parser.parse_read(file.as_bytes()).next().unwrap().is_err()); /// ``` #[inline] - pub fn without_named_graphs(self) -> Self { - Self { - inner: self.inner, - default_graph: self.default_graph, - without_named_graphs: true, - rename_blank_nodes: self.rename_blank_nodes, - } + pub fn without_named_graphs(mut self) -> Self { + self.without_named_graphs = true; + self } /// Renames the blank nodes ids from the ones set in the serialization to random ids. @@ -240,13 +228,27 @@ impl RdfParser { /// # Result::<_,Box>::Ok(()) /// ``` #[inline] - pub fn rename_blank_nodes(self) -> Self { - Self { - inner: self.inner, - default_graph: self.default_graph, - without_named_graphs: self.without_named_graphs, - rename_blank_nodes: true, - } + pub fn rename_blank_nodes(mut self) -> Self { + self.rename_blank_nodes = true; + self + } + + /// Assumes the file is valid to make parsing faster. + /// + /// It will skip some validations. + /// + /// Note that if the file is actually not valid, then broken RDF might be emitted by the parser. + #[inline] + pub fn unchecked(mut self) -> Self { + self.inner = match self.inner { + RdfParserKind::N3(p) => RdfParserKind::N3(p.unchecked()), + RdfParserKind::NTriples(p) => RdfParserKind::NTriples(p.unchecked()), + RdfParserKind::NQuads(p) => RdfParserKind::NQuads(p.unchecked()), + RdfParserKind::RdfXml(p) => RdfParserKind::RdfXml(p.unchecked()), + RdfParserKind::TriG(p) => RdfParserKind::TriG(p.unchecked()), + RdfParserKind::Turtle(p) => RdfParserKind::Turtle(p.unchecked()), + }; + self } /// Parses from a [`Read`] implementation and returns an iterator of quads. diff --git a/lib/oxrdfxml/Cargo.toml b/lib/oxrdfxml/Cargo.toml index bc5e8993..1b309626 100644 --- a/lib/oxrdfxml/Cargo.toml +++ b/lib/oxrdfxml/Cargo.toml @@ -21,7 +21,7 @@ async-tokio = ["dep:tokio", "quick-xml/async-tokio"] [dependencies] oxrdf = { version = "0.2.0-alpha.1-dev", path = "../oxrdf" } oxilangtag = "0.1" -oxiri = "0.2" +oxiri = "0.2.3-alpha.1" quick-xml = ">=0.29, <0.32" tokio = { version = "1.29", optional = true, features = ["io-util"] } diff --git a/lib/oxrdfxml/src/parser.rs b/lib/oxrdfxml/src/parser.rs index ebbe80d1..ba0b27a1 100644 --- a/lib/oxrdfxml/src/parser.rs +++ b/lib/oxrdfxml/src/parser.rs @@ -52,6 +52,7 @@ use tokio::io::{AsyncRead, BufReader as AsyncBufReader}; #[derive(Default)] #[must_use] pub struct RdfXmlParser { + unchecked: bool, base: Option>, } @@ -62,6 +63,17 @@ impl RdfXmlParser { Self::default() } + /// Assumes the file is valid to make parsing faster. + /// + /// It will skip some validations. + /// + /// Note that if the file is actually not valid, then broken RDF might be emitted by the parser. + #[inline] + pub fn unchecked(mut self) -> Self { + self.unchecked = true; + self + } + #[inline] pub fn with_base_iri(mut self, base_iri: impl Into) -> Result { self.base = Some(Iri::parse(base_iri.into())?); @@ -158,6 +170,7 @@ impl RdfXmlParser { in_literal_depth: 0, known_rdf_id: HashSet::default(), is_end: false, + unchecked: self.unchecked, } } } @@ -414,6 +427,7 @@ struct RdfXmlReader { in_literal_depth: usize, known_rdf_id: HashSet, is_end: bool, + unchecked: bool, } impl RdfXmlReader { @@ -551,19 +565,28 @@ impl RdfXmlReader { let attribute = attribute.map_err(Error::InvalidAttr)?; if attribute.key.as_ref().starts_with(b"xml") { if attribute.key.as_ref() == b"xml:lang" { - let tag = self.convert_attribute(&attribute)?; - language = Some( + let tag = self.convert_attribute(&attribute)?.to_ascii_lowercase(); + language = Some(if self.unchecked { + tag + } else { LanguageTag::parse(tag.to_ascii_lowercase()) .map_err(|error| SyntaxError { inner: SyntaxErrorKind::InvalidLanguageTag { tag, error }, })? - .into_inner(), - ); + .into_inner() + }); } else if attribute.key.as_ref() == b"xml:base" { let iri = self.convert_attribute(&attribute)?; - base_iri = Some(Iri::parse(iri.clone()).map_err(|error| SyntaxError { - inner: SyntaxErrorKind::InvalidIri { iri, error }, - })?) + base_iri = Some( + if self.unchecked { + Iri::parse_unchecked(iri.clone()) + } else { + Iri::parse(iri.clone()) + } + .map_err(|error| SyntaxError { + inner: SyntaxErrorKind::InvalidIri { iri, error }, + })?, + ) } else { // We ignore other xml attributes } @@ -622,12 +645,7 @@ impl RdfXmlReader { .into()); } else { property_attrs.push(( - NamedNode::new(attribute_url.clone()).map_err(|error| SyntaxError { - inner: SyntaxErrorKind::InvalidIri { - iri: attribute_url, - error, - }, - })?, + self.parse_iri(attribute_url)?, self.convert_attribute(&attribute)?, )); } @@ -637,7 +655,7 @@ impl RdfXmlReader { //Parsing with the base URI let id_attr = match id_attr { Some(iri) => { - let iri = resolve(&base_iri, iri)?; + let iri = self.resolve_iri(&base_iri, iri)?; if self.known_rdf_id.contains(iri.as_str()) { return Err(SyntaxError::msg(format!( "{} has already been used as rdf:ID value", @@ -701,12 +719,7 @@ impl RdfXmlReader { .into()); } else { Self::build_node_elt( - NamedNode::new(tag_name.clone()).map_err(|error| SyntaxError { - inner: SyntaxErrorKind::InvalidIri { - iri: tag_name, - error, - }, - })?, + self.parse_iri(tag_name)?, base_iri, language, id_attr, @@ -727,12 +740,7 @@ impl RdfXmlReader { .into()); } Self::build_node_elt( - NamedNode::new(tag_name.clone()).map_err(|error| SyntaxError { - inner: SyntaxErrorKind::InvalidIri { - iri: tag_name, - error, - }, - })?, + self.parse_iri(tag_name)?, base_iri, language, id_attr, @@ -766,12 +774,7 @@ impl RdfXmlReader { )) .into()); } else { - NamedNode::new(tag_name.clone()).map_err(|error| SyntaxError { - inner: SyntaxErrorKind::InvalidIri { - iri: tag_name, - error, - }, - })? + self.parse_iri(tag_name)? }; match parse_type { RdfXmlParseType::Default => { @@ -1156,32 +1159,51 @@ impl RdfXmlReader { base_iri: &Option>, attribute: &Attribute<'_>, ) -> Result { - Ok(resolve(base_iri, self.convert_attribute(attribute)?)?) + Ok(self.resolve_iri(base_iri, self.convert_attribute(attribute)?)?) } - fn resolve_entity(&self, e: &str) -> Option<&str> { - self.custom_entities.get(e).map(String::as_str) + fn resolve_iri( + &self, + base_iri: &Option>, + relative_iri: String, + ) -> Result { + if let Some(base_iri) = base_iri { + Ok(NamedNode::new_unchecked( + if self.unchecked { + base_iri.resolve_unchecked(&relative_iri) + } else { + base_iri.resolve(&relative_iri) + } + .map_err(|error| SyntaxError { + inner: SyntaxErrorKind::InvalidIri { + iri: relative_iri, + error, + }, + })? + .into_inner(), + )) + } else { + self.parse_iri(relative_iri) + } } -} -fn resolve(base_iri: &Option>, relative_iri: String) -> Result { - if let Some(base_iri) = base_iri { - Ok(base_iri - .resolve(&relative_iri) - .map_err(|error| SyntaxError { - inner: SyntaxErrorKind::InvalidIri { - iri: relative_iri, - error, - }, - })? - .into()) - } else { - NamedNode::new(relative_iri.clone()).map_err(|error| SyntaxError { - inner: SyntaxErrorKind::InvalidIri { - iri: relative_iri, - error, - }, - }) + fn parse_iri(&self, relative_iri: String) -> Result { + Ok(NamedNode::new_unchecked(if self.unchecked { + relative_iri + } else { + Iri::parse(relative_iri.clone()) + .map_err(|error| SyntaxError { + inner: SyntaxErrorKind::InvalidIri { + iri: relative_iri, + error, + }, + })? + .into_inner() + })) + } + + fn resolve_entity(&self, e: &str) -> Option<&str> { + self.custom_entities.get(e).map(String::as_str) } } diff --git a/lib/oxttl/Cargo.toml b/lib/oxttl/Cargo.toml index eeeb10a3..3301fe0e 100644 --- a/lib/oxttl/Cargo.toml +++ b/lib/oxttl/Cargo.toml @@ -22,7 +22,7 @@ async-tokio = ["dep:tokio"] [dependencies] memchr = "2.5" oxrdf = { version = "0.2.0-alpha.1-dev", path = "../oxrdf" } -oxiri = "0.2" +oxiri = "0.2.3-alpha.1" oxilangtag = "0.1" tokio = { version = "1.29", optional = true, features = ["io-util"] } diff --git a/lib/oxttl/src/lexer.rs b/lib/oxttl/src/lexer.rs index 6dd1d024..ef95938e 100644 --- a/lib/oxttl/src/lexer.rs +++ b/lib/oxttl/src/lexer.rs @@ -6,12 +6,12 @@ use oxrdf::NamedNode; use std::borrow::Cow; use std::cmp::min; use std::collections::HashMap; -use std::ops::{Range, RangeInclusive}; +use std::ops::Range; use std::str; #[derive(Debug, PartialEq, Eq)] pub enum N3Token<'a> { - IriRef(Iri), + IriRef(String), PrefixedName { prefix: &'a str, local: Cow<'a, str>, @@ -42,6 +42,7 @@ pub struct N3LexerOptions { pub struct N3Lexer { mode: N3LexerMode, + unchecked: bool, } // TODO: there are a lot of 'None' (missing data) returned even if the stream is ending!!! @@ -61,7 +62,7 @@ impl TokenRecognizer for N3Lexer { b'<' => match *data.get(1)? { b'<' => Some((2, Ok(N3Token::Punctuation("<<")))), b'=' if self.mode == N3LexerMode::N3 => { - if let Some((consumed, result)) = Self::recognize_iri(data, options) { + if let Some((consumed, result)) = self.recognize_iri(data, options) { Some(if let Ok(result) = result { (consumed, Ok(result)) } else { @@ -74,7 +75,7 @@ impl TokenRecognizer for N3Lexer { } } b'-' if self.mode == N3LexerMode::N3 => { - if let Some((consumed, result)) = Self::recognize_iri(data, options) { + if let Some((consumed, result)) = self.recognize_iri(data, options) { Some(if let Ok(result) = result { (consumed, Ok(result)) } else { @@ -86,7 +87,7 @@ impl TokenRecognizer for N3Lexer { None } } - _ => Self::recognize_iri(data, options), + _ => self.recognize_iri(data, options), }, b'>' => { if *data.get(1)? == b'>' { @@ -119,7 +120,7 @@ impl TokenRecognizer for N3Lexer { Self::recognize_string(data, b'\'') } } - b'@' => Self::recognize_lang_tag(data), + b'@' => self.recognize_lang_tag(data), b'.' => match data.get(1) { Some(b'0'..=b'9') => Self::recognize_number(data), Some(_) => Some((1, Ok(N3Token::Punctuation(".")))), @@ -162,18 +163,19 @@ impl TokenRecognizer for N3Lexer { } } b'0'..=b'9' | b'+' | b'-' => Self::recognize_number(data), - b'?' => Self::recognize_variable(data, is_ending), - _ => Self::recognize_pname_or_keyword(data, is_ending), + b'?' => self.recognize_variable(data, is_ending), + _ => self.recognize_pname_or_keyword(data, is_ending), } } } impl N3Lexer { - pub fn new(mode: N3LexerMode) -> Self { - Self { mode } + pub fn new(mode: N3LexerMode, unchecked: bool) -> Self { + Self { mode, unchecked } } fn recognize_iri( + &self, data: &[u8], options: &N3LexerOptions, ) -> Option<(usize, Result, TokenRecognizerError>)> { @@ -186,7 +188,8 @@ impl N3Lexer { i += end; match data[i] { b'>' => { - return Some((i + 1, Self::parse_iri(string, 0..=i, options))); + #[allow(clippy::range_plus_one)] + return Some((i + 1, self.parse_iri(string, 0..i + 1, options))); } b'\\' => { let (additional, c) = Self::recognize_escape(&data[i..], i, false)?; @@ -205,29 +208,36 @@ impl N3Lexer { } fn parse_iri( + &self, iri: Vec, - position: RangeInclusive, + position: Range, options: &N3LexerOptions, ) -> Result, TokenRecognizerError> { - let iri = String::from_utf8(iri).map_err(|e| { - ( - position.clone(), - format!("The IRI contains invalid UTF-8 characters: {e}"), - ) - })?; - let iri = if let Some(base_iri) = options.base_iri.as_ref() { - base_iri.resolve(&iri) - } else { - Iri::parse(iri) - } - .map_err(|e| (position, e.to_string()))?; - Ok(N3Token::IriRef(iri)) + let iri = string_from_utf8(iri, position.clone())?; + Ok(N3Token::IriRef( + if let Some(base_iri) = options.base_iri.as_ref() { + if self.unchecked { + base_iri.resolve_unchecked(&iri) + } else { + base_iri.resolve(&iri) + } + .map_err(|e| (position, e.to_string()))? + .into_inner() + } else if self.unchecked { + iri + } else { + Iri::parse(iri) + .map_err(|e| (position, e.to_string()))? + .into_inner() + }, + )) } - fn recognize_pname_or_keyword( - data: &[u8], + fn recognize_pname_or_keyword<'a>( + &self, + data: &'a [u8], is_ending: bool, - ) -> Option<(usize, Result, TokenRecognizerError>)> { + ) -> Option<(usize, Result, TokenRecognizerError>)> { // [139s] PNAME_NS ::= PN_PREFIX? ':' // [140s] PNAME_LN ::= PNAME_NS PN_LOCAL @@ -303,7 +313,8 @@ impl N3Lexer { )); } - let (consumed, pn_local_result) = Self::recognize_optional_pn_local(&data[i..], is_ending)?; + let (consumed, pn_local_result) = + self.recognize_optional_pn_local(&data[i..], is_ending)?; Some(( consumed + i, pn_local_result.map(|(local, might_be_invalid_iri)| N3Token::PrefixedName { @@ -314,12 +325,13 @@ impl N3Lexer { )) } - fn recognize_variable( - data: &[u8], + fn recognize_variable<'a>( + &self, + data: &'a [u8], is_ending: bool, - ) -> Option<(usize, Result, TokenRecognizerError>)> { + ) -> Option<(usize, Result, TokenRecognizerError>)> { // [36] QUICK_VAR_NAME ::= "?" PN_LOCAL - let (consumed, result) = Self::recognize_optional_pn_local(&data[1..], is_ending)?; + let (consumed, result) = self.recognize_optional_pn_local(&data[1..], is_ending)?; Some(( consumed + 1, result.and_then(|(name, _)| { @@ -332,10 +344,11 @@ impl N3Lexer { )) } - fn recognize_optional_pn_local( - data: &[u8], + fn recognize_optional_pn_local<'a>( + &self, + data: &'a [u8], is_ending: bool, - ) -> Option<(usize, Result<(Cow<'_, str>, bool), TokenRecognizerError>)> { + ) -> Option<(usize, Result<(Cow<'a, str>, bool), TokenRecognizerError>)> { // [168s] PN_LOCAL ::= (PN_CHARS_U | ':' | [0-9] | PLX) ((PN_CHARS | '.' | ':' | PLX)* (PN_CHARS | ':' | PLX))? let mut i = 0; let mut buffer = None; // Buffer if there are some escaped characters @@ -359,23 +372,25 @@ impl N3Lexer { } else if c == '\\' { i += 1; let a = char::from(*data.get(i)?); - if matches!( - a, - '_' | '~' - | '.' - | '-' - | '!' - | '$' - | '&' - | '\'' - | '(' - | ')' - | '*' - | '+' - | ',' - | ';' - | '=' - ) { + if self.unchecked + || matches!( + a, + '_' | '~' + | '.' + | '-' + | '!' + | '$' + | '&' + | '\'' + | '(' + | ')' + | '*' + | '+' + | ',' + | ';' + | '=' + ) + { // ok to escape } else if matches!(a, '/' | '?' | '#' | '@' | '%') { // ok to escape but requires IRI validation @@ -406,12 +421,18 @@ impl N3Lexer { { return Some((0, Ok((Cow::Borrowed(""), false)))); } - might_be_invalid_iri |= - Self::is_possible_pn_chars_base_but_not_valid_iri(c) || c == ':'; + if !self.unchecked { + might_be_invalid_iri |= + Self::is_possible_pn_chars_base_but_not_valid_iri(c) + || c == ':'; + } i += consumed; } else if Self::is_possible_pn_chars(c) || c == ':' || c == '.' { - might_be_invalid_iri |= - Self::is_possible_pn_chars_base_but_not_valid_iri(c) || c == ':'; + if !self.unchecked { + might_be_invalid_iri |= + Self::is_possible_pn_chars_base_but_not_valid_iri(c) + || c == ':'; + } i += consumed; } else { let buffer = if let Some(mut buffer) = buffer { @@ -518,9 +539,10 @@ impl N3Lexer { } } - fn recognize_lang_tag( - data: &[u8], - ) -> Option<(usize, Result, TokenRecognizerError>)> { + fn recognize_lang_tag<'a>( + &self, + data: &'a [u8], + ) -> Option<(usize, Result, TokenRecognizerError>)> { // [144s] LANGTAG ::= '@' [a-zA-Z]+ ('-' [a-zA-Z0-9]+)* let mut is_last_block_empty = true; for (i, c) in data[1..].iter().enumerate() { @@ -532,25 +554,29 @@ impl N3Lexer { Err((1..2, "A language code should always start with a letter").into()), )); } else if is_last_block_empty { - return Some((i, Self::parse_lang_tag(&data[1..i], 1..i - 1))); + return Some((i, self.parse_lang_tag(&data[1..i], 1..i - 1))); } else if *c == b'-' { is_last_block_empty = true; } else { - return Some((i + 1, Self::parse_lang_tag(&data[1..=i], 1..i))); + return Some((i + 1, self.parse_lang_tag(&data[1..=i], 1..i))); } } None } - fn parse_lang_tag( - lang_tag: &[u8], + fn parse_lang_tag<'a>( + &self, + lang_tag: &'a [u8], position: Range, - ) -> Result, TokenRecognizerError> { - Ok(N3Token::LangTag( - LanguageTag::parse(str_from_utf8(lang_tag, position.clone())?) + ) -> Result, TokenRecognizerError> { + let lang_tag = str_from_utf8(lang_tag, position.clone())?; + Ok(N3Token::LangTag(if self.unchecked { + lang_tag + } else { + LanguageTag::parse(lang_tag) .map_err(|e| (position.clone(), e.to_string()))? - .into_inner(), - )) + .into_inner() + })) } fn recognize_string( @@ -933,3 +959,14 @@ fn str_from_utf8(data: &[u8], range: Range) -> Result<&str, TokenRecogniz .into() }) } + +fn string_from_utf8(data: Vec, range: Range) -> Result { + String::from_utf8(data).map_err(|e| { + ( + range.start + e.utf8_error().valid_up_to() + ..min(range.end, range.start + e.utf8_error().valid_up_to() + 4), + format!("Invalid UTF-8: {e}"), + ) + .into() + }) +} diff --git a/lib/oxttl/src/line_formats.rs b/lib/oxttl/src/line_formats.rs index ec439a57..fc48cd53 100644 --- a/lib/oxttl/src/line_formats.rs +++ b/lib/oxttl/src/line_formats.rs @@ -63,7 +63,7 @@ impl RuleRecognizer for NQuadsRecognizer { NQuadsState::ExpectSubject => match token { N3Token::IriRef(s) => { self.subjects - .push(NamedNode::from(s).into()); + .push(NamedNode::new_unchecked(s).into()); self.stack.push(NQuadsState::ExpectPredicate); self } @@ -86,7 +86,7 @@ impl RuleRecognizer for NQuadsRecognizer { NQuadsState::ExpectPredicate => match token { N3Token::IriRef(p) => { self.predicates - .push(p.into()); + .push(NamedNode::new_unchecked(p)); self.stack.push(NQuadsState::ExpectedObject); self } @@ -98,7 +98,7 @@ impl RuleRecognizer for NQuadsRecognizer { NQuadsState::ExpectedObject => match token { N3Token::IriRef(o) => { self.objects - .push(NamedNode::from(o).into()); + .push(NamedNode::new_unchecked(o).into()); self.stack .push(NQuadsState::ExpectPossibleGraphOrEndOfQuotedTriple); self @@ -155,7 +155,7 @@ impl RuleRecognizer for NQuadsRecognizer { self.objects.push( Literal::new_typed_literal( value, - d + NamedNode::new_unchecked(d) ) .into(), ); @@ -171,7 +171,7 @@ impl RuleRecognizer for NQuadsRecognizer { N3Token::IriRef(g) if context.with_graph_name => { self.emit_quad( results, - NamedNode::from(g).into(), + NamedNode::new_unchecked(g).into(), ); self.stack.push(NQuadsState::ExpectDot); self @@ -264,10 +264,11 @@ impl NQuadsRecognizer { pub fn new_parser( with_graph_name: bool, #[cfg(feature = "rdf-star")] with_quoted_triples: bool, + unchecked: bool, ) -> Parser { Parser::new( Lexer::new( - N3Lexer::new(N3LexerMode::NTriples), + N3Lexer::new(N3LexerMode::NTriples, unchecked), MIN_BUFFER_SIZE, MAX_BUFFER_SIZE, true, diff --git a/lib/oxttl/src/n3.rs b/lib/oxttl/src/n3.rs index a81318d3..72db1611 100644 --- a/lib/oxttl/src/n3.rs +++ b/lib/oxttl/src/n3.rs @@ -206,6 +206,7 @@ impl From for N3Quad { #[derive(Default)] #[must_use] pub struct N3Parser { + unchecked: bool, base: Option>, prefixes: HashMap>, } @@ -217,6 +218,17 @@ impl N3Parser { Self::default() } + /// Assumes the file is valid to make parsing faster. + /// + /// It will skip some validations. + /// + /// Note that if the file is actually not valid, then broken RDF might be emitted by the parser. + #[inline] + pub fn unchecked(mut self) -> Self { + self.unchecked = true; + self + } + #[inline] pub fn with_base_iri(mut self, base_iri: impl Into) -> Result { self.base = Some(Iri::parse(base_iri.into())?); @@ -345,7 +357,7 @@ impl N3Parser { /// ``` pub fn parse(self) -> LowLevelN3Reader { LowLevelN3Reader { - parser: N3Recognizer::new_parser(self.base, self.prefixes), + parser: N3Recognizer::new_parser(self.unchecked, self.base, self.prefixes), } } } @@ -665,8 +677,13 @@ impl RuleRecognizer for N3Recognizer { } N3State::BaseExpectIri => return match token { N3Token::IriRef(iri) => { - context.lexer_options.base_iri = Some(iri); - self + match Iri::parse_unchecked(iri) { + Ok(iri) => { + context.lexer_options.base_iri = Some(iri); + self + } + Err(e) => self.error(errors, format!("Invalid base IRI: {e}")) + } } _ => self.error(errors, "The BASE keyword should be followed by an IRI"), }, @@ -681,8 +698,13 @@ impl RuleRecognizer for N3Recognizer { }, N3State::PrefixExpectIri { name } => return match token { N3Token::IriRef(iri) => { - context.prefixes.insert(name, iri); - self + match Iri::parse_unchecked(iri) { + Ok(iri) => { + context.prefixes.insert(name, iri); + self + } + Err(e) => self.error(errors, format!("Invalid prefix IRI: {e}")) + } } _ => self.error(errors, "The PREFIX declaration should be followed by a prefix and its value as an IRI"), }, @@ -843,7 +865,7 @@ impl RuleRecognizer for N3Recognizer { N3State::PathItem => { return match token { N3Token::IriRef(iri) => { - self.terms.push(NamedNode::from(iri).into()); + self.terms.push(NamedNode::new_unchecked(iri).into()); self } N3Token::PrefixedName { prefix, local, might_be_invalid_iri } => match resolve_local_name(prefix, &local, might_be_invalid_iri, &context.prefixes) { @@ -925,7 +947,7 @@ impl RuleRecognizer for N3Recognizer { } N3State::IriPropertyList => return match token { N3Token::IriRef(id) => { - self.terms.push(NamedNode::from(id).into()); + self.terms.push(NamedNode::new_unchecked(id).into()); self.stack.push(N3State::PropertyListEnd); self.stack.push(N3State::PredicateObjectList); self @@ -999,7 +1021,7 @@ impl RuleRecognizer for N3Recognizer { N3State::LiteralExpectDatatype { value } => { match token { N3Token::IriRef(datatype) => { - self.terms.push(Literal::new_typed_literal(value, datatype).into()); + self.terms.push(Literal::new_typed_literal(value, NamedNode::new_unchecked(datatype)).into()); return self; } N3Token::PrefixedName { prefix, local, might_be_invalid_iri } => match resolve_local_name(prefix, &local, might_be_invalid_iri, &context.prefixes) { @@ -1096,12 +1118,13 @@ impl RuleRecognizer for N3Recognizer { impl N3Recognizer { pub fn new_parser( + unchecked: bool, base_iri: Option>, prefixes: HashMap>, ) -> Parser { Parser::new( Lexer::new( - N3Lexer::new(N3LexerMode::N3), + N3Lexer::new(N3LexerMode::N3, unchecked), MIN_BUFFER_SIZE, MAX_BUFFER_SIZE, true, diff --git a/lib/oxttl/src/nquads.rs b/lib/oxttl/src/nquads.rs index c2bf35cd..f150b2d2 100644 --- a/lib/oxttl/src/nquads.rs +++ b/lib/oxttl/src/nquads.rs @@ -37,6 +37,7 @@ use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; #[derive(Default)] #[must_use] pub struct NQuadsParser { + unchecked: bool, #[cfg(feature = "rdf-star")] with_quoted_triples: bool, } @@ -48,6 +49,17 @@ impl NQuadsParser { Self::default() } + /// Assumes the file is valid to make parsing faster. + /// + /// It will skip some validations. + /// + /// Note that if the file is actually not valid, then broken RDF might be emitted by the parser. + #[inline] + pub fn unchecked(mut self) -> Self { + self.unchecked = true; + self + } + /// Enables [N-Quads-star](https://w3c.github.io/rdf-star/cg-spec/2021-12-17.html#n-quads-star). #[cfg(feature = "rdf-star")] #[inline] @@ -165,6 +177,7 @@ impl NQuadsParser { true, #[cfg(feature = "rdf-star")] self.with_quoted_triples, + self.unchecked, ), } } diff --git a/lib/oxttl/src/ntriples.rs b/lib/oxttl/src/ntriples.rs index 674dc27a..995643bc 100644 --- a/lib/oxttl/src/ntriples.rs +++ b/lib/oxttl/src/ntriples.rs @@ -38,6 +38,7 @@ use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; #[derive(Default)] #[must_use] pub struct NTriplesParser { + unchecked: bool, #[cfg(feature = "rdf-star")] with_quoted_triples: bool, } @@ -49,6 +50,17 @@ impl NTriplesParser { Self::default() } + /// Assumes the file is valid to make parsing faster. + /// + /// It will skip some validations. + /// + /// Note that if the file is actually not valid, then broken RDF might be emitted by the parser. /// + #[inline] + pub fn unchecked(mut self) -> Self { + self.unchecked = true; + self + } + /// Enables [N-Triples-star](https://w3c.github.io/rdf-star/cg-spec/2021-12-17.html#n-triples-star). #[cfg(feature = "rdf-star")] #[inline] @@ -166,6 +178,7 @@ impl NTriplesParser { false, #[cfg(feature = "rdf-star")] self.with_quoted_triples, + self.unchecked, ), } } @@ -542,3 +555,26 @@ impl LowLevelNTriplesWriter { writeln!(write, "{} .", t.into()) } } + +#[cfg(test)] +mod tests { + use super::*; + use oxrdf::{Literal, NamedNode}; + + #[test] + fn unchecked_parsing() { + let triples = NTriplesParser::new() + .unchecked() + .parse_read(" \"baz\"@toolonglangtag .".as_bytes()) + .collect::, _>>() + .unwrap(); + assert_eq!( + triples, + [Triple::new( + NamedNode::new_unchecked("foo"), + NamedNode::new_unchecked("bar"), + Literal::new_language_tagged_literal_unchecked("baz", "toolonglangtag"), + )] + ) + } +} diff --git a/lib/oxttl/src/terse.rs b/lib/oxttl/src/terse.rs index 46dcd740..6c83fb71 100644 --- a/lib/oxttl/src/terse.rs +++ b/lib/oxttl/src/terse.rs @@ -107,8 +107,13 @@ impl RuleRecognizer for TriGRecognizer { }, TriGState::BaseExpectIri => match token { N3Token::IriRef(iri) => { - context.lexer_options.base_iri = Some(iri); - self + match Iri::parse_unchecked(iri) { + Ok(iri) => { + context.lexer_options.base_iri = Some(iri); + self + } + Err(e) => self.error(errors, format!("Invalid base IRI: {e}")) + } } _ => self.error(errors, "The BASE keyword should be followed by an IRI"), }, @@ -123,9 +128,13 @@ impl RuleRecognizer for TriGRecognizer { }, TriGState::PrefixExpectIri { name } => match token { N3Token::IriRef(iri) => { - context.prefixes.insert(name, iri); - self - } + match Iri::parse_unchecked(iri) { + Ok(iri) => { + context.prefixes.insert(name, iri); + self + } + Err(e) => self.error(errors, format!("Invalid prefix IRI: {e}")) + } } _ => self.error(errors, "The PREFIX declaration should be followed by a prefix and its value as an IRI"), }, // [3g] triplesOrGraph ::= labelOrSubject ( wrappedGraph | predicateObjectList '.' ) | quotedTriple predicateObjectList '.' @@ -133,7 +142,7 @@ impl RuleRecognizer for TriGRecognizer { TriGState::TriplesOrGraph => match token { N3Token::IriRef(iri) => { self.stack.push(TriGState::WrappedGraphOrPredicateObjectList { - term: NamedNode::from(iri).into() + term: NamedNode::new_unchecked(iri).into() }); self } @@ -291,7 +300,7 @@ impl RuleRecognizer for TriGRecognizer { self } N3Token::IriRef(iri) => { - self.cur_subject.push(NamedNode::from(iri).into()); + self.cur_subject.push(NamedNode::new_unchecked(iri).into()); self.stack.push(TriGState::PredicateObjectList); self } @@ -337,7 +346,7 @@ impl RuleRecognizer for TriGRecognizer { // [7g] labelOrSubject ::= iri | BlankNode TriGState::GraphName => match token { N3Token::IriRef(iri) => { - self.cur_graph = NamedNode::from(iri).into(); + self.cur_graph = NamedNode::new_unchecked(iri).into(); self } N3Token::PrefixedName { prefix, local, might_be_invalid_iri } => match resolve_local_name(prefix, &local, might_be_invalid_iri, &context.prefixes) { @@ -451,7 +460,7 @@ impl RuleRecognizer for TriGRecognizer { self } N3Token::IriRef(iri) => { - self.cur_predicate.push(NamedNode::from(iri)); + self.cur_predicate.push(NamedNode::new_unchecked(iri)); self } N3Token::PrefixedName { prefix, local, might_be_invalid_iri } => match resolve_local_name(prefix, &local, might_be_invalid_iri, &context.prefixes) { @@ -479,7 +488,7 @@ impl RuleRecognizer for TriGRecognizer { // [137s] BlankNode ::= BLANK_NODE_LABEL | ANON TriGState::Object => match token { N3Token::IriRef(iri) => { - self.cur_object.push(NamedNode::from(iri).into()); + self.cur_object.push(NamedNode::new_unchecked(iri).into()); self.emit_quad(results); self } @@ -626,7 +635,7 @@ impl RuleRecognizer for TriGRecognizer { TriGState::LiteralExpectDatatype { value, emit } => { match token { N3Token::IriRef(datatype) => { - self.cur_object.push(Literal::new_typed_literal(value, datatype).into()); + self.cur_object.push(Literal::new_typed_literal(value, NamedNode::new_unchecked(datatype)).into()); if emit { self.emit_quad(results); } @@ -688,7 +697,7 @@ impl RuleRecognizer for TriGRecognizer { self } N3Token::IriRef(iri) => { - self.cur_subject.push(NamedNode::from(iri).into()); + self.cur_subject.push(NamedNode::new_unchecked(iri).into()); self } N3Token::PrefixedName { prefix, local, might_be_invalid_iri } => match resolve_local_name(prefix, &local, might_be_invalid_iri, &context.prefixes) { @@ -720,7 +729,7 @@ impl RuleRecognizer for TriGRecognizer { self } N3Token::IriRef(iri) => { - self.cur_object.push(NamedNode::from(iri).into()); + self.cur_object.push(NamedNode::new_unchecked(iri).into()); self } N3Token::PrefixedName { prefix, local, might_be_invalid_iri } => match resolve_local_name(prefix, &local, might_be_invalid_iri, &context.prefixes) { @@ -823,12 +832,13 @@ impl TriGRecognizer { pub fn new_parser( with_graph_name: bool, #[cfg(feature = "rdf-star")] with_quoted_triples: bool, + unchecked: bool, base_iri: Option>, prefixes: HashMap>, ) -> Parser { Parser::new( Lexer::new( - N3Lexer::new(N3LexerMode::Turtle), + N3Lexer::new(N3LexerMode::Turtle, unchecked), MIN_BUFFER_SIZE, MAX_BUFFER_SIZE, true, diff --git a/lib/oxttl/src/trig.rs b/lib/oxttl/src/trig.rs index 748e3d6a..61cff3af 100644 --- a/lib/oxttl/src/trig.rs +++ b/lib/oxttl/src/trig.rs @@ -42,6 +42,7 @@ use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; #[derive(Default)] #[must_use] pub struct TriGParser { + unchecked: bool, base: Option>, prefixes: HashMap>, #[cfg(feature = "rdf-star")] @@ -55,6 +56,17 @@ impl TriGParser { Self::default() } + /// Assumes the file is valid to make parsing faster. + /// + /// It will skip some validations. + /// + /// Note that if the file is actually not valid, then broken RDF might be emitted by the parser. + #[inline] + pub fn unchecked(mut self) -> Self { + self.unchecked = true; + self + } + #[inline] pub fn with_base_iri(mut self, base_iri: impl Into) -> Result { self.base = Some(Iri::parse(base_iri.into())?); @@ -192,6 +204,7 @@ impl TriGParser { true, #[cfg(feature = "rdf-star")] self.with_quoted_triples, + self.unchecked, self.base, self.prefixes, ), diff --git a/lib/oxttl/src/turtle.rs b/lib/oxttl/src/turtle.rs index 2272e9f8..f9d31232 100644 --- a/lib/oxttl/src/turtle.rs +++ b/lib/oxttl/src/turtle.rs @@ -44,6 +44,7 @@ use tokio::io::{AsyncRead, AsyncWrite}; #[derive(Default)] #[must_use] pub struct TurtleParser { + unchecked: bool, base: Option>, prefixes: HashMap>, #[cfg(feature = "rdf-star")] @@ -57,6 +58,17 @@ impl TurtleParser { Self::default() } + /// Assumes the file is valid to make parsing faster. + /// + /// It will skip some validations. + /// + /// Note that if the file is actually not valid, then broken RDF might be emitted by the parser. + #[inline] + pub fn unchecked(mut self) -> Self { + self.unchecked = true; + self + } + #[inline] pub fn with_base_iri(mut self, base_iri: impl Into) -> Result { self.base = Some(Iri::parse(base_iri.into())?); @@ -194,6 +206,7 @@ impl TurtleParser { false, #[cfg(feature = "rdf-star")] self.with_quoted_triples, + self.unchecked, self.base, self.prefixes, ), diff --git a/lib/spargebra/Cargo.toml b/lib/spargebra/Cargo.toml index 42a14b20..e75cfbea 100644 --- a/lib/spargebra/Cargo.toml +++ b/lib/spargebra/Cargo.toml @@ -23,7 +23,7 @@ sep-0006 = [] [dependencies] peg = "0.8" rand = "0.8" -oxiri = "0.2" +oxiri = "0.2.3-alpha.1" oxilangtag = "0.1" oxrdf = { version = "0.2.0-alpha.1-dev", path="../oxrdf" } diff --git a/lib/src/store.rs b/lib/src/store.rs index 988befed..cec9baae 100644 --- a/lib/src/store.rs +++ b/lib/src/store.rs @@ -1053,7 +1053,6 @@ impl<'a> Transaction<'a> { /// Retrieves quads with a filter on each quad component. /// /// Usage example: - /// Usage example: /// ``` /// use oxigraph::store::{StorageError, Store}; /// use oxigraph::model::*; @@ -1601,19 +1600,22 @@ impl BulkLoader { /// ///
This method is optimized for speed. See [the struct](BulkLoader) documentation for more details.
/// - /// Usage example: + /// To get better speed on valid datasets, consider enabling [`RdfParser::unchecked`] option to skip some validations. + /// /// Usage example: /// ``` /// use oxigraph::store::Store; - /// use oxigraph::io::RdfFormat; + /// use oxigraph::io::{RdfParser, RdfFormat}; /// use oxigraph::model::*; - /// use oxrdfio::RdfParser; /// /// let store = Store::new()?; /// /// // insert a dataset file (former load_dataset method) /// let file = b" ."; - /// store.bulk_loader().load_from_read(RdfFormat::NQuads, file.as_ref())?; + /// store.bulk_loader().load_from_read( + /// RdfParser::from_format(RdfFormat::NQuads).unchecked(), // we inject a custom parser with options + /// file.as_ref() + /// )?; /// /// // insert a graph file (former load_graph method) /// let file = b"<> <> <> ."; diff --git a/lints/test_debian_compatibility.py b/lints/test_debian_compatibility.py index 27530f96..ef00126f 100644 --- a/lints/test_debian_compatibility.py +++ b/lints/test_debian_compatibility.py @@ -5,7 +5,7 @@ from urllib.request import urlopen TARGET_DEBIAN_VERSIONS = ["sid"] IGNORE_PACKAGES = {"oxigraph-js", "oxigraph-testsuite", "pyoxigraph", "sparql-smith"} -ALLOWED_MISSING_PACKAGES = {"codspeed-criterion-compat", "escargot", "json-event-parser", "oxhttp", "quick-xml"} +ALLOWED_MISSING_PACKAGES = {"codspeed-criterion-compat", "escargot", "json-event-parser", "oxhttp", "oxiri", "quick-xml"} base_path = Path(__file__).parent.parent