Adds an enum for CanonicalizationAlgorithm

Enables implementing other algorithms like RDFC-1.0
pull/819/head
Tpt 10 months ago committed by Thomas Tanon
parent f5b975e4d1
commit 0ac70e73dc
  1. 7
      fuzz/fuzz_targets/trig.rs
  2. 154
      lib/oxrdf/src/dataset.rs
  3. 18
      lib/oxrdf/src/graph.rs
  4. 9
      testsuite/src/parser_evaluator.rs
  5. 7
      testsuite/src/sparql_evaluator.rs

@ -1,6 +1,7 @@
#![no_main] #![no_main]
use libfuzzer_sys::fuzz_target; use libfuzzer_sys::fuzz_target;
use oxrdf::graph::CanonicalizationAlgorithm;
use oxrdf::{Dataset, GraphName, Quad, Subject, Term, Triple}; use oxrdf::{Dataset, GraphName, Quad, Subject, Term, Triple};
use oxttl::{TriGParser, TriGSerializer}; use oxttl::{TriGParser, TriGSerializer};
@ -120,8 +121,8 @@ fuzz_target!(|data: &[u8]| {
} else if bnodes_count <= 4 { } else if bnodes_count <= 4 {
let mut dataset_with_split = quads.iter().collect::<Dataset>(); let mut dataset_with_split = quads.iter().collect::<Dataset>();
let mut dataset_without_split = quads_without_split.iter().collect::<Dataset>(); let mut dataset_without_split = quads_without_split.iter().collect::<Dataset>();
dataset_with_split.canonicalize(); dataset_with_split.canonicalize(CanonicalizationAlgorithm::Unstable);
dataset_without_split.canonicalize(); dataset_without_split.canonicalize(CanonicalizationAlgorithm::Unstable);
assert_eq!( assert_eq!(
dataset_with_split, dataset_with_split,
dataset_without_split, dataset_without_split,
@ -131,7 +132,7 @@ fuzz_target!(|data: &[u8]| {
); );
if errors.is_empty() { if errors.is_empty() {
let mut dataset_unchecked = quads_unchecked.iter().collect::<Dataset>(); let mut dataset_unchecked = quads_unchecked.iter().collect::<Dataset>();
dataset_unchecked.canonicalize(); dataset_unchecked.canonicalize(CanonicalizationAlgorithm::Unstable);
assert_eq!( assert_eq!(
dataset_with_split, dataset_with_split,
dataset_unchecked, dataset_unchecked,

@ -31,7 +31,6 @@
use crate::interning::*; use crate::interning::*;
use crate::*; use crate::*;
use std::cmp::min;
use std::collections::hash_map::DefaultHasher; use std::collections::hash_map::DefaultHasher;
use std::collections::{BTreeSet, HashMap, HashSet}; use std::collections::{BTreeSet, HashMap, HashSet};
use std::fmt; use std::fmt;
@ -510,6 +509,7 @@ impl Dataset {
/// ///
/// Usage example ([Dataset isomorphism](https://www.w3.org/TR/rdf11-concepts/#dfn-dataset-isomorphism)): /// Usage example ([Dataset isomorphism](https://www.w3.org/TR/rdf11-concepts/#dfn-dataset-isomorphism)):
/// ``` /// ```
/// use oxrdf::dataset::CanonicalizationAlgorithm;
/// use oxrdf::*; /// use oxrdf::*;
/// ///
/// let iri = NamedNodeRef::new("http://example.com")?; /// let iri = NamedNodeRef::new("http://example.com")?;
@ -527,29 +527,56 @@ impl Dataset {
/// graph2.insert(QuadRef::new(&bnode2, iri, iri, &g2)); /// graph2.insert(QuadRef::new(&bnode2, iri, iri, &g2));
/// ///
/// assert_ne!(graph1, graph2); /// assert_ne!(graph1, graph2);
/// graph1.canonicalize(); /// graph1.canonicalize(CanonicalizationAlgorithm::Unstable);
/// graph2.canonicalize(); /// graph2.canonicalize(CanonicalizationAlgorithm::Unstable);
/// assert_eq!(graph1, graph2); /// assert_eq!(graph1, graph2);
/// # Result::<_,Box<dyn std::error::Error>>::Ok(()) /// # Result::<_,Box<dyn std::error::Error>>::Ok(())
/// ``` /// ```
/// ///
/// Warning 1: Blank node ids depends on the current shape of the graph. Adding a new quad might change the ids of a lot of blank nodes. /// <div class="warning">Blank node ids depends on the current shape of the graph. Adding a new quad might change the ids of a lot of blank nodes.
/// Hence, this canonization might not be suitable for diffs. /// Hence, this canonization might not be suitable for diffs.</div>
/// ///
/// Warning 2: The canonicalization algorithm is not stable and canonical blank node ids might change between Oxigraph version. /// <div class="warning">This implementation worst-case complexity is in *O(b!)* with *b* the number of blank nodes in the input dataset.</div>
pub fn canonicalize(&mut self, algorithm: CanonicalizationAlgorithm) {
let bnode_mapping = self.canonicalize_interned_blank_nodes(algorithm);
let new_quads = self.map_blank_nodes(&bnode_mapping);
self.clear();
for quad in new_quads {
self.insert_encoded(quad);
}
}
/// Returns a map between the current dataset blank node and the canonicalized blank node
/// to create a canonical dataset.
/// ///
/// Warning 3: This implementation worst-case complexity is in *O(b!)* with *b* the number of blank nodes in the input dataset. /// See also [`canonicalize`](Self::canonicalize).
pub fn canonicalize(&mut self) { pub fn canonicalize_blank_nodes(
&self,
algorithm: CanonicalizationAlgorithm,
) -> HashMap<BlankNodeRef<'_>, BlankNode> {
self.canonicalize_interned_blank_nodes(algorithm)
.into_iter()
.map(|(from, to)| (from.decode_from(&self.interner), to))
.collect()
}
fn canonicalize_interned_blank_nodes(
&self,
algorithm: CanonicalizationAlgorithm,
) -> HashMap<InternedBlankNode, BlankNode> {
match algorithm {
CanonicalizationAlgorithm::Unstable => {
let bnodes = self.blank_nodes(); let bnodes = self.blank_nodes();
let quads_per_blank_node = self.quads_per_blank_nodes(); let quads_per_blank_node = self.quads_per_blank_nodes();
let (hash, partition) = self.hash_bnodes( let (hash, partition) = self.hash_bnodes(
bnodes.into_iter().map(|bnode| (bnode, 0)).collect(), bnodes.into_iter().map(|bnode| (bnode, 0)).collect(),
&quads_per_blank_node, &quads_per_blank_node,
); );
let new_quads = self.distinguish(&hash, &partition, &quads_per_blank_node); self.distinguish(hash, &partition, &quads_per_blank_node)
self.clear(); .into_iter()
for quad in new_quads { .map(|(from, to)| (from, BlankNode::new_from_unique_id(to.into())))
self.insert_encoded(quad); .collect()
}
} }
} }
@ -781,16 +808,11 @@ impl Dataset {
} }
fn distinguish( fn distinguish(
&mut self, &self,
hash: &HashMap<InternedBlankNode, u64>, hash: HashMap<InternedBlankNode, u64>,
partition: &[(u64, Vec<InternedBlankNode>)], partition: &[(u64, Vec<InternedBlankNode>)],
quads_per_blank_node: &QuadsPerBlankNode, quads_per_blank_node: &QuadsPerBlankNode,
) -> Vec<( ) -> HashMap<InternedBlankNode, u64> {
InternedSubject,
InternedNamedNode,
InternedTerm,
InternedGraphName,
)> {
let b_prime = partition.iter().map(|(_, b)| b).find(|b| b.len() > 1); let b_prime = partition.iter().map(|(_, b)| b).find(|b| b.len() > 1);
if let Some(b_prime) = b_prime { if let Some(b_prime) = b_prime {
b_prime b_prime
@ -800,19 +822,29 @@ impl Dataset {
hash_prime.insert(*b, Self::hash_tuple((hash_prime[b], 22))); hash_prime.insert(*b, Self::hash_tuple((hash_prime[b], 22)));
let (hash_prime_prime, partition_prime) = let (hash_prime_prime, partition_prime) =
self.hash_bnodes(hash_prime, quads_per_blank_node); self.hash_bnodes(hash_prime, quads_per_blank_node);
self.distinguish(&hash_prime_prime, &partition_prime, quads_per_blank_node) self.distinguish(hash_prime_prime, &partition_prime, quads_per_blank_node)
})
.reduce(|a, b| {
let mut a_hashes = a.values().collect::<Vec<_>>();
a_hashes.sort();
let mut b_hashes = a.values().collect::<Vec<_>>();
b_hashes.sort();
if a_hashes <= b_hashes {
a
} else {
b
}
}) })
.reduce(min)
.unwrap_or_default() .unwrap_or_default()
} else { } else {
self.label(hash) hash
} }
} }
#[allow(clippy::needless_collect)] #[allow(clippy::needless_collect)]
fn label( fn map_blank_nodes(
&mut self, &mut self,
hashes: &HashMap<InternedBlankNode, u64>, bnode_mapping: &HashMap<InternedBlankNode, BlankNode>,
) -> Vec<( ) -> Vec<(
InternedSubject, InternedSubject,
InternedNamedNode, InternedNamedNode,
@ -820,19 +852,22 @@ impl Dataset {
InternedGraphName, InternedGraphName,
)> { )> {
let old_quads: Vec<_> = self.spog.iter().cloned().collect(); let old_quads: Vec<_> = self.spog.iter().cloned().collect();
let mut quads: Vec<_> = old_quads old_quads
.into_iter() .into_iter()
.map(|(s, p, o, g)| { .map(|(s, p, o, g)| {
( (
match s { match s {
InternedSubject::NamedNode(_) => s, InternedSubject::NamedNode(_) => s,
InternedSubject::BlankNode(bnode) => { InternedSubject::BlankNode(bnode) => {
InternedSubject::BlankNode(self.map_bnode(bnode, hashes)) InternedSubject::BlankNode(InternedBlankNode::encoded_into(
bnode_mapping[&bnode].as_ref(),
&mut self.interner,
))
} }
#[cfg(feature = "rdf-star")] #[cfg(feature = "rdf-star")]
InternedSubject::Triple(triple) => { InternedSubject::Triple(triple) => {
InternedSubject::Triple(Box::new(InternedTriple::encoded_into( InternedSubject::Triple(Box::new(InternedTriple::encoded_into(
self.label_triple(&triple, hashes).as_ref(), self.map_triple_blank_nodes(&triple, bnode_mapping).as_ref(),
&mut self.interner, &mut self.interner,
))) )))
} }
@ -841,12 +876,15 @@ impl Dataset {
match o { match o {
InternedTerm::NamedNode(_) | InternedTerm::Literal(_) => o, InternedTerm::NamedNode(_) | InternedTerm::Literal(_) => o,
InternedTerm::BlankNode(bnode) => { InternedTerm::BlankNode(bnode) => {
InternedTerm::BlankNode(self.map_bnode(bnode, hashes)) InternedTerm::BlankNode(InternedBlankNode::encoded_into(
bnode_mapping[&bnode].as_ref(),
&mut self.interner,
))
} }
#[cfg(feature = "rdf-star")] #[cfg(feature = "rdf-star")]
InternedTerm::Triple(triple) => { InternedTerm::Triple(triple) => {
InternedTerm::Triple(Box::new(InternedTriple::encoded_into( InternedTerm::Triple(Box::new(InternedTriple::encoded_into(
self.label_triple(&triple, hashes).as_ref(), self.map_triple_blank_nodes(&triple, bnode_mapping).as_ref(),
&mut self.interner, &mut self.interner,
))) )))
} }
@ -854,58 +892,41 @@ impl Dataset {
match g { match g {
InternedGraphName::NamedNode(_) | InternedGraphName::DefaultGraph => g, InternedGraphName::NamedNode(_) | InternedGraphName::DefaultGraph => g,
InternedGraphName::BlankNode(bnode) => { InternedGraphName::BlankNode(bnode) => {
InternedGraphName::BlankNode(self.map_bnode(bnode, hashes)) InternedGraphName::BlankNode(InternedBlankNode::encoded_into(
bnode_mapping[&bnode].as_ref(),
&mut self.interner,
))
} }
}, },
) )
}) })
.collect(); .collect()
quads.sort_unstable();
quads
} }
#[cfg(feature = "rdf-star")] #[cfg(feature = "rdf-star")]
fn label_triple( fn map_triple_blank_nodes(
&mut self, &mut self,
triple: &InternedTriple, triple: &InternedTriple,
hashes: &HashMap<InternedBlankNode, u64>, bnode_mapping: &HashMap<InternedBlankNode, BlankNode>,
) -> Triple { ) -> Triple {
Triple { Triple {
subject: if let InternedSubject::BlankNode(bnode) = &triple.subject { subject: if let InternedSubject::BlankNode(bnode) = &triple.subject {
Self::gen_bnode(*bnode, hashes).into() bnode_mapping[bnode].clone().into()
} else if let InternedSubject::Triple(t) = &triple.subject { } else if let InternedSubject::Triple(t) = &triple.subject {
self.label_triple(t, hashes).into() self.map_triple_blank_nodes(t, bnode_mapping).into()
} else { } else {
triple.subject.decode_from(&self.interner).into_owned() triple.subject.decode_from(&self.interner).into_owned()
}, },
predicate: triple.predicate.decode_from(&self.interner).into_owned(), predicate: triple.predicate.decode_from(&self.interner).into_owned(),
object: if let InternedTerm::BlankNode(bnode) = &triple.object { object: if let InternedTerm::BlankNode(bnode) = &triple.object {
Self::gen_bnode(*bnode, hashes).into() bnode_mapping[bnode].clone().into()
} else if let InternedTerm::Triple(t) = &triple.object { } else if let InternedTerm::Triple(t) = &triple.object {
self.label_triple(t, hashes).into() self.map_triple_blank_nodes(t, bnode_mapping).into()
} else { } else {
triple.object.decode_from(&self.interner).into_owned() triple.object.decode_from(&self.interner).into_owned()
}, },
} }
} }
fn map_bnode(
&mut self,
old_bnode: InternedBlankNode,
hashes: &HashMap<InternedBlankNode, u64>,
) -> InternedBlankNode {
InternedBlankNode::encoded_into(
Self::gen_bnode(old_bnode, hashes).as_ref(),
&mut self.interner,
)
}
fn gen_bnode(
old_bnode: InternedBlankNode,
hashes: &HashMap<InternedBlankNode, u64>,
) -> BlankNode {
BlankNode::new_from_unique_id(hashes[&old_bnode].into())
}
} }
impl PartialEq for Dataset { impl PartialEq for Dataset {
@ -1569,6 +1590,19 @@ type QuadsPerBlankNode = HashMap<
)>, )>,
>; >;
/// An algorithm used to canonicalize graph and datasets.
///
/// See [`Graph::canonicalize`] and [`Dataset::canonicalize`].
#[derive(Default, Debug, Clone, Copy, Eq, PartialEq, Hash)]
#[non_exhaustive]
pub enum CanonicalizationAlgorithm {
/// The algorithm preferred by OxRDF.
///
/// <div class="warning">The canonicalization algorithm is not stable and canonical blank node ids might change between Oxigraph version.</div>
#[default]
Unstable,
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
@ -1588,7 +1622,7 @@ mod tests {
BlankNode::default().as_ref(), BlankNode::default().as_ref(),
GraphNameRef::DefaultGraph, GraphNameRef::DefaultGraph,
)); ));
dataset.canonicalize(); dataset.canonicalize(CanonicalizationAlgorithm::Unstable);
let mut dataset2 = Dataset::new(); let mut dataset2 = Dataset::new();
dataset2.insert(QuadRef::new( dataset2.insert(QuadRef::new(
BlankNode::default().as_ref(), BlankNode::default().as_ref(),
@ -1602,7 +1636,7 @@ mod tests {
BlankNode::default().as_ref(), BlankNode::default().as_ref(),
GraphNameRef::DefaultGraph, GraphNameRef::DefaultGraph,
)); ));
dataset2.canonicalize(); dataset2.canonicalize(CanonicalizationAlgorithm::Unstable);
assert_eq!(dataset, dataset2); assert_eq!(dataset, dataset2);
} }
} }

@ -25,6 +25,7 @@
//! //!
//! See also [`Dataset`] if you want to get support of multiple RDF graphs at the same time. //! See also [`Dataset`] if you want to get support of multiple RDF graphs at the same time.
pub use crate::dataset::CanonicalizationAlgorithm;
use crate::dataset::*; use crate::dataset::*;
use crate::*; use crate::*;
use std::fmt; use std::fmt;
@ -188,6 +189,7 @@ impl Graph {
/// ///
/// Usage example ([Graph isomorphism](https://www.w3.org/TR/rdf11-concepts/#dfn-graph-isomorphism)): /// Usage example ([Graph isomorphism](https://www.w3.org/TR/rdf11-concepts/#dfn-graph-isomorphism)):
/// ``` /// ```
/// use oxrdf::graph::CanonicalizationAlgorithm;
/// use oxrdf::*; /// use oxrdf::*;
/// ///
/// let iri = NamedNodeRef::new("http://example.com")?; /// let iri = NamedNodeRef::new("http://example.com")?;
@ -203,20 +205,18 @@ impl Graph {
/// graph2.insert(TripleRef::new(&bnode2, iri, iri)); /// graph2.insert(TripleRef::new(&bnode2, iri, iri));
/// ///
/// assert_ne!(graph1, graph2); /// assert_ne!(graph1, graph2);
/// graph1.canonicalize(); /// graph1.canonicalize(CanonicalizationAlgorithm::Unstable);
/// graph2.canonicalize(); /// graph2.canonicalize(CanonicalizationAlgorithm::Unstable);
/// assert_eq!(graph1, graph2); /// assert_eq!(graph1, graph2);
/// # Result::<_,Box<dyn std::error::Error>>::Ok(()) /// # Result::<_,Box<dyn std::error::Error>>::Ok(())
/// ``` /// ```
/// ///
/// Warning 1: Blank node ids depends on the current shape of the graph. Adding a new triple might change the ids of a lot of blank nodes. /// <div class="warning">Blank node ids depends on the current shape of the graph. Adding a new quad might change the ids of a lot of blank nodes.
/// Hence, this canonization might not be suitable for diffs. /// Hence, this canonization might not be suitable for diffs.</div>
/// ///
/// Warning 2: The canonicalization algorithm is not stable and canonical blank node Ids might change between Oxigraph version. /// <div class="warning">This implementation worst-case complexity is in *O(b!)* with *b* the number of blank nodes in the input dataset.</div>
/// pub fn canonicalize(&mut self, algorithm: CanonicalizationAlgorithm) {
/// Warning 3: This implementation worst-case complexity is in *O(b!)* with *b* the number of blank nodes in the input graph. self.dataset.canonicalize(algorithm)
pub fn canonicalize(&mut self) {
self.dataset.canonicalize()
} }
} }

@ -4,6 +4,7 @@ use crate::manifest::Test;
use crate::report::{dataset_diff, format_diff}; use crate::report::{dataset_diff, format_diff};
use anyhow::{bail, ensure, Context, Result}; use anyhow::{bail, ensure, Context, Result};
use oxigraph::io::RdfFormat; use oxigraph::io::RdfFormat;
use oxigraph::model::graph::CanonicalizationAlgorithm;
use oxigraph::model::{BlankNode, Dataset, Quad}; use oxigraph::model::{BlankNode, Dataset, Quad};
use oxttl::n3::{N3Quad, N3Term}; use oxttl::n3::{N3Quad, N3Term};
@ -138,11 +139,11 @@ fn evaluate_eval_test(test: &Test, format: RdfFormat, ignore_errors: bool) -> Re
let action = test.action.as_deref().context("No action found")?; let action = test.action.as_deref().context("No action found")?;
let mut actual_dataset = load_dataset(action, format, ignore_errors) let mut actual_dataset = load_dataset(action, format, ignore_errors)
.with_context(|| format!("Parse error on file {action}"))?; .with_context(|| format!("Parse error on file {action}"))?;
actual_dataset.canonicalize(); actual_dataset.canonicalize(CanonicalizationAlgorithm::Unstable);
let results = test.result.as_ref().context("No tests result found")?; let results = test.result.as_ref().context("No tests result found")?;
let mut expected_dataset = load_dataset(results, guess_rdf_format(results)?, false) let mut expected_dataset = load_dataset(results, guess_rdf_format(results)?, false)
.with_context(|| format!("Parse error on file {results}"))?; .with_context(|| format!("Parse error on file {results}"))?;
expected_dataset.canonicalize(); expected_dataset.canonicalize(CanonicalizationAlgorithm::Unstable);
ensure!( ensure!(
expected_dataset == actual_dataset, expected_dataset == actual_dataset,
"The two files are not isomorphic. Diff:\n{}", "The two files are not isomorphic. Diff:\n{}",
@ -156,12 +157,12 @@ fn evaluate_n3_eval_test(test: &Test, ignore_errors: bool) -> Result<()> {
let mut actual_dataset = n3_to_dataset( let mut actual_dataset = n3_to_dataset(
load_n3(action, ignore_errors).with_context(|| format!("Parse error on file {action}"))?, load_n3(action, ignore_errors).with_context(|| format!("Parse error on file {action}"))?,
); );
actual_dataset.canonicalize(); actual_dataset.canonicalize(CanonicalizationAlgorithm::Unstable);
let results = test.result.as_ref().context("No tests result found")?; let results = test.result.as_ref().context("No tests result found")?;
let mut expected_dataset = n3_to_dataset( let mut expected_dataset = n3_to_dataset(
load_n3(results, false).with_context(|| format!("Parse error on file {results}"))?, load_n3(results, false).with_context(|| format!("Parse error on file {results}"))?,
); );
expected_dataset.canonicalize(); expected_dataset.canonicalize(CanonicalizationAlgorithm::Unstable);
ensure!( ensure!(
expected_dataset == actual_dataset, expected_dataset == actual_dataset,
"The two files are not isomorphic. Diff:\n{}", "The two files are not isomorphic. Diff:\n{}",

@ -5,6 +5,7 @@ use crate::report::{dataset_diff, format_diff};
use crate::vocab::*; use crate::vocab::*;
use anyhow::{bail, ensure, Context, Error, Result}; use anyhow::{bail, ensure, Context, Error, Result};
use oxigraph::io::RdfParser; use oxigraph::io::RdfParser;
use oxigraph::model::dataset::CanonicalizationAlgorithm;
use oxigraph::model::vocab::*; use oxigraph::model::vocab::*;
use oxigraph::model::*; use oxigraph::model::*;
use oxigraph::sparql::results::QueryResultsFormat; use oxigraph::sparql::results::QueryResultsFormat;
@ -235,9 +236,9 @@ fn evaluate_update_evaluation_test(test: &Test) -> Result<()> {
store.update(update).context("Failure to execute update")?; store.update(update).context("Failure to execute update")?;
let mut store_dataset: Dataset = store.iter().collect::<Result<_, _>>()?; let mut store_dataset: Dataset = store.iter().collect::<Result<_, _>>()?;
store_dataset.canonicalize(); store_dataset.canonicalize(CanonicalizationAlgorithm::Unstable);
let mut result_store_dataset: Dataset = result_store.iter().collect::<Result<_, _>>()?; let mut result_store_dataset: Dataset = result_store.iter().collect::<Result<_, _>>()?;
result_store_dataset.canonicalize(); result_store_dataset.canonicalize(CanonicalizationAlgorithm::Unstable);
ensure!( ensure!(
store_dataset == result_store_dataset, store_dataset == result_store_dataset,
"Not isomorphic result dataset.\nDiff:\n{}\nParsed update:\n{}\n", "Not isomorphic result dataset.\nDiff:\n{}\nParsed update:\n{}\n",
@ -533,7 +534,7 @@ impl StaticQueryResults {
}) })
} }
} else { } else {
graph.canonicalize(); graph.canonicalize(CanonicalizationAlgorithm::Unstable);
Ok(Self::Graph(graph)) Ok(Self::Graph(graph))
} }
} }

Loading…
Cancel
Save