From 0ac70e73dcc484a29b82f7174b20820d4e5cc580 Mon Sep 17 00:00:00 2001 From: Tpt Date: Sun, 3 Mar 2024 21:35:18 +0100 Subject: [PATCH] Adds an enum for CanonicalizationAlgorithm Enables implementing other algorithms like RDFC-1.0 --- fuzz/fuzz_targets/trig.rs | 7 +- lib/oxrdf/src/dataset.rs | 162 ++++++++++++++++++------------ lib/oxrdf/src/graph.rs | 18 ++-- testsuite/src/parser_evaluator.rs | 9 +- testsuite/src/sparql_evaluator.rs | 7 +- 5 files changed, 120 insertions(+), 83 deletions(-) diff --git a/fuzz/fuzz_targets/trig.rs b/fuzz/fuzz_targets/trig.rs index cd17488a..d5bac93f 100644 --- a/fuzz/fuzz_targets/trig.rs +++ b/fuzz/fuzz_targets/trig.rs @@ -1,6 +1,7 @@ #![no_main] use libfuzzer_sys::fuzz_target; +use oxrdf::graph::CanonicalizationAlgorithm; use oxrdf::{Dataset, GraphName, Quad, Subject, Term, Triple}; use oxttl::{TriGParser, TriGSerializer}; @@ -120,8 +121,8 @@ fuzz_target!(|data: &[u8]| { } else if bnodes_count <= 4 { let mut dataset_with_split = quads.iter().collect::(); let mut dataset_without_split = quads_without_split.iter().collect::(); - dataset_with_split.canonicalize(); - dataset_without_split.canonicalize(); + dataset_with_split.canonicalize(CanonicalizationAlgorithm::Unstable); + dataset_without_split.canonicalize(CanonicalizationAlgorithm::Unstable); assert_eq!( dataset_with_split, dataset_without_split, @@ -131,7 +132,7 @@ fuzz_target!(|data: &[u8]| { ); if errors.is_empty() { let mut dataset_unchecked = quads_unchecked.iter().collect::(); - dataset_unchecked.canonicalize(); + dataset_unchecked.canonicalize(CanonicalizationAlgorithm::Unstable); assert_eq!( dataset_with_split, dataset_unchecked, diff --git a/lib/oxrdf/src/dataset.rs b/lib/oxrdf/src/dataset.rs index ed6249a4..3e6592a3 100644 --- a/lib/oxrdf/src/dataset.rs +++ b/lib/oxrdf/src/dataset.rs @@ -31,7 +31,6 @@ use crate::interning::*; use crate::*; -use std::cmp::min; use std::collections::hash_map::DefaultHasher; use std::collections::{BTreeSet, HashMap, HashSet}; use std::fmt; @@ -510,6 +509,7 @@ impl Dataset { /// /// Usage example ([Dataset isomorphism](https://www.w3.org/TR/rdf11-concepts/#dfn-dataset-isomorphism)): /// ``` + /// use oxrdf::dataset::CanonicalizationAlgorithm; /// use oxrdf::*; /// /// let iri = NamedNodeRef::new("http://example.com")?; @@ -527,32 +527,59 @@ impl Dataset { /// graph2.insert(QuadRef::new(&bnode2, iri, iri, &g2)); /// /// assert_ne!(graph1, graph2); - /// graph1.canonicalize(); - /// graph2.canonicalize(); + /// graph1.canonicalize(CanonicalizationAlgorithm::Unstable); + /// graph2.canonicalize(CanonicalizationAlgorithm::Unstable); /// assert_eq!(graph1, graph2); /// # Result::<_,Box>::Ok(()) /// ``` /// - /// Warning 1: Blank node ids depends on the current shape of the graph. Adding a new quad might change the ids of a lot of blank nodes. - /// Hence, this canonization might not be suitable for diffs. + ///
Blank node ids depends on the current shape of the graph. Adding a new quad might change the ids of a lot of blank nodes. + /// Hence, this canonization might not be suitable for diffs.
/// - /// Warning 2: The canonicalization algorithm is not stable and canonical blank node ids might change between Oxigraph version. - /// - /// Warning 3: This implementation worst-case complexity is in *O(b!)* with *b* the number of blank nodes in the input dataset. - pub fn canonicalize(&mut self) { - let bnodes = self.blank_nodes(); - let quads_per_blank_node = self.quads_per_blank_nodes(); - let (hash, partition) = self.hash_bnodes( - bnodes.into_iter().map(|bnode| (bnode, 0)).collect(), - &quads_per_blank_node, - ); - let new_quads = self.distinguish(&hash, &partition, &quads_per_blank_node); + ///
This implementation worst-case complexity is in *O(b!)* with *b* the number of blank nodes in the input dataset.
+ pub fn canonicalize(&mut self, algorithm: CanonicalizationAlgorithm) { + let bnode_mapping = self.canonicalize_interned_blank_nodes(algorithm); + let new_quads = self.map_blank_nodes(&bnode_mapping); self.clear(); for quad in new_quads { self.insert_encoded(quad); } } + /// Returns a map between the current dataset blank node and the canonicalized blank node + /// to create a canonical dataset. + /// + /// See also [`canonicalize`](Self::canonicalize). + pub fn canonicalize_blank_nodes( + &self, + algorithm: CanonicalizationAlgorithm, + ) -> HashMap, BlankNode> { + self.canonicalize_interned_blank_nodes(algorithm) + .into_iter() + .map(|(from, to)| (from.decode_from(&self.interner), to)) + .collect() + } + + fn canonicalize_interned_blank_nodes( + &self, + algorithm: CanonicalizationAlgorithm, + ) -> HashMap { + match algorithm { + CanonicalizationAlgorithm::Unstable => { + let bnodes = self.blank_nodes(); + let quads_per_blank_node = self.quads_per_blank_nodes(); + let (hash, partition) = self.hash_bnodes( + bnodes.into_iter().map(|bnode| (bnode, 0)).collect(), + &quads_per_blank_node, + ); + self.distinguish(hash, &partition, &quads_per_blank_node) + .into_iter() + .map(|(from, to)| (from, BlankNode::new_from_unique_id(to.into()))) + .collect() + } + } + } + fn blank_nodes(&self) -> HashSet { let mut bnodes = HashSet::new(); for (g, s, _, o) in &self.gspo { @@ -781,16 +808,11 @@ impl Dataset { } fn distinguish( - &mut self, - hash: &HashMap, + &self, + hash: HashMap, partition: &[(u64, Vec)], quads_per_blank_node: &QuadsPerBlankNode, - ) -> Vec<( - InternedSubject, - InternedNamedNode, - InternedTerm, - InternedGraphName, - )> { + ) -> HashMap { let b_prime = partition.iter().map(|(_, b)| b).find(|b| b.len() > 1); if let Some(b_prime) = b_prime { b_prime @@ -800,19 +822,29 @@ impl Dataset { hash_prime.insert(*b, Self::hash_tuple((hash_prime[b], 22))); let (hash_prime_prime, partition_prime) = self.hash_bnodes(hash_prime, quads_per_blank_node); - self.distinguish(&hash_prime_prime, &partition_prime, quads_per_blank_node) + self.distinguish(hash_prime_prime, &partition_prime, quads_per_blank_node) + }) + .reduce(|a, b| { + let mut a_hashes = a.values().collect::>(); + a_hashes.sort(); + let mut b_hashes = a.values().collect::>(); + b_hashes.sort(); + if a_hashes <= b_hashes { + a + } else { + b + } }) - .reduce(min) .unwrap_or_default() } else { - self.label(hash) + hash } } #[allow(clippy::needless_collect)] - fn label( + fn map_blank_nodes( &mut self, - hashes: &HashMap, + bnode_mapping: &HashMap, ) -> Vec<( InternedSubject, InternedNamedNode, @@ -820,19 +852,22 @@ impl Dataset { InternedGraphName, )> { let old_quads: Vec<_> = self.spog.iter().cloned().collect(); - let mut quads: Vec<_> = old_quads + old_quads .into_iter() .map(|(s, p, o, g)| { ( match s { InternedSubject::NamedNode(_) => s, InternedSubject::BlankNode(bnode) => { - InternedSubject::BlankNode(self.map_bnode(bnode, hashes)) + InternedSubject::BlankNode(InternedBlankNode::encoded_into( + bnode_mapping[&bnode].as_ref(), + &mut self.interner, + )) } #[cfg(feature = "rdf-star")] InternedSubject::Triple(triple) => { InternedSubject::Triple(Box::new(InternedTriple::encoded_into( - self.label_triple(&triple, hashes).as_ref(), + self.map_triple_blank_nodes(&triple, bnode_mapping).as_ref(), &mut self.interner, ))) } @@ -841,12 +876,15 @@ impl Dataset { match o { InternedTerm::NamedNode(_) | InternedTerm::Literal(_) => o, InternedTerm::BlankNode(bnode) => { - InternedTerm::BlankNode(self.map_bnode(bnode, hashes)) + InternedTerm::BlankNode(InternedBlankNode::encoded_into( + bnode_mapping[&bnode].as_ref(), + &mut self.interner, + )) } #[cfg(feature = "rdf-star")] InternedTerm::Triple(triple) => { InternedTerm::Triple(Box::new(InternedTriple::encoded_into( - self.label_triple(&triple, hashes).as_ref(), + self.map_triple_blank_nodes(&triple, bnode_mapping).as_ref(), &mut self.interner, ))) } @@ -854,58 +892,41 @@ impl Dataset { match g { InternedGraphName::NamedNode(_) | InternedGraphName::DefaultGraph => g, InternedGraphName::BlankNode(bnode) => { - InternedGraphName::BlankNode(self.map_bnode(bnode, hashes)) + InternedGraphName::BlankNode(InternedBlankNode::encoded_into( + bnode_mapping[&bnode].as_ref(), + &mut self.interner, + )) } }, ) }) - .collect(); - quads.sort_unstable(); - quads + .collect() } #[cfg(feature = "rdf-star")] - fn label_triple( + fn map_triple_blank_nodes( &mut self, triple: &InternedTriple, - hashes: &HashMap, + bnode_mapping: &HashMap, ) -> Triple { Triple { subject: if let InternedSubject::BlankNode(bnode) = &triple.subject { - Self::gen_bnode(*bnode, hashes).into() + bnode_mapping[bnode].clone().into() } else if let InternedSubject::Triple(t) = &triple.subject { - self.label_triple(t, hashes).into() + self.map_triple_blank_nodes(t, bnode_mapping).into() } else { triple.subject.decode_from(&self.interner).into_owned() }, predicate: triple.predicate.decode_from(&self.interner).into_owned(), object: if let InternedTerm::BlankNode(bnode) = &triple.object { - Self::gen_bnode(*bnode, hashes).into() + bnode_mapping[bnode].clone().into() } else if let InternedTerm::Triple(t) = &triple.object { - self.label_triple(t, hashes).into() + self.map_triple_blank_nodes(t, bnode_mapping).into() } else { triple.object.decode_from(&self.interner).into_owned() }, } } - - fn map_bnode( - &mut self, - old_bnode: InternedBlankNode, - hashes: &HashMap, - ) -> InternedBlankNode { - InternedBlankNode::encoded_into( - Self::gen_bnode(old_bnode, hashes).as_ref(), - &mut self.interner, - ) - } - - fn gen_bnode( - old_bnode: InternedBlankNode, - hashes: &HashMap, - ) -> BlankNode { - BlankNode::new_from_unique_id(hashes[&old_bnode].into()) - } } impl PartialEq for Dataset { @@ -1569,6 +1590,19 @@ type QuadsPerBlankNode = HashMap< )>, >; +/// An algorithm used to canonicalize graph and datasets. +/// +/// See [`Graph::canonicalize`] and [`Dataset::canonicalize`]. +#[derive(Default, Debug, Clone, Copy, Eq, PartialEq, Hash)] +#[non_exhaustive] +pub enum CanonicalizationAlgorithm { + /// The algorithm preferred by OxRDF. + /// + ///
The canonicalization algorithm is not stable and canonical blank node ids might change between Oxigraph version.
+ #[default] + Unstable, +} + #[cfg(test)] mod tests { use super::*; @@ -1588,7 +1622,7 @@ mod tests { BlankNode::default().as_ref(), GraphNameRef::DefaultGraph, )); - dataset.canonicalize(); + dataset.canonicalize(CanonicalizationAlgorithm::Unstable); let mut dataset2 = Dataset::new(); dataset2.insert(QuadRef::new( BlankNode::default().as_ref(), @@ -1602,7 +1636,7 @@ mod tests { BlankNode::default().as_ref(), GraphNameRef::DefaultGraph, )); - dataset2.canonicalize(); + dataset2.canonicalize(CanonicalizationAlgorithm::Unstable); assert_eq!(dataset, dataset2); } } diff --git a/lib/oxrdf/src/graph.rs b/lib/oxrdf/src/graph.rs index 5459b65c..b6002ea2 100644 --- a/lib/oxrdf/src/graph.rs +++ b/lib/oxrdf/src/graph.rs @@ -25,6 +25,7 @@ //! //! See also [`Dataset`] if you want to get support of multiple RDF graphs at the same time. +pub use crate::dataset::CanonicalizationAlgorithm; use crate::dataset::*; use crate::*; use std::fmt; @@ -188,6 +189,7 @@ impl Graph { /// /// Usage example ([Graph isomorphism](https://www.w3.org/TR/rdf11-concepts/#dfn-graph-isomorphism)): /// ``` + /// use oxrdf::graph::CanonicalizationAlgorithm; /// use oxrdf::*; /// /// let iri = NamedNodeRef::new("http://example.com")?; @@ -203,20 +205,18 @@ impl Graph { /// graph2.insert(TripleRef::new(&bnode2, iri, iri)); /// /// assert_ne!(graph1, graph2); - /// graph1.canonicalize(); - /// graph2.canonicalize(); + /// graph1.canonicalize(CanonicalizationAlgorithm::Unstable); + /// graph2.canonicalize(CanonicalizationAlgorithm::Unstable); /// assert_eq!(graph1, graph2); /// # Result::<_,Box>::Ok(()) /// ``` /// - /// Warning 1: Blank node ids depends on the current shape of the graph. Adding a new triple might change the ids of a lot of blank nodes. - /// Hence, this canonization might not be suitable for diffs. + ///
Blank node ids depends on the current shape of the graph. Adding a new quad might change the ids of a lot of blank nodes. + /// Hence, this canonization might not be suitable for diffs.
/// - /// Warning 2: The canonicalization algorithm is not stable and canonical blank node Ids might change between Oxigraph version. - /// - /// Warning 3: This implementation worst-case complexity is in *O(b!)* with *b* the number of blank nodes in the input graph. - pub fn canonicalize(&mut self) { - self.dataset.canonicalize() + ///
This implementation worst-case complexity is in *O(b!)* with *b* the number of blank nodes in the input dataset.
+ pub fn canonicalize(&mut self, algorithm: CanonicalizationAlgorithm) { + self.dataset.canonicalize(algorithm) } } diff --git a/testsuite/src/parser_evaluator.rs b/testsuite/src/parser_evaluator.rs index 830607b0..98d20815 100644 --- a/testsuite/src/parser_evaluator.rs +++ b/testsuite/src/parser_evaluator.rs @@ -4,6 +4,7 @@ use crate::manifest::Test; use crate::report::{dataset_diff, format_diff}; use anyhow::{bail, ensure, Context, Result}; use oxigraph::io::RdfFormat; +use oxigraph::model::graph::CanonicalizationAlgorithm; use oxigraph::model::{BlankNode, Dataset, Quad}; use oxttl::n3::{N3Quad, N3Term}; @@ -138,11 +139,11 @@ fn evaluate_eval_test(test: &Test, format: RdfFormat, ignore_errors: bool) -> Re let action = test.action.as_deref().context("No action found")?; let mut actual_dataset = load_dataset(action, format, ignore_errors) .with_context(|| format!("Parse error on file {action}"))?; - actual_dataset.canonicalize(); + actual_dataset.canonicalize(CanonicalizationAlgorithm::Unstable); let results = test.result.as_ref().context("No tests result found")?; let mut expected_dataset = load_dataset(results, guess_rdf_format(results)?, false) .with_context(|| format!("Parse error on file {results}"))?; - expected_dataset.canonicalize(); + expected_dataset.canonicalize(CanonicalizationAlgorithm::Unstable); ensure!( expected_dataset == actual_dataset, "The two files are not isomorphic. Diff:\n{}", @@ -156,12 +157,12 @@ fn evaluate_n3_eval_test(test: &Test, ignore_errors: bool) -> Result<()> { let mut actual_dataset = n3_to_dataset( load_n3(action, ignore_errors).with_context(|| format!("Parse error on file {action}"))?, ); - actual_dataset.canonicalize(); + actual_dataset.canonicalize(CanonicalizationAlgorithm::Unstable); let results = test.result.as_ref().context("No tests result found")?; let mut expected_dataset = n3_to_dataset( load_n3(results, false).with_context(|| format!("Parse error on file {results}"))?, ); - expected_dataset.canonicalize(); + expected_dataset.canonicalize(CanonicalizationAlgorithm::Unstable); ensure!( expected_dataset == actual_dataset, "The two files are not isomorphic. Diff:\n{}", diff --git a/testsuite/src/sparql_evaluator.rs b/testsuite/src/sparql_evaluator.rs index b68c4c2b..8f6c3729 100644 --- a/testsuite/src/sparql_evaluator.rs +++ b/testsuite/src/sparql_evaluator.rs @@ -5,6 +5,7 @@ use crate::report::{dataset_diff, format_diff}; use crate::vocab::*; use anyhow::{bail, ensure, Context, Error, Result}; use oxigraph::io::RdfParser; +use oxigraph::model::dataset::CanonicalizationAlgorithm; use oxigraph::model::vocab::*; use oxigraph::model::*; use oxigraph::sparql::results::QueryResultsFormat; @@ -235,9 +236,9 @@ fn evaluate_update_evaluation_test(test: &Test) -> Result<()> { store.update(update).context("Failure to execute update")?; let mut store_dataset: Dataset = store.iter().collect::>()?; - store_dataset.canonicalize(); + store_dataset.canonicalize(CanonicalizationAlgorithm::Unstable); let mut result_store_dataset: Dataset = result_store.iter().collect::>()?; - result_store_dataset.canonicalize(); + result_store_dataset.canonicalize(CanonicalizationAlgorithm::Unstable); ensure!( store_dataset == result_store_dataset, "Not isomorphic result dataset.\nDiff:\n{}\nParsed update:\n{}\n", @@ -533,7 +534,7 @@ impl StaticQueryResults { }) } } else { - graph.canonicalize(); + graph.canonicalize(CanonicalizationAlgorithm::Unstable); Ok(Self::Graph(graph)) } }