Adds an enum for CanonicalizationAlgorithm

Enables implementing other algorithms like RDFC-1.0
2 years ago · 0ac70e73dc
parent f5b975e4d1
commit 0ac70e73dc
5 changed files with 120 additions and 83 deletions
--- a/fuzz/fuzz_targets/trig.rs
+++ b/fuzz/fuzz_targets/trig.rs
@ -1,6 +1,7 @@
 #![no_main]

 use libfuzzer_sys::fuzz_target;
+use oxrdf::graph::CanonicalizationAlgorithm;
 use oxrdf::{Dataset, GraphName, Quad, Subject, Term, Triple};
 use oxttl::{TriGParser, TriGSerializer};

@ -120,8 +121,8 @@ fuzz_target!(|data: &[u8]| {
    } else if bnodes_count <= 4 {
        let mut dataset_with_split = quads.iter().collect::<Dataset>();
        let mut dataset_without_split = quads_without_split.iter().collect::<Dataset>();
-        dataset_with_split.canonicalize();
-        dataset_without_split.canonicalize();
+        dataset_with_split.canonicalize(CanonicalizationAlgorithm::Unstable);
+        dataset_without_split.canonicalize(CanonicalizationAlgorithm::Unstable);
        assert_eq!(
            dataset_with_split,
            dataset_without_split,
@ -131,7 +132,7 @@ fuzz_target!(|data: &[u8]| {
        );
        if errors.is_empty() {
            let mut dataset_unchecked = quads_unchecked.iter().collect::<Dataset>();
-            dataset_unchecked.canonicalize();
+            dataset_unchecked.canonicalize(CanonicalizationAlgorithm::Unstable);
            assert_eq!(
                dataset_with_split,
                dataset_unchecked,
--- a/lib/oxrdf/src/dataset.rs
+++ b/lib/oxrdf/src/dataset.rs
@ -31,7 +31,6 @@

 use crate::interning::*;
 use crate::*;
-use std::cmp::min;
 use std::collections::hash_map::DefaultHasher;
 use std::collections::{BTreeSet, HashMap, HashSet};
 use std::fmt;
@ -510,6 +509,7 @@ impl Dataset {
    ///
    /// Usage example ([Dataset isomorphism](https://www.w3.org/TR/rdf11-concepts/#dfn-dataset-isomorphism)):
    /// ```
+    /// use oxrdf::dataset::CanonicalizationAlgorithm;
    /// use oxrdf::*;
    ///
    /// let iri = NamedNodeRef::new("http://example.com")?;
@ -527,32 +527,59 @@ impl Dataset {
    /// graph2.insert(QuadRef::new(&bnode2, iri, iri, &g2));
    ///
    /// assert_ne!(graph1, graph2);
-    /// graph1.canonicalize();
-    /// graph2.canonicalize();
+    /// graph1.canonicalize(CanonicalizationAlgorithm::Unstable);
+    /// graph2.canonicalize(CanonicalizationAlgorithm::Unstable);
    /// assert_eq!(graph1, graph2);
    /// # Result::<_,Box<dyn std::error::Error>>::Ok(())
    /// ```
    ///
-    /// Warning 1: Blank node ids depends on the current shape of the graph. Adding a new quad might change the ids of a lot of blank nodes.
-    /// Hence, this canonization might not be suitable for diffs.
+    /// <div class="warning">Blank node ids depends on the current shape of the graph. Adding a new quad might change the ids of a lot of blank nodes.
+    /// Hence, this canonization might not be suitable for diffs.</div>
    ///
-    /// Warning 2: The canonicalization algorithm is not stable and canonical blank node ids might change between Oxigraph version.
-    ///
-    /// Warning 3: This implementation worst-case complexity is in *O(b!)* with *b* the number of blank nodes in the input dataset.
-    pub fn canonicalize(&mut self) {
-        let bnodes = self.blank_nodes();
-        let quads_per_blank_node = self.quads_per_blank_nodes();
-        let (hash, partition) = self.hash_bnodes(
-            bnodes.into_iter().map(|bnode| (bnode, 0)).collect(),
-            &quads_per_blank_node,
-        );
-        let new_quads = self.distinguish(&hash, &partition, &quads_per_blank_node);
+    /// <div class="warning">This implementation worst-case complexity is in *O(b!)* with *b* the number of blank nodes in the input dataset.</div>
+    pub fn canonicalize(&mut self, algorithm: CanonicalizationAlgorithm) {
+        let bnode_mapping = self.canonicalize_interned_blank_nodes(algorithm);
+        let new_quads = self.map_blank_nodes(&bnode_mapping);
        self.clear();
        for quad in new_quads {
            self.insert_encoded(quad);
        }
    }

+    /// Returns a map between the current dataset blank node and the canonicalized blank node
+    /// to create a canonical dataset.
+    ///
+    /// See also [`canonicalize`](Self::canonicalize).
+    pub fn canonicalize_blank_nodes(
+        &self,
+        algorithm: CanonicalizationAlgorithm,
+    ) -> HashMap<BlankNodeRef<'_>, BlankNode> {
+        self.canonicalize_interned_blank_nodes(algorithm)
+            .into_iter()
+            .map(|(from, to)| (from.decode_from(&self.interner), to))
+            .collect()
+    }
+
+    fn canonicalize_interned_blank_nodes(
+        &self,
+        algorithm: CanonicalizationAlgorithm,
+    ) -> HashMap<InternedBlankNode, BlankNode> {
+        match algorithm {
+            CanonicalizationAlgorithm::Unstable => {
+                let bnodes = self.blank_nodes();
+                let quads_per_blank_node = self.quads_per_blank_nodes();
+                let (hash, partition) = self.hash_bnodes(
+                    bnodes.into_iter().map(|bnode| (bnode, 0)).collect(),
+                    &quads_per_blank_node,
+                );
+                self.distinguish(hash, &partition, &quads_per_blank_node)
+                    .into_iter()
+                    .map(|(from, to)| (from, BlankNode::new_from_unique_id(to.into())))
+                    .collect()
+            }
+        }
+    }
+
    fn blank_nodes(&self) -> HashSet<InternedBlankNode> {
        let mut bnodes = HashSet::new();
        for (g, s, _, o) in &self.gspo {
@ -781,16 +808,11 @@ impl Dataset {
    }

    fn distinguish(
-        &mut self,
-        hash: &HashMap<InternedBlankNode, u64>,
+        &self,
+        hash: HashMap<InternedBlankNode, u64>,
        partition: &[(u64, Vec<InternedBlankNode>)],
        quads_per_blank_node: &QuadsPerBlankNode,
-    ) -> Vec<(
-        InternedSubject,
-        InternedNamedNode,
-        InternedTerm,
-        InternedGraphName,
-    )> {
+    ) -> HashMap<InternedBlankNode, u64> {
        let b_prime = partition.iter().map(|(_, b)| b).find(|b| b.len() > 1);
        if let Some(b_prime) = b_prime {
            b_prime
@ -800,19 +822,29 @@ impl Dataset {
                    hash_prime.insert(*b, Self::hash_tuple((hash_prime[b], 22)));
                    let (hash_prime_prime, partition_prime) =
                        self.hash_bnodes(hash_prime, quads_per_blank_node);
-                    self.distinguish(&hash_prime_prime, &partition_prime, quads_per_blank_node)
+                    self.distinguish(hash_prime_prime, &partition_prime, quads_per_blank_node)
+                })
+                .reduce(|a, b| {
+                    let mut a_hashes = a.values().collect::<Vec<_>>();
+                    a_hashes.sort();
+                    let mut b_hashes = a.values().collect::<Vec<_>>();
+                    b_hashes.sort();
+                    if a_hashes <= b_hashes {
+                        a
+                    } else {
+                        b
+                    }
                })
-                .reduce(min)
                .unwrap_or_default()
        } else {
-            self.label(hash)
+            hash
        }
    }

    #[allow(clippy::needless_collect)]
-    fn label(
+    fn map_blank_nodes(
        &mut self,
-        hashes: &HashMap<InternedBlankNode, u64>,
+        bnode_mapping: &HashMap<InternedBlankNode, BlankNode>,
    ) -> Vec<(
        InternedSubject,
        InternedNamedNode,
@ -820,19 +852,22 @@ impl Dataset {
        InternedGraphName,
    )> {
        let old_quads: Vec<_> = self.spog.iter().cloned().collect();
-        let mut quads: Vec<_> = old_quads
+        old_quads
            .into_iter()
            .map(|(s, p, o, g)| {
                (
                    match s {
                        InternedSubject::NamedNode(_) => s,
                        InternedSubject::BlankNode(bnode) => {
-                            InternedSubject::BlankNode(self.map_bnode(bnode, hashes))
+                            InternedSubject::BlankNode(InternedBlankNode::encoded_into(
+                                bnode_mapping[&bnode].as_ref(),
+                                &mut self.interner,
+                            ))
                        }
                        #[cfg(feature = "rdf-star")]
                        InternedSubject::Triple(triple) => {
                            InternedSubject::Triple(Box::new(InternedTriple::encoded_into(
-                                self.label_triple(&triple, hashes).as_ref(),
+                                self.map_triple_blank_nodes(&triple, bnode_mapping).as_ref(),
                                &mut self.interner,
                            )))
                        }
@ -841,12 +876,15 @@ impl Dataset {
                    match o {
                        InternedTerm::NamedNode(_) | InternedTerm::Literal(_) => o,
                        InternedTerm::BlankNode(bnode) => {
-                            InternedTerm::BlankNode(self.map_bnode(bnode, hashes))
+                            InternedTerm::BlankNode(InternedBlankNode::encoded_into(
+                                bnode_mapping[&bnode].as_ref(),
+                                &mut self.interner,
+                            ))
                        }
                        #[cfg(feature = "rdf-star")]
                        InternedTerm::Triple(triple) => {
                            InternedTerm::Triple(Box::new(InternedTriple::encoded_into(
-                                self.label_triple(&triple, hashes).as_ref(),
+                                self.map_triple_blank_nodes(&triple, bnode_mapping).as_ref(),
                                &mut self.interner,
                            )))
                        }
@ -854,58 +892,41 @@ impl Dataset {
                    match g {
                        InternedGraphName::NamedNode(_) | InternedGraphName::DefaultGraph => g,
                        InternedGraphName::BlankNode(bnode) => {
-                            InternedGraphName::BlankNode(self.map_bnode(bnode, hashes))
+                            InternedGraphName::BlankNode(InternedBlankNode::encoded_into(
+                                bnode_mapping[&bnode].as_ref(),
+                                &mut self.interner,
+                            ))
                        }
                    },
                )
            })
-            .collect();
-        quads.sort_unstable();
-        quads
+            .collect()
    }

    #[cfg(feature = "rdf-star")]
-    fn label_triple(
+    fn map_triple_blank_nodes(
        &mut self,
        triple: &InternedTriple,
-        hashes: &HashMap<InternedBlankNode, u64>,
+        bnode_mapping: &HashMap<InternedBlankNode, BlankNode>,
    ) -> Triple {
        Triple {
            subject: if let InternedSubject::BlankNode(bnode) = &triple.subject {
-                Self::gen_bnode(*bnode, hashes).into()
+                bnode_mapping[bnode].clone().into()
            } else if let InternedSubject::Triple(t) = &triple.subject {
-                self.label_triple(t, hashes).into()
+                self.map_triple_blank_nodes(t, bnode_mapping).into()
            } else {
                triple.subject.decode_from(&self.interner).into_owned()
            },
            predicate: triple.predicate.decode_from(&self.interner).into_owned(),
            object: if let InternedTerm::BlankNode(bnode) = &triple.object {
-                Self::gen_bnode(*bnode, hashes).into()
+                bnode_mapping[bnode].clone().into()
            } else if let InternedTerm::Triple(t) = &triple.object {
-                self.label_triple(t, hashes).into()
+                self.map_triple_blank_nodes(t, bnode_mapping).into()
            } else {
                triple.object.decode_from(&self.interner).into_owned()
            },
        }
    }
-
-    fn map_bnode(
-        &mut self,
-        old_bnode: InternedBlankNode,
-        hashes: &HashMap<InternedBlankNode, u64>,
-    ) -> InternedBlankNode {
-        InternedBlankNode::encoded_into(
-            Self::gen_bnode(old_bnode, hashes).as_ref(),
-            &mut self.interner,
-        )
-    }
-
-    fn gen_bnode(
-        old_bnode: InternedBlankNode,
-        hashes: &HashMap<InternedBlankNode, u64>,
-    ) -> BlankNode {
-        BlankNode::new_from_unique_id(hashes[&old_bnode].into())
-    }
 }

 impl PartialEq for Dataset {
@ -1569,6 +1590,19 @@ type QuadsPerBlankNode = HashMap<
    )>,
 >;

+/// An algorithm used to canonicalize graph and datasets.
+///
+/// See [`Graph::canonicalize`] and [`Dataset::canonicalize`].
+#[derive(Default, Debug, Clone, Copy, Eq, PartialEq, Hash)]
+#[non_exhaustive]
+pub enum CanonicalizationAlgorithm {
+    /// The algorithm preferred by OxRDF.
+    ///
+    /// <div class="warning">The canonicalization algorithm is not stable and canonical blank node ids might change between Oxigraph version.</div>
+    #[default]
+    Unstable,
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@ -1588,7 +1622,7 @@ mod tests {
            BlankNode::default().as_ref(),
            GraphNameRef::DefaultGraph,
        ));
-        dataset.canonicalize();
+        dataset.canonicalize(CanonicalizationAlgorithm::Unstable);
        let mut dataset2 = Dataset::new();
        dataset2.insert(QuadRef::new(
            BlankNode::default().as_ref(),
@ -1602,7 +1636,7 @@ mod tests {
            BlankNode::default().as_ref(),
            GraphNameRef::DefaultGraph,
        ));
-        dataset2.canonicalize();
+        dataset2.canonicalize(CanonicalizationAlgorithm::Unstable);
        assert_eq!(dataset, dataset2);
    }
 }
--- a/lib/oxrdf/src/graph.rs
+++ b/lib/oxrdf/src/graph.rs
@ -25,6 +25,7 @@
 //!
 //! See also [`Dataset`] if you want to get support of multiple RDF graphs at the same time.

+pub use crate::dataset::CanonicalizationAlgorithm;
 use crate::dataset::*;
 use crate::*;
 use std::fmt;
@ -188,6 +189,7 @@ impl Graph {
    ///
    /// Usage example ([Graph isomorphism](https://www.w3.org/TR/rdf11-concepts/#dfn-graph-isomorphism)):
    /// ```
+    /// use oxrdf::graph::CanonicalizationAlgorithm;
    /// use oxrdf::*;
    ///
    /// let iri = NamedNodeRef::new("http://example.com")?;
@ -203,20 +205,18 @@ impl Graph {
    /// graph2.insert(TripleRef::new(&bnode2, iri, iri));
    ///
    /// assert_ne!(graph1, graph2);
-    /// graph1.canonicalize();
-    /// graph2.canonicalize();
+    /// graph1.canonicalize(CanonicalizationAlgorithm::Unstable);
+    /// graph2.canonicalize(CanonicalizationAlgorithm::Unstable);
    /// assert_eq!(graph1, graph2);
    /// # Result::<_,Box<dyn std::error::Error>>::Ok(())
    /// ```
    ///
-    /// Warning 1: Blank node ids depends on the current shape of the graph. Adding a new triple might change the ids of a lot of blank nodes.
-    /// Hence, this canonization might not be suitable for diffs.
+    /// <div class="warning">Blank node ids depends on the current shape of the graph. Adding a new quad might change the ids of a lot of blank nodes.
+    /// Hence, this canonization might not be suitable for diffs.</div>
    ///
-    /// Warning 2: The canonicalization algorithm is not stable and canonical blank node Ids might change between Oxigraph version.
-    ///
-    /// Warning 3: This implementation worst-case complexity is in *O(b!)* with *b* the number of blank nodes in the input graph.
-    pub fn canonicalize(&mut self) {
-        self.dataset.canonicalize()
+    /// <div class="warning">This implementation worst-case complexity is in *O(b!)* with *b* the number of blank nodes in the input dataset.</div>
+    pub fn canonicalize(&mut self, algorithm: CanonicalizationAlgorithm) {
+        self.dataset.canonicalize(algorithm)
    }
 }

--- a/testsuite/src/parser_evaluator.rs
+++ b/testsuite/src/parser_evaluator.rs
@ -4,6 +4,7 @@ use crate::manifest::Test;
 use crate::report::{dataset_diff, format_diff};
 use anyhow::{bail, ensure, Context, Result};
 use oxigraph::io::RdfFormat;
+use oxigraph::model::graph::CanonicalizationAlgorithm;
 use oxigraph::model::{BlankNode, Dataset, Quad};
 use oxttl::n3::{N3Quad, N3Term};

@ -138,11 +139,11 @@ fn evaluate_eval_test(test: &Test, format: RdfFormat, ignore_errors: bool) -> Re
    let action = test.action.as_deref().context("No action found")?;
    let mut actual_dataset = load_dataset(action, format, ignore_errors)
        .with_context(|| format!("Parse error on file {action}"))?;
-    actual_dataset.canonicalize();
+    actual_dataset.canonicalize(CanonicalizationAlgorithm::Unstable);
    let results = test.result.as_ref().context("No tests result found")?;
    let mut expected_dataset = load_dataset(results, guess_rdf_format(results)?, false)
        .with_context(|| format!("Parse error on file {results}"))?;
-    expected_dataset.canonicalize();
+    expected_dataset.canonicalize(CanonicalizationAlgorithm::Unstable);
    ensure!(
        expected_dataset == actual_dataset,
        "The two files are not isomorphic. Diff:\n{}",
@ -156,12 +157,12 @@ fn evaluate_n3_eval_test(test: &Test, ignore_errors: bool) -> Result<()> {
    let mut actual_dataset = n3_to_dataset(
        load_n3(action, ignore_errors).with_context(|| format!("Parse error on file {action}"))?,
    );
-    actual_dataset.canonicalize();
+    actual_dataset.canonicalize(CanonicalizationAlgorithm::Unstable);
    let results = test.result.as_ref().context("No tests result found")?;
    let mut expected_dataset = n3_to_dataset(
        load_n3(results, false).with_context(|| format!("Parse error on file {results}"))?,
    );
-    expected_dataset.canonicalize();
+    expected_dataset.canonicalize(CanonicalizationAlgorithm::Unstable);
    ensure!(
        expected_dataset == actual_dataset,
        "The two files are not isomorphic. Diff:\n{}",
--- a/testsuite/src/sparql_evaluator.rs
+++ b/testsuite/src/sparql_evaluator.rs
@ -5,6 +5,7 @@ use crate::report::{dataset_diff, format_diff};
 use crate::vocab::*;
 use anyhow::{bail, ensure, Context, Error, Result};
 use oxigraph::io::RdfParser;
+use oxigraph::model::dataset::CanonicalizationAlgorithm;
 use oxigraph::model::vocab::*;
 use oxigraph::model::*;
 use oxigraph::sparql::results::QueryResultsFormat;
@ -235,9 +236,9 @@ fn evaluate_update_evaluation_test(test: &Test) -> Result<()> {

    store.update(update).context("Failure to execute update")?;
    let mut store_dataset: Dataset = store.iter().collect::<Result<_, _>>()?;
-    store_dataset.canonicalize();
+    store_dataset.canonicalize(CanonicalizationAlgorithm::Unstable);
    let mut result_store_dataset: Dataset = result_store.iter().collect::<Result<_, _>>()?;
-    result_store_dataset.canonicalize();
+    result_store_dataset.canonicalize(CanonicalizationAlgorithm::Unstable);
    ensure!(
        store_dataset == result_store_dataset,
        "Not isomorphic result dataset.\nDiff:\n{}\nParsed update:\n{}\n",
@ -533,7 +534,7 @@ impl StaticQueryResults {
                })
            }
        } else {
-            graph.canonicalize();
+            graph.canonicalize(CanonicalizationAlgorithm::Unstable);
            Ok(Self::Graph(graph))
        }
    }