Adds an enum for CanonicalizationAlgorithm

Enables implementing other algorithms like RDFC-1.0
pull/819/head
Tpt 10 months ago committed by Thomas Tanon
parent f5b975e4d1
commit 0ac70e73dc
  1. 7
      fuzz/fuzz_targets/trig.rs
  2. 162
      lib/oxrdf/src/dataset.rs
  3. 18
      lib/oxrdf/src/graph.rs
  4. 9
      testsuite/src/parser_evaluator.rs
  5. 7
      testsuite/src/sparql_evaluator.rs

@ -1,6 +1,7 @@
#![no_main]
use libfuzzer_sys::fuzz_target;
use oxrdf::graph::CanonicalizationAlgorithm;
use oxrdf::{Dataset, GraphName, Quad, Subject, Term, Triple};
use oxttl::{TriGParser, TriGSerializer};
@ -120,8 +121,8 @@ fuzz_target!(|data: &[u8]| {
} else if bnodes_count <= 4 {
let mut dataset_with_split = quads.iter().collect::<Dataset>();
let mut dataset_without_split = quads_without_split.iter().collect::<Dataset>();
dataset_with_split.canonicalize();
dataset_without_split.canonicalize();
dataset_with_split.canonicalize(CanonicalizationAlgorithm::Unstable);
dataset_without_split.canonicalize(CanonicalizationAlgorithm::Unstable);
assert_eq!(
dataset_with_split,
dataset_without_split,
@ -131,7 +132,7 @@ fuzz_target!(|data: &[u8]| {
);
if errors.is_empty() {
let mut dataset_unchecked = quads_unchecked.iter().collect::<Dataset>();
dataset_unchecked.canonicalize();
dataset_unchecked.canonicalize(CanonicalizationAlgorithm::Unstable);
assert_eq!(
dataset_with_split,
dataset_unchecked,

@ -31,7 +31,6 @@
use crate::interning::*;
use crate::*;
use std::cmp::min;
use std::collections::hash_map::DefaultHasher;
use std::collections::{BTreeSet, HashMap, HashSet};
use std::fmt;
@ -510,6 +509,7 @@ impl Dataset {
///
/// Usage example ([Dataset isomorphism](https://www.w3.org/TR/rdf11-concepts/#dfn-dataset-isomorphism)):
/// ```
/// use oxrdf::dataset::CanonicalizationAlgorithm;
/// use oxrdf::*;
///
/// let iri = NamedNodeRef::new("http://example.com")?;
@ -527,32 +527,59 @@ impl Dataset {
/// graph2.insert(QuadRef::new(&bnode2, iri, iri, &g2));
///
/// assert_ne!(graph1, graph2);
/// graph1.canonicalize();
/// graph2.canonicalize();
/// graph1.canonicalize(CanonicalizationAlgorithm::Unstable);
/// graph2.canonicalize(CanonicalizationAlgorithm::Unstable);
/// assert_eq!(graph1, graph2);
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
/// ```
///
/// Warning 1: Blank node ids depends on the current shape of the graph. Adding a new quad might change the ids of a lot of blank nodes.
/// Hence, this canonization might not be suitable for diffs.
/// <div class="warning">Blank node ids depends on the current shape of the graph. Adding a new quad might change the ids of a lot of blank nodes.
/// Hence, this canonization might not be suitable for diffs.</div>
///
/// Warning 2: The canonicalization algorithm is not stable and canonical blank node ids might change between Oxigraph version.
///
/// Warning 3: This implementation worst-case complexity is in *O(b!)* with *b* the number of blank nodes in the input dataset.
pub fn canonicalize(&mut self) {
let bnodes = self.blank_nodes();
let quads_per_blank_node = self.quads_per_blank_nodes();
let (hash, partition) = self.hash_bnodes(
bnodes.into_iter().map(|bnode| (bnode, 0)).collect(),
&quads_per_blank_node,
);
let new_quads = self.distinguish(&hash, &partition, &quads_per_blank_node);
/// <div class="warning">This implementation worst-case complexity is in *O(b!)* with *b* the number of blank nodes in the input dataset.</div>
pub fn canonicalize(&mut self, algorithm: CanonicalizationAlgorithm) {
let bnode_mapping = self.canonicalize_interned_blank_nodes(algorithm);
let new_quads = self.map_blank_nodes(&bnode_mapping);
self.clear();
for quad in new_quads {
self.insert_encoded(quad);
}
}
/// Returns a map between the current dataset blank node and the canonicalized blank node
/// to create a canonical dataset.
///
/// See also [`canonicalize`](Self::canonicalize).
pub fn canonicalize_blank_nodes(
&self,
algorithm: CanonicalizationAlgorithm,
) -> HashMap<BlankNodeRef<'_>, BlankNode> {
self.canonicalize_interned_blank_nodes(algorithm)
.into_iter()
.map(|(from, to)| (from.decode_from(&self.interner), to))
.collect()
}
fn canonicalize_interned_blank_nodes(
&self,
algorithm: CanonicalizationAlgorithm,
) -> HashMap<InternedBlankNode, BlankNode> {
match algorithm {
CanonicalizationAlgorithm::Unstable => {
let bnodes = self.blank_nodes();
let quads_per_blank_node = self.quads_per_blank_nodes();
let (hash, partition) = self.hash_bnodes(
bnodes.into_iter().map(|bnode| (bnode, 0)).collect(),
&quads_per_blank_node,
);
self.distinguish(hash, &partition, &quads_per_blank_node)
.into_iter()
.map(|(from, to)| (from, BlankNode::new_from_unique_id(to.into())))
.collect()
}
}
}
fn blank_nodes(&self) -> HashSet<InternedBlankNode> {
let mut bnodes = HashSet::new();
for (g, s, _, o) in &self.gspo {
@ -781,16 +808,11 @@ impl Dataset {
}
fn distinguish(
&mut self,
hash: &HashMap<InternedBlankNode, u64>,
&self,
hash: HashMap<InternedBlankNode, u64>,
partition: &[(u64, Vec<InternedBlankNode>)],
quads_per_blank_node: &QuadsPerBlankNode,
) -> Vec<(
InternedSubject,
InternedNamedNode,
InternedTerm,
InternedGraphName,
)> {
) -> HashMap<InternedBlankNode, u64> {
let b_prime = partition.iter().map(|(_, b)| b).find(|b| b.len() > 1);
if let Some(b_prime) = b_prime {
b_prime
@ -800,19 +822,29 @@ impl Dataset {
hash_prime.insert(*b, Self::hash_tuple((hash_prime[b], 22)));
let (hash_prime_prime, partition_prime) =
self.hash_bnodes(hash_prime, quads_per_blank_node);
self.distinguish(&hash_prime_prime, &partition_prime, quads_per_blank_node)
self.distinguish(hash_prime_prime, &partition_prime, quads_per_blank_node)
})
.reduce(|a, b| {
let mut a_hashes = a.values().collect::<Vec<_>>();
a_hashes.sort();
let mut b_hashes = a.values().collect::<Vec<_>>();
b_hashes.sort();
if a_hashes <= b_hashes {
a
} else {
b
}
})
.reduce(min)
.unwrap_or_default()
} else {
self.label(hash)
hash
}
}
#[allow(clippy::needless_collect)]
fn label(
fn map_blank_nodes(
&mut self,
hashes: &HashMap<InternedBlankNode, u64>,
bnode_mapping: &HashMap<InternedBlankNode, BlankNode>,
) -> Vec<(
InternedSubject,
InternedNamedNode,
@ -820,19 +852,22 @@ impl Dataset {
InternedGraphName,
)> {
let old_quads: Vec<_> = self.spog.iter().cloned().collect();
let mut quads: Vec<_> = old_quads
old_quads
.into_iter()
.map(|(s, p, o, g)| {
(
match s {
InternedSubject::NamedNode(_) => s,
InternedSubject::BlankNode(bnode) => {
InternedSubject::BlankNode(self.map_bnode(bnode, hashes))
InternedSubject::BlankNode(InternedBlankNode::encoded_into(
bnode_mapping[&bnode].as_ref(),
&mut self.interner,
))
}
#[cfg(feature = "rdf-star")]
InternedSubject::Triple(triple) => {
InternedSubject::Triple(Box::new(InternedTriple::encoded_into(
self.label_triple(&triple, hashes).as_ref(),
self.map_triple_blank_nodes(&triple, bnode_mapping).as_ref(),
&mut self.interner,
)))
}
@ -841,12 +876,15 @@ impl Dataset {
match o {
InternedTerm::NamedNode(_) | InternedTerm::Literal(_) => o,
InternedTerm::BlankNode(bnode) => {
InternedTerm::BlankNode(self.map_bnode(bnode, hashes))
InternedTerm::BlankNode(InternedBlankNode::encoded_into(
bnode_mapping[&bnode].as_ref(),
&mut self.interner,
))
}
#[cfg(feature = "rdf-star")]
InternedTerm::Triple(triple) => {
InternedTerm::Triple(Box::new(InternedTriple::encoded_into(
self.label_triple(&triple, hashes).as_ref(),
self.map_triple_blank_nodes(&triple, bnode_mapping).as_ref(),
&mut self.interner,
)))
}
@ -854,58 +892,41 @@ impl Dataset {
match g {
InternedGraphName::NamedNode(_) | InternedGraphName::DefaultGraph => g,
InternedGraphName::BlankNode(bnode) => {
InternedGraphName::BlankNode(self.map_bnode(bnode, hashes))
InternedGraphName::BlankNode(InternedBlankNode::encoded_into(
bnode_mapping[&bnode].as_ref(),
&mut self.interner,
))
}
},
)
})
.collect();
quads.sort_unstable();
quads
.collect()
}
#[cfg(feature = "rdf-star")]
fn label_triple(
fn map_triple_blank_nodes(
&mut self,
triple: &InternedTriple,
hashes: &HashMap<InternedBlankNode, u64>,
bnode_mapping: &HashMap<InternedBlankNode, BlankNode>,
) -> Triple {
Triple {
subject: if let InternedSubject::BlankNode(bnode) = &triple.subject {
Self::gen_bnode(*bnode, hashes).into()
bnode_mapping[bnode].clone().into()
} else if let InternedSubject::Triple(t) = &triple.subject {
self.label_triple(t, hashes).into()
self.map_triple_blank_nodes(t, bnode_mapping).into()
} else {
triple.subject.decode_from(&self.interner).into_owned()
},
predicate: triple.predicate.decode_from(&self.interner).into_owned(),
object: if let InternedTerm::BlankNode(bnode) = &triple.object {
Self::gen_bnode(*bnode, hashes).into()
bnode_mapping[bnode].clone().into()
} else if let InternedTerm::Triple(t) = &triple.object {
self.label_triple(t, hashes).into()
self.map_triple_blank_nodes(t, bnode_mapping).into()
} else {
triple.object.decode_from(&self.interner).into_owned()
},
}
}
fn map_bnode(
&mut self,
old_bnode: InternedBlankNode,
hashes: &HashMap<InternedBlankNode, u64>,
) -> InternedBlankNode {
InternedBlankNode::encoded_into(
Self::gen_bnode(old_bnode, hashes).as_ref(),
&mut self.interner,
)
}
fn gen_bnode(
old_bnode: InternedBlankNode,
hashes: &HashMap<InternedBlankNode, u64>,
) -> BlankNode {
BlankNode::new_from_unique_id(hashes[&old_bnode].into())
}
}
impl PartialEq for Dataset {
@ -1569,6 +1590,19 @@ type QuadsPerBlankNode = HashMap<
)>,
>;
/// An algorithm used to canonicalize graph and datasets.
///
/// See [`Graph::canonicalize`] and [`Dataset::canonicalize`].
#[derive(Default, Debug, Clone, Copy, Eq, PartialEq, Hash)]
#[non_exhaustive]
pub enum CanonicalizationAlgorithm {
/// The algorithm preferred by OxRDF.
///
/// <div class="warning">The canonicalization algorithm is not stable and canonical blank node ids might change between Oxigraph version.</div>
#[default]
Unstable,
}
#[cfg(test)]
mod tests {
use super::*;
@ -1588,7 +1622,7 @@ mod tests {
BlankNode::default().as_ref(),
GraphNameRef::DefaultGraph,
));
dataset.canonicalize();
dataset.canonicalize(CanonicalizationAlgorithm::Unstable);
let mut dataset2 = Dataset::new();
dataset2.insert(QuadRef::new(
BlankNode::default().as_ref(),
@ -1602,7 +1636,7 @@ mod tests {
BlankNode::default().as_ref(),
GraphNameRef::DefaultGraph,
));
dataset2.canonicalize();
dataset2.canonicalize(CanonicalizationAlgorithm::Unstable);
assert_eq!(dataset, dataset2);
}
}

@ -25,6 +25,7 @@
//!
//! See also [`Dataset`] if you want to get support of multiple RDF graphs at the same time.
pub use crate::dataset::CanonicalizationAlgorithm;
use crate::dataset::*;
use crate::*;
use std::fmt;
@ -188,6 +189,7 @@ impl Graph {
///
/// Usage example ([Graph isomorphism](https://www.w3.org/TR/rdf11-concepts/#dfn-graph-isomorphism)):
/// ```
/// use oxrdf::graph::CanonicalizationAlgorithm;
/// use oxrdf::*;
///
/// let iri = NamedNodeRef::new("http://example.com")?;
@ -203,20 +205,18 @@ impl Graph {
/// graph2.insert(TripleRef::new(&bnode2, iri, iri));
///
/// assert_ne!(graph1, graph2);
/// graph1.canonicalize();
/// graph2.canonicalize();
/// graph1.canonicalize(CanonicalizationAlgorithm::Unstable);
/// graph2.canonicalize(CanonicalizationAlgorithm::Unstable);
/// assert_eq!(graph1, graph2);
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
/// ```
///
/// Warning 1: Blank node ids depends on the current shape of the graph. Adding a new triple might change the ids of a lot of blank nodes.
/// Hence, this canonization might not be suitable for diffs.
/// <div class="warning">Blank node ids depends on the current shape of the graph. Adding a new quad might change the ids of a lot of blank nodes.
/// Hence, this canonization might not be suitable for diffs.</div>
///
/// Warning 2: The canonicalization algorithm is not stable and canonical blank node Ids might change between Oxigraph version.
///
/// Warning 3: This implementation worst-case complexity is in *O(b!)* with *b* the number of blank nodes in the input graph.
pub fn canonicalize(&mut self) {
self.dataset.canonicalize()
/// <div class="warning">This implementation worst-case complexity is in *O(b!)* with *b* the number of blank nodes in the input dataset.</div>
pub fn canonicalize(&mut self, algorithm: CanonicalizationAlgorithm) {
self.dataset.canonicalize(algorithm)
}
}

@ -4,6 +4,7 @@ use crate::manifest::Test;
use crate::report::{dataset_diff, format_diff};
use anyhow::{bail, ensure, Context, Result};
use oxigraph::io::RdfFormat;
use oxigraph::model::graph::CanonicalizationAlgorithm;
use oxigraph::model::{BlankNode, Dataset, Quad};
use oxttl::n3::{N3Quad, N3Term};
@ -138,11 +139,11 @@ fn evaluate_eval_test(test: &Test, format: RdfFormat, ignore_errors: bool) -> Re
let action = test.action.as_deref().context("No action found")?;
let mut actual_dataset = load_dataset(action, format, ignore_errors)
.with_context(|| format!("Parse error on file {action}"))?;
actual_dataset.canonicalize();
actual_dataset.canonicalize(CanonicalizationAlgorithm::Unstable);
let results = test.result.as_ref().context("No tests result found")?;
let mut expected_dataset = load_dataset(results, guess_rdf_format(results)?, false)
.with_context(|| format!("Parse error on file {results}"))?;
expected_dataset.canonicalize();
expected_dataset.canonicalize(CanonicalizationAlgorithm::Unstable);
ensure!(
expected_dataset == actual_dataset,
"The two files are not isomorphic. Diff:\n{}",
@ -156,12 +157,12 @@ fn evaluate_n3_eval_test(test: &Test, ignore_errors: bool) -> Result<()> {
let mut actual_dataset = n3_to_dataset(
load_n3(action, ignore_errors).with_context(|| format!("Parse error on file {action}"))?,
);
actual_dataset.canonicalize();
actual_dataset.canonicalize(CanonicalizationAlgorithm::Unstable);
let results = test.result.as_ref().context("No tests result found")?;
let mut expected_dataset = n3_to_dataset(
load_n3(results, false).with_context(|| format!("Parse error on file {results}"))?,
);
expected_dataset.canonicalize();
expected_dataset.canonicalize(CanonicalizationAlgorithm::Unstable);
ensure!(
expected_dataset == actual_dataset,
"The two files are not isomorphic. Diff:\n{}",

@ -5,6 +5,7 @@ use crate::report::{dataset_diff, format_diff};
use crate::vocab::*;
use anyhow::{bail, ensure, Context, Error, Result};
use oxigraph::io::RdfParser;
use oxigraph::model::dataset::CanonicalizationAlgorithm;
use oxigraph::model::vocab::*;
use oxigraph::model::*;
use oxigraph::sparql::results::QueryResultsFormat;
@ -235,9 +236,9 @@ fn evaluate_update_evaluation_test(test: &Test) -> Result<()> {
store.update(update).context("Failure to execute update")?;
let mut store_dataset: Dataset = store.iter().collect::<Result<_, _>>()?;
store_dataset.canonicalize();
store_dataset.canonicalize(CanonicalizationAlgorithm::Unstable);
let mut result_store_dataset: Dataset = result_store.iter().collect::<Result<_, _>>()?;
result_store_dataset.canonicalize();
result_store_dataset.canonicalize(CanonicalizationAlgorithm::Unstable);
ensure!(
store_dataset == result_store_dataset,
"Not isomorphic result dataset.\nDiff:\n{}\nParsed update:\n{}\n",
@ -533,7 +534,7 @@ impl StaticQueryResults {
})
}
} else {
graph.canonicalize();
graph.canonicalize(CanonicalizationAlgorithm::Unstable);
Ok(Self::Graph(graph))
}
}

Loading…
Cancel
Save