From 807cf0d436167b78979548a7adfcb26dd923a291 Mon Sep 17 00:00:00 2001 From: Tpt Date: Sun, 16 Jul 2023 21:34:22 +0200 Subject: [PATCH] Isomorphism: make sure to also take quoted triples into account and fixes interning stability issue --- lib/oxrdf/src/dataset.rs | 323 +++++++++++++++++++++++++------------ lib/oxrdf/src/interning.rs | 43 +++-- 2 files changed, 251 insertions(+), 115 deletions(-) diff --git a/lib/oxrdf/src/dataset.rs b/lib/oxrdf/src/dataset.rs index 95ecef75..dbf82c86 100644 --- a/lib/oxrdf/src/dataset.rs +++ b/lib/oxrdf/src/dataset.rs @@ -294,6 +294,18 @@ impl Dataset { .map(|(o, s, p, g)| (s, p, o, g)) } + pub fn quads_for_graph_name<'a, 'b>( + &'a self, + graph_name: impl Into>, + ) -> impl Iterator> + 'a { + let graph_name = self + .encoded_graph_name(graph_name) + .unwrap_or_else(InternedGraphName::impossible); + + self.interned_quads_for_graph_name(&graph_name) + .map(move |q| self.decode_spog(q)) + } + fn interned_quads_for_graph_name( &self, graph_name: &InternedGraphName, @@ -526,9 +538,12 @@ impl Dataset { /// Warning 3: This implementation worst-case complexity is in *O(b!)* with *b* the number of blank nodes in the input dataset. pub fn canonicalize(&mut self) { let bnodes = self.blank_nodes(); - let (hash, partition) = - self.hash_bnodes(bnodes.into_iter().map(|bnode| (bnode, 0)).collect()); - let new_quads = self.distinguish(&hash, &partition); + let quads_per_blank_node = self.quads_per_blank_nodes(); + let (hash, partition) = self.hash_bnodes( + bnodes.into_iter().map(|bnode| (bnode, 0)).collect(), + &quads_per_blank_node, + ); + let new_quads = self.distinguish(&hash, &partition, &quads_per_blank_node); self.clear(); for quad in new_quads { self.insert_encoded(quad); @@ -573,107 +588,168 @@ impl Dataset { } } + fn quads_per_blank_nodes(&self) -> QuadsPerBlankNode { + let mut map: HashMap<_, Vec<_>> = HashMap::new(); + for quad in &self.spog { + if let InternedSubject::BlankNode(bnode) = &quad.0 { + map.entry(*bnode).or_default().push(quad.clone()); + } + #[cfg(feature = "rdf-star")] + if let InternedSubject::Triple(t) = &quad.0 { + Self::add_quad_with_quoted_triple_to_quad_per_blank_nodes_map(quad, t, &mut map); + } + if let InternedTerm::BlankNode(bnode) = &quad.2 { + map.entry(*bnode).or_default().push(quad.clone()); + } + #[cfg(feature = "rdf-star")] + if let InternedTerm::Triple(t) = &quad.2 { + Self::add_quad_with_quoted_triple_to_quad_per_blank_nodes_map(quad, t, &mut map); + } + if let InternedGraphName::BlankNode(bnode) = &quad.3 { + map.entry(*bnode).or_default().push(quad.clone()); + } + } + map + } + + #[cfg(feature = "rdf-star")] + fn add_quad_with_quoted_triple_to_quad_per_blank_nodes_map( + quad: &( + InternedSubject, + InternedNamedNode, + InternedTerm, + InternedGraphName, + ), + triple: &InternedTriple, + map: &mut QuadsPerBlankNode, + ) { + if let InternedSubject::BlankNode(bnode) = &triple.subject { + map.entry(*bnode).or_default().push(quad.clone()); + } + if let InternedSubject::Triple(t) = &triple.subject { + Self::add_quad_with_quoted_triple_to_quad_per_blank_nodes_map(quad, t, map); + } + if let InternedTerm::BlankNode(bnode) = &triple.object { + map.entry(*bnode).or_default().push(quad.clone()); + } + if let InternedTerm::Triple(t) = &triple.object { + Self::add_quad_with_quoted_triple_to_quad_per_blank_nodes_map(quad, t, map); + } + } + fn hash_bnodes( &self, mut hashes: HashMap, + quads_per_blank_node: &QuadsPerBlankNode, ) -> ( HashMap, Vec<(u64, Vec)>, ) { let mut to_hash = Vec::new(); - let mut partition: HashMap> = HashMap::new(); - let mut partition_len = 0; - loop { - //TODO: improve termination - let mut new_hashes = HashMap::new(); - for (bnode, old_hash) in &hashes { - for (_, p, o, g) in - self.interned_quads_for_subject(&InternedSubject::BlankNode(*bnode)) - { - to_hash.push(( - self.hash_named_node(*p), - self.hash_term(o, &hashes), - self.hash_graph_name(g, &hashes), - 0, - )); - } - for (s, p, _, g) in self.interned_quads_for_object(&InternedTerm::BlankNode(*bnode)) - { - to_hash.push(( - self.hash_subject(s, &hashes), - self.hash_named_node(*p), - self.hash_graph_name(g, &hashes), - 1, - )); - } - for (s, p, o, _) in - self.interned_quads_for_graph_name(&InternedGraphName::BlankNode(*bnode)) - { + let mut to_do = hashes.keys().copied().collect::>(); + let mut partition = HashMap::<_, Vec<_>>::with_capacity(hashes.len()); + let mut partition_count = to_do.len(); + while !to_do.is_empty() { + partition.clear(); + let mut new_hashes = hashes.clone(); + let mut new_todo = Vec::with_capacity(to_do.len()); + for bnode in to_do { + for (s, p, o, g) in &quads_per_blank_node[&bnode] { to_hash.push(( - self.hash_subject(s, &hashes), + self.hash_subject(s, bnode, &hashes), self.hash_named_node(*p), - self.hash_term(o, &hashes), - 2, + self.hash_term(o, bnode, &hashes), + self.hash_graph_name(g, bnode, &hashes), )); } to_hash.sort_unstable(); - let hash = Self::hash_tuple((old_hash, &to_hash)); + let hash = Self::hash_tuple((&to_hash,)); to_hash.clear(); - new_hashes.insert(*bnode, hash); - partition.entry(hash).or_default().push(*bnode); - } - if partition.len() == partition_len { - let mut partition: Vec<_> = partition.into_iter().collect(); - partition.sort_by(|(h1, b1), (h2, b2)| (b1.len(), h1).cmp(&(b2.len(), h2))); - return (hashes, partition); + if hash != hashes[&bnode] { + new_hashes.insert(bnode, hash); + new_todo.push(bnode); + } + partition.entry(hash).or_default().push(bnode); } hashes = new_hashes; - partition_len = partition.len(); - partition.clear(); + to_do = new_todo; + if partition_count == partition.len() { + break; // no improvement + } + partition_count = partition.len(); } + let mut partition: Vec<_> = partition.into_iter().collect(); + partition.sort_unstable_by(|(h1, b1), (h2, b2)| (b1.len(), h1).cmp(&(b2.len(), h2))); + (hashes, partition) } fn hash_named_node(&self, node: InternedNamedNode) -> u64 { Self::hash_tuple(node.decode_from(&self.interner)) } + fn hash_blank_node( + node: InternedBlankNode, + current_blank_node: InternedBlankNode, + bnodes_hash: &HashMap, + ) -> u64 { + if node == current_blank_node { + u64::MAX + } else { + bnodes_hash[&node] + } + } + fn hash_subject( &self, node: &InternedSubject, + current_blank_node: InternedBlankNode, bnodes_hash: &HashMap, ) -> u64 { - #[cfg(feature = "rdf-star")] - if let InternedSubject::Triple(triple) = node { - return self.hash_triple(triple, bnodes_hash); - } - if let InternedSubject::BlankNode(bnode) = node { - bnodes_hash[bnode] - } else { - Self::hash_tuple(node.decode_from(&self.interner)) + match node { + InternedSubject::NamedNode(node) => Self::hash_tuple(node.decode_from(&self.interner)), + InternedSubject::BlankNode(bnode) => { + Self::hash_blank_node(*bnode, current_blank_node, bnodes_hash) + } + #[cfg(feature = "rdf-star")] + InternedSubject::Triple(triple) => { + self.hash_triple(triple, current_blank_node, bnodes_hash) + } } } - fn hash_term(&self, term: &InternedTerm, bnodes_hash: &HashMap) -> u64 { - #[cfg(feature = "rdf-star")] - if let InternedTerm::Triple(triple) = term { - return self.hash_triple(triple, bnodes_hash); - } - if let InternedTerm::BlankNode(bnode) = term { - bnodes_hash[bnode] - } else { - Self::hash_tuple(term.decode_from(&self.interner)) + fn hash_term( + &self, + term: &InternedTerm, + current_blank_node: InternedBlankNode, + bnodes_hash: &HashMap, + ) -> u64 { + match term { + InternedTerm::NamedNode(node) => Self::hash_tuple(node.decode_from(&self.interner)), + InternedTerm::BlankNode(bnode) => { + Self::hash_blank_node(*bnode, current_blank_node, bnodes_hash) + } + InternedTerm::Literal(literal) => Self::hash_tuple(literal.decode_from(&self.interner)), + #[cfg(feature = "rdf-star")] + InternedTerm::Triple(triple) => { + self.hash_triple(triple, current_blank_node, bnodes_hash) + } } } fn hash_graph_name( &self, graph_name: &InternedGraphName, + current_blank_node: InternedBlankNode, bnodes_hash: &HashMap, ) -> u64 { - if let InternedGraphName::BlankNode(bnode) = graph_name { - bnodes_hash[bnode] - } else { - Self::hash_tuple(graph_name.decode_from(&self.interner)) + match graph_name { + InternedGraphName::NamedNode(node) => { + Self::hash_tuple(node.decode_from(&self.interner)) + } + InternedGraphName::BlankNode(bnode) => { + Self::hash_blank_node(*bnode, current_blank_node, bnodes_hash) + } + InternedGraphName::DefaultGraph => 0, } } @@ -681,12 +757,13 @@ impl Dataset { fn hash_triple( &self, triple: &InternedTriple, + current_blank_node: InternedBlankNode, bnodes_hash: &HashMap, ) -> u64 { Self::hash_tuple(( - self.hash_subject(&triple.subject, bnodes_hash), + self.hash_subject(&triple.subject, current_blank_node, bnodes_hash), self.hash_named_node(triple.predicate), - self.hash_term(&triple.object, bnodes_hash), + self.hash_term(&triple.object, current_blank_node, bnodes_hash), )) } @@ -700,6 +777,7 @@ impl Dataset { &mut self, hash: &HashMap, partition: &[(u64, Vec)], + quads_per_blank_node: &QuadsPerBlankNode, ) -> Vec<( InternedSubject, InternedNamedNode, @@ -713,8 +791,9 @@ impl Dataset { .map(|b| { let mut hash_prime = hash.clone(); hash_prime.insert(*b, Self::hash_tuple((hash_prime[b], 22))); - let (hash_prime_prime, partition_prime) = self.hash_bnodes(hash_prime); - self.distinguish(&hash_prime_prime, &partition_prime) + let (hash_prime_prime, partition_prime) = + self.hash_bnodes(hash_prime, quads_per_blank_node); + self.distinguish(&hash_prime_prime, &partition_prime, quads_per_blank_node) }) .reduce(min) .unwrap_or_default() @@ -738,54 +817,43 @@ impl Dataset { .into_iter() .map(|(s, p, o, g)| { ( - if let InternedSubject::BlankNode(bnode) = s { - InternedSubject::BlankNode(self.map_bnode(bnode, hashes)) - } else { - #[cfg(feature = "rdf-star")] - { - if let InternedSubject::Triple(triple) = s { - InternedSubject::Triple(Box::new(InternedTriple::encoded_into( - self.label_triple(&triple, hashes).as_ref(), - &mut self.interner, - ))) - } else { - s - } + match s { + InternedSubject::NamedNode(_) => s, + InternedSubject::BlankNode(bnode) => { + InternedSubject::BlankNode(self.map_bnode(bnode, hashes)) } - #[cfg(not(feature = "rdf-star"))] - { - s + #[cfg(feature = "rdf-star")] + InternedSubject::Triple(triple) => { + InternedSubject::Triple(Box::new(InternedTriple::encoded_into( + self.label_triple(&triple, hashes).as_ref(), + &mut self.interner, + ))) } }, p, - if let InternedTerm::BlankNode(bnode) = o { - InternedTerm::BlankNode(self.map_bnode(bnode, hashes)) - } else { - #[cfg(feature = "rdf-star")] - { - if let InternedTerm::Triple(triple) = o { - InternedTerm::Triple(Box::new(InternedTriple::encoded_into( - self.label_triple(&triple, hashes).as_ref(), - &mut self.interner, - ))) - } else { - o - } + match o { + InternedTerm::NamedNode(_) | InternedTerm::Literal(_) => o, + InternedTerm::BlankNode(bnode) => { + InternedTerm::BlankNode(self.map_bnode(bnode, hashes)) } - #[cfg(not(feature = "rdf-star"))] - { - o + #[cfg(feature = "rdf-star")] + InternedTerm::Triple(triple) => { + InternedTerm::Triple(Box::new(InternedTriple::encoded_into( + self.label_triple(&triple, hashes).as_ref(), + &mut self.interner, + ))) } }, - if let InternedGraphName::BlankNode(bnode) = g { - InternedGraphName::BlankNode(self.map_bnode(bnode, hashes)) - } else { - g + match g { + InternedGraphName::NamedNode(_) | InternedGraphName::DefaultGraph => g, + InternedGraphName::BlankNode(bnode) => { + InternedGraphName::BlankNode(self.map_bnode(bnode, hashes)) + } }, ) }) .collect(); - quads.sort(); + quads.sort_unstable(); quads } @@ -1483,3 +1551,46 @@ impl<'a> Iterator for GraphViewIter<'a> { .map(|(_, s, p, o)| self.dataset.decode_spo((s, p, o))) } } + +type QuadsPerBlankNode = HashMap< + InternedBlankNode, + Vec<( + InternedSubject, + InternedNamedNode, + InternedTerm, + InternedGraphName, + )>, +>; + +#[test] +fn test_canon() { + let mut dataset = Dataset::new(); + dataset.insert(QuadRef::new( + BlankNode::default().as_ref(), + NamedNodeRef::new_unchecked("http://ex"), + BlankNode::default().as_ref(), + GraphNameRef::DefaultGraph, + )); + dataset.insert(QuadRef::new( + BlankNode::default().as_ref(), + NamedNodeRef::new_unchecked("http://ex"), + BlankNode::default().as_ref(), + GraphNameRef::DefaultGraph, + )); + dataset.canonicalize(); + let mut dataset2 = Dataset::new(); + dataset2.insert(QuadRef::new( + BlankNode::default().as_ref(), + NamedNodeRef::new_unchecked("http://ex"), + BlankNode::default().as_ref(), + GraphNameRef::DefaultGraph, + )); + dataset2.insert(QuadRef::new( + BlankNode::default().as_ref(), + NamedNodeRef::new_unchecked("http://ex"), + BlankNode::default().as_ref(), + GraphNameRef::DefaultGraph, + )); + dataset2.canonicalize(); + assert_eq!(dataset, dataset2); +} diff --git a/lib/oxrdf/src/interning.rs b/lib/oxrdf/src/interning.rs index 4b7b8705..3414d51a 100644 --- a/lib/oxrdf/src/interning.rs +++ b/lib/oxrdf/src/interning.rs @@ -8,6 +8,7 @@ use std::hash::{BuildHasher, Hasher}; pub struct Interner { hasher: RandomState, string_for_hash: HashMap, + string_for_blank_node_id: HashMap, #[cfg(feature = "rdf-star")] triples: HashMap, } @@ -119,29 +120,53 @@ impl InternedNamedNode { } #[derive(Eq, PartialEq, Ord, PartialOrd, Debug, Clone, Copy, Hash)] -pub struct InternedBlankNode { - id: Key, +pub enum InternedBlankNode { + Number { id: u128 }, + Other { id: Key }, } impl InternedBlankNode { pub fn encoded_into(blank_node: BlankNodeRef<'_>, interner: &mut Interner) -> Self { - Self { - id: interner.get_or_intern(blank_node.as_str()), + if let Some(id) = blank_node.unique_id() { + interner + .string_for_blank_node_id + .entry(id) + .or_insert_with(|| blank_node.as_str().into()); + Self::Number { id } + } else { + Self::Other { + id: interner.get_or_intern(blank_node.as_str()), + } } } pub fn encoded_from(blank_node: BlankNodeRef<'_>, interner: &Interner) -> Option { - Some(Self { - id: interner.get(blank_node.as_str())?, - }) + if let Some(id) = blank_node.unique_id() { + interner + .string_for_blank_node_id + .contains_key(&id) + .then_some(Self::Number { id }) + } else { + Some(Self::Other { + id: interner.get(blank_node.as_str())?, + }) + } } pub fn decode_from(self, interner: &Interner) -> BlankNodeRef { - BlankNodeRef::new_unchecked(interner.resolve(self.id)) + BlankNodeRef::new_unchecked(match self { + Self::Number { id } => &interner.string_for_blank_node_id[&id], + Self::Other { id } => interner.resolve(id), + }) } pub fn next(self) -> Self { - Self { id: self.id.next() } + match self { + Self::Number { id } => Self::Number { + id: id.saturating_add(1), + }, + Self::Other { id } => Self::Other { id: id.next() }, + } } }