From bbf184f7aea4d60ec01a33d680d8a3dbb6567170 Mon Sep 17 00:00:00 2001 From: Tpt Date: Thu, 6 Jul 2023 17:56:05 +0200 Subject: [PATCH] Isomorphism: makes sure that new hashes depends on the old ones Allows to make the "distinguish" step work --- fuzz/fuzz_targets/trig.rs | 11 +++++++- lib/oxrdf/src/dataset.rs | 56 +++++++++++++++++++++------------------ 2 files changed, 40 insertions(+), 27 deletions(-) diff --git a/fuzz/fuzz_targets/trig.rs b/fuzz/fuzz_targets/trig.rs index a96ca86c..6a930a97 100644 --- a/fuzz/fuzz_targets/trig.rs +++ b/fuzz/fuzz_targets/trig.rs @@ -74,7 +74,16 @@ fuzz_target!(|data: &[u8]| { .filter(|c| *c != 0xFF) .collect::>() .as_slice()]); - if quads.iter().map(count_quad_blank_nodes).sum::() < 2 { + let bnodes_count = quads.iter().map(count_quad_blank_nodes).sum::(); + if bnodes_count == 0 { + assert_eq!( + quads, + quads_without_split, + "With split:\n{}\nWithout split:\n{}", + String::from_utf8_lossy(&serialize_quads(&quads)), + String::from_utf8_lossy(&serialize_quads(&quads_without_split)) + ); + } else if bnodes_count <= 4 { let mut dataset_with_split = quads.iter().collect::(); let mut dataset_without_split = quads_without_split.iter().collect::(); dataset_with_split.canonicalize(); diff --git a/lib/oxrdf/src/dataset.rs b/lib/oxrdf/src/dataset.rs index dbf82c86..4b7736eb 100644 --- a/lib/oxrdf/src/dataset.rs +++ b/lib/oxrdf/src/dataset.rs @@ -646,37 +646,41 @@ impl Dataset { Vec<(u64, Vec)>, ) { let mut to_hash = Vec::new(); - let mut to_do = hashes.keys().copied().collect::>(); + let mut to_do = hashes + .keys() + .map(|bnode| (*bnode, true)) + .collect::>(); let mut partition = HashMap::<_, Vec<_>>::with_capacity(hashes.len()); - let mut partition_count = to_do.len(); - while !to_do.is_empty() { + let mut old_partition_count = usize::MAX; + while old_partition_count != partition.len() { + old_partition_count = partition.len(); partition.clear(); let mut new_hashes = hashes.clone(); - let mut new_todo = Vec::with_capacity(to_do.len()); - for bnode in to_do { - for (s, p, o, g) in &quads_per_blank_node[&bnode] { - to_hash.push(( - self.hash_subject(s, bnode, &hashes), - self.hash_named_node(*p), - self.hash_term(o, bnode, &hashes), - self.hash_graph_name(g, bnode, &hashes), - )); - } - to_hash.sort_unstable(); - let hash = Self::hash_tuple((&to_hash,)); - to_hash.clear(); - if hash != hashes[&bnode] { - new_hashes.insert(bnode, hash); - new_todo.push(bnode); - } - partition.entry(hash).or_default().push(bnode); + for bnode in hashes.keys() { + let hash = if to_do.contains_key(bnode) { + for (s, p, o, g) in &quads_per_blank_node[bnode] { + to_hash.push(( + self.hash_subject(s, *bnode, &hashes), + self.hash_named_node(*p), + self.hash_term(o, *bnode, &hashes), + self.hash_graph_name(g, *bnode, &hashes), + )); + } + to_hash.sort_unstable(); + let hash = Self::hash_tuple((&to_hash, hashes[bnode])); + to_hash.clear(); + if hash == hashes[bnode] { + to_do.insert(*bnode, false); + } else { + new_hashes.insert(*bnode, hash); + } + hash + } else { + hashes[bnode] + }; + partition.entry(hash).or_default().push(*bnode); } hashes = new_hashes; - to_do = new_todo; - if partition_count == partition.len() { - break; // no improvement - } - partition_count = partition.len(); } let mut partition: Vec<_> = partition.into_iter().collect(); partition.sort_unstable_by(|(h1, b1), (h2, b2)| (b1.len(), h1).cmp(&(b2.len(), h2)));