Isomorphism: makes sure that new hashes depends on the old ones

Allows to make the "distinguish" step work
pull/603/head
Tpt 1 year ago
parent 788450932a
commit bbf184f7ae
  1. 11
      fuzz/fuzz_targets/trig.rs
  2. 42
      lib/oxrdf/src/dataset.rs

@ -74,7 +74,16 @@ fuzz_target!(|data: &[u8]| {
.filter(|c| *c != 0xFF) .filter(|c| *c != 0xFF)
.collect::<Vec<_>>() .collect::<Vec<_>>()
.as_slice()]); .as_slice()]);
if quads.iter().map(count_quad_blank_nodes).sum::<usize>() < 2 { let bnodes_count = quads.iter().map(count_quad_blank_nodes).sum::<usize>();
if bnodes_count == 0 {
assert_eq!(
quads,
quads_without_split,
"With split:\n{}\nWithout split:\n{}",
String::from_utf8_lossy(&serialize_quads(&quads)),
String::from_utf8_lossy(&serialize_quads(&quads_without_split))
);
} else if bnodes_count <= 4 {
let mut dataset_with_split = quads.iter().collect::<Dataset>(); let mut dataset_with_split = quads.iter().collect::<Dataset>();
let mut dataset_without_split = quads_without_split.iter().collect::<Dataset>(); let mut dataset_without_split = quads_without_split.iter().collect::<Dataset>();
dataset_with_split.canonicalize(); dataset_with_split.canonicalize();

@ -646,37 +646,41 @@ impl Dataset {
Vec<(u64, Vec<InternedBlankNode>)>, Vec<(u64, Vec<InternedBlankNode>)>,
) { ) {
let mut to_hash = Vec::new(); let mut to_hash = Vec::new();
let mut to_do = hashes.keys().copied().collect::<Vec<_>>(); let mut to_do = hashes
.keys()
.map(|bnode| (*bnode, true))
.collect::<HashMap<_, _>>();
let mut partition = HashMap::<_, Vec<_>>::with_capacity(hashes.len()); let mut partition = HashMap::<_, Vec<_>>::with_capacity(hashes.len());
let mut partition_count = to_do.len(); let mut old_partition_count = usize::MAX;
while !to_do.is_empty() { while old_partition_count != partition.len() {
old_partition_count = partition.len();
partition.clear(); partition.clear();
let mut new_hashes = hashes.clone(); let mut new_hashes = hashes.clone();
let mut new_todo = Vec::with_capacity(to_do.len()); for bnode in hashes.keys() {
for bnode in to_do { let hash = if to_do.contains_key(bnode) {
for (s, p, o, g) in &quads_per_blank_node[&bnode] { for (s, p, o, g) in &quads_per_blank_node[bnode] {
to_hash.push(( to_hash.push((
self.hash_subject(s, bnode, &hashes), self.hash_subject(s, *bnode, &hashes),
self.hash_named_node(*p), self.hash_named_node(*p),
self.hash_term(o, bnode, &hashes), self.hash_term(o, *bnode, &hashes),
self.hash_graph_name(g, bnode, &hashes), self.hash_graph_name(g, *bnode, &hashes),
)); ));
} }
to_hash.sort_unstable(); to_hash.sort_unstable();
let hash = Self::hash_tuple((&to_hash,)); let hash = Self::hash_tuple((&to_hash, hashes[bnode]));
to_hash.clear(); to_hash.clear();
if hash != hashes[&bnode] { if hash == hashes[bnode] {
new_hashes.insert(bnode, hash); to_do.insert(*bnode, false);
new_todo.push(bnode); } else {
new_hashes.insert(*bnode, hash);
} }
partition.entry(hash).or_default().push(bnode); hash
} else {
hashes[bnode]
};
partition.entry(hash).or_default().push(*bnode);
} }
hashes = new_hashes; hashes = new_hashes;
to_do = new_todo;
if partition_count == partition.len() {
break; // no improvement
}
partition_count = partition.len();
} }
let mut partition: Vec<_> = partition.into_iter().collect(); let mut partition: Vec<_> = partition.into_iter().collect();
partition.sort_unstable_by(|(h1, b1), (h2, b2)| (b1.len(), h1).cmp(&(b2.len(), h2))); partition.sort_unstable_by(|(h1, b1), (h2, b2)| (b1.len(), h1).cmp(&(b2.len(), h2)));

Loading…
Cancel
Save