Isomorphism: make sure to also take quoted triples into account and fixes interning stability issue

pull/600/head
Tpt 1 year ago committed by Thomas Tanon
parent 5fee36e587
commit 807cf0d436
  1. 323
      lib/oxrdf/src/dataset.rs
  2. 43
      lib/oxrdf/src/interning.rs

@ -294,6 +294,18 @@ impl Dataset {
.map(|(o, s, p, g)| (s, p, o, g)) .map(|(o, s, p, g)| (s, p, o, g))
} }
pub fn quads_for_graph_name<'a, 'b>(
&'a self,
graph_name: impl Into<GraphNameRef<'b>>,
) -> impl Iterator<Item = QuadRef<'a>> + 'a {
let graph_name = self
.encoded_graph_name(graph_name)
.unwrap_or_else(InternedGraphName::impossible);
self.interned_quads_for_graph_name(&graph_name)
.map(move |q| self.decode_spog(q))
}
fn interned_quads_for_graph_name( fn interned_quads_for_graph_name(
&self, &self,
graph_name: &InternedGraphName, graph_name: &InternedGraphName,
@ -526,9 +538,12 @@ impl Dataset {
/// Warning 3: This implementation worst-case complexity is in *O(b!)* with *b* the number of blank nodes in the input dataset. /// Warning 3: This implementation worst-case complexity is in *O(b!)* with *b* the number of blank nodes in the input dataset.
pub fn canonicalize(&mut self) { pub fn canonicalize(&mut self) {
let bnodes = self.blank_nodes(); let bnodes = self.blank_nodes();
let (hash, partition) = let quads_per_blank_node = self.quads_per_blank_nodes();
self.hash_bnodes(bnodes.into_iter().map(|bnode| (bnode, 0)).collect()); let (hash, partition) = self.hash_bnodes(
let new_quads = self.distinguish(&hash, &partition); bnodes.into_iter().map(|bnode| (bnode, 0)).collect(),
&quads_per_blank_node,
);
let new_quads = self.distinguish(&hash, &partition, &quads_per_blank_node);
self.clear(); self.clear();
for quad in new_quads { for quad in new_quads {
self.insert_encoded(quad); self.insert_encoded(quad);
@ -573,107 +588,168 @@ impl Dataset {
} }
} }
fn quads_per_blank_nodes(&self) -> QuadsPerBlankNode {
let mut map: HashMap<_, Vec<_>> = HashMap::new();
for quad in &self.spog {
if let InternedSubject::BlankNode(bnode) = &quad.0 {
map.entry(*bnode).or_default().push(quad.clone());
}
#[cfg(feature = "rdf-star")]
if let InternedSubject::Triple(t) = &quad.0 {
Self::add_quad_with_quoted_triple_to_quad_per_blank_nodes_map(quad, t, &mut map);
}
if let InternedTerm::BlankNode(bnode) = &quad.2 {
map.entry(*bnode).or_default().push(quad.clone());
}
#[cfg(feature = "rdf-star")]
if let InternedTerm::Triple(t) = &quad.2 {
Self::add_quad_with_quoted_triple_to_quad_per_blank_nodes_map(quad, t, &mut map);
}
if let InternedGraphName::BlankNode(bnode) = &quad.3 {
map.entry(*bnode).or_default().push(quad.clone());
}
}
map
}
#[cfg(feature = "rdf-star")]
fn add_quad_with_quoted_triple_to_quad_per_blank_nodes_map(
quad: &(
InternedSubject,
InternedNamedNode,
InternedTerm,
InternedGraphName,
),
triple: &InternedTriple,
map: &mut QuadsPerBlankNode,
) {
if let InternedSubject::BlankNode(bnode) = &triple.subject {
map.entry(*bnode).or_default().push(quad.clone());
}
if let InternedSubject::Triple(t) = &triple.subject {
Self::add_quad_with_quoted_triple_to_quad_per_blank_nodes_map(quad, t, map);
}
if let InternedTerm::BlankNode(bnode) = &triple.object {
map.entry(*bnode).or_default().push(quad.clone());
}
if let InternedTerm::Triple(t) = &triple.object {
Self::add_quad_with_quoted_triple_to_quad_per_blank_nodes_map(quad, t, map);
}
}
fn hash_bnodes( fn hash_bnodes(
&self, &self,
mut hashes: HashMap<InternedBlankNode, u64>, mut hashes: HashMap<InternedBlankNode, u64>,
quads_per_blank_node: &QuadsPerBlankNode,
) -> ( ) -> (
HashMap<InternedBlankNode, u64>, HashMap<InternedBlankNode, u64>,
Vec<(u64, Vec<InternedBlankNode>)>, Vec<(u64, Vec<InternedBlankNode>)>,
) { ) {
let mut to_hash = Vec::new(); let mut to_hash = Vec::new();
let mut partition: HashMap<u64, Vec<InternedBlankNode>> = HashMap::new(); let mut to_do = hashes.keys().copied().collect::<Vec<_>>();
let mut partition_len = 0; let mut partition = HashMap::<_, Vec<_>>::with_capacity(hashes.len());
loop { let mut partition_count = to_do.len();
//TODO: improve termination while !to_do.is_empty() {
let mut new_hashes = HashMap::new(); partition.clear();
for (bnode, old_hash) in &hashes { let mut new_hashes = hashes.clone();
for (_, p, o, g) in let mut new_todo = Vec::with_capacity(to_do.len());
self.interned_quads_for_subject(&InternedSubject::BlankNode(*bnode)) for bnode in to_do {
{ for (s, p, o, g) in &quads_per_blank_node[&bnode] {
to_hash.push((
self.hash_named_node(*p),
self.hash_term(o, &hashes),
self.hash_graph_name(g, &hashes),
0,
));
}
for (s, p, _, g) in self.interned_quads_for_object(&InternedTerm::BlankNode(*bnode))
{
to_hash.push((
self.hash_subject(s, &hashes),
self.hash_named_node(*p),
self.hash_graph_name(g, &hashes),
1,
));
}
for (s, p, o, _) in
self.interned_quads_for_graph_name(&InternedGraphName::BlankNode(*bnode))
{
to_hash.push(( to_hash.push((
self.hash_subject(s, &hashes), self.hash_subject(s, bnode, &hashes),
self.hash_named_node(*p), self.hash_named_node(*p),
self.hash_term(o, &hashes), self.hash_term(o, bnode, &hashes),
2, self.hash_graph_name(g, bnode, &hashes),
)); ));
} }
to_hash.sort_unstable(); to_hash.sort_unstable();
let hash = Self::hash_tuple((old_hash, &to_hash)); let hash = Self::hash_tuple((&to_hash,));
to_hash.clear(); to_hash.clear();
new_hashes.insert(*bnode, hash); if hash != hashes[&bnode] {
partition.entry(hash).or_default().push(*bnode); new_hashes.insert(bnode, hash);
} new_todo.push(bnode);
if partition.len() == partition_len { }
let mut partition: Vec<_> = partition.into_iter().collect(); partition.entry(hash).or_default().push(bnode);
partition.sort_by(|(h1, b1), (h2, b2)| (b1.len(), h1).cmp(&(b2.len(), h2)));
return (hashes, partition);
} }
hashes = new_hashes; hashes = new_hashes;
partition_len = partition.len(); to_do = new_todo;
partition.clear(); if partition_count == partition.len() {
break; // no improvement
}
partition_count = partition.len();
} }
let mut partition: Vec<_> = partition.into_iter().collect();
partition.sort_unstable_by(|(h1, b1), (h2, b2)| (b1.len(), h1).cmp(&(b2.len(), h2)));
(hashes, partition)
} }
fn hash_named_node(&self, node: InternedNamedNode) -> u64 { fn hash_named_node(&self, node: InternedNamedNode) -> u64 {
Self::hash_tuple(node.decode_from(&self.interner)) Self::hash_tuple(node.decode_from(&self.interner))
} }
fn hash_blank_node(
node: InternedBlankNode,
current_blank_node: InternedBlankNode,
bnodes_hash: &HashMap<InternedBlankNode, u64>,
) -> u64 {
if node == current_blank_node {
u64::MAX
} else {
bnodes_hash[&node]
}
}
fn hash_subject( fn hash_subject(
&self, &self,
node: &InternedSubject, node: &InternedSubject,
current_blank_node: InternedBlankNode,
bnodes_hash: &HashMap<InternedBlankNode, u64>, bnodes_hash: &HashMap<InternedBlankNode, u64>,
) -> u64 { ) -> u64 {
#[cfg(feature = "rdf-star")] match node {
if let InternedSubject::Triple(triple) = node { InternedSubject::NamedNode(node) => Self::hash_tuple(node.decode_from(&self.interner)),
return self.hash_triple(triple, bnodes_hash); InternedSubject::BlankNode(bnode) => {
} Self::hash_blank_node(*bnode, current_blank_node, bnodes_hash)
if let InternedSubject::BlankNode(bnode) = node { }
bnodes_hash[bnode] #[cfg(feature = "rdf-star")]
} else { InternedSubject::Triple(triple) => {
Self::hash_tuple(node.decode_from(&self.interner)) self.hash_triple(triple, current_blank_node, bnodes_hash)
}
} }
} }
fn hash_term(&self, term: &InternedTerm, bnodes_hash: &HashMap<InternedBlankNode, u64>) -> u64 { fn hash_term(
#[cfg(feature = "rdf-star")] &self,
if let InternedTerm::Triple(triple) = term { term: &InternedTerm,
return self.hash_triple(triple, bnodes_hash); current_blank_node: InternedBlankNode,
} bnodes_hash: &HashMap<InternedBlankNode, u64>,
if let InternedTerm::BlankNode(bnode) = term { ) -> u64 {
bnodes_hash[bnode] match term {
} else { InternedTerm::NamedNode(node) => Self::hash_tuple(node.decode_from(&self.interner)),
Self::hash_tuple(term.decode_from(&self.interner)) InternedTerm::BlankNode(bnode) => {
Self::hash_blank_node(*bnode, current_blank_node, bnodes_hash)
}
InternedTerm::Literal(literal) => Self::hash_tuple(literal.decode_from(&self.interner)),
#[cfg(feature = "rdf-star")]
InternedTerm::Triple(triple) => {
self.hash_triple(triple, current_blank_node, bnodes_hash)
}
} }
} }
fn hash_graph_name( fn hash_graph_name(
&self, &self,
graph_name: &InternedGraphName, graph_name: &InternedGraphName,
current_blank_node: InternedBlankNode,
bnodes_hash: &HashMap<InternedBlankNode, u64>, bnodes_hash: &HashMap<InternedBlankNode, u64>,
) -> u64 { ) -> u64 {
if let InternedGraphName::BlankNode(bnode) = graph_name { match graph_name {
bnodes_hash[bnode] InternedGraphName::NamedNode(node) => {
} else { Self::hash_tuple(node.decode_from(&self.interner))
Self::hash_tuple(graph_name.decode_from(&self.interner)) }
InternedGraphName::BlankNode(bnode) => {
Self::hash_blank_node(*bnode, current_blank_node, bnodes_hash)
}
InternedGraphName::DefaultGraph => 0,
} }
} }
@ -681,12 +757,13 @@ impl Dataset {
fn hash_triple( fn hash_triple(
&self, &self,
triple: &InternedTriple, triple: &InternedTriple,
current_blank_node: InternedBlankNode,
bnodes_hash: &HashMap<InternedBlankNode, u64>, bnodes_hash: &HashMap<InternedBlankNode, u64>,
) -> u64 { ) -> u64 {
Self::hash_tuple(( Self::hash_tuple((
self.hash_subject(&triple.subject, bnodes_hash), self.hash_subject(&triple.subject, current_blank_node, bnodes_hash),
self.hash_named_node(triple.predicate), self.hash_named_node(triple.predicate),
self.hash_term(&triple.object, bnodes_hash), self.hash_term(&triple.object, current_blank_node, bnodes_hash),
)) ))
} }
@ -700,6 +777,7 @@ impl Dataset {
&mut self, &mut self,
hash: &HashMap<InternedBlankNode, u64>, hash: &HashMap<InternedBlankNode, u64>,
partition: &[(u64, Vec<InternedBlankNode>)], partition: &[(u64, Vec<InternedBlankNode>)],
quads_per_blank_node: &QuadsPerBlankNode,
) -> Vec<( ) -> Vec<(
InternedSubject, InternedSubject,
InternedNamedNode, InternedNamedNode,
@ -713,8 +791,9 @@ impl Dataset {
.map(|b| { .map(|b| {
let mut hash_prime = hash.clone(); let mut hash_prime = hash.clone();
hash_prime.insert(*b, Self::hash_tuple((hash_prime[b], 22))); hash_prime.insert(*b, Self::hash_tuple((hash_prime[b], 22)));
let (hash_prime_prime, partition_prime) = self.hash_bnodes(hash_prime); let (hash_prime_prime, partition_prime) =
self.distinguish(&hash_prime_prime, &partition_prime) self.hash_bnodes(hash_prime, quads_per_blank_node);
self.distinguish(&hash_prime_prime, &partition_prime, quads_per_blank_node)
}) })
.reduce(min) .reduce(min)
.unwrap_or_default() .unwrap_or_default()
@ -738,54 +817,43 @@ impl Dataset {
.into_iter() .into_iter()
.map(|(s, p, o, g)| { .map(|(s, p, o, g)| {
( (
if let InternedSubject::BlankNode(bnode) = s { match s {
InternedSubject::BlankNode(self.map_bnode(bnode, hashes)) InternedSubject::NamedNode(_) => s,
} else { InternedSubject::BlankNode(bnode) => {
#[cfg(feature = "rdf-star")] InternedSubject::BlankNode(self.map_bnode(bnode, hashes))
{
if let InternedSubject::Triple(triple) = s {
InternedSubject::Triple(Box::new(InternedTriple::encoded_into(
self.label_triple(&triple, hashes).as_ref(),
&mut self.interner,
)))
} else {
s
}
} }
#[cfg(not(feature = "rdf-star"))] #[cfg(feature = "rdf-star")]
{ InternedSubject::Triple(triple) => {
s InternedSubject::Triple(Box::new(InternedTriple::encoded_into(
self.label_triple(&triple, hashes).as_ref(),
&mut self.interner,
)))
} }
}, },
p, p,
if let InternedTerm::BlankNode(bnode) = o { match o {
InternedTerm::BlankNode(self.map_bnode(bnode, hashes)) InternedTerm::NamedNode(_) | InternedTerm::Literal(_) => o,
} else { InternedTerm::BlankNode(bnode) => {
#[cfg(feature = "rdf-star")] InternedTerm::BlankNode(self.map_bnode(bnode, hashes))
{
if let InternedTerm::Triple(triple) = o {
InternedTerm::Triple(Box::new(InternedTriple::encoded_into(
self.label_triple(&triple, hashes).as_ref(),
&mut self.interner,
)))
} else {
o
}
} }
#[cfg(not(feature = "rdf-star"))] #[cfg(feature = "rdf-star")]
{ InternedTerm::Triple(triple) => {
o InternedTerm::Triple(Box::new(InternedTriple::encoded_into(
self.label_triple(&triple, hashes).as_ref(),
&mut self.interner,
)))
} }
}, },
if let InternedGraphName::BlankNode(bnode) = g { match g {
InternedGraphName::BlankNode(self.map_bnode(bnode, hashes)) InternedGraphName::NamedNode(_) | InternedGraphName::DefaultGraph => g,
} else { InternedGraphName::BlankNode(bnode) => {
g InternedGraphName::BlankNode(self.map_bnode(bnode, hashes))
}
}, },
) )
}) })
.collect(); .collect();
quads.sort(); quads.sort_unstable();
quads quads
} }
@ -1483,3 +1551,46 @@ impl<'a> Iterator for GraphViewIter<'a> {
.map(|(_, s, p, o)| self.dataset.decode_spo((s, p, o))) .map(|(_, s, p, o)| self.dataset.decode_spo((s, p, o)))
} }
} }
type QuadsPerBlankNode = HashMap<
InternedBlankNode,
Vec<(
InternedSubject,
InternedNamedNode,
InternedTerm,
InternedGraphName,
)>,
>;
#[test]
fn test_canon() {
let mut dataset = Dataset::new();
dataset.insert(QuadRef::new(
BlankNode::default().as_ref(),
NamedNodeRef::new_unchecked("http://ex"),
BlankNode::default().as_ref(),
GraphNameRef::DefaultGraph,
));
dataset.insert(QuadRef::new(
BlankNode::default().as_ref(),
NamedNodeRef::new_unchecked("http://ex"),
BlankNode::default().as_ref(),
GraphNameRef::DefaultGraph,
));
dataset.canonicalize();
let mut dataset2 = Dataset::new();
dataset2.insert(QuadRef::new(
BlankNode::default().as_ref(),
NamedNodeRef::new_unchecked("http://ex"),
BlankNode::default().as_ref(),
GraphNameRef::DefaultGraph,
));
dataset2.insert(QuadRef::new(
BlankNode::default().as_ref(),
NamedNodeRef::new_unchecked("http://ex"),
BlankNode::default().as_ref(),
GraphNameRef::DefaultGraph,
));
dataset2.canonicalize();
assert_eq!(dataset, dataset2);
}

@ -8,6 +8,7 @@ use std::hash::{BuildHasher, Hasher};
pub struct Interner { pub struct Interner {
hasher: RandomState, hasher: RandomState,
string_for_hash: HashMap<u64, String, IdentityHasherBuilder>, string_for_hash: HashMap<u64, String, IdentityHasherBuilder>,
string_for_blank_node_id: HashMap<u128, String>,
#[cfg(feature = "rdf-star")] #[cfg(feature = "rdf-star")]
triples: HashMap<InternedTriple, Triple>, triples: HashMap<InternedTriple, Triple>,
} }
@ -119,29 +120,53 @@ impl InternedNamedNode {
} }
#[derive(Eq, PartialEq, Ord, PartialOrd, Debug, Clone, Copy, Hash)] #[derive(Eq, PartialEq, Ord, PartialOrd, Debug, Clone, Copy, Hash)]
pub struct InternedBlankNode { pub enum InternedBlankNode {
id: Key, Number { id: u128 },
Other { id: Key },
} }
impl InternedBlankNode { impl InternedBlankNode {
pub fn encoded_into(blank_node: BlankNodeRef<'_>, interner: &mut Interner) -> Self { pub fn encoded_into(blank_node: BlankNodeRef<'_>, interner: &mut Interner) -> Self {
Self { if let Some(id) = blank_node.unique_id() {
id: interner.get_or_intern(blank_node.as_str()), interner
.string_for_blank_node_id
.entry(id)
.or_insert_with(|| blank_node.as_str().into());
Self::Number { id }
} else {
Self::Other {
id: interner.get_or_intern(blank_node.as_str()),
}
} }
} }
pub fn encoded_from(blank_node: BlankNodeRef<'_>, interner: &Interner) -> Option<Self> { pub fn encoded_from(blank_node: BlankNodeRef<'_>, interner: &Interner) -> Option<Self> {
Some(Self { if let Some(id) = blank_node.unique_id() {
id: interner.get(blank_node.as_str())?, interner
}) .string_for_blank_node_id
.contains_key(&id)
.then_some(Self::Number { id })
} else {
Some(Self::Other {
id: interner.get(blank_node.as_str())?,
})
}
} }
pub fn decode_from(self, interner: &Interner) -> BlankNodeRef { pub fn decode_from(self, interner: &Interner) -> BlankNodeRef {
BlankNodeRef::new_unchecked(interner.resolve(self.id)) BlankNodeRef::new_unchecked(match self {
Self::Number { id } => &interner.string_for_blank_node_id[&id],
Self::Other { id } => interner.resolve(id),
})
} }
pub fn next(self) -> Self { pub fn next(self) -> Self {
Self { id: self.id.next() } match self {
Self::Number { id } => Self::Number {
id: id.saturating_add(1),
},
Self::Other { id } => Self::Other { id: id.next() },
}
} }
} }

Loading…
Cancel
Save