diff --git a/lib/Cargo.toml b/lib/Cargo.toml index 795b7a49..48f340f7 100644 --- a/lib/Cargo.toml +++ b/lib/Cargo.toml @@ -35,7 +35,7 @@ hex = "0.4" nom = "5" peg = "0.6" siphasher = "0.3" -lasso = "0.3" +lasso = {version="0.3", features=["multi-threaded"]} [target.'cfg(target_arch = "wasm32")'.dependencies] js-sys = "0.3" diff --git a/lib/src/store/memory.rs b/lib/src/store/memory.rs index 066fa1d1..20eaf402 100644 --- a/lib/src/store/memory.rs +++ b/lib/src/store/memory.rs @@ -5,20 +5,19 @@ use crate::io::{DatasetFormat, DatasetParser, GraphFormat, GraphParser}; use crate::model::*; use crate::sparql::{EvaluationError, Query, QueryOptions, QueryResult, SimplePreparedQuery}; use crate::store::numeric_encoder::{ - write_term, Decoder, ReadEncoder, StrContainer, StrHash, StrLookup, WithStoreError, - WriteEncoder, WRITTEN_TERM_MAX_SIZE, + Decoder, ReadEncoder, StrContainer, StrId, StrLookup, WithStoreError, WriteEncoder, }; use crate::store::{ dump_dataset, dump_graph, get_encoded_quad_pattern, load_dataset, load_graph, ReadableEncodedStore, WritableEncodedStore, }; +use lasso::{LargeSpur, ThreadedRodeo}; use std::collections::hash_map::DefaultHasher; use std::collections::{HashMap, HashSet}; use std::convert::{Infallible, TryInto}; -use std::hash::{BuildHasherDefault, Hash, Hasher}; +use std::hash::{Hash, Hasher}; use std::io::{BufRead, Write}; use std::iter::FromIterator; -use std::mem::size_of; use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}; use std::vec::IntoIter; use std::{fmt, io}; @@ -53,14 +52,13 @@ use std::{fmt, io}; #[derive(Clone)] pub struct MemoryStore { indexes: Arc>, + strings: Arc>, } -type TrivialHashMap = HashMap>; -type TrivialHashSet = HashSet>; -type TripleMap = TrivialHashMap>>; -type QuadMap = TrivialHashMap>; -type EncodedTerm = crate::store::numeric_encoder::EncodedTerm; -type EncodedQuad = crate::store::numeric_encoder::EncodedQuad; +type TripleMap = HashMap>>; +type QuadMap = HashMap>; +type EncodedTerm = crate::store::numeric_encoder::EncodedTerm; +type EncodedQuad = crate::store::numeric_encoder::EncodedQuad; #[derive(Default)] struct MemoryStoreIndexes { @@ -70,7 +68,6 @@ struct MemoryStoreIndexes { gspo: QuadMap, gpos: QuadMap, gosp: QuadMap, - id2str: HashMap, } impl Default for MemoryStore { @@ -84,6 +81,7 @@ impl MemoryStore { pub fn new() -> Self { Self { indexes: Arc::new(RwLock::default()), + strings: Arc::new(ThreadedRodeo::new()), } } @@ -247,15 +245,16 @@ impl MemoryStore { let mut transaction = MemoryTransaction { ops: Vec::new() }; f(&mut transaction)?; + let mut this = self; let mut indexes = self.indexes_mut(); for op in transaction.ops { match op { TransactionOp::Insert(quad) => { - let quad = indexes.encode_quad(&quad).unwrap_infallible(); + let quad = this.encode_quad(&quad).unwrap_infallible(); indexes.insert_encoded(&quad).unwrap_infallible() } TransactionOp::Delete(quad) => { - let quad = indexes.encode_quad(&quad).unwrap_infallible(); + let quad = this.encode_quad(&quad).unwrap_infallible(); indexes.remove_encoded(&quad).unwrap_infallible() } } @@ -335,16 +334,16 @@ impl MemoryStore { /// Adds a quad to this store. #[allow(clippy::needless_pass_by_value)] pub fn insert(&self, quad: Quad) { - let mut indexes = self.indexes_mut(); - let quad = indexes.encode_quad(&quad).unwrap_infallible(); - indexes.insert_encoded(&quad).unwrap_infallible(); + let mut this = self; + let quad = this.encode_quad(&quad).unwrap_infallible(); + this.insert_encoded(&quad).unwrap_infallible(); } /// Removes a quad from this store. pub fn remove(&self, quad: &Quad) { - let mut indexes = self.indexes_mut(); - if let Some(quad) = indexes.get_encoded_quad(quad).unwrap_infallible() { - indexes.remove_encoded(&quad).unwrap_infallible(); + if let Some(quad) = self.get_encoded_quad(quad).unwrap_infallible() { + let mut this = self; + this.remove_encoded(&quad).unwrap_infallible(); } } @@ -708,22 +707,23 @@ impl MemoryStore { impl WithStoreError for MemoryStore { type Error = Infallible; - type StrId = StrHash; + type StrId = LargeSpur; } impl StrLookup for MemoryStore { - fn get_str(&self, id: StrHash) -> Result, Infallible> { - self.indexes().get_str(id) + fn get_str(&self, id: LargeSpur) -> Result, Infallible> { + //TODO: avoid copy by adding a lifetime limit to get_str + Ok(self.strings.try_resolve(&id).map(|e| e.to_owned())) } - fn get_str_id(&self, value: &str) -> Result, Infallible> { - self.indexes().get_str_id(value) + fn get_str_id(&self, value: &str) -> Result, Infallible> { + Ok(self.strings.get(value)) } } impl<'a> StrContainer for &'a MemoryStore { - fn insert_str(&mut self, value: &str) -> Result { - self.indexes_mut().insert_str(value) + fn insert_str(&mut self, value: &str) -> Result { + Ok(self.strings.get_or_intern(value)) } } @@ -757,33 +757,8 @@ impl<'a> WritableEncodedStore for &'a MemoryStore { impl WithStoreError for MemoryStoreIndexes { type Error = Infallible; - type StrId = StrHash; -} - -impl StrLookup for MemoryStoreIndexes { - fn get_str(&self, id: StrHash) -> Result, Infallible> { - //TODO: avoid copy by adding a lifetime limit to get_str - Ok(self.id2str.get(&id).cloned()) - } - - fn get_str_id(&self, value: &str) -> Result, Infallible> { - let id = StrHash::new(value); - Ok(if self.id2str.contains_key(&id) { - Some(id) - } else { - None - }) - } -} - -impl StrContainer for MemoryStoreIndexes { - fn insert_str(&mut self, value: &str) -> Result { - let key = StrHash::new(value); - self.id2str.entry(key).or_insert_with(|| value.to_owned()); - Ok(key) - } + type StrId = LargeSpur; } - impl WritableEncodedStore for MemoryStoreIndexes { fn insert_encoded(&mut self, quad: &EncodedQuad) -> Result<(), Infallible> { insert_into_quad_map( @@ -913,14 +888,12 @@ fn remove_from_quad_map(map1: &mut QuadMap, e1: &T, e2: &T, e3: } } -fn option_set_flatten<'a, T: Clone>( - i: Option<&'a TrivialHashSet>, -) -> impl Iterator + 'a { +fn option_set_flatten<'a, T: Clone>(i: Option<&'a HashSet>) -> impl Iterator + 'a { i.into_iter().flat_map(|s| s.iter().cloned()) } fn option_pair_map_flatten<'a, T: Copy>( - i: Option<&'a TrivialHashMap>>, + i: Option<&'a HashMap>>, ) -> impl Iterator + 'a { i.into_iter().flat_map(|kv| { kv.iter().flat_map(|(k, vs)| { @@ -1127,9 +1100,11 @@ impl Iterator for EncodedQuadsIter { } } +impl StrId for LargeSpur {} + // Isomorphism implementation -fn iso_canonicalize(g: &MemoryStore) -> Vec> { +fn iso_canonicalize(g: &MemoryStore) -> Vec { let bnodes = bnodes(g); let (hash, partition) = hash_bnodes(g, bnodes.into_iter().map(|bnode| (bnode, 0)).collect()); distinguish(g, &hash, &partition) @@ -1137,9 +1112,9 @@ fn iso_canonicalize(g: &MemoryStore) -> Vec> { fn distinguish( g: &MemoryStore, - hash: &TrivialHashMap, + hash: &HashMap, partition: &[(u64, Vec)], -) -> Vec> { +) -> Vec { let b_prime = partition .iter() .find_map(|(_, b)| if b.len() > 1 { Some(b) } else { None }); @@ -1171,41 +1146,36 @@ fn distinguish( fn hash_bnodes( g: &MemoryStore, - mut hashes: TrivialHashMap, -) -> ( - TrivialHashMap, - Vec<(u64, Vec)>, -) { + mut hashes: HashMap, +) -> (HashMap, Vec<(u64, Vec)>) { let mut to_hash = Vec::new(); - let mut partition: TrivialHashMap> = - TrivialHashMap::with_hasher(BuildHasherDefault::::default()); + let mut partition: HashMap> = HashMap::new(); let mut partition_len = 0; loop { //TODO: improve termination - let mut new_hashes = - TrivialHashMap::with_hasher(BuildHasherDefault::::default()); + let mut new_hashes = HashMap::new(); for (bnode, old_hash) in &hashes { for q in g.encoded_quads_for_subject(*bnode) { to_hash.push(( - hash_term(q.predicate, &hashes), - hash_term(q.object, &hashes), - hash_term(q.graph_name, &hashes), + hash_term(q.predicate, &hashes, g), + hash_term(q.object, &hashes, g), + hash_term(q.graph_name, &hashes, g), 0, )); } for q in g.encoded_quads_for_object(*bnode) { to_hash.push(( - hash_term(q.subject, &hashes), - hash_term(q.predicate, &hashes), - hash_term(q.graph_name, &hashes), + hash_term(q.subject, &hashes, g), + hash_term(q.predicate, &hashes, g), + hash_term(q.graph_name, &hashes, g), 1, )); } for q in g.encoded_quads_for_graph(*bnode) { to_hash.push(( - hash_term(q.subject, &hashes), - hash_term(q.predicate, &hashes), - hash_term(q.object, &hashes), + hash_term(q.subject, &hashes, g), + hash_term(q.predicate, &hashes, g), + hash_term(q.object, &hashes, g), 2, )); } @@ -1226,8 +1196,8 @@ fn hash_bnodes( } } -fn bnodes(g: &MemoryStore) -> TrivialHashSet { - let mut bnodes = TrivialHashSet::with_hasher(BuildHasherDefault::::default()); +fn bnodes(g: &MemoryStore) -> HashSet { + let mut bnodes = HashSet::new(); for q in g.encoded_quads() { if q.subject.is_blank_node() { bnodes.insert(q.subject); @@ -1242,25 +1212,27 @@ fn bnodes(g: &MemoryStore) -> TrivialHashSet { bnodes } -fn label(g: &MemoryStore, hashes: &TrivialHashMap) -> Vec> { +fn label(g: &MemoryStore, hashes: &HashMap) -> Vec { //TODO: better representation? let mut data: Vec<_> = g .encoded_quads() .into_iter() .map(|q| { - let mut buffer = Vec::with_capacity(WRITTEN_TERM_MAX_SIZE * 4); - write_term(&mut buffer, map_term(q.subject, hashes)); - write_term(&mut buffer, map_term(q.predicate, hashes)); - write_term(&mut buffer, map_term(q.object, hashes)); - write_term(&mut buffer, map_term(q.graph_name, hashes)); - buffer + g.decode_quad(&EncodedQuad { + subject: map_term(q.subject, hashes), + predicate: map_term(q.predicate, hashes), + object: map_term(q.object, hashes), + graph_name: map_term(q.graph_name, hashes), + }) + .unwrap() + .to_string() }) .collect(); data.sort(); data } -fn map_term(term: EncodedTerm, bnodes_hash: &TrivialHashMap) -> EncodedTerm { +fn map_term(term: EncodedTerm, bnodes_hash: &HashMap) -> EncodedTerm { if term.is_blank_node() { EncodedTerm::InlineBlankNode { id: (*bnodes_hash.get(&term).unwrap()).into(), @@ -1270,11 +1242,13 @@ fn map_term(term: EncodedTerm, bnodes_hash: &TrivialHashMap) - } } -fn hash_term(term: EncodedTerm, bnodes_hash: &TrivialHashMap) -> u64 { +fn hash_term(term: EncodedTerm, bnodes_hash: &HashMap, g: &MemoryStore) -> u64 { if term.is_blank_node() { *bnodes_hash.get(&term).unwrap() - } else { + } else if let Ok(term) = g.decode_term(term) { hash_tuple(term) + } else { + 0 } } @@ -1283,77 +1257,3 @@ fn hash_tuple(v: impl Hash) -> u64 { v.hash(&mut hasher); hasher.finish() } - -#[derive(Default)] -struct TrivialHasher { - value: u64, -} - -#[allow( - arithmetic_overflow, - clippy::cast_sign_loss, - clippy::cast_possible_truncation -)] -impl Hasher for TrivialHasher { - fn finish(&self) -> u64 { - self.value - } - - fn write(&mut self, bytes: &[u8]) { - for chunk in bytes.chunks(size_of::()) { - let mut val = [0; size_of::()]; - val[0..chunk.len()].copy_from_slice(chunk); - self.write_u64(u64::from_le_bytes(val)); - } - } - - fn write_u8(&mut self, i: u8) { - self.write_u64(i.into()); - } - - fn write_u16(&mut self, i: u16) { - self.write_u64(i.into()); - } - - fn write_u32(&mut self, i: u32) { - self.write_u64(i.into()); - } - - fn write_u64(&mut self, i: u64) { - self.value ^= i; - } - - fn write_u128(&mut self, i: u128) { - self.write_u64(i as u64); - self.write_u64((i >> 64) as u64); - } - - fn write_usize(&mut self, i: usize) { - self.write_u64(i as u64); - self.write_u64((i >> 64) as u64); - } - - fn write_i8(&mut self, i: i8) { - self.write_u8(i as u8); - } - - fn write_i16(&mut self, i: i16) { - self.write_u16(i as u16); - } - - fn write_i32(&mut self, i: i32) { - self.write_u32(i as u32); - } - - fn write_i64(&mut self, i: i64) { - self.write_u64(i as u64); - } - - fn write_i128(&mut self, i: i128) { - self.write_u128(i as u128); - } - - fn write_isize(&mut self, i: isize) { - self.write_usize(i as usize); - } -} diff --git a/lib/src/store/mod.rs b/lib/src/store/mod.rs index 7daf5b87..6f342249 100644 --- a/lib/src/store/mod.rs +++ b/lib/src/store/mod.rs @@ -45,13 +45,13 @@ pub(crate) trait ReadableEncodedStore: StrLookup { ) -> Self::QuadsIter; } -pub(crate) trait WritableEncodedStore: StrContainer { +pub(crate) trait WritableEncodedStore: WithStoreError { fn insert_encoded(&mut self, quad: &EncodedQuad) -> Result<(), Self::Error>; fn remove_encoded(&mut self, quad: &EncodedQuad) -> Result<(), Self::Error>; } -fn load_graph( +fn load_graph( store: &mut S, reader: impl BufRead, format: GraphFormat, @@ -72,7 +72,7 @@ fn load_graph( } } -fn load_from_triple_parser( +fn load_from_triple_parser( store: &mut S, parser: Result, to_graph_name: &GraphName, @@ -137,7 +137,7 @@ fn map_xml_err(e: RdfXmlError) -> io::Error { io::Error::new(io::ErrorKind::Other, e) // TODO: drop } -fn load_dataset( +fn load_dataset( store: &mut S, reader: impl BufRead, format: DatasetFormat, @@ -150,7 +150,7 @@ fn load_dataset( } } -fn load_from_quad_parser( +fn load_from_quad_parser( store: &mut S, parser: Result, ) -> Result<(), StoreOrParseError>