diff --git a/Cargo.lock b/Cargo.lock index 9186c790..af81a92c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8,17 +8,6 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" -[[package]] -name = "ahash" -version = "0.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" -dependencies = [ - "getrandom", - "once_cell", - "version_check", -] - [[package]] name = "aho-corasick" version = "0.7.20" @@ -597,15 +586,6 @@ version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" -[[package]] -name = "hashbrown" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" -dependencies = [ - "ahash", -] - [[package]] name = "hashbrown" version = "0.12.3" @@ -689,7 +669,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1885e79c1fc4b10f0e172c475f458b7f7b93061064d98c3293e98c5ba0c8b399" dependencies = [ "autocfg", - "hashbrown 0.12.3", + "hashbrown", ] [[package]] @@ -778,15 +758,6 @@ dependencies = [ "winapi-build", ] -[[package]] -name = "lasso" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aeb7b21a526375c5ca55f1a6dfd4e1fad9fa4edd750f530252a718a44b2608f0" -dependencies = [ - "hashbrown 0.11.2", -] - [[package]] name = "lazy_static" version = "1.4.0" @@ -1053,7 +1024,6 @@ checksum = "bb175ec8981211357b7b379869c2f8d555881c55ea62311428ec0de46d89bd5c" name = "oxrdf" version = "0.1.5-dev" dependencies = [ - "lasso", "oxilangtag", "oxiri", "oxsdatatypes", diff --git a/lib/oxrdf/Cargo.toml b/lib/oxrdf/Cargo.toml index 6fa8cc0d..7c9042da 100644 --- a/lib/oxrdf/Cargo.toml +++ b/lib/oxrdf/Cargo.toml @@ -21,9 +21,7 @@ rdf-star = [] rand = "0.8" oxilangtag = "0.1" oxiri = "0.2" - oxsdatatypes = { version = "0.1.1", path="../oxsdatatypes", optional = true } -lasso = { version = "0.6", features = ["inline-more"] } [package.metadata.docs.rs] all-features = true diff --git a/lib/oxrdf/src/interning.rs b/lib/oxrdf/src/interning.rs index bb47e26c..3198f422 100644 --- a/lib/oxrdf/src/interning.rs +++ b/lib/oxrdf/src/interning.rs @@ -1,108 +1,175 @@ //! Interning of RDF elements using Rodeo use crate::*; -use lasso::{Key, Rodeo, Spur}; -#[cfg(feature = "rdf-star")] -use std::collections::HashMap; +use std::collections::hash_map::{Entry, HashMap, RandomState}; +use std::hash::{BuildHasher, Hasher}; #[derive(Debug, Default)] pub struct Interner { - strings: Rodeo, + hasher: RandomState, + string_for_hash: HashMap, #[cfg(feature = "rdf-star")] triples: HashMap, } +impl Interner { + fn get_or_intern(&mut self, value: &str) -> Key { + let mut hash = self.hash(value); + loop { + match self.string_for_hash.entry(hash) { + Entry::Vacant(e) => { + e.insert(value.into()); + return Key(hash); + } + Entry::Occupied(e) => loop { + if e.get() == value { + return Key(hash); + } else if hash == u64::MAX - 1 { + hash = 0; + } else { + hash += 1; + } + }, + } + } + } + + fn get(&self, value: &str) -> Option { + let mut hash = self.hash(value); + loop { + let v = self.string_for_hash.get(&hash)?; + if v == value { + return Some(Key(hash)); + } else if hash == u64::MAX - 1 { + hash = 0; + } else { + hash += 1; + } + } + } + + fn hash(&self, value: &str) -> u64 { + let mut hasher = self.hasher.build_hasher(); + hasher.write(value.as_bytes()); + let hash = hasher.finish(); + if hash == u64::MAX { + 0 + } else { + hash + } + } + + fn resolve(&self, key: &Key) -> &str { + self.string_for_hash + .get(&key.0) + .expect("Interned key not found") + } +} + +#[derive(Eq, PartialEq, Ord, PartialOrd, Debug, Clone, Copy, Hash)] +pub struct Key(u64); + +impl Key { + fn first() -> Self { + Self(0) + } + + fn next(self) -> Self { + Self(self.0.saturating_add(1)) + } + + fn impossible() -> Self { + Key(u64::MAX) + } +} + #[derive(Eq, PartialEq, Ord, PartialOrd, Debug, Clone, Copy, Hash)] pub struct InternedNamedNode { - id: Spur, + id: Key, } impl InternedNamedNode { pub fn encoded_into(named_node: NamedNodeRef<'_>, interner: &mut Interner) -> Self { Self { - id: interner.strings.get_or_intern(named_node.as_str()), + id: interner.get_or_intern(named_node.as_str()), } } pub fn encoded_from(named_node: NamedNodeRef<'_>, interner: &Interner) -> Option { Some(Self { - id: interner.strings.get(named_node.as_str())?, + id: interner.get(named_node.as_str())?, }) } pub fn decode_from<'a>(&self, interner: &'a Interner) -> NamedNodeRef<'a> { - NamedNodeRef::new_unchecked(interner.strings.resolve(&self.id)) + NamedNodeRef::new_unchecked(interner.resolve(&self.id)) } pub fn first() -> Self { - Self { id: fist_spur() } + Self { id: Key::first() } } pub fn next(self) -> Self { - Self { - id: next_spur(self.id), - } + Self { id: self.id.next() } } pub fn impossible() -> Self { Self { - id: impossible_spur(), + id: Key::impossible(), } } } #[derive(Eq, PartialEq, Ord, PartialOrd, Debug, Clone, Copy, Hash)] pub struct InternedBlankNode { - id: Spur, + id: Key, } impl InternedBlankNode { pub fn encoded_into(blank_node: BlankNodeRef<'_>, interner: &mut Interner) -> Self { Self { - id: interner.strings.get_or_intern(blank_node.as_str()), + id: interner.get_or_intern(blank_node.as_str()), } } pub fn encoded_from(blank_node: BlankNodeRef<'_>, interner: &Interner) -> Option { Some(Self { - id: interner.strings.get(blank_node.as_str())?, + id: interner.get(blank_node.as_str())?, }) } pub fn decode_from<'a>(&self, interner: &'a Interner) -> BlankNodeRef<'a> { - BlankNodeRef::new_unchecked(interner.strings.resolve(&self.id)) + BlankNodeRef::new_unchecked(interner.resolve(&self.id)) } pub fn next(self) -> Self { - Self { - id: next_spur(self.id), - } + Self { id: self.id.next() } } } #[derive(Eq, PartialEq, Ord, PartialOrd, Debug, Clone, Copy, Hash)] pub enum InternedLiteral { String { - value_id: Spur, + value_id: Key, }, LanguageTaggedString { - value_id: Spur, - language_id: Spur, + value_id: Key, + language_id: Key, }, TypedLiteral { - value_id: Spur, + value_id: Key, datatype: InternedNamedNode, }, } impl InternedLiteral { pub fn encoded_into(literal: LiteralRef<'_>, interner: &mut Interner) -> Self { - let value_id = interner.strings.get_or_intern(literal.value()); + let value_id = interner.get_or_intern(literal.value()); if literal.is_plain() { if let Some(language) = literal.language() { Self::LanguageTaggedString { value_id, - language_id: interner.strings.get_or_intern(language), + language_id: interner.get_or_intern(language), } } else { Self::String { value_id } @@ -116,12 +183,12 @@ impl InternedLiteral { } pub fn encoded_from(literal: LiteralRef<'_>, interner: &Interner) -> Option { - let value_id = interner.strings.get(literal.value())?; + let value_id = interner.get(literal.value())?; Some(if literal.is_plain() { if let Some(language) = literal.language() { Self::LanguageTaggedString { value_id, - language_id: interner.strings.get(language)?, + language_id: interner.get(language)?, } } else { Self::String { value_id } @@ -137,17 +204,17 @@ impl InternedLiteral { pub fn decode_from<'a>(&self, interner: &'a Interner) -> LiteralRef<'a> { match self { InternedLiteral::String { value_id } => { - LiteralRef::new_simple_literal(interner.strings.resolve(value_id)) + LiteralRef::new_simple_literal(interner.resolve(value_id)) } InternedLiteral::LanguageTaggedString { value_id, language_id, } => LiteralRef::new_language_tagged_literal_unchecked( - interner.strings.resolve(value_id), - interner.strings.resolve(language_id), + interner.resolve(value_id), + interner.resolve(language_id), ), InternedLiteral::TypedLiteral { value_id, datatype } => LiteralRef::new_typed_literal( - interner.strings.resolve(value_id), + interner.resolve(value_id), datatype.decode_from(interner), ), } @@ -156,14 +223,14 @@ impl InternedLiteral { pub fn next(&self) -> Self { match self { Self::String { value_id } => Self::String { - value_id: next_spur(*value_id), + value_id: value_id.next(), }, Self::LanguageTaggedString { value_id, language_id, } => Self::LanguageTaggedString { value_id: *value_id, - language_id: next_spur(*language_id), + language_id: language_id.next(), }, Self::TypedLiteral { value_id, datatype } => Self::TypedLiteral { value_id: *value_id, @@ -414,14 +481,32 @@ impl InternedTriple { } } -fn fist_spur() -> Spur { - Spur::try_from_usize(0).unwrap() +#[derive(Default)] +struct IdentityHasherBuilder {} + +impl BuildHasher for IdentityHasherBuilder { + type Hasher = IdentityHasher; + + fn build_hasher(&self) -> IdentityHasher { + IdentityHasher::default() + } } -fn next_spur(value: Spur) -> Spur { - Spur::try_from_usize(value.into_usize() + 1).unwrap() +#[derive(Default)] +struct IdentityHasher { + value: u64, } -fn impossible_spur() -> Spur { - Spur::try_from_usize((u32::MAX - 10).try_into().unwrap()).unwrap() +impl Hasher for IdentityHasher { + fn finish(&self) -> u64 { + self.value + } + + fn write(&mut self, _bytes: &[u8]) { + unimplemented!() + } + + fn write_u64(&mut self, i: u64) { + self.value = i + } }