diff --git a/lib/src/sparql/eval.rs b/lib/src/sparql/eval.rs index cb9b998e..2b506fd0 100644 --- a/lib/src/sparql/eval.rs +++ b/lib/src/sparql/eval.rs @@ -31,7 +31,6 @@ use std::iter::{empty, once}; use std::ops::Deref; use std::str; use std::sync::Mutex; -use std::u64; use uuid::Uuid; const REGEX_SIZE_LIMIT: usize = 1_000_000; @@ -40,7 +39,7 @@ type EncodedTuplesIterator<'a> = Box> + pub struct SimpleEvaluator { dataset: DatasetView, - bnodes_map: Mutex>, + bnodes_map: Mutex>, base_iri: Option>, now: DateTime, } @@ -855,17 +854,12 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator { _ => None, }?; let iri = self.dataset.get_str(iri_id).ok()??; - Some(if let Some(base_iri) = &self.base_iri { - EncodedTerm::NamedNode { - iri_id: self - .dataset - .insert_str(&base_iri.resolve(&iri).ok()?.into_inner()) - .ok()?, - } + if let Some(base_iri) = &self.base_iri { + self.build_named_node(&base_iri.resolve(&iri).ok()?.into_inner()) } else { Iri::parse(iri).ok()?; - EncodedTerm::NamedNode { iri_id } - }) + Some(EncodedTerm::NamedNode { iri_id }) + } } PlanExpression::BNode(id) => match id { Some(id) => { @@ -1044,12 +1038,7 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator { } } } - Some(EncodedTerm::StringLiteral { - value_id: self - .dataset - .insert_str(str::from_utf8(&result).ok()?) - .ok()?, - }) + self.build_string_literal(str::from_utf8(&result).ok()?) } PlanExpression::StrEnds(arg1, arg2) => { let (arg1, arg2, _) = self.to_argument_compatible_strings( @@ -1166,11 +1155,9 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator { write!(&mut result, "{}S", seconds).ok()?; } Some(EncodedTerm::TypedLiteral { - value_id: self.dataset.insert_str(&result).ok()?, + value_id: self.build_string_id(&result)?, datatype_id: self - .dataset - .insert_str("http://www.w3.org/2001/XMLSchema#dayTimeDuration") - .ok()?, + .build_string_id("http://www.w3.org/2001/XMLSchema#dayTimeDuration")?, }) } PlanExpression::Tz(e) => { @@ -1185,9 +1172,9 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator { Some(if let Some(timezone) = timezone { EncodedTerm::StringLiteral { value_id: if timezone.local_minus_utc() == 0 { - self.dataset.insert_str("Z").ok()? + self.build_string_id("Z")? } else { - self.dataset.insert_str(&timezone.to_string()).ok()? + self.build_string_id(&timezone.to_string())? }, } } else { @@ -1195,26 +1182,16 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator { }) } PlanExpression::Now => Some(self.now.into()), - PlanExpression::UUID => Some(EncodedTerm::NamedNode { - iri_id: self - .dataset - .insert_str( - Uuid::new_v4() - .to_urn() - .encode_lower(&mut Uuid::encode_buffer()), - ) - .ok()?, - }), - PlanExpression::StrUUID => Some(EncodedTerm::StringLiteral { - value_id: self - .dataset - .insert_str( - Uuid::new_v4() - .to_hyphenated() - .encode_lower(&mut Uuid::encode_buffer()), - ) - .ok()?, - }), + PlanExpression::UUID => self.build_named_node( + Uuid::new_v4() + .to_urn() + .encode_lower(&mut Uuid::encode_buffer()), + ), + PlanExpression::StrUUID => self.build_string_literal( + Uuid::new_v4() + .to_hyphenated() + .encode_lower(&mut Uuid::encode_buffer()), + ), PlanExpression::MD5(arg) => self.hash::(arg, tuple), PlanExpression::SHA1(arg) => self.hash::(arg, tuple), PlanExpression::SHA256(arg) => self.hash::(arg, tuple), @@ -1253,7 +1230,6 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator { None }?; self.dataset - .encoder() .encode_rio_literal(rio::Literal::Typed { value: &value, datatype: rio::NamedNode { iri: &datatype }, @@ -1296,10 +1272,9 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator { } PlanExpression::BooleanCast(e) => match self.eval_expression(e, tuple)? { EncodedTerm::BooleanLiteral(value) => Some(value.into()), - EncodedTerm::StringLiteral { value_id } => self - .dataset - .encoder() - .encode_boolean_str(&*self.dataset.get_str(value_id).ok()??), + EncodedTerm::StringLiteral { value_id } => { + parse_boolean_str(&*self.dataset.get_str(value_id).ok()??) + } _ => None, }, PlanExpression::DoubleCast(e) => match self.eval_expression(e, tuple)? { @@ -1310,10 +1285,9 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator { EncodedTerm::BooleanLiteral(value) => { Some(if value { 1. as f64 } else { 0. }.into()) } - EncodedTerm::StringLiteral { value_id } => self - .dataset - .encoder() - .encode_double_str(&*self.dataset.get_str(value_id).ok()??), + EncodedTerm::StringLiteral { value_id } => { + parse_double_str(&*self.dataset.get_str(value_id).ok()??) + } _ => None, }, PlanExpression::FloatCast(e) => match self.eval_expression(e, tuple)? { @@ -1324,10 +1298,9 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator { EncodedTerm::BooleanLiteral(value) => { Some(if value { 1. as f32 } else { 0. }.into()) } - EncodedTerm::StringLiteral { value_id } => self - .dataset - .encoder() - .encode_float_str(&*self.dataset.get_str(value_id).ok()??), + EncodedTerm::StringLiteral { value_id } => { + parse_float_str(&*self.dataset.get_str(value_id).ok()??) + } _ => None, }, PlanExpression::IntegerCast(e) => match self.eval_expression(e, tuple)? { @@ -1336,10 +1309,9 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator { EncodedTerm::IntegerLiteral(value) => Some(value.to_i128()?.into()), EncodedTerm::DecimalLiteral(value) => Some(value.to_i128()?.into()), EncodedTerm::BooleanLiteral(value) => Some(if value { 1 } else { 0 }.into()), - EncodedTerm::StringLiteral { value_id } => self - .dataset - .encoder() - .encode_integer_str(&*self.dataset.get_str(value_id).ok()??), + EncodedTerm::StringLiteral { value_id } => { + parse_integer_str(&*self.dataset.get_str(value_id).ok()??) + } _ => None, }, PlanExpression::DecimalCast(e) => match self.eval_expression(e, tuple)? { @@ -1355,10 +1327,9 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator { } .into(), ), - EncodedTerm::StringLiteral { value_id } => self - .dataset - .encoder() - .encode_decimal_str(&*self.dataset.get_str(value_id).ok()??), + EncodedTerm::StringLiteral { value_id } => { + parse_decimal_str(&*self.dataset.get_str(value_id).ok()??) + } _ => None, }, PlanExpression::DateCast(e) => match self.eval_expression(e, tuple)? { @@ -1366,29 +1337,26 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator { EncodedTerm::NaiveDateLiteral(value) => Some(value.into()), EncodedTerm::DateTimeLiteral(value) => Some(value.date().into()), EncodedTerm::NaiveDateTimeLiteral(value) => Some(value.date().into()), - EncodedTerm::StringLiteral { value_id } => self - .dataset - .encoder() - .encode_date_str(&*self.dataset.get_str(value_id).ok()??), + EncodedTerm::StringLiteral { value_id } => { + parse_date_str(&*self.dataset.get_str(value_id).ok()??) + } _ => None, }, PlanExpression::TimeCast(e) => match self.eval_expression(e, tuple)? { EncodedTerm::NaiveTimeLiteral(value) => Some(value.into()), EncodedTerm::DateTimeLiteral(value) => Some(value.time().into()), EncodedTerm::NaiveDateTimeLiteral(value) => Some(value.time().into()), - EncodedTerm::StringLiteral { value_id } => self - .dataset - .encoder() - .encode_time_str(&*self.dataset.get_str(value_id).ok()??), + EncodedTerm::StringLiteral { value_id } => { + parse_time_str(&*self.dataset.get_str(value_id).ok()??) + } _ => None, }, PlanExpression::DateTimeCast(e) => match self.eval_expression(e, tuple)? { EncodedTerm::DateTimeLiteral(value) => Some(value.into()), EncodedTerm::NaiveDateTimeLiteral(value) => Some(value.into()), - EncodedTerm::StringLiteral { value_id } => self - .dataset - .encoder() - .encode_date_time_str(&*self.dataset.get_str(value_id).ok()??), + EncodedTerm::StringLiteral { value_id } => { + parse_date_time_str(&*self.dataset.get_str(value_id).ok()??) + } _ => None, }, PlanExpression::StringCast(e) => Some(EncodedTerm::StringLiteral { @@ -1409,7 +1377,7 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator { } } - fn to_string_id(&self, term: EncodedTerm) -> Option { + fn to_string_id(&self, term: EncodedTerm) -> Option { match term { EncodedTerm::DefaultGraph => None, EncodedTerm::NamedNode { iri_id } => Some(iri_id), @@ -1417,32 +1385,25 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator { EncodedTerm::StringLiteral { value_id } | EncodedTerm::LangStringLiteral { value_id, .. } | EncodedTerm::TypedLiteral { value_id, .. } => Some(value_id), - EncodedTerm::BooleanLiteral(value) => self - .dataset - .insert_str(if value { "true" } else { "false" }) - .ok(), - EncodedTerm::FloatLiteral(value) => self.dataset.insert_str(&value.to_string()).ok(), - EncodedTerm::DoubleLiteral(value) => self.dataset.insert_str(&value.to_string()).ok(), - EncodedTerm::IntegerLiteral(value) => self.dataset.insert_str(&value.to_string()).ok(), - EncodedTerm::DecimalLiteral(value) => self.dataset.insert_str(&value.to_string()).ok(), - EncodedTerm::DateLiteral(value) => self.dataset.insert_str(&value.to_string()).ok(), - EncodedTerm::NaiveDateLiteral(value) => { - self.dataset.insert_str(&value.to_string()).ok() - } - EncodedTerm::NaiveTimeLiteral(value) => { - self.dataset.insert_str(&value.to_string()).ok() - } - EncodedTerm::DateTimeLiteral(value) => self.dataset.insert_str(&value.to_string()).ok(), - EncodedTerm::NaiveDateTimeLiteral(value) => { - self.dataset.insert_str(&value.to_string()).ok() - } + EncodedTerm::BooleanLiteral(value) => { + self.build_string_id(if value { "true" } else { "false" }) + } + EncodedTerm::FloatLiteral(value) => self.build_string_id(&value.to_string()), + EncodedTerm::DoubleLiteral(value) => self.build_string_id(&value.to_string()), + EncodedTerm::IntegerLiteral(value) => self.build_string_id(&value.to_string()), + EncodedTerm::DecimalLiteral(value) => self.build_string_id(&value.to_string()), + EncodedTerm::DateLiteral(value) => self.build_string_id(&value.to_string()), + EncodedTerm::NaiveDateLiteral(value) => self.build_string_id(&value.to_string()), + EncodedTerm::NaiveTimeLiteral(value) => self.build_string_id(&value.to_string()), + EncodedTerm::DateTimeLiteral(value) => self.build_string_id(&value.to_string()), + EncodedTerm::NaiveDateTimeLiteral(value) => self.build_string_id(&value.to_string()), } } fn to_simple_string( &self, term: EncodedTerm, - ) -> Option< as StringStore>::StringType> { + ) -> Option< as StrLookup>::StrType> { if let EncodedTerm::StringLiteral { value_id } = term { self.dataset.get_str(value_id).ok()? } else { @@ -1450,7 +1411,7 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator { } } - fn to_simple_string_id(&self, term: EncodedTerm) -> Option { + fn to_simple_string_id(&self, term: EncodedTerm) -> Option { if let EncodedTerm::StringLiteral { value_id } = term { Some(value_id) } else { @@ -1458,7 +1419,7 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator { } } - fn to_string(&self, term: EncodedTerm) -> Option< as StringStore>::StringType> { + fn to_string(&self, term: EncodedTerm) -> Option< as StrLookup>::StrType> { match term { EncodedTerm::StringLiteral { value_id } | EncodedTerm::LangStringLiteral { value_id, .. } => { @@ -1471,7 +1432,7 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator { fn to_string_and_language( &self, term: EncodedTerm, - ) -> Option<( as StringStore>::StringType, Option)> { + ) -> Option<( as StrLookup>::StrType, Option)> { match term { EncodedTerm::StringLiteral { value_id } => { Some((self.dataset.get_str(value_id).ok()??, None)) @@ -1484,27 +1445,47 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator { } } - fn build_plain_literal(&self, value: &str, language: Option) -> Option { - Some(if let Some(language_id) = language { - EncodedTerm::LangStringLiteral { - value_id: self.dataset.insert_str(value).ok()?, - language_id, - } - } else { - EncodedTerm::StringLiteral { - value_id: self.dataset.insert_str(value).ok()?, - } + fn build_named_node(&self, iri: &str) -> Option { + Some(EncodedTerm::NamedNode { + iri_id: self.build_string_id(iri)?, }) } + fn build_string_literal(&self, value: &str) -> Option { + Some(EncodedTerm::StringLiteral { + value_id: self.build_string_id(value)?, + }) + } + + fn build_lang_string_literal(&self, value: &str, language_id: u128) -> Option { + Some(EncodedTerm::LangStringLiteral { + value_id: self.build_string_id(value)?, + language_id, + }) + } + + fn build_plain_literal(&self, value: &str, language: Option) -> Option { + if let Some(language_id) = language { + self.build_lang_string_literal(value, language_id) + } else { + self.build_string_literal(value) + } + } + + fn build_string_id(&self, value: &str) -> Option { + let value_id = get_str_id(value); + self.dataset.insert_str(value_id, value).ok()?; + Some(value_id) + } + fn to_argument_compatible_strings( &self, arg1: EncodedTerm, arg2: EncodedTerm, ) -> Option<( - as StringStore>::StringType, - as StringStore>::StringType, - Option, + as StrLookup>::StrType, + as StrLookup>::StrType, + Option, )> { let (value1, language1) = self.to_string_and_language(arg1)?; let (value2, language2) = self.to_string_and_language(arg2)?; @@ -1569,11 +1550,10 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator { BindingsIterator::new( variables, Box::new(iter.map(move |values| { - let encoder = eval.dataset.encoder(); let mut result = vec![None; tuple_size]; for (i, value) in values?.into_iter().enumerate() { if let Some(term) = value { - result[i] = Some(encoder.decode_term(term)?) + result[i] = Some(eval.dataset.decode_term(term)?) } } Ok(result) @@ -1803,7 +1783,7 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator { } } - fn compare_str_ids(&self, a: u64, b: u64) -> Option { + fn compare_str_ids(&self, a: u128, b: u128) -> Option { Some( self.dataset .get_str(a) @@ -1819,9 +1799,7 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator { ) -> Option { let input = self.to_simple_string(self.eval_expression(arg, tuple)?)?; let hash = hex::encode(H::new().chain(&input as &str).result()); - Some(EncodedTerm::StringLiteral { - value_id: self.dataset.insert_str(&hash).ok()?, - }) + self.build_string_literal(&hash) } } @@ -2195,15 +2173,18 @@ impl<'a, S: StoreConnection> Iterator for ConstructIterator<'a, S> { Ok(tuple) => tuple, Err(error) => return Some(Err(error)), }; - let encoder = self.eval.dataset.encoder(); for template in self.template { if let (Some(subject), Some(predicate), Some(object)) = ( get_triple_template_value(&template.subject, &tuple, &mut self.bnodes), get_triple_template_value(&template.predicate, &tuple, &mut self.bnodes), get_triple_template_value(&template.object, &tuple, &mut self.bnodes), ) { - self.buffered_results - .push(decode_triple(&encoder, subject, predicate, object)); + self.buffered_results.push(decode_triple( + &self.eval.dataset, + subject, + predicate, + object, + )); } } self.bnodes.clear(); //We do not reuse old bnodes @@ -2229,16 +2210,16 @@ fn get_triple_template_value( } } -fn decode_triple( - encoder: &Encoder, +fn decode_triple( + decoder: impl Decoder, subject: EncodedTerm, predicate: EncodedTerm, object: EncodedTerm, ) -> Result { Ok(Triple::new( - encoder.decode_named_or_blank_node(subject)?, - encoder.decode_named_node(predicate)?, - encoder.decode_term(object)?, + decoder.decode_named_or_blank_node(subject)?, + decoder.decode_named_node(predicate)?, + decoder.decode_term(object)?, )) } @@ -2258,7 +2239,6 @@ impl<'a, S: StoreConnection> Iterator for DescribeIterator<'a, S> { Ok(quad) => self .eval .dataset - .encoder() .decode_quad(&quad) .map(|q| q.into_triple()), Err(error) => Err(error), @@ -2607,7 +2587,7 @@ impl Accumulator for SampleAccumulator { struct GroupConcatAccumulator<'a, S: StoreConnection + 'a> { eval: &'a SimpleEvaluator, concat: Option, - language: Option>, + language: Option>, separator: &'a str, } diff --git a/lib/src/sparql/mod.rs b/lib/src/sparql/mod.rs index a48a4244..46bd1c13 100644 --- a/lib/src/sparql/mod.rs +++ b/lib/src/sparql/mod.rs @@ -64,7 +64,7 @@ impl SimplePreparedQuery { dataset: _, base_iri, } => { - let (plan, variables) = PlanBuilder::build(dataset.encoder(), &algebra)?; + let (plan, variables) = PlanBuilder::build(&dataset, &algebra)?; SimplePreparedQueryOptions::Select { plan, variables, @@ -76,7 +76,7 @@ impl SimplePreparedQuery { dataset: _, base_iri, } => { - let (plan, _) = PlanBuilder::build(dataset.encoder(), &algebra)?; + let (plan, _) = PlanBuilder::build(&dataset, &algebra)?; SimplePreparedQueryOptions::Ask { plan, evaluator: SimpleEvaluator::new(dataset, base_iri), @@ -88,14 +88,10 @@ impl SimplePreparedQuery { dataset: _, base_iri, } => { - let (plan, variables) = PlanBuilder::build(dataset.encoder(), &algebra)?; + let (plan, variables) = PlanBuilder::build(&dataset, &algebra)?; SimplePreparedQueryOptions::Construct { plan, - construct: PlanBuilder::build_graph_template( - dataset.encoder(), - &construct, - variables, - )?, + construct: PlanBuilder::build_graph_template(&dataset, &construct, variables)?, evaluator: SimpleEvaluator::new(dataset, base_iri), } } @@ -104,7 +100,7 @@ impl SimplePreparedQuery { dataset: _, base_iri, } => { - let (plan, _) = PlanBuilder::build(dataset.encoder(), &algebra)?; + let (plan, _) = PlanBuilder::build(&dataset, &algebra)?; SimplePreparedQueryOptions::Describe { plan, evaluator: SimpleEvaluator::new(dataset, base_iri), diff --git a/lib/src/sparql/plan.rs b/lib/src/sparql/plan.rs index a8bdde19..20e4599d 100644 --- a/lib/src/sparql/plan.rs +++ b/lib/src/sparql/plan.rs @@ -1,11 +1,10 @@ use crate::sparql::eval::StringOrStoreString; use crate::store::numeric_encoder::{ - EncodedQuad, EncodedTerm, Encoder, MemoryStringStore, StringStore, + EncodedQuad, EncodedTerm, MemoryStrStore, StrContainer, StrLookup, }; use crate::store::StoreConnection; use crate::Result; use std::collections::BTreeSet; -use std::u64; pub type EncodedTuple = Vec>; @@ -461,14 +460,14 @@ pub enum TripleTemplateValue { pub struct DatasetView { store: S, - extra: MemoryStringStore, + extra: MemoryStrStore, } impl DatasetView { pub fn new(store: S) -> Self { Self { store, - extra: MemoryStringStore::default(), + extra: MemoryStrStore::default(), } } @@ -482,38 +481,28 @@ impl DatasetView { self.store .quads_for_pattern(subject, predicate, object, graph_name) } - - pub fn encoder(&self) -> Encoder<&Self> { - Encoder::new(&self) - } } -impl StringStore for DatasetView { - type StringType = StringOrStoreString; +impl StrLookup for DatasetView { + type StrType = StringOrStoreString; - fn get_str(&self, id: u64) -> Result>> { - Ok(if let Some(value) = self.store.get_str(id)? { - Some(StringOrStoreString::Store(value)) - } else if let Some(value) = self.extra.get_str(u64::MAX - id)? { + fn get_str(&self, id: u128) -> Result>> { + Ok(if let Some(value) = self.extra.get_str(id)? { Some(StringOrStoreString::String(value)) + } else if let Some(value) = self.store.get_str(id)? { + Some(StringOrStoreString::Store(value)) } else { None }) } +} - fn get_str_id(&self, value: &str) -> Result> { - Ok(if let Some(id) = self.store.get_str_id(value)? { - Some(id) - } else { - self.extra.get_str_id(value)?.map(|id| u64::MAX - id) - }) - } - - fn insert_str(&self, value: &str) -> Result { - Ok(if let Some(id) = self.store.get_str_id(value)? { - id +impl StrContainer for DatasetView { + fn insert_str(&self, key: u128, value: &str) -> Result<()> { + if self.store.get_str(key)?.is_none() { + self.extra.insert_str(key, value) } else { - u64::MAX - self.extra.insert_str(value)? - }) + Ok(()) + } } } diff --git a/lib/src/sparql/plan_builder.rs b/lib/src/sparql/plan_builder.rs index dbab2a57..bacc0c40 100644 --- a/lib/src/sparql/plan_builder.rs +++ b/lib/src/sparql/plan_builder.rs @@ -4,17 +4,17 @@ use crate::sparql::algebra::*; use crate::sparql::model::*; use crate::sparql::plan::PlanPropertyPath; use crate::sparql::plan::*; -use crate::store::numeric_encoder::{Encoder, StringStore, ENCODED_DEFAULT_GRAPH}; +use crate::store::numeric_encoder::{Encoder, ENCODED_DEFAULT_GRAPH}; use crate::Result; use failure::format_err; use std::collections::HashSet; -pub struct PlanBuilder { - encoder: Encoder, +pub struct PlanBuilder { + encoder: E, } -impl PlanBuilder { - pub fn build(encoder: Encoder, pattern: &GraphPattern) -> Result<(PlanNode, Vec)> { +impl PlanBuilder { + pub fn build(encoder: E, pattern: &GraphPattern) -> Result<(PlanNode, Vec)> { let mut variables = Vec::default(); let plan = PlanBuilder { encoder }.build_for_graph_pattern( pattern, @@ -25,7 +25,7 @@ impl PlanBuilder { } pub fn build_graph_template( - encoder: Encoder, + encoder: E, template: &[TriplePattern], mut variables: Vec, ) -> Result> { diff --git a/lib/src/store/memory.rs b/lib/src/store/memory.rs index bbd0a43c..6cceba26 100644 --- a/lib/src/store/memory.rs +++ b/lib/src/store/memory.rs @@ -45,7 +45,7 @@ type QuadMap = BTreeMap>; #[derive(Default)] pub struct MemoryStore { - string_store: MemoryStringStore, + str_store: MemoryStrStore, quad_indexes: RwLock, } @@ -75,19 +75,17 @@ impl<'a> Store for &'a MemoryStore { } } -impl StringStore for MemoryStore { - type StringType = String; +impl StrLookup for MemoryStore { + type StrType = String; - fn get_str(&self, id: u64) -> Result> { - self.string_store.get_str(id) - } - - fn get_str_id(&self, value: &str) -> Result> { - self.string_store.get_str_id(value) + fn get_str(&self, id: u128) -> Result> { + self.str_store.get_str(id) } +} - fn insert_str(&self, value: &str) -> Result { - self.string_store.insert_str(value) +impl StrContainer for MemoryStore { + fn insert_str(&self, key: u128, value: &str) -> Result<()> { + self.str_store.insert_str(key, value) } } diff --git a/lib/src/store/mod.rs b/lib/src/store/mod.rs index bd40e460..900940f9 100644 --- a/lib/src/store/mod.rs +++ b/lib/src/store/mod.rs @@ -18,7 +18,7 @@ use rio_turtle::{NQuadsParser, NTriplesParser, TriGParser, TurtleParser}; use rio_xml::RdfXmlParser; use std::collections::HashMap; use std::io::BufRead; -use std::iter::{once, Iterator}; +use std::iter::Iterator; /// Defines the `Store` traits that is used to have efficient binary storage pub trait Store { @@ -28,10 +28,13 @@ pub trait Store { } /// A connection to a `Store` -pub trait StoreConnection: StringStore + Sized + Clone { +pub trait StoreConnection: StrContainer + StrLookup + Sized + Clone { fn contains(&self, quad: &EncodedQuad) -> Result; + fn insert(&mut self, quad: &EncodedQuad) -> Result<()>; + fn remove(&mut self, quad: &EncodedQuad) -> Result<()>; + fn quads_for_pattern<'a>( &'a self, subject: Option, @@ -39,9 +42,6 @@ pub trait StoreConnection: StringStore + Sized + Clone { object: Option, graph_name: Option, ) -> Box> + 'a>; - fn encoder(&self) -> Encoder<&Self> { - Encoder::new(&self) - } } /// A `RepositoryConnection` from a `StoreConnection` @@ -73,48 +73,14 @@ impl RepositoryConnection for StoreRepositoryConnection { where Self: 'a, { - let encoder = self.inner.encoder(); - let subject = if let Some(subject) = subject { - match encoder.encode_named_or_blank_node(subject) { - Ok(subject) => Some(subject), - Err(error) => return Box::new(once(Err(error))), - } - } else { - None - }; - let predicate = if let Some(predicate) = predicate { - match encoder.encode_named_node(predicate) { - Ok(predicate) => Some(predicate), - Err(error) => return Box::new(once(Err(error))), - } - } else { - None - }; - let object = if let Some(object) = object { - match encoder.encode_term(object) { - Ok(object) => Some(object), - Err(error) => return Box::new(once(Err(error))), - } - } else { - None - }; - let graph_name = if let Some(graph_name) = graph_name { - Some(if let Some(graph_name) = graph_name { - match encoder.encode_named_or_blank_node(graph_name) { - Ok(graph_name) => graph_name, - Err(error) => return Box::new(once(Err(error))), - } - } else { - EncodedTerm::DefaultGraph - }) - } else { - None - }; - + let subject = subject.map(|s| s.into()); + let predicate = predicate.map(|p| p.into()); + let object = object.map(|o| o.into()); + let graph_name = graph_name.map(|g| g.map_or(ENCODED_DEFAULT_GRAPH, |g| g.into())); Box::new( self.inner .quads_for_pattern(subject, predicate, object, graph_name) - .map(move |quad| self.inner.encoder().decode_quad(&quad?)), + .map(move |quad| self.inner.decode_quad(&quad?)), ) } @@ -153,16 +119,15 @@ impl RepositoryConnection for StoreRepositoryConnection { } fn contains(&self, quad: &Quad) -> Result { - self.inner - .contains(&self.inner.encoder().encode_quad(quad)?) + self.inner.contains(&quad.into()) } fn insert(&mut self, quad: &Quad) -> Result<()> { - self.inner.insert(&self.inner.encoder().encode_quad(quad)?) + self.inner.insert(&self.inner.encode_quad(quad)?) } fn remove(&mut self, quad: &Quad) -> Result<()> { - self.inner.remove(&self.inner.encoder().encode_quad(quad)?) + self.inner.remove(&self.inner.encode_quad(quad)?) } } @@ -177,19 +142,16 @@ impl StoreRepositoryConnection { { let mut bnode_map = HashMap::default(); let graph_name = if let Some(graph_name) = to_graph_name { - self.inner - .encoder() - .encode_named_or_blank_node(graph_name)? + self.inner.encode_named_or_blank_node(graph_name)? } else { EncodedTerm::DefaultGraph }; parser.parse_all(&mut move |t| { - self.inner - .insert(&self.inner.encoder().encode_rio_triple_in_graph( - t, - graph_name, - &mut bnode_map, - )?) + self.inner.insert(&self.inner.encode_rio_triple_in_graph( + t, + graph_name, + &mut bnode_map, + )?) })?; Ok(()) } @@ -201,7 +163,7 @@ impl StoreRepositoryConnection { let mut bnode_map = HashMap::default(); parser.parse_all(&mut move |q| { self.inner - .insert(&self.inner.encoder().encode_rio_quad(q, &mut bnode_map)?) + .insert(&self.inner.encode_rio_quad(q, &mut bnode_map)?) })?; Ok(()) } diff --git a/lib/src/store/numeric_encoder.rs b/lib/src/store/numeric_encoder.rs index d81718e7..f9383533 100644 --- a/lib/src/store/numeric_encoder.rs +++ b/lib/src/store/numeric_encoder.rs @@ -8,11 +8,13 @@ use chrono::prelude::*; use failure::format_err; use failure::Backtrace; use failure::Fail; +use md5::digest::Digest; +use md5::Md5; use ordered_float::OrderedFloat; use rand::random; use rio_api::model as rio; use rust_decimal::Decimal; -use std::collections::{BTreeMap, HashMap}; +use std::collections::HashMap; use std::io::Read; use std::io::Write; use std::mem::size_of; @@ -21,110 +23,22 @@ use std::str; use std::sync::PoisonError; use std::sync::RwLock; -const EMPTY_STRING_ID: u64 = 0; -const RDF_LANG_STRING_ID: u64 = 1; -const XSD_STRING_ID: u64 = 2; -const XSD_BOOLEAN_ID: u64 = 3; -const XSD_FLOAT_ID: u64 = 4; -const XSD_DOUBLE_ID: u64 = 5; -const XSD_INTEGER_ID: u64 = 6; -const XSD_DECIMAL_ID: u64 = 7; -const XSD_DATE_TIME_ID: u64 = 8; -const XSD_DATE_ID: u64 = 9; -const XSD_TIME_ID: u64 = 10; - -pub trait StringStore { - type StringType: Deref + ToString + Into; - - fn get_str(&self, id: u64) -> Result>; - fn get_str_id(&self, value: &str) -> Result>; - fn insert_str(&self, value: &str) -> Result; - - /// Should be called when the bytes store is created - fn set_first_strings(&self) -> Result<()> { - if EMPTY_STRING_ID == self.insert_str("")? - && RDF_LANG_STRING_ID == self.insert_str(rdf::LANG_STRING.as_str())? - && XSD_STRING_ID == self.insert_str(xsd::STRING.as_str())? - && XSD_BOOLEAN_ID == self.insert_str(xsd::BOOLEAN.as_str())? - && XSD_FLOAT_ID == self.insert_str(xsd::FLOAT.as_str())? - && XSD_DOUBLE_ID == self.insert_str(xsd::DOUBLE.as_str())? - && XSD_INTEGER_ID == self.insert_str(xsd::INTEGER.as_str())? - && XSD_DECIMAL_ID == self.insert_str(xsd::DECIMAL.as_str())? - && XSD_DATE_TIME_ID == self.insert_str(xsd::DATE_TIME.as_str())? - && XSD_DATE_ID == self.insert_str(xsd::DATE.as_str())? - && XSD_TIME_ID == self.insert_str(xsd::TIME.as_str())? - { - Ok(()) - } else { - Err(format_err!( - "Failed to properly setup the basic string ids in the dictionnary" - )) - } - } -} - -impl<'a, S: StringStore> StringStore for &'a S { - type StringType = S::StringType; - - fn get_str(&self, id: u64) -> Result> { - (*self).get_str(id) - } - - fn get_str_id(&self, value: &str) -> Result> { - (*self).get_str_id(value) - } - - fn insert_str(&self, value: &str) -> Result { - (*self).insert_str(value) - } -} - -pub struct MemoryStringStore { - id2str: RwLock>, - str2id: RwLock>, -} - -impl Default for MemoryStringStore { - fn default() -> Self { - let new = Self { - id2str: RwLock::default(), - str2id: RwLock::default(), - }; - new.set_first_strings().unwrap(); - new - } -} - -impl StringStore for MemoryStringStore { - type StringType = String; - - fn get_str(&self, id: u64) -> Result> { - //TODO: avoid copy by adding a lifetime limit to get_str - let id2str = self.id2str.read().map_err(MutexPoisonError::from)?; - Ok(if id2str.len() as u64 <= id { - None - } else { - Some(id2str[id as usize].to_owned()) - }) - } - - fn get_str_id(&self, value: &str) -> Result> { - let str2id = self.str2id.read().map_err(MutexPoisonError::from)?; - Ok(str2id.get(value).cloned()) - } - - fn insert_str(&self, value: &str) -> Result { - let mut id2str = self.id2str.write().map_err(MutexPoisonError::from)?; - let mut str2id = self.str2id.write().map_err(MutexPoisonError::from)?; - Ok(if let Some(id) = str2id.get(value) { - *id - } else { - let id = id2str.len() as u64; - id2str.push(value.to_string()); - str2id.insert(value.to_string(), id); - id - }) - } +const EMPTY_STRING_ID: u128 = 167830467844043968176572005485231480276; +const RDF_LANG_STRING_ID: u128 = 32982328051974780078994098831023510434; +const XSD_STRING_ID: u128 = 13800943641695357404848007879689460046; +const XSD_BOOLEAN_ID: u128 = 95660596900939122510990529520735927827; +const XSD_FLOAT_ID: u128 = 31528676610345933421445910151629221319; +const XSD_DOUBLE_ID: u128 = 55169043483206236595575765215713332225; +const XSD_INTEGER_ID: u128 = 264492531517574030670228763493245709866; +const XSD_DECIMAL_ID: u128 = 80624473126247401518595349505346497075; +const XSD_DATE_TIME_ID: u128 = 257903479904871420659358808477547675664; +const XSD_DATE_ID: u128 = 269408747350206033502011401422135526584; +const XSD_TIME_ID: u128 = 163434887606038564205926318428306098363; + +pub fn get_str_id(value: &str) -> u128 { + let mut id = [0 as u8; 16]; + id.copy_from_slice(&Md5::new().chain(value).result()); + u128::from_le_bytes(id) } const TYPE_DEFAULT_GRAPH_ID: u8 = 0; @@ -183,11 +97,11 @@ pub const ENCODED_XSD_DATE_TIME_NAMED_NODE: EncodedTerm = EncodedTerm::NamedNode #[derive(Eq, PartialEq, Ord, PartialOrd, Debug, Clone, Copy, Hash)] pub enum EncodedTerm { DefaultGraph, - NamedNode { iri_id: u64 }, + NamedNode { iri_id: u128 }, BlankNode { id: u128 }, - StringLiteral { value_id: u64 }, - LangStringLiteral { value_id: u64, language_id: u64 }, - TypedLiteral { value_id: u64, datatype_id: u64 }, + StringLiteral { value_id: u128 }, + LangStringLiteral { value_id: u128, language_id: u128 }, + TypedLiteral { value_id: u128, datatype_id: u128 }, BooleanLiteral(bool), FloatLiteral(OrderedFloat), DoubleLiteral(OrderedFloat), @@ -363,12 +277,110 @@ impl From for EncodedTerm { } } -impl From for EncodedTerm { - fn from(node: BlankNode) -> Self { +impl From<&NamedNode> for EncodedTerm { + fn from(node: &NamedNode) -> Self { + rio::NamedNode::from(node).into() + } +} + +impl<'a> From> for EncodedTerm { + fn from(node: rio::NamedNode<'a>) -> Self { + EncodedTerm::NamedNode { + iri_id: get_str_id(node.iri), + } + } +} + +impl From<&BlankNode> for EncodedTerm { + fn from(node: &BlankNode) -> Self { EncodedTerm::BlankNode { id: node.id() } } } +impl From<&Literal> for EncodedTerm { + fn from(literal: &Literal) -> Self { + rio::Literal::from(literal).into() + } +} + +impl<'a> From> for EncodedTerm { + fn from(literal: rio::Literal<'a>) -> Self { + match literal { + rio::Literal::Simple { value } => EncodedTerm::StringLiteral { + value_id: get_str_id(value), + }, + rio::Literal::LanguageTaggedString { value, language } => { + EncodedTerm::LangStringLiteral { + value_id: get_str_id(value), + language_id: if language.bytes().all(|b| b.is_ascii_lowercase()) { + get_str_id(language) + } else { + get_str_id(&language.to_ascii_lowercase()) + }, + } + } + rio::Literal::Typed { value, datatype } => { + match match datatype.iri { + "http://www.w3.org/2001/XMLSchema#boolean" => parse_boolean_str(value), + "http://www.w3.org/2001/XMLSchema#string" => Some(EncodedTerm::StringLiteral { + value_id: get_str_id(value), + }), + "http://www.w3.org/2001/XMLSchema#float" => parse_float_str(value), + "http://www.w3.org/2001/XMLSchema#double" => parse_double_str(value), + "http://www.w3.org/2001/XMLSchema#integer" + | "http://www.w3.org/2001/XMLSchema#byte" + | "http://www.w3.org/2001/XMLSchema#short" + | "http://www.w3.org/2001/XMLSchema#int" + | "http://www.w3.org/2001/XMLSchema#long" + | "http://www.w3.org/2001/XMLSchema#unsignedByte" + | "http://www.w3.org/2001/XMLSchema#unsignedShort" + | "http://www.w3.org/2001/XMLSchema#unsignedInt" + | "http://www.w3.org/2001/XMLSchema#unsignedLong" + | "http://www.w3.org/2001/XMLSchema#positiveInteger" + | "http://www.w3.org/2001/XMLSchema#negativeInteger" + | "http://www.w3.org/2001/XMLSchema#nonPositiveInteger" + | "http://www.w3.org/2001/XMLSchema#nonNegativeInteger" => { + parse_integer_str(value) + } + "http://www.w3.org/2001/XMLSchema#decimal" => parse_decimal_str(value), + "http://www.w3.org/2001/XMLSchema#date" => parse_date_str(value), + "http://www.w3.org/2001/XMLSchema#time" => parse_time_str(value), + "http://www.w3.org/2001/XMLSchema#dateTime" + | "http://www.w3.org/2001/XMLSchema#dateTimeStamp" => { + parse_date_time_str(value) + } + _ => None, + } { + Some(v) => v, + None => EncodedTerm::TypedLiteral { + value_id: get_str_id(value), + datatype_id: get_str_id(datatype.iri), + }, + } + } + } + } +} + +impl From<&NamedOrBlankNode> for EncodedTerm { + fn from(node: &NamedOrBlankNode) -> Self { + match node { + NamedOrBlankNode::NamedNode(node) => node.into(), + NamedOrBlankNode::BlankNode(node) => node.into(), + } + } +} + +impl From<&Term> for EncodedTerm { + fn from(node: &Term) -> Self { + match node { + Term::NamedNode(node) => node.into(), + Term::BlankNode(node) => node.into(), + Term::Literal(literal) => literal.into(), + } + } +} + #[derive(Eq, PartialEq, Ord, PartialOrd, Debug, Clone, Hash)] pub struct EncodedQuad { pub subject: EncodedTerm, @@ -393,6 +405,20 @@ impl EncodedQuad { } } +impl From<&Quad> for EncodedQuad { + fn from(quad: &Quad) -> Self { + Self { + subject: quad.subject().into(), + predicate: quad.predicate().into(), + object: quad.object().into(), + graph_name: quad + .graph_name() + .as_ref() + .map_or(ENCODED_DEFAULT_GRAPH, |g| g.into()), + } + } +} + pub trait TermReader { fn read_term(&mut self) -> Result; fn read_spog_quad(&mut self) -> Result; @@ -408,21 +434,21 @@ impl TermReader for R { match self.read_u8()? { TYPE_DEFAULT_GRAPH_ID => Ok(EncodedTerm::DefaultGraph), TYPE_NAMED_NODE_ID => Ok(EncodedTerm::NamedNode { - iri_id: self.read_u64::()?, + iri_id: self.read_u128::()?, }), TYPE_BLANK_NODE_ID => Ok(EncodedTerm::BlankNode { id: self.read_u128::()?, }), TYPE_LANG_STRING_LITERAL_ID => Ok(EncodedTerm::LangStringLiteral { - language_id: self.read_u64::()?, - value_id: self.read_u64::()?, + language_id: self.read_u128::()?, + value_id: self.read_u128::()?, }), TYPE_TYPED_LITERAL_ID => Ok(EncodedTerm::TypedLiteral { - datatype_id: self.read_u64::()?, - value_id: self.read_u64::()?, + datatype_id: self.read_u128::()?, + value_id: self.read_u128::()?, }), TYPE_STRING_LITERAL => Ok(EncodedTerm::StringLiteral { - value_id: self.read_u64::()?, + value_id: self.read_u128::()?, }), TYPE_BOOLEAN_LITERAL_TRUE => Ok(EncodedTerm::BooleanLiteral(true)), TYPE_BOOLEAN_LITERAL_FALSE => Ok(EncodedTerm::BooleanLiteral(false)), @@ -556,7 +582,7 @@ impl TermReader for R { } } -pub const WRITTEN_TERM_MAX_SIZE: usize = size_of::() + 2 * size_of::(); +pub const WRITTEN_TERM_MAX_SIZE: usize = size_of::() + 2 * size_of::(); pub trait TermWriter { fn write_term(&mut self, term: EncodedTerm) -> Result<()>; @@ -573,24 +599,24 @@ impl TermWriter for W { self.write_u8(term.type_id())?; match term { EncodedTerm::DefaultGraph => {} - EncodedTerm::NamedNode { iri_id } => self.write_u64::(iri_id)?, + EncodedTerm::NamedNode { iri_id } => self.write_u128::(iri_id)?, EncodedTerm::BlankNode { id } => self.write_u128::(id)?, EncodedTerm::StringLiteral { value_id } => { - self.write_u64::(value_id)?; + self.write_u128::(value_id)?; } EncodedTerm::LangStringLiteral { value_id, language_id, } => { - self.write_u64::(language_id)?; - self.write_u64::(value_id)?; + self.write_u128::(language_id)?; + self.write_u128::(value_id)?; } EncodedTerm::TypedLiteral { value_id, datatype_id, } => { - self.write_u64::(datatype_id)?; - self.write_u64::(value_id)?; + self.write_u128::(datatype_id)?; + self.write_u128::(value_id)?; } EncodedTerm::BooleanLiteral(_) => {} EncodedTerm::FloatLiteral(value) => self.write_f32::(*value)?, @@ -670,37 +696,160 @@ impl TermWriter for W { } } -pub struct Encoder { - string_store: S, +pub trait StrLookup { + type StrType: Deref + ToString + Into; + + fn get_str(&self, id: u128) -> Result>; +} + +pub trait StrContainer { + fn insert_str(&self, key: u128, value: &str) -> Result<()>; + + /// Should be called when the bytes store is created + fn set_first_strings(&self) -> Result<()> { + self.insert_str(EMPTY_STRING_ID, "")?; + self.insert_str(RDF_LANG_STRING_ID, rdf::LANG_STRING.as_str())?; + self.insert_str(XSD_STRING_ID, xsd::STRING.as_str())?; + self.insert_str(XSD_BOOLEAN_ID, xsd::BOOLEAN.as_str())?; + self.insert_str(XSD_FLOAT_ID, xsd::FLOAT.as_str())?; + self.insert_str(XSD_DOUBLE_ID, xsd::DOUBLE.as_str())?; + self.insert_str(XSD_INTEGER_ID, xsd::INTEGER.as_str())?; + self.insert_str(XSD_DECIMAL_ID, xsd::DECIMAL.as_str())?; + self.insert_str(XSD_DATE_TIME_ID, xsd::DATE_TIME.as_str())?; + self.insert_str(XSD_DATE_ID, xsd::DATE.as_str())?; + self.insert_str(XSD_TIME_ID, xsd::TIME.as_str())?; + Ok(()) + } +} + +impl<'a, S: StrLookup + 'a> StrLookup for &'a S { + type StrType = S::StrType; + + fn get_str(&self, id: u128) -> Result> { + (*self).get_str(id) + } +} + +impl<'a, S: StrContainer + 'a> StrContainer for &'a S { + fn insert_str(&self, key: u128, value: &str) -> Result<()> { + (*self).insert_str(key, value) + } +} + +pub struct MemoryStrStore { + id2str: RwLock>, +} + +impl Default for MemoryStrStore { + fn default() -> Self { + let new = Self { + id2str: RwLock::default(), + }; + new.set_first_strings().unwrap(); + new + } +} + +impl StrLookup for MemoryStrStore { + type StrType = String; + + fn get_str(&self, id: u128) -> Result> { + //TODO: avoid copy by adding a lifetime limit to get_str + Ok(self + .id2str + .read() + .map_err(MutexPoisonError::from)? + .get(&id) + .cloned()) + } } -impl Encoder { - pub fn new(string_store: S) -> Self { - Self { string_store } +impl StrContainer for MemoryStrStore { + fn insert_str(&self, key: u128, value: &str) -> Result<()> { + let mut id2str = self.id2str.write().map_err(MutexPoisonError::from)?; + if !id2str.contains_key(&key) { + id2str.insert(key, value.to_owned()); + } + Ok(()) } +} + +pub trait Encoder { + fn encode_named_node(&self, named_node: &NamedNode) -> Result; + + fn encode_blank_node(&self, blank_node: &BlankNode) -> Result; + + fn encode_literal(&self, literal: &Literal) -> Result; + + fn encode_named_or_blank_node(&self, term: &NamedOrBlankNode) -> Result; - pub fn encode_named_node(&self, named_node: &NamedNode) -> Result { + fn encode_term(&self, term: &Term) -> Result; + + fn encode_quad(&self, quad: &Quad) -> Result; + + fn encode_triple_in_graph( + &self, + triple: &Triple, + graph_name: EncodedTerm, + ) -> Result; + + fn encode_rio_named_node(&self, named_node: rio::NamedNode) -> Result; + + fn encode_rio_blank_node( + &self, + blank_node: rio::BlankNode, + bnodes_map: &mut HashMap, + ) -> Result; + + fn encode_rio_literal(&self, literal: rio::Literal) -> Result; + + fn encode_rio_named_or_blank_node( + &self, + term: rio::NamedOrBlankNode, + bnodes_map: &mut HashMap, + ) -> Result; + + fn encode_rio_term( + &self, + term: rio::Term, + bnodes_map: &mut HashMap, + ) -> Result; + + fn encode_rio_quad( + &self, + quad: rio::Quad, + bnodes_map: &mut HashMap, + ) -> Result; + + fn encode_rio_triple_in_graph( + &self, + triple: rio::Triple, + graph_name: EncodedTerm, + bnodes_map: &mut HashMap, + ) -> Result; +} + +impl Encoder for S { + fn encode_named_node(&self, named_node: &NamedNode) -> Result { self.encode_rio_named_node(named_node.into()) } - pub fn encode_blank_node(&self, blank_node: &BlankNode) -> Result { - Ok(EncodedTerm::BlankNode { - id: blank_node.id(), - }) + fn encode_blank_node(&self, blank_node: &BlankNode) -> Result { + Ok(blank_node.into()) } - pub fn encode_literal(&self, literal: &Literal) -> Result { + fn encode_literal(&self, literal: &Literal) -> Result { self.encode_rio_literal(literal.into()) } - pub fn encode_named_or_blank_node(&self, term: &NamedOrBlankNode) -> Result { + fn encode_named_or_blank_node(&self, term: &NamedOrBlankNode) -> Result { match term { NamedOrBlankNode::NamedNode(named_node) => self.encode_named_node(named_node), NamedOrBlankNode::BlankNode(blank_node) => self.encode_blank_node(blank_node), } } - pub fn encode_term(&self, term: &Term) -> Result { + fn encode_term(&self, term: &Term) -> Result { match term { Term::NamedNode(named_node) => self.encode_named_node(named_node), Term::BlankNode(blank_node) => self.encode_blank_node(blank_node), @@ -708,7 +857,7 @@ impl Encoder { } } - pub fn encode_quad(&self, quad: &Quad) -> Result { + fn encode_quad(&self, quad: &Quad) -> Result { Ok(EncodedQuad { subject: self.encode_named_or_blank_node(quad.subject())?, predicate: self.encode_named_node(quad.predicate())?, @@ -720,7 +869,7 @@ impl Encoder { }) } - pub fn encode_triple_in_graph( + fn encode_triple_in_graph( &self, triple: &Triple, graph_name: EncodedTerm, @@ -733,13 +882,13 @@ impl Encoder { }) } - pub fn encode_rio_named_node(&self, named_node: rio::NamedNode) -> Result { - Ok(EncodedTerm::NamedNode { - iri_id: self.string_store.insert_str(named_node.iri)?, - }) + fn encode_rio_named_node(&self, named_node: rio::NamedNode) -> Result { + let iri_id = get_str_id(named_node.iri); + self.insert_str(iri_id, named_node.iri)?; + Ok(EncodedTerm::NamedNode { iri_id }) } - pub fn encode_rio_blank_node( + fn encode_rio_blank_node( &self, blank_node: rio::BlankNode, bnodes_map: &mut HashMap, @@ -753,29 +902,43 @@ impl Encoder { }) } - pub fn encode_rio_literal(&self, literal: rio::Literal) -> Result { + fn encode_rio_literal(&self, literal: rio::Literal) -> Result { Ok(match literal { - rio::Literal::Simple { value } => EncodedTerm::StringLiteral { - value_id: self.string_store.insert_str(value)?, - }, + rio::Literal::Simple { value } => { + let value_id = get_str_id(value); + self.insert_str(value_id, value)?; + EncodedTerm::StringLiteral { value_id } + } rio::Literal::LanguageTaggedString { value, language } => { + let value_id = get_str_id(value); + self.insert_str(value_id, value)?; + + let language_id = if language.bytes().all(|b| b.is_ascii_lowercase()) { + let language_id = get_str_id(language); + self.insert_str(language_id, language)?; + language_id + } else { + let language = language.to_ascii_lowercase(); + let language_id = get_str_id(&language); + self.insert_str(language_id, &language)?; + language_id + }; + EncodedTerm::LangStringLiteral { - value_id: self.string_store.insert_str(value)?, - language_id: if language.bytes().all(|b| b.is_ascii_lowercase()) { - self.string_store.insert_str(language) - } else { - self.string_store.insert_str(&language.to_ascii_lowercase()) - }?, + value_id, + language_id, } } rio::Literal::Typed { value, datatype } => { match match datatype.iri { - "http://www.w3.org/2001/XMLSchema#boolean" => self.encode_boolean_str(value), - "http://www.w3.org/2001/XMLSchema#string" => Some(EncodedTerm::StringLiteral { - value_id: self.string_store.insert_str(value)?, - }), - "http://www.w3.org/2001/XMLSchema#float" => self.encode_float_str(value), - "http://www.w3.org/2001/XMLSchema#double" => self.encode_double_str(value), + "http://www.w3.org/2001/XMLSchema#boolean" => parse_boolean_str(value), + "http://www.w3.org/2001/XMLSchema#string" => { + let value_id = get_str_id(value); + self.insert_str(value_id, value)?; + Some(EncodedTerm::StringLiteral { value_id }) + } + "http://www.w3.org/2001/XMLSchema#float" => parse_float_str(value), + "http://www.w3.org/2001/XMLSchema#double" => parse_double_str(value), "http://www.w3.org/2001/XMLSchema#integer" | "http://www.w3.org/2001/XMLSchema#byte" | "http://www.w3.org/2001/XMLSchema#short" @@ -789,28 +952,34 @@ impl Encoder { | "http://www.w3.org/2001/XMLSchema#negativeInteger" | "http://www.w3.org/2001/XMLSchema#nonPositiveInteger" | "http://www.w3.org/2001/XMLSchema#nonNegativeInteger" => { - self.encode_integer_str(value) + parse_integer_str(value) } - "http://www.w3.org/2001/XMLSchema#decimal" => self.encode_decimal_str(value), - "http://www.w3.org/2001/XMLSchema#date" => self.encode_date_str(value), - "http://www.w3.org/2001/XMLSchema#time" => self.encode_time_str(value), + "http://www.w3.org/2001/XMLSchema#decimal" => parse_decimal_str(value), + "http://www.w3.org/2001/XMLSchema#date" => parse_date_str(value), + "http://www.w3.org/2001/XMLSchema#time" => parse_time_str(value), "http://www.w3.org/2001/XMLSchema#dateTime" | "http://www.w3.org/2001/XMLSchema#dateTimeStamp" => { - self.encode_date_time_str(value) + parse_date_time_str(value) } _ => None, } { Some(v) => v, - None => EncodedTerm::TypedLiteral { - value_id: self.string_store.insert_str(value)?, - datatype_id: self.string_store.insert_str(datatype.iri)?, - }, + None => { + let value_id = get_str_id(value); + self.insert_str(value_id, value)?; + let datatype_id = get_str_id(datatype.iri); + self.insert_str(datatype_id, datatype.iri)?; + EncodedTerm::TypedLiteral { + value_id, + datatype_id, + } + } } } }) } - pub fn encode_rio_named_or_blank_node( + fn encode_rio_named_or_blank_node( &self, term: rio::NamedOrBlankNode, bnodes_map: &mut HashMap, @@ -823,7 +992,7 @@ impl Encoder { } } - pub fn encode_rio_term( + fn encode_rio_term( &self, term: rio::Term, bnodes_map: &mut HashMap, @@ -835,7 +1004,7 @@ impl Encoder { } } - pub fn encode_rio_quad( + fn encode_rio_quad( &self, quad: rio::Quad, bnodes_map: &mut HashMap, @@ -851,7 +1020,7 @@ impl Encoder { }) } - pub fn encode_rio_triple_in_graph( + fn encode_rio_triple_in_graph( &self, triple: rio::Triple, graph_name: EncodedTerm, @@ -864,124 +1033,78 @@ impl Encoder { graph_name, }) } +} - pub fn encode_boolean_str(&self, value: &str) -> Option { - match value { - "true" | "1" => Some(EncodedTerm::BooleanLiteral(true)), - "false" | "0" => Some(EncodedTerm::BooleanLiteral(false)), - _ => None, - } - } - - pub fn encode_float_str(&self, value: &str) -> Option { - value - .parse() - .map(|value| EncodedTerm::FloatLiteral(OrderedFloat(value))) - .ok() - } - - pub fn encode_double_str(&self, value: &str) -> Option { - value - .parse() - .map(|value| EncodedTerm::DoubleLiteral(OrderedFloat(value))) - .ok() +pub fn parse_boolean_str(value: &str) -> Option { + match value { + "true" | "1" => Some(EncodedTerm::BooleanLiteral(true)), + "false" | "0" => Some(EncodedTerm::BooleanLiteral(false)), + _ => None, } +} - pub fn encode_integer_str(&self, value: &str) -> Option { - value.parse().map(EncodedTerm::IntegerLiteral).ok() - } +pub fn parse_float_str(value: &str) -> Option { + value + .parse() + .map(|value| EncodedTerm::FloatLiteral(OrderedFloat(value))) + .ok() +} - pub fn encode_decimal_str(&self, value: &str) -> Option { - value.parse().map(EncodedTerm::DecimalLiteral).ok() - } +pub fn parse_double_str(value: &str) -> Option { + value + .parse() + .map(|value| EncodedTerm::DoubleLiteral(OrderedFloat(value))) + .ok() +} - pub fn encode_date_str(&self, value: &str) -> Option { - let mut parsed = Parsed::new(); - match parse(&mut parsed, &value, StrftimeItems::new("%Y-%m-%d%:z")).and_then(|_| { - Ok(Date::from_utc( - parsed.to_naive_date()?, - parsed.to_fixed_offset()?, - )) - }) { - Ok(value) => Some(EncodedTerm::DateLiteral(value)), - Err(_) => match NaiveDate::parse_from_str(&value, "%Y-%m-%dZ") { - Ok(value) => Some(EncodedTerm::DateLiteral(Date::from_utc( - value, - FixedOffset::east(0), - ))), - Err(_) => NaiveDate::parse_from_str(&value, "%Y-%m-%d") - .map(EncodedTerm::NaiveDateLiteral) - .ok(), - }, - } - } +pub fn parse_integer_str(value: &str) -> Option { + value.parse().map(EncodedTerm::IntegerLiteral).ok() +} - pub fn encode_time_str(&self, value: &str) -> Option { - NaiveTime::parse_from_str(&value, "%H:%M:%S") - .map(EncodedTerm::NaiveTimeLiteral) - .ok() - } +pub fn parse_decimal_str(value: &str) -> Option { + value.parse().map(EncodedTerm::DecimalLiteral).ok() +} - pub fn encode_date_time_str(&self, value: &str) -> Option { - match DateTime::parse_from_rfc3339(&value) { - Ok(value) => Some(EncodedTerm::DateTimeLiteral(value)), - Err(_) => NaiveDateTime::parse_from_str(&value, "%Y-%m-%dT%H:%M:%S") - .map(EncodedTerm::NaiveDateTimeLiteral) +pub fn parse_date_str(value: &str) -> Option { + let mut parsed = Parsed::new(); + match parse(&mut parsed, &value, StrftimeItems::new("%Y-%m-%d%:z")).and_then(|_| { + Ok(Date::from_utc( + parsed.to_naive_date()?, + parsed.to_fixed_offset()?, + )) + }) { + Ok(value) => Some(EncodedTerm::DateLiteral(value)), + Err(_) => match NaiveDate::parse_from_str(&value, "%Y-%m-%dZ") { + Ok(value) => Some(EncodedTerm::DateLiteral(Date::from_utc( + value, + FixedOffset::east(0), + ))), + Err(_) => NaiveDate::parse_from_str(&value, "%Y-%m-%d") + .map(EncodedTerm::NaiveDateLiteral) .ok(), - } + }, } +} - pub fn decode_term(&self, encoded: EncodedTerm) -> Result { - match encoded { - EncodedTerm::DefaultGraph => { - Err(format_err!("The default graph tag is not a valid term")) - } - EncodedTerm::NamedNode { iri_id } => { - Ok(NamedNode::new_from_string(self.get_str(iri_id)?).into()) - } - EncodedTerm::BlankNode { id } => Ok(BlankNode::new_from_unique_id(id).into()), - EncodedTerm::StringLiteral { value_id } => { - Ok(Literal::new_simple_literal(self.get_str(value_id)?).into()) - } - EncodedTerm::LangStringLiteral { - value_id, - language_id, - } => Ok(Literal::new_language_tagged_literal( - self.get_str(value_id)?, - self.get_str(language_id)?, - ) - .into()), - EncodedTerm::TypedLiteral { - value_id, - datatype_id, - } => Ok(Literal::new_typed_literal( - self.get_str(value_id)?, - NamedNode::new_from_string(self.get_str(datatype_id)?), - ) - .into()), - EncodedTerm::BooleanLiteral(value) => Ok(Literal::from(value).into()), - EncodedTerm::FloatLiteral(value) => Ok(Literal::from(*value).into()), - EncodedTerm::DoubleLiteral(value) => Ok(Literal::from(*value).into()), - EncodedTerm::IntegerLiteral(value) => Ok(Literal::from(value).into()), - EncodedTerm::DecimalLiteral(value) => Ok(Literal::from(value).into()), - EncodedTerm::DateLiteral(value) => Ok(Literal::from(value).into()), - EncodedTerm::NaiveDateLiteral(value) => Ok(Literal::from(value).into()), - EncodedTerm::NaiveTimeLiteral(value) => Ok(Literal::from(value).into()), - EncodedTerm::DateTimeLiteral(value) => Ok(Literal::from(value).into()), - EncodedTerm::NaiveDateTimeLiteral(value) => Ok(Literal::from(value).into()), - } - } +pub fn parse_time_str(value: &str) -> Option { + NaiveTime::parse_from_str(&value, "%H:%M:%S") + .map(EncodedTerm::NaiveTimeLiteral) + .ok() +} - fn get_str(&self, id: u64) -> Result { - self.string_store.get_str(id)?.ok_or_else(|| { - format_err!( - "Not able to find the string with id {} in the string store", - id - ) - }) +pub fn parse_date_time_str(value: &str) -> Option { + match DateTime::parse_from_rfc3339(&value) { + Ok(value) => Some(EncodedTerm::DateTimeLiteral(value)), + Err(_) => NaiveDateTime::parse_from_str(&value, "%Y-%m-%dT%H:%M:%S") + .map(EncodedTerm::NaiveDateTimeLiteral) + .ok(), } +} + +pub trait Decoder { + fn decode_term(&self, encoded: EncodedTerm) -> Result; - pub fn decode_named_or_blank_node(&self, encoded: EncodedTerm) -> Result { + fn decode_named_or_blank_node(&self, encoded: EncodedTerm) -> Result { match self.decode_term(encoded)? { Term::NamedNode(named_node) => Ok(named_node.into()), Term::BlankNode(blank_node) => Ok(blank_node.into()), @@ -991,7 +1114,7 @@ impl Encoder { } } - pub fn decode_named_node(&self, encoded: EncodedTerm) -> Result { + fn decode_named_node(&self, encoded: EncodedTerm) -> Result { match self.decode_term(encoded)? { Term::NamedNode(named_node) => Ok(named_node), Term::BlankNode(_) => Err(format_err!( @@ -1003,7 +1126,7 @@ impl Encoder { } } - pub fn decode_triple(&self, encoded: &EncodedQuad) -> Result { + fn decode_triple(&self, encoded: &EncodedQuad) -> Result { Ok(Triple::new( self.decode_named_or_blank_node(encoded.subject)?, self.decode_named_node(encoded.predicate)?, @@ -1011,7 +1134,7 @@ impl Encoder { )) } - pub fn decode_quad(&self, encoded: &EncodedQuad) -> Result { + fn decode_quad(&self, encoded: &EncodedQuad) -> Result { Ok(Quad::new( self.decode_named_or_blank_node(encoded.subject)?, self.decode_named_node(encoded.predicate)?, @@ -1024,14 +1147,58 @@ impl Encoder { } } -impl Default for Encoder { - fn default() -> Self { - Self { - string_store: S::default(), +impl Decoder for S { + fn decode_term(&self, encoded: EncodedTerm) -> Result { + match encoded { + EncodedTerm::DefaultGraph => { + Err(format_err!("The default graph tag is not a valid term")) + } + EncodedTerm::NamedNode { iri_id } => { + Ok(NamedNode::new_from_string(get_required_str(self, iri_id)?).into()) + } + EncodedTerm::BlankNode { id } => Ok(BlankNode::new_from_unique_id(id).into()), + EncodedTerm::StringLiteral { value_id } => { + Ok(Literal::new_simple_literal(get_required_str(self, value_id)?).into()) + } + EncodedTerm::LangStringLiteral { + value_id, + language_id, + } => Ok(Literal::new_language_tagged_literal( + get_required_str(self, value_id)?, + get_required_str(self, language_id)?, + ) + .into()), + EncodedTerm::TypedLiteral { + value_id, + datatype_id, + } => Ok(Literal::new_typed_literal( + get_required_str(self, value_id)?, + NamedNode::new_from_string(get_required_str(self, datatype_id)?), + ) + .into()), + EncodedTerm::BooleanLiteral(value) => Ok(Literal::from(value).into()), + EncodedTerm::FloatLiteral(value) => Ok(Literal::from(*value).into()), + EncodedTerm::DoubleLiteral(value) => Ok(Literal::from(*value).into()), + EncodedTerm::IntegerLiteral(value) => Ok(Literal::from(value).into()), + EncodedTerm::DecimalLiteral(value) => Ok(Literal::from(value).into()), + EncodedTerm::DateLiteral(value) => Ok(Literal::from(value).into()), + EncodedTerm::NaiveDateLiteral(value) => Ok(Literal::from(value).into()), + EncodedTerm::NaiveTimeLiteral(value) => Ok(Literal::from(value).into()), + EncodedTerm::DateTimeLiteral(value) => Ok(Literal::from(value).into()), + EncodedTerm::NaiveDateTimeLiteral(value) => Ok(Literal::from(value).into()), } } } +fn get_required_str(lookup: S, id: u128) -> Result { + lookup.get_str(id)?.ok_or_else(|| { + format_err!( + "Not able to find the string with id {} in the string store", + id + ) + }) +} + #[derive(Debug, Fail)] #[fail(display = "Mutex Mutex was poisoned")] pub struct MutexPoisonError { @@ -1048,7 +1215,7 @@ impl From> for MutexPoisonError { #[test] fn test_encoding() { - let encoder: Encoder = Encoder::default(); + let store = MemoryStrStore::default(); let terms: Vec = vec![ NamedNode::new_from_string("http://foo.com").into(), NamedNode::new_from_string("http://bar.com").into(), @@ -1063,14 +1230,8 @@ fn test_encoding() { Literal::new_language_tagged_literal("foo", "FR").into(), ]; for term in terms { - let encoded = encoder.encode_term(&term).unwrap(); - assert_eq!(term, encoder.decode_term(encoded).unwrap()) + let encoded = store.encode_term(&term).unwrap(); + assert_eq!(term, store.decode_term(encoded).unwrap()); + assert_eq!(encoded, EncodedTerm::from(&term)); } } - -#[test] -fn test_encoded_term_size() { - use std::mem::size_of; - - assert_eq!(size_of::(), 24); -} diff --git a/lib/src/store/rocksdb.rs b/lib/src/store/rocksdb.rs index deb5ab21..e7804fd0 100644 --- a/lib/src/store/rocksdb.rs +++ b/lib/src/store/rocksdb.rs @@ -1,8 +1,6 @@ use crate::store::numeric_encoder::*; use crate::store::{Store, StoreConnection, StoreRepositoryConnection}; use crate::{Repository, Result}; -use byteorder::ByteOrder; -use byteorder::LittleEndian; use failure::format_err; use rocksdb::ColumnFamily; use rocksdb::DBCompactionStyle; @@ -16,7 +14,6 @@ use std::iter::{empty, once}; use std::ops::Deref; use std::path::Path; use std::str; -use std::sync::Mutex; /// `Repository` implementation based on the [RocksDB](https://rocksdb.org/) key-value store /// @@ -55,7 +52,6 @@ pub struct RocksDbRepository { pub type RocksDbRepositoryConnection<'a> = StoreRepositoryConnection>; const ID2STR_CF: &str = "id2str"; -const STR2ID_CF: &str = "id2str"; const SPOG_CF: &str = "spog"; const POSG_CF: &str = "posg"; const OSPG_CF: &str = "ospg"; @@ -67,13 +63,12 @@ const EMPTY_BUF: [u8; 0] = [0 as u8; 0]; //TODO: indexes for the default graph and indexes for the named graphs (no more Optional and space saving) -const COLUMN_FAMILIES: [&str; 8] = [ - ID2STR_CF, STR2ID_CF, SPOG_CF, POSG_CF, OSPG_CF, GSPO_CF, GPOS_CF, GOSP_CF, +const COLUMN_FAMILIES: [&str; 7] = [ + ID2STR_CF, SPOG_CF, POSG_CF, OSPG_CF, GSPO_CF, GPOS_CF, GOSP_CF, ]; struct RocksDbStore { db: DB, - str_id_counter: Mutex, } #[derive(Clone)] @@ -81,7 +76,6 @@ pub struct RocksDbStoreConnection<'a> { store: &'a RocksDbStore, buffer: Vec, id2str_cf: ColumnFamily<'a>, - str2id_cf: ColumnFamily<'a>, spog_cf: ColumnFamily<'a>, posg_cf: ColumnFamily<'a>, ospg_cf: ColumnFamily<'a>, @@ -115,7 +109,6 @@ impl RocksDbStore { let new = Self { db: DB::open_cf(&options, path, &COLUMN_FAMILIES)?, - str_id_counter: Mutex::new(RocksDBCounter::new("bsc")), }; (&new).connection()?.set_first_strings()?; Ok(new) @@ -130,7 +123,6 @@ impl<'a> Store for &'a RocksDbStore { store: self, buffer: Vec::default(), id2str_cf: get_cf(&self.db, ID2STR_CF)?, - str2id_cf: get_cf(&self.db, STR2ID_CF)?, spog_cf: get_cf(&self.db, SPOG_CF)?, posg_cf: get_cf(&self.db, POSG_CF)?, ospg_cf: get_cf(&self.db, OSPG_CF)?, @@ -141,42 +133,24 @@ impl<'a> Store for &'a RocksDbStore { } } -impl StringStore for RocksDbStoreConnection<'_> { - type StringType = RocksString; +impl StrLookup for RocksDbStoreConnection<'_> { + type StrType = RocksString; - fn get_str(&self, id: u64) -> Result> { + fn get_str(&self, id: u128) -> Result> { Ok(self .store .db - .get_cf(self.id2str_cf, &to_bytes(id))? + .get_cf(self.id2str_cf, &id.to_le_bytes())? .map(|v| RocksString { vec: v })) } +} - fn get_str_id(&self, value: &str) -> Result> { - Ok(self - .store +impl StrContainer for RocksDbStoreConnection<'_> { + fn insert_str(&self, key: u128, value: &str) -> Result<()> { + self.store .db - .get_cf(self.str2id_cf, value.as_bytes())? - .map(|id| LittleEndian::read_u64(&id))) - } - - fn insert_str(&self, value: &str) -> Result { - Ok(if let Some(id) = self.get_str_id(value)? { - id - } else { - let id = self - .store - .str_id_counter - .lock() - .map_err(MutexPoisonError::from)? - .get_and_increment(&self.store.db)? as u64; - let id_bytes = to_bytes(id); - let mut batch = WriteBatch::default(); - batch.put_cf(self.id2str_cf, &id_bytes, value)?; - batch.put_cf(self.str2id_cf, value, &id_bytes)?; - self.store.db.write(batch)?; - id - }) + .put_cf(self.id2str_cf, &key.to_le_bytes(), value)?; + Ok(()) } } @@ -530,24 +504,6 @@ fn wrap_error<'a, E: 'a, I: Iterator> + 'a>( } } -struct RocksDBCounter { - name: &'static str, -} - -impl RocksDBCounter { - fn new(name: &'static str) -> Self { - Self { name } - } - - fn get_and_increment(&self, db: &DB) -> Result { - let value = db - .get(self.name.as_bytes())? - .map_or(0, |b| LittleEndian::read_u64(&b)); - db.put(self.name.as_bytes(), &to_bytes(value + 1))?; - Ok(value) - } -} - struct EncodedQuadPattern { subject: Option, predicate: Option, @@ -740,12 +696,6 @@ impl>> Iterator for FilteringEncodedQuads } } -fn to_bytes(int: u64) -> [u8; 8] { - let mut buf = [0 as u8; 8]; - LittleEndian::write_u64(&mut buf, int); - buf -} - pub struct RocksString { vec: DBVector, }