Use hashing for key storage

Simplifies code related to string storage

Avoids to insert strings when doing lookups on Repository
pull/10/head
Tpt 5 years ago
parent 5220d0e98c
commit df8e265f0f
  1. 238
      lib/src/sparql/eval.rs
  2. 14
      lib/src/sparql/mod.rs
  3. 43
      lib/src/sparql/plan.rs
  4. 12
      lib/src/sparql/plan_builder.rs
  5. 20
      lib/src/store/memory.rs
  6. 78
      lib/src/store/mod.rs
  7. 751
      lib/src/store/numeric_encoder.rs
  8. 74
      lib/src/store/rocksdb.rs

@ -31,7 +31,6 @@ use std::iter::{empty, once};
use std::ops::Deref; use std::ops::Deref;
use std::str; use std::str;
use std::sync::Mutex; use std::sync::Mutex;
use std::u64;
use uuid::Uuid; use uuid::Uuid;
const REGEX_SIZE_LIMIT: usize = 1_000_000; const REGEX_SIZE_LIMIT: usize = 1_000_000;
@ -40,7 +39,7 @@ type EncodedTuplesIterator<'a> = Box<dyn Iterator<Item = Result<EncodedTuple>> +
pub struct SimpleEvaluator<S: StoreConnection> { pub struct SimpleEvaluator<S: StoreConnection> {
dataset: DatasetView<S>, dataset: DatasetView<S>,
bnodes_map: Mutex<BTreeMap<u64, u128>>, bnodes_map: Mutex<BTreeMap<u128, u128>>,
base_iri: Option<Iri<String>>, base_iri: Option<Iri<String>>,
now: DateTime<FixedOffset>, now: DateTime<FixedOffset>,
} }
@ -855,17 +854,12 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator<S> {
_ => None, _ => None,
}?; }?;
let iri = self.dataset.get_str(iri_id).ok()??; let iri = self.dataset.get_str(iri_id).ok()??;
Some(if let Some(base_iri) = &self.base_iri { if let Some(base_iri) = &self.base_iri {
EncodedTerm::NamedNode { self.build_named_node(&base_iri.resolve(&iri).ok()?.into_inner())
iri_id: self
.dataset
.insert_str(&base_iri.resolve(&iri).ok()?.into_inner())
.ok()?,
}
} else { } else {
Iri::parse(iri).ok()?; Iri::parse(iri).ok()?;
EncodedTerm::NamedNode { iri_id } Some(EncodedTerm::NamedNode { iri_id })
}) }
} }
PlanExpression::BNode(id) => match id { PlanExpression::BNode(id) => match id {
Some(id) => { Some(id) => {
@ -1044,12 +1038,7 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator<S> {
} }
} }
} }
Some(EncodedTerm::StringLiteral { self.build_string_literal(str::from_utf8(&result).ok()?)
value_id: self
.dataset
.insert_str(str::from_utf8(&result).ok()?)
.ok()?,
})
} }
PlanExpression::StrEnds(arg1, arg2) => { PlanExpression::StrEnds(arg1, arg2) => {
let (arg1, arg2, _) = self.to_argument_compatible_strings( let (arg1, arg2, _) = self.to_argument_compatible_strings(
@ -1166,11 +1155,9 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator<S> {
write!(&mut result, "{}S", seconds).ok()?; write!(&mut result, "{}S", seconds).ok()?;
} }
Some(EncodedTerm::TypedLiteral { Some(EncodedTerm::TypedLiteral {
value_id: self.dataset.insert_str(&result).ok()?, value_id: self.build_string_id(&result)?,
datatype_id: self datatype_id: self
.dataset .build_string_id("http://www.w3.org/2001/XMLSchema#dayTimeDuration")?,
.insert_str("http://www.w3.org/2001/XMLSchema#dayTimeDuration")
.ok()?,
}) })
} }
PlanExpression::Tz(e) => { PlanExpression::Tz(e) => {
@ -1185,9 +1172,9 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator<S> {
Some(if let Some(timezone) = timezone { Some(if let Some(timezone) = timezone {
EncodedTerm::StringLiteral { EncodedTerm::StringLiteral {
value_id: if timezone.local_minus_utc() == 0 { value_id: if timezone.local_minus_utc() == 0 {
self.dataset.insert_str("Z").ok()? self.build_string_id("Z")?
} else { } else {
self.dataset.insert_str(&timezone.to_string()).ok()? self.build_string_id(&timezone.to_string())?
}, },
} }
} else { } else {
@ -1195,26 +1182,16 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator<S> {
}) })
} }
PlanExpression::Now => Some(self.now.into()), PlanExpression::Now => Some(self.now.into()),
PlanExpression::UUID => Some(EncodedTerm::NamedNode { PlanExpression::UUID => self.build_named_node(
iri_id: self Uuid::new_v4()
.dataset .to_urn()
.insert_str( .encode_lower(&mut Uuid::encode_buffer()),
Uuid::new_v4() ),
.to_urn() PlanExpression::StrUUID => self.build_string_literal(
.encode_lower(&mut Uuid::encode_buffer()), Uuid::new_v4()
) .to_hyphenated()
.ok()?, .encode_lower(&mut Uuid::encode_buffer()),
}), ),
PlanExpression::StrUUID => Some(EncodedTerm::StringLiteral {
value_id: self
.dataset
.insert_str(
Uuid::new_v4()
.to_hyphenated()
.encode_lower(&mut Uuid::encode_buffer()),
)
.ok()?,
}),
PlanExpression::MD5(arg) => self.hash::<Md5>(arg, tuple), PlanExpression::MD5(arg) => self.hash::<Md5>(arg, tuple),
PlanExpression::SHA1(arg) => self.hash::<Sha1>(arg, tuple), PlanExpression::SHA1(arg) => self.hash::<Sha1>(arg, tuple),
PlanExpression::SHA256(arg) => self.hash::<Sha256>(arg, tuple), PlanExpression::SHA256(arg) => self.hash::<Sha256>(arg, tuple),
@ -1253,7 +1230,6 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator<S> {
None None
}?; }?;
self.dataset self.dataset
.encoder()
.encode_rio_literal(rio::Literal::Typed { .encode_rio_literal(rio::Literal::Typed {
value: &value, value: &value,
datatype: rio::NamedNode { iri: &datatype }, datatype: rio::NamedNode { iri: &datatype },
@ -1296,10 +1272,9 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator<S> {
} }
PlanExpression::BooleanCast(e) => match self.eval_expression(e, tuple)? { PlanExpression::BooleanCast(e) => match self.eval_expression(e, tuple)? {
EncodedTerm::BooleanLiteral(value) => Some(value.into()), EncodedTerm::BooleanLiteral(value) => Some(value.into()),
EncodedTerm::StringLiteral { value_id } => self EncodedTerm::StringLiteral { value_id } => {
.dataset parse_boolean_str(&*self.dataset.get_str(value_id).ok()??)
.encoder() }
.encode_boolean_str(&*self.dataset.get_str(value_id).ok()??),
_ => None, _ => None,
}, },
PlanExpression::DoubleCast(e) => match self.eval_expression(e, tuple)? { PlanExpression::DoubleCast(e) => match self.eval_expression(e, tuple)? {
@ -1310,10 +1285,9 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator<S> {
EncodedTerm::BooleanLiteral(value) => { EncodedTerm::BooleanLiteral(value) => {
Some(if value { 1. as f64 } else { 0. }.into()) Some(if value { 1. as f64 } else { 0. }.into())
} }
EncodedTerm::StringLiteral { value_id } => self EncodedTerm::StringLiteral { value_id } => {
.dataset parse_double_str(&*self.dataset.get_str(value_id).ok()??)
.encoder() }
.encode_double_str(&*self.dataset.get_str(value_id).ok()??),
_ => None, _ => None,
}, },
PlanExpression::FloatCast(e) => match self.eval_expression(e, tuple)? { PlanExpression::FloatCast(e) => match self.eval_expression(e, tuple)? {
@ -1324,10 +1298,9 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator<S> {
EncodedTerm::BooleanLiteral(value) => { EncodedTerm::BooleanLiteral(value) => {
Some(if value { 1. as f32 } else { 0. }.into()) Some(if value { 1. as f32 } else { 0. }.into())
} }
EncodedTerm::StringLiteral { value_id } => self EncodedTerm::StringLiteral { value_id } => {
.dataset parse_float_str(&*self.dataset.get_str(value_id).ok()??)
.encoder() }
.encode_float_str(&*self.dataset.get_str(value_id).ok()??),
_ => None, _ => None,
}, },
PlanExpression::IntegerCast(e) => match self.eval_expression(e, tuple)? { PlanExpression::IntegerCast(e) => match self.eval_expression(e, tuple)? {
@ -1336,10 +1309,9 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator<S> {
EncodedTerm::IntegerLiteral(value) => Some(value.to_i128()?.into()), EncodedTerm::IntegerLiteral(value) => Some(value.to_i128()?.into()),
EncodedTerm::DecimalLiteral(value) => Some(value.to_i128()?.into()), EncodedTerm::DecimalLiteral(value) => Some(value.to_i128()?.into()),
EncodedTerm::BooleanLiteral(value) => Some(if value { 1 } else { 0 }.into()), EncodedTerm::BooleanLiteral(value) => Some(if value { 1 } else { 0 }.into()),
EncodedTerm::StringLiteral { value_id } => self EncodedTerm::StringLiteral { value_id } => {
.dataset parse_integer_str(&*self.dataset.get_str(value_id).ok()??)
.encoder() }
.encode_integer_str(&*self.dataset.get_str(value_id).ok()??),
_ => None, _ => None,
}, },
PlanExpression::DecimalCast(e) => match self.eval_expression(e, tuple)? { PlanExpression::DecimalCast(e) => match self.eval_expression(e, tuple)? {
@ -1355,10 +1327,9 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator<S> {
} }
.into(), .into(),
), ),
EncodedTerm::StringLiteral { value_id } => self EncodedTerm::StringLiteral { value_id } => {
.dataset parse_decimal_str(&*self.dataset.get_str(value_id).ok()??)
.encoder() }
.encode_decimal_str(&*self.dataset.get_str(value_id).ok()??),
_ => None, _ => None,
}, },
PlanExpression::DateCast(e) => match self.eval_expression(e, tuple)? { PlanExpression::DateCast(e) => match self.eval_expression(e, tuple)? {
@ -1366,29 +1337,26 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator<S> {
EncodedTerm::NaiveDateLiteral(value) => Some(value.into()), EncodedTerm::NaiveDateLiteral(value) => Some(value.into()),
EncodedTerm::DateTimeLiteral(value) => Some(value.date().into()), EncodedTerm::DateTimeLiteral(value) => Some(value.date().into()),
EncodedTerm::NaiveDateTimeLiteral(value) => Some(value.date().into()), EncodedTerm::NaiveDateTimeLiteral(value) => Some(value.date().into()),
EncodedTerm::StringLiteral { value_id } => self EncodedTerm::StringLiteral { value_id } => {
.dataset parse_date_str(&*self.dataset.get_str(value_id).ok()??)
.encoder() }
.encode_date_str(&*self.dataset.get_str(value_id).ok()??),
_ => None, _ => None,
}, },
PlanExpression::TimeCast(e) => match self.eval_expression(e, tuple)? { PlanExpression::TimeCast(e) => match self.eval_expression(e, tuple)? {
EncodedTerm::NaiveTimeLiteral(value) => Some(value.into()), EncodedTerm::NaiveTimeLiteral(value) => Some(value.into()),
EncodedTerm::DateTimeLiteral(value) => Some(value.time().into()), EncodedTerm::DateTimeLiteral(value) => Some(value.time().into()),
EncodedTerm::NaiveDateTimeLiteral(value) => Some(value.time().into()), EncodedTerm::NaiveDateTimeLiteral(value) => Some(value.time().into()),
EncodedTerm::StringLiteral { value_id } => self EncodedTerm::StringLiteral { value_id } => {
.dataset parse_time_str(&*self.dataset.get_str(value_id).ok()??)
.encoder() }
.encode_time_str(&*self.dataset.get_str(value_id).ok()??),
_ => None, _ => None,
}, },
PlanExpression::DateTimeCast(e) => match self.eval_expression(e, tuple)? { PlanExpression::DateTimeCast(e) => match self.eval_expression(e, tuple)? {
EncodedTerm::DateTimeLiteral(value) => Some(value.into()), EncodedTerm::DateTimeLiteral(value) => Some(value.into()),
EncodedTerm::NaiveDateTimeLiteral(value) => Some(value.into()), EncodedTerm::NaiveDateTimeLiteral(value) => Some(value.into()),
EncodedTerm::StringLiteral { value_id } => self EncodedTerm::StringLiteral { value_id } => {
.dataset parse_date_time_str(&*self.dataset.get_str(value_id).ok()??)
.encoder() }
.encode_date_time_str(&*self.dataset.get_str(value_id).ok()??),
_ => None, _ => None,
}, },
PlanExpression::StringCast(e) => Some(EncodedTerm::StringLiteral { PlanExpression::StringCast(e) => Some(EncodedTerm::StringLiteral {
@ -1409,7 +1377,7 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator<S> {
} }
} }
fn to_string_id(&self, term: EncodedTerm) -> Option<u64> { fn to_string_id(&self, term: EncodedTerm) -> Option<u128> {
match term { match term {
EncodedTerm::DefaultGraph => None, EncodedTerm::DefaultGraph => None,
EncodedTerm::NamedNode { iri_id } => Some(iri_id), EncodedTerm::NamedNode { iri_id } => Some(iri_id),
@ -1417,32 +1385,25 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator<S> {
EncodedTerm::StringLiteral { value_id } EncodedTerm::StringLiteral { value_id }
| EncodedTerm::LangStringLiteral { value_id, .. } | EncodedTerm::LangStringLiteral { value_id, .. }
| EncodedTerm::TypedLiteral { value_id, .. } => Some(value_id), | EncodedTerm::TypedLiteral { value_id, .. } => Some(value_id),
EncodedTerm::BooleanLiteral(value) => self EncodedTerm::BooleanLiteral(value) => {
.dataset self.build_string_id(if value { "true" } else { "false" })
.insert_str(if value { "true" } else { "false" }) }
.ok(), EncodedTerm::FloatLiteral(value) => self.build_string_id(&value.to_string()),
EncodedTerm::FloatLiteral(value) => self.dataset.insert_str(&value.to_string()).ok(), EncodedTerm::DoubleLiteral(value) => self.build_string_id(&value.to_string()),
EncodedTerm::DoubleLiteral(value) => self.dataset.insert_str(&value.to_string()).ok(), EncodedTerm::IntegerLiteral(value) => self.build_string_id(&value.to_string()),
EncodedTerm::IntegerLiteral(value) => self.dataset.insert_str(&value.to_string()).ok(), EncodedTerm::DecimalLiteral(value) => self.build_string_id(&value.to_string()),
EncodedTerm::DecimalLiteral(value) => self.dataset.insert_str(&value.to_string()).ok(), EncodedTerm::DateLiteral(value) => self.build_string_id(&value.to_string()),
EncodedTerm::DateLiteral(value) => self.dataset.insert_str(&value.to_string()).ok(), EncodedTerm::NaiveDateLiteral(value) => self.build_string_id(&value.to_string()),
EncodedTerm::NaiveDateLiteral(value) => { EncodedTerm::NaiveTimeLiteral(value) => self.build_string_id(&value.to_string()),
self.dataset.insert_str(&value.to_string()).ok() EncodedTerm::DateTimeLiteral(value) => self.build_string_id(&value.to_string()),
} EncodedTerm::NaiveDateTimeLiteral(value) => self.build_string_id(&value.to_string()),
EncodedTerm::NaiveTimeLiteral(value) => {
self.dataset.insert_str(&value.to_string()).ok()
}
EncodedTerm::DateTimeLiteral(value) => self.dataset.insert_str(&value.to_string()).ok(),
EncodedTerm::NaiveDateTimeLiteral(value) => {
self.dataset.insert_str(&value.to_string()).ok()
}
} }
} }
fn to_simple_string( fn to_simple_string(
&self, &self,
term: EncodedTerm, term: EncodedTerm,
) -> Option<<DatasetView<S> as StringStore>::StringType> { ) -> Option<<DatasetView<S> as StrLookup>::StrType> {
if let EncodedTerm::StringLiteral { value_id } = term { if let EncodedTerm::StringLiteral { value_id } = term {
self.dataset.get_str(value_id).ok()? self.dataset.get_str(value_id).ok()?
} else { } else {
@ -1450,7 +1411,7 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator<S> {
} }
} }
fn to_simple_string_id(&self, term: EncodedTerm) -> Option<u64> { fn to_simple_string_id(&self, term: EncodedTerm) -> Option<u128> {
if let EncodedTerm::StringLiteral { value_id } = term { if let EncodedTerm::StringLiteral { value_id } = term {
Some(value_id) Some(value_id)
} else { } else {
@ -1458,7 +1419,7 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator<S> {
} }
} }
fn to_string(&self, term: EncodedTerm) -> Option<<DatasetView<S> as StringStore>::StringType> { fn to_string(&self, term: EncodedTerm) -> Option<<DatasetView<S> as StrLookup>::StrType> {
match term { match term {
EncodedTerm::StringLiteral { value_id } EncodedTerm::StringLiteral { value_id }
| EncodedTerm::LangStringLiteral { value_id, .. } => { | EncodedTerm::LangStringLiteral { value_id, .. } => {
@ -1471,7 +1432,7 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator<S> {
fn to_string_and_language( fn to_string_and_language(
&self, &self,
term: EncodedTerm, term: EncodedTerm,
) -> Option<(<DatasetView<S> as StringStore>::StringType, Option<u64>)> { ) -> Option<(<DatasetView<S> as StrLookup>::StrType, Option<u128>)> {
match term { match term {
EncodedTerm::StringLiteral { value_id } => { EncodedTerm::StringLiteral { value_id } => {
Some((self.dataset.get_str(value_id).ok()??, None)) Some((self.dataset.get_str(value_id).ok()??, None))
@ -1484,27 +1445,47 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator<S> {
} }
} }
fn build_plain_literal(&self, value: &str, language: Option<u64>) -> Option<EncodedTerm> { fn build_named_node(&self, iri: &str) -> Option<EncodedTerm> {
Some(if let Some(language_id) = language { Some(EncodedTerm::NamedNode {
EncodedTerm::LangStringLiteral { iri_id: self.build_string_id(iri)?,
value_id: self.dataset.insert_str(value).ok()?,
language_id,
}
} else {
EncodedTerm::StringLiteral {
value_id: self.dataset.insert_str(value).ok()?,
}
}) })
} }
fn build_string_literal(&self, value: &str) -> Option<EncodedTerm> {
Some(EncodedTerm::StringLiteral {
value_id: self.build_string_id(value)?,
})
}
fn build_lang_string_literal(&self, value: &str, language_id: u128) -> Option<EncodedTerm> {
Some(EncodedTerm::LangStringLiteral {
value_id: self.build_string_id(value)?,
language_id,
})
}
fn build_plain_literal(&self, value: &str, language: Option<u128>) -> Option<EncodedTerm> {
if let Some(language_id) = language {
self.build_lang_string_literal(value, language_id)
} else {
self.build_string_literal(value)
}
}
fn build_string_id(&self, value: &str) -> Option<u128> {
let value_id = get_str_id(value);
self.dataset.insert_str(value_id, value).ok()?;
Some(value_id)
}
fn to_argument_compatible_strings( fn to_argument_compatible_strings(
&self, &self,
arg1: EncodedTerm, arg1: EncodedTerm,
arg2: EncodedTerm, arg2: EncodedTerm,
) -> Option<( ) -> Option<(
<DatasetView<S> as StringStore>::StringType, <DatasetView<S> as StrLookup>::StrType,
<DatasetView<S> as StringStore>::StringType, <DatasetView<S> as StrLookup>::StrType,
Option<u64>, Option<u128>,
)> { )> {
let (value1, language1) = self.to_string_and_language(arg1)?; let (value1, language1) = self.to_string_and_language(arg1)?;
let (value2, language2) = self.to_string_and_language(arg2)?; let (value2, language2) = self.to_string_and_language(arg2)?;
@ -1569,11 +1550,10 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator<S> {
BindingsIterator::new( BindingsIterator::new(
variables, variables,
Box::new(iter.map(move |values| { Box::new(iter.map(move |values| {
let encoder = eval.dataset.encoder();
let mut result = vec![None; tuple_size]; let mut result = vec![None; tuple_size];
for (i, value) in values?.into_iter().enumerate() { for (i, value) in values?.into_iter().enumerate() {
if let Some(term) = value { if let Some(term) = value {
result[i] = Some(encoder.decode_term(term)?) result[i] = Some(eval.dataset.decode_term(term)?)
} }
} }
Ok(result) Ok(result)
@ -1803,7 +1783,7 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator<S> {
} }
} }
fn compare_str_ids(&self, a: u64, b: u64) -> Option<Ordering> { fn compare_str_ids(&self, a: u128, b: u128) -> Option<Ordering> {
Some( Some(
self.dataset self.dataset
.get_str(a) .get_str(a)
@ -1819,9 +1799,7 @@ impl<'a, S: StoreConnection + 'a> SimpleEvaluator<S> {
) -> Option<EncodedTerm> { ) -> Option<EncodedTerm> {
let input = self.to_simple_string(self.eval_expression(arg, tuple)?)?; let input = self.to_simple_string(self.eval_expression(arg, tuple)?)?;
let hash = hex::encode(H::new().chain(&input as &str).result()); let hash = hex::encode(H::new().chain(&input as &str).result());
Some(EncodedTerm::StringLiteral { self.build_string_literal(&hash)
value_id: self.dataset.insert_str(&hash).ok()?,
})
} }
} }
@ -2195,15 +2173,18 @@ impl<'a, S: StoreConnection> Iterator for ConstructIterator<'a, S> {
Ok(tuple) => tuple, Ok(tuple) => tuple,
Err(error) => return Some(Err(error)), Err(error) => return Some(Err(error)),
}; };
let encoder = self.eval.dataset.encoder();
for template in self.template { for template in self.template {
if let (Some(subject), Some(predicate), Some(object)) = ( if let (Some(subject), Some(predicate), Some(object)) = (
get_triple_template_value(&template.subject, &tuple, &mut self.bnodes), get_triple_template_value(&template.subject, &tuple, &mut self.bnodes),
get_triple_template_value(&template.predicate, &tuple, &mut self.bnodes), get_triple_template_value(&template.predicate, &tuple, &mut self.bnodes),
get_triple_template_value(&template.object, &tuple, &mut self.bnodes), get_triple_template_value(&template.object, &tuple, &mut self.bnodes),
) { ) {
self.buffered_results self.buffered_results.push(decode_triple(
.push(decode_triple(&encoder, subject, predicate, object)); &self.eval.dataset,
subject,
predicate,
object,
));
} }
} }
self.bnodes.clear(); //We do not reuse old bnodes self.bnodes.clear(); //We do not reuse old bnodes
@ -2229,16 +2210,16 @@ fn get_triple_template_value(
} }
} }
fn decode_triple<S: StringStore>( fn decode_triple(
encoder: &Encoder<S>, decoder: impl Decoder,
subject: EncodedTerm, subject: EncodedTerm,
predicate: EncodedTerm, predicate: EncodedTerm,
object: EncodedTerm, object: EncodedTerm,
) -> Result<Triple> { ) -> Result<Triple> {
Ok(Triple::new( Ok(Triple::new(
encoder.decode_named_or_blank_node(subject)?, decoder.decode_named_or_blank_node(subject)?,
encoder.decode_named_node(predicate)?, decoder.decode_named_node(predicate)?,
encoder.decode_term(object)?, decoder.decode_term(object)?,
)) ))
} }
@ -2258,7 +2239,6 @@ impl<'a, S: StoreConnection> Iterator for DescribeIterator<'a, S> {
Ok(quad) => self Ok(quad) => self
.eval .eval
.dataset .dataset
.encoder()
.decode_quad(&quad) .decode_quad(&quad)
.map(|q| q.into_triple()), .map(|q| q.into_triple()),
Err(error) => Err(error), Err(error) => Err(error),
@ -2607,7 +2587,7 @@ impl Accumulator for SampleAccumulator {
struct GroupConcatAccumulator<'a, S: StoreConnection + 'a> { struct GroupConcatAccumulator<'a, S: StoreConnection + 'a> {
eval: &'a SimpleEvaluator<S>, eval: &'a SimpleEvaluator<S>,
concat: Option<String>, concat: Option<String>,
language: Option<Option<u64>>, language: Option<Option<u128>>,
separator: &'a str, separator: &'a str,
} }

@ -64,7 +64,7 @@ impl<S: StoreConnection> SimplePreparedQuery<S> {
dataset: _, dataset: _,
base_iri, base_iri,
} => { } => {
let (plan, variables) = PlanBuilder::build(dataset.encoder(), &algebra)?; let (plan, variables) = PlanBuilder::build(&dataset, &algebra)?;
SimplePreparedQueryOptions::Select { SimplePreparedQueryOptions::Select {
plan, plan,
variables, variables,
@ -76,7 +76,7 @@ impl<S: StoreConnection> SimplePreparedQuery<S> {
dataset: _, dataset: _,
base_iri, base_iri,
} => { } => {
let (plan, _) = PlanBuilder::build(dataset.encoder(), &algebra)?; let (plan, _) = PlanBuilder::build(&dataset, &algebra)?;
SimplePreparedQueryOptions::Ask { SimplePreparedQueryOptions::Ask {
plan, plan,
evaluator: SimpleEvaluator::new(dataset, base_iri), evaluator: SimpleEvaluator::new(dataset, base_iri),
@ -88,14 +88,10 @@ impl<S: StoreConnection> SimplePreparedQuery<S> {
dataset: _, dataset: _,
base_iri, base_iri,
} => { } => {
let (plan, variables) = PlanBuilder::build(dataset.encoder(), &algebra)?; let (plan, variables) = PlanBuilder::build(&dataset, &algebra)?;
SimplePreparedQueryOptions::Construct { SimplePreparedQueryOptions::Construct {
plan, plan,
construct: PlanBuilder::build_graph_template( construct: PlanBuilder::build_graph_template(&dataset, &construct, variables)?,
dataset.encoder(),
&construct,
variables,
)?,
evaluator: SimpleEvaluator::new(dataset, base_iri), evaluator: SimpleEvaluator::new(dataset, base_iri),
} }
} }
@ -104,7 +100,7 @@ impl<S: StoreConnection> SimplePreparedQuery<S> {
dataset: _, dataset: _,
base_iri, base_iri,
} => { } => {
let (plan, _) = PlanBuilder::build(dataset.encoder(), &algebra)?; let (plan, _) = PlanBuilder::build(&dataset, &algebra)?;
SimplePreparedQueryOptions::Describe { SimplePreparedQueryOptions::Describe {
plan, plan,
evaluator: SimpleEvaluator::new(dataset, base_iri), evaluator: SimpleEvaluator::new(dataset, base_iri),

@ -1,11 +1,10 @@
use crate::sparql::eval::StringOrStoreString; use crate::sparql::eval::StringOrStoreString;
use crate::store::numeric_encoder::{ use crate::store::numeric_encoder::{
EncodedQuad, EncodedTerm, Encoder, MemoryStringStore, StringStore, EncodedQuad, EncodedTerm, MemoryStrStore, StrContainer, StrLookup,
}; };
use crate::store::StoreConnection; use crate::store::StoreConnection;
use crate::Result; use crate::Result;
use std::collections::BTreeSet; use std::collections::BTreeSet;
use std::u64;
pub type EncodedTuple = Vec<Option<EncodedTerm>>; pub type EncodedTuple = Vec<Option<EncodedTerm>>;
@ -461,14 +460,14 @@ pub enum TripleTemplateValue {
pub struct DatasetView<S: StoreConnection> { pub struct DatasetView<S: StoreConnection> {
store: S, store: S,
extra: MemoryStringStore, extra: MemoryStrStore,
} }
impl<S: StoreConnection> DatasetView<S> { impl<S: StoreConnection> DatasetView<S> {
pub fn new(store: S) -> Self { pub fn new(store: S) -> Self {
Self { Self {
store, store,
extra: MemoryStringStore::default(), extra: MemoryStrStore::default(),
} }
} }
@ -482,38 +481,28 @@ impl<S: StoreConnection> DatasetView<S> {
self.store self.store
.quads_for_pattern(subject, predicate, object, graph_name) .quads_for_pattern(subject, predicate, object, graph_name)
} }
pub fn encoder(&self) -> Encoder<&Self> {
Encoder::new(&self)
}
} }
impl<S: StoreConnection> StringStore for DatasetView<S> { impl<S: StoreConnection> StrLookup for DatasetView<S> {
type StringType = StringOrStoreString<S::StringType>; type StrType = StringOrStoreString<S::StrType>;
fn get_str(&self, id: u64) -> Result<Option<StringOrStoreString<S::StringType>>> { fn get_str(&self, id: u128) -> Result<Option<StringOrStoreString<S::StrType>>> {
Ok(if let Some(value) = self.store.get_str(id)? { Ok(if let Some(value) = self.extra.get_str(id)? {
Some(StringOrStoreString::Store(value))
} else if let Some(value) = self.extra.get_str(u64::MAX - id)? {
Some(StringOrStoreString::String(value)) Some(StringOrStoreString::String(value))
} else if let Some(value) = self.store.get_str(id)? {
Some(StringOrStoreString::Store(value))
} else { } else {
None None
}) })
} }
}
fn get_str_id(&self, value: &str) -> Result<Option<u64>> { impl<S: StoreConnection> StrContainer for DatasetView<S> {
Ok(if let Some(id) = self.store.get_str_id(value)? { fn insert_str(&self, key: u128, value: &str) -> Result<()> {
Some(id) if self.store.get_str(key)?.is_none() {
} else { self.extra.insert_str(key, value)
self.extra.get_str_id(value)?.map(|id| u64::MAX - id)
})
}
fn insert_str(&self, value: &str) -> Result<u64> {
Ok(if let Some(id) = self.store.get_str_id(value)? {
id
} else { } else {
u64::MAX - self.extra.insert_str(value)? Ok(())
}) }
} }
} }

@ -4,17 +4,17 @@ use crate::sparql::algebra::*;
use crate::sparql::model::*; use crate::sparql::model::*;
use crate::sparql::plan::PlanPropertyPath; use crate::sparql::plan::PlanPropertyPath;
use crate::sparql::plan::*; use crate::sparql::plan::*;
use crate::store::numeric_encoder::{Encoder, StringStore, ENCODED_DEFAULT_GRAPH}; use crate::store::numeric_encoder::{Encoder, ENCODED_DEFAULT_GRAPH};
use crate::Result; use crate::Result;
use failure::format_err; use failure::format_err;
use std::collections::HashSet; use std::collections::HashSet;
pub struct PlanBuilder<S: StringStore> { pub struct PlanBuilder<E: Encoder> {
encoder: Encoder<S>, encoder: E,
} }
impl<S: StringStore> PlanBuilder<S> { impl<E: Encoder> PlanBuilder<E> {
pub fn build(encoder: Encoder<S>, pattern: &GraphPattern) -> Result<(PlanNode, Vec<Variable>)> { pub fn build(encoder: E, pattern: &GraphPattern) -> Result<(PlanNode, Vec<Variable>)> {
let mut variables = Vec::default(); let mut variables = Vec::default();
let plan = PlanBuilder { encoder }.build_for_graph_pattern( let plan = PlanBuilder { encoder }.build_for_graph_pattern(
pattern, pattern,
@ -25,7 +25,7 @@ impl<S: StringStore> PlanBuilder<S> {
} }
pub fn build_graph_template( pub fn build_graph_template(
encoder: Encoder<S>, encoder: E,
template: &[TriplePattern], template: &[TriplePattern],
mut variables: Vec<Variable>, mut variables: Vec<Variable>,
) -> Result<Vec<TripleTemplate>> { ) -> Result<Vec<TripleTemplate>> {

@ -45,7 +45,7 @@ type QuadMap<T> = BTreeMap<T, TripleMap<T>>;
#[derive(Default)] #[derive(Default)]
pub struct MemoryStore { pub struct MemoryStore {
string_store: MemoryStringStore, str_store: MemoryStrStore,
quad_indexes: RwLock<MemoryStoreIndexes>, quad_indexes: RwLock<MemoryStoreIndexes>,
} }
@ -75,19 +75,17 @@ impl<'a> Store for &'a MemoryStore {
} }
} }
impl StringStore for MemoryStore { impl StrLookup for MemoryStore {
type StringType = String; type StrType = String;
fn get_str(&self, id: u64) -> Result<Option<String>> { fn get_str(&self, id: u128) -> Result<Option<String>> {
self.string_store.get_str(id) self.str_store.get_str(id)
}
fn get_str_id(&self, value: &str) -> Result<Option<u64>> {
self.string_store.get_str_id(value)
} }
}
fn insert_str(&self, value: &str) -> Result<u64> { impl StrContainer for MemoryStore {
self.string_store.insert_str(value) fn insert_str(&self, key: u128, value: &str) -> Result<()> {
self.str_store.insert_str(key, value)
} }
} }

@ -18,7 +18,7 @@ use rio_turtle::{NQuadsParser, NTriplesParser, TriGParser, TurtleParser};
use rio_xml::RdfXmlParser; use rio_xml::RdfXmlParser;
use std::collections::HashMap; use std::collections::HashMap;
use std::io::BufRead; use std::io::BufRead;
use std::iter::{once, Iterator}; use std::iter::Iterator;
/// Defines the `Store` traits that is used to have efficient binary storage /// Defines the `Store` traits that is used to have efficient binary storage
pub trait Store { pub trait Store {
@ -28,10 +28,13 @@ pub trait Store {
} }
/// A connection to a `Store` /// A connection to a `Store`
pub trait StoreConnection: StringStore + Sized + Clone { pub trait StoreConnection: StrContainer + StrLookup + Sized + Clone {
fn contains(&self, quad: &EncodedQuad) -> Result<bool>; fn contains(&self, quad: &EncodedQuad) -> Result<bool>;
fn insert(&mut self, quad: &EncodedQuad) -> Result<()>; fn insert(&mut self, quad: &EncodedQuad) -> Result<()>;
fn remove(&mut self, quad: &EncodedQuad) -> Result<()>; fn remove(&mut self, quad: &EncodedQuad) -> Result<()>;
fn quads_for_pattern<'a>( fn quads_for_pattern<'a>(
&'a self, &'a self,
subject: Option<EncodedTerm>, subject: Option<EncodedTerm>,
@ -39,9 +42,6 @@ pub trait StoreConnection: StringStore + Sized + Clone {
object: Option<EncodedTerm>, object: Option<EncodedTerm>,
graph_name: Option<EncodedTerm>, graph_name: Option<EncodedTerm>,
) -> Box<dyn Iterator<Item = Result<EncodedQuad>> + 'a>; ) -> Box<dyn Iterator<Item = Result<EncodedQuad>> + 'a>;
fn encoder(&self) -> Encoder<&Self> {
Encoder::new(&self)
}
} }
/// A `RepositoryConnection` from a `StoreConnection` /// A `RepositoryConnection` from a `StoreConnection`
@ -73,48 +73,14 @@ impl<S: StoreConnection> RepositoryConnection for StoreRepositoryConnection<S> {
where where
Self: 'a, Self: 'a,
{ {
let encoder = self.inner.encoder(); let subject = subject.map(|s| s.into());
let subject = if let Some(subject) = subject { let predicate = predicate.map(|p| p.into());
match encoder.encode_named_or_blank_node(subject) { let object = object.map(|o| o.into());
Ok(subject) => Some(subject), let graph_name = graph_name.map(|g| g.map_or(ENCODED_DEFAULT_GRAPH, |g| g.into()));
Err(error) => return Box::new(once(Err(error))),
}
} else {
None
};
let predicate = if let Some(predicate) = predicate {
match encoder.encode_named_node(predicate) {
Ok(predicate) => Some(predicate),
Err(error) => return Box::new(once(Err(error))),
}
} else {
None
};
let object = if let Some(object) = object {
match encoder.encode_term(object) {
Ok(object) => Some(object),
Err(error) => return Box::new(once(Err(error))),
}
} else {
None
};
let graph_name = if let Some(graph_name) = graph_name {
Some(if let Some(graph_name) = graph_name {
match encoder.encode_named_or_blank_node(graph_name) {
Ok(graph_name) => graph_name,
Err(error) => return Box::new(once(Err(error))),
}
} else {
EncodedTerm::DefaultGraph
})
} else {
None
};
Box::new( Box::new(
self.inner self.inner
.quads_for_pattern(subject, predicate, object, graph_name) .quads_for_pattern(subject, predicate, object, graph_name)
.map(move |quad| self.inner.encoder().decode_quad(&quad?)), .map(move |quad| self.inner.decode_quad(&quad?)),
) )
} }
@ -153,16 +119,15 @@ impl<S: StoreConnection> RepositoryConnection for StoreRepositoryConnection<S> {
} }
fn contains(&self, quad: &Quad) -> Result<bool> { fn contains(&self, quad: &Quad) -> Result<bool> {
self.inner self.inner.contains(&quad.into())
.contains(&self.inner.encoder().encode_quad(quad)?)
} }
fn insert(&mut self, quad: &Quad) -> Result<()> { fn insert(&mut self, quad: &Quad) -> Result<()> {
self.inner.insert(&self.inner.encoder().encode_quad(quad)?) self.inner.insert(&self.inner.encode_quad(quad)?)
} }
fn remove(&mut self, quad: &Quad) -> Result<()> { fn remove(&mut self, quad: &Quad) -> Result<()> {
self.inner.remove(&self.inner.encoder().encode_quad(quad)?) self.inner.remove(&self.inner.encode_quad(quad)?)
} }
} }
@ -177,19 +142,16 @@ impl<S: StoreConnection> StoreRepositoryConnection<S> {
{ {
let mut bnode_map = HashMap::default(); let mut bnode_map = HashMap::default();
let graph_name = if let Some(graph_name) = to_graph_name { let graph_name = if let Some(graph_name) = to_graph_name {
self.inner self.inner.encode_named_or_blank_node(graph_name)?
.encoder()
.encode_named_or_blank_node(graph_name)?
} else { } else {
EncodedTerm::DefaultGraph EncodedTerm::DefaultGraph
}; };
parser.parse_all(&mut move |t| { parser.parse_all(&mut move |t| {
self.inner self.inner.insert(&self.inner.encode_rio_triple_in_graph(
.insert(&self.inner.encoder().encode_rio_triple_in_graph( t,
t, graph_name,
graph_name, &mut bnode_map,
&mut bnode_map, )?)
)?)
})?; })?;
Ok(()) Ok(())
} }
@ -201,7 +163,7 @@ impl<S: StoreConnection> StoreRepositoryConnection<S> {
let mut bnode_map = HashMap::default(); let mut bnode_map = HashMap::default();
parser.parse_all(&mut move |q| { parser.parse_all(&mut move |q| {
self.inner self.inner
.insert(&self.inner.encoder().encode_rio_quad(q, &mut bnode_map)?) .insert(&self.inner.encode_rio_quad(q, &mut bnode_map)?)
})?; })?;
Ok(()) Ok(())
} }

@ -8,11 +8,13 @@ use chrono::prelude::*;
use failure::format_err; use failure::format_err;
use failure::Backtrace; use failure::Backtrace;
use failure::Fail; use failure::Fail;
use md5::digest::Digest;
use md5::Md5;
use ordered_float::OrderedFloat; use ordered_float::OrderedFloat;
use rand::random; use rand::random;
use rio_api::model as rio; use rio_api::model as rio;
use rust_decimal::Decimal; use rust_decimal::Decimal;
use std::collections::{BTreeMap, HashMap}; use std::collections::HashMap;
use std::io::Read; use std::io::Read;
use std::io::Write; use std::io::Write;
use std::mem::size_of; use std::mem::size_of;
@ -21,110 +23,22 @@ use std::str;
use std::sync::PoisonError; use std::sync::PoisonError;
use std::sync::RwLock; use std::sync::RwLock;
const EMPTY_STRING_ID: u64 = 0; const EMPTY_STRING_ID: u128 = 167830467844043968176572005485231480276;
const RDF_LANG_STRING_ID: u64 = 1; const RDF_LANG_STRING_ID: u128 = 32982328051974780078994098831023510434;
const XSD_STRING_ID: u64 = 2; const XSD_STRING_ID: u128 = 13800943641695357404848007879689460046;
const XSD_BOOLEAN_ID: u64 = 3; const XSD_BOOLEAN_ID: u128 = 95660596900939122510990529520735927827;
const XSD_FLOAT_ID: u64 = 4; const XSD_FLOAT_ID: u128 = 31528676610345933421445910151629221319;
const XSD_DOUBLE_ID: u64 = 5; const XSD_DOUBLE_ID: u128 = 55169043483206236595575765215713332225;
const XSD_INTEGER_ID: u64 = 6; const XSD_INTEGER_ID: u128 = 264492531517574030670228763493245709866;
const XSD_DECIMAL_ID: u64 = 7; const XSD_DECIMAL_ID: u128 = 80624473126247401518595349505346497075;
const XSD_DATE_TIME_ID: u64 = 8; const XSD_DATE_TIME_ID: u128 = 257903479904871420659358808477547675664;
const XSD_DATE_ID: u64 = 9; const XSD_DATE_ID: u128 = 269408747350206033502011401422135526584;
const XSD_TIME_ID: u64 = 10; const XSD_TIME_ID: u128 = 163434887606038564205926318428306098363;
pub trait StringStore { pub fn get_str_id(value: &str) -> u128 {
type StringType: Deref<Target = str> + ToString + Into<String>; let mut id = [0 as u8; 16];
id.copy_from_slice(&Md5::new().chain(value).result());
fn get_str(&self, id: u64) -> Result<Option<Self::StringType>>; u128::from_le_bytes(id)
fn get_str_id(&self, value: &str) -> Result<Option<u64>>;
fn insert_str(&self, value: &str) -> Result<u64>;
/// Should be called when the bytes store is created
fn set_first_strings(&self) -> Result<()> {
if EMPTY_STRING_ID == self.insert_str("")?
&& RDF_LANG_STRING_ID == self.insert_str(rdf::LANG_STRING.as_str())?
&& XSD_STRING_ID == self.insert_str(xsd::STRING.as_str())?
&& XSD_BOOLEAN_ID == self.insert_str(xsd::BOOLEAN.as_str())?
&& XSD_FLOAT_ID == self.insert_str(xsd::FLOAT.as_str())?
&& XSD_DOUBLE_ID == self.insert_str(xsd::DOUBLE.as_str())?
&& XSD_INTEGER_ID == self.insert_str(xsd::INTEGER.as_str())?
&& XSD_DECIMAL_ID == self.insert_str(xsd::DECIMAL.as_str())?
&& XSD_DATE_TIME_ID == self.insert_str(xsd::DATE_TIME.as_str())?
&& XSD_DATE_ID == self.insert_str(xsd::DATE.as_str())?
&& XSD_TIME_ID == self.insert_str(xsd::TIME.as_str())?
{
Ok(())
} else {
Err(format_err!(
"Failed to properly setup the basic string ids in the dictionnary"
))
}
}
}
impl<'a, S: StringStore> StringStore for &'a S {
type StringType = S::StringType;
fn get_str(&self, id: u64) -> Result<Option<S::StringType>> {
(*self).get_str(id)
}
fn get_str_id(&self, value: &str) -> Result<Option<u64>> {
(*self).get_str_id(value)
}
fn insert_str(&self, value: &str) -> Result<u64> {
(*self).insert_str(value)
}
}
pub struct MemoryStringStore {
id2str: RwLock<Vec<String>>,
str2id: RwLock<BTreeMap<String, u64>>,
}
impl Default for MemoryStringStore {
fn default() -> Self {
let new = Self {
id2str: RwLock::default(),
str2id: RwLock::default(),
};
new.set_first_strings().unwrap();
new
}
}
impl StringStore for MemoryStringStore {
type StringType = String;
fn get_str(&self, id: u64) -> Result<Option<String>> {
//TODO: avoid copy by adding a lifetime limit to get_str
let id2str = self.id2str.read().map_err(MutexPoisonError::from)?;
Ok(if id2str.len() as u64 <= id {
None
} else {
Some(id2str[id as usize].to_owned())
})
}
fn get_str_id(&self, value: &str) -> Result<Option<u64>> {
let str2id = self.str2id.read().map_err(MutexPoisonError::from)?;
Ok(str2id.get(value).cloned())
}
fn insert_str(&self, value: &str) -> Result<u64> {
let mut id2str = self.id2str.write().map_err(MutexPoisonError::from)?;
let mut str2id = self.str2id.write().map_err(MutexPoisonError::from)?;
Ok(if let Some(id) = str2id.get(value) {
*id
} else {
let id = id2str.len() as u64;
id2str.push(value.to_string());
str2id.insert(value.to_string(), id);
id
})
}
} }
const TYPE_DEFAULT_GRAPH_ID: u8 = 0; const TYPE_DEFAULT_GRAPH_ID: u8 = 0;
@ -183,11 +97,11 @@ pub const ENCODED_XSD_DATE_TIME_NAMED_NODE: EncodedTerm = EncodedTerm::NamedNode
#[derive(Eq, PartialEq, Ord, PartialOrd, Debug, Clone, Copy, Hash)] #[derive(Eq, PartialEq, Ord, PartialOrd, Debug, Clone, Copy, Hash)]
pub enum EncodedTerm { pub enum EncodedTerm {
DefaultGraph, DefaultGraph,
NamedNode { iri_id: u64 }, NamedNode { iri_id: u128 },
BlankNode { id: u128 }, BlankNode { id: u128 },
StringLiteral { value_id: u64 }, StringLiteral { value_id: u128 },
LangStringLiteral { value_id: u64, language_id: u64 }, LangStringLiteral { value_id: u128, language_id: u128 },
TypedLiteral { value_id: u64, datatype_id: u64 }, TypedLiteral { value_id: u128, datatype_id: u128 },
BooleanLiteral(bool), BooleanLiteral(bool),
FloatLiteral(OrderedFloat<f32>), FloatLiteral(OrderedFloat<f32>),
DoubleLiteral(OrderedFloat<f64>), DoubleLiteral(OrderedFloat<f64>),
@ -363,12 +277,110 @@ impl From<NaiveDateTime> for EncodedTerm {
} }
} }
impl From<BlankNode> for EncodedTerm { impl From<&NamedNode> for EncodedTerm {
fn from(node: BlankNode) -> Self { fn from(node: &NamedNode) -> Self {
rio::NamedNode::from(node).into()
}
}
impl<'a> From<rio::NamedNode<'a>> for EncodedTerm {
fn from(node: rio::NamedNode<'a>) -> Self {
EncodedTerm::NamedNode {
iri_id: get_str_id(node.iri),
}
}
}
impl From<&BlankNode> for EncodedTerm {
fn from(node: &BlankNode) -> Self {
EncodedTerm::BlankNode { id: node.id() } EncodedTerm::BlankNode { id: node.id() }
} }
} }
impl From<&Literal> for EncodedTerm {
fn from(literal: &Literal) -> Self {
rio::Literal::from(literal).into()
}
}
impl<'a> From<rio::Literal<'a>> for EncodedTerm {
fn from(literal: rio::Literal<'a>) -> Self {
match literal {
rio::Literal::Simple { value } => EncodedTerm::StringLiteral {
value_id: get_str_id(value),
},
rio::Literal::LanguageTaggedString { value, language } => {
EncodedTerm::LangStringLiteral {
value_id: get_str_id(value),
language_id: if language.bytes().all(|b| b.is_ascii_lowercase()) {
get_str_id(language)
} else {
get_str_id(&language.to_ascii_lowercase())
},
}
}
rio::Literal::Typed { value, datatype } => {
match match datatype.iri {
"http://www.w3.org/2001/XMLSchema#boolean" => parse_boolean_str(value),
"http://www.w3.org/2001/XMLSchema#string" => Some(EncodedTerm::StringLiteral {
value_id: get_str_id(value),
}),
"http://www.w3.org/2001/XMLSchema#float" => parse_float_str(value),
"http://www.w3.org/2001/XMLSchema#double" => parse_double_str(value),
"http://www.w3.org/2001/XMLSchema#integer"
| "http://www.w3.org/2001/XMLSchema#byte"
| "http://www.w3.org/2001/XMLSchema#short"
| "http://www.w3.org/2001/XMLSchema#int"
| "http://www.w3.org/2001/XMLSchema#long"
| "http://www.w3.org/2001/XMLSchema#unsignedByte"
| "http://www.w3.org/2001/XMLSchema#unsignedShort"
| "http://www.w3.org/2001/XMLSchema#unsignedInt"
| "http://www.w3.org/2001/XMLSchema#unsignedLong"
| "http://www.w3.org/2001/XMLSchema#positiveInteger"
| "http://www.w3.org/2001/XMLSchema#negativeInteger"
| "http://www.w3.org/2001/XMLSchema#nonPositiveInteger"
| "http://www.w3.org/2001/XMLSchema#nonNegativeInteger" => {
parse_integer_str(value)
}
"http://www.w3.org/2001/XMLSchema#decimal" => parse_decimal_str(value),
"http://www.w3.org/2001/XMLSchema#date" => parse_date_str(value),
"http://www.w3.org/2001/XMLSchema#time" => parse_time_str(value),
"http://www.w3.org/2001/XMLSchema#dateTime"
| "http://www.w3.org/2001/XMLSchema#dateTimeStamp" => {
parse_date_time_str(value)
}
_ => None,
} {
Some(v) => v,
None => EncodedTerm::TypedLiteral {
value_id: get_str_id(value),
datatype_id: get_str_id(datatype.iri),
},
}
}
}
}
}
impl From<&NamedOrBlankNode> for EncodedTerm {
fn from(node: &NamedOrBlankNode) -> Self {
match node {
NamedOrBlankNode::NamedNode(node) => node.into(),
NamedOrBlankNode::BlankNode(node) => node.into(),
}
}
}
impl From<&Term> for EncodedTerm {
fn from(node: &Term) -> Self {
match node {
Term::NamedNode(node) => node.into(),
Term::BlankNode(node) => node.into(),
Term::Literal(literal) => literal.into(),
}
}
}
#[derive(Eq, PartialEq, Ord, PartialOrd, Debug, Clone, Hash)] #[derive(Eq, PartialEq, Ord, PartialOrd, Debug, Clone, Hash)]
pub struct EncodedQuad { pub struct EncodedQuad {
pub subject: EncodedTerm, pub subject: EncodedTerm,
@ -393,6 +405,20 @@ impl EncodedQuad {
} }
} }
impl From<&Quad> for EncodedQuad {
fn from(quad: &Quad) -> Self {
Self {
subject: quad.subject().into(),
predicate: quad.predicate().into(),
object: quad.object().into(),
graph_name: quad
.graph_name()
.as_ref()
.map_or(ENCODED_DEFAULT_GRAPH, |g| g.into()),
}
}
}
pub trait TermReader { pub trait TermReader {
fn read_term(&mut self) -> Result<EncodedTerm>; fn read_term(&mut self) -> Result<EncodedTerm>;
fn read_spog_quad(&mut self) -> Result<EncodedQuad>; fn read_spog_quad(&mut self) -> Result<EncodedQuad>;
@ -408,21 +434,21 @@ impl<R: Read> TermReader for R {
match self.read_u8()? { match self.read_u8()? {
TYPE_DEFAULT_GRAPH_ID => Ok(EncodedTerm::DefaultGraph), TYPE_DEFAULT_GRAPH_ID => Ok(EncodedTerm::DefaultGraph),
TYPE_NAMED_NODE_ID => Ok(EncodedTerm::NamedNode { TYPE_NAMED_NODE_ID => Ok(EncodedTerm::NamedNode {
iri_id: self.read_u64::<LittleEndian>()?, iri_id: self.read_u128::<LittleEndian>()?,
}), }),
TYPE_BLANK_NODE_ID => Ok(EncodedTerm::BlankNode { TYPE_BLANK_NODE_ID => Ok(EncodedTerm::BlankNode {
id: self.read_u128::<LittleEndian>()?, id: self.read_u128::<LittleEndian>()?,
}), }),
TYPE_LANG_STRING_LITERAL_ID => Ok(EncodedTerm::LangStringLiteral { TYPE_LANG_STRING_LITERAL_ID => Ok(EncodedTerm::LangStringLiteral {
language_id: self.read_u64::<LittleEndian>()?, language_id: self.read_u128::<LittleEndian>()?,
value_id: self.read_u64::<LittleEndian>()?, value_id: self.read_u128::<LittleEndian>()?,
}), }),
TYPE_TYPED_LITERAL_ID => Ok(EncodedTerm::TypedLiteral { TYPE_TYPED_LITERAL_ID => Ok(EncodedTerm::TypedLiteral {
datatype_id: self.read_u64::<LittleEndian>()?, datatype_id: self.read_u128::<LittleEndian>()?,
value_id: self.read_u64::<LittleEndian>()?, value_id: self.read_u128::<LittleEndian>()?,
}), }),
TYPE_STRING_LITERAL => Ok(EncodedTerm::StringLiteral { TYPE_STRING_LITERAL => Ok(EncodedTerm::StringLiteral {
value_id: self.read_u64::<LittleEndian>()?, value_id: self.read_u128::<LittleEndian>()?,
}), }),
TYPE_BOOLEAN_LITERAL_TRUE => Ok(EncodedTerm::BooleanLiteral(true)), TYPE_BOOLEAN_LITERAL_TRUE => Ok(EncodedTerm::BooleanLiteral(true)),
TYPE_BOOLEAN_LITERAL_FALSE => Ok(EncodedTerm::BooleanLiteral(false)), TYPE_BOOLEAN_LITERAL_FALSE => Ok(EncodedTerm::BooleanLiteral(false)),
@ -556,7 +582,7 @@ impl<R: Read> TermReader for R {
} }
} }
pub const WRITTEN_TERM_MAX_SIZE: usize = size_of::<u8>() + 2 * size_of::<u64>(); pub const WRITTEN_TERM_MAX_SIZE: usize = size_of::<u8>() + 2 * size_of::<u128>();
pub trait TermWriter { pub trait TermWriter {
fn write_term(&mut self, term: EncodedTerm) -> Result<()>; fn write_term(&mut self, term: EncodedTerm) -> Result<()>;
@ -573,24 +599,24 @@ impl<W: Write> TermWriter for W {
self.write_u8(term.type_id())?; self.write_u8(term.type_id())?;
match term { match term {
EncodedTerm::DefaultGraph => {} EncodedTerm::DefaultGraph => {}
EncodedTerm::NamedNode { iri_id } => self.write_u64::<LittleEndian>(iri_id)?, EncodedTerm::NamedNode { iri_id } => self.write_u128::<LittleEndian>(iri_id)?,
EncodedTerm::BlankNode { id } => self.write_u128::<LittleEndian>(id)?, EncodedTerm::BlankNode { id } => self.write_u128::<LittleEndian>(id)?,
EncodedTerm::StringLiteral { value_id } => { EncodedTerm::StringLiteral { value_id } => {
self.write_u64::<LittleEndian>(value_id)?; self.write_u128::<LittleEndian>(value_id)?;
} }
EncodedTerm::LangStringLiteral { EncodedTerm::LangStringLiteral {
value_id, value_id,
language_id, language_id,
} => { } => {
self.write_u64::<LittleEndian>(language_id)?; self.write_u128::<LittleEndian>(language_id)?;
self.write_u64::<LittleEndian>(value_id)?; self.write_u128::<LittleEndian>(value_id)?;
} }
EncodedTerm::TypedLiteral { EncodedTerm::TypedLiteral {
value_id, value_id,
datatype_id, datatype_id,
} => { } => {
self.write_u64::<LittleEndian>(datatype_id)?; self.write_u128::<LittleEndian>(datatype_id)?;
self.write_u64::<LittleEndian>(value_id)?; self.write_u128::<LittleEndian>(value_id)?;
} }
EncodedTerm::BooleanLiteral(_) => {} EncodedTerm::BooleanLiteral(_) => {}
EncodedTerm::FloatLiteral(value) => self.write_f32::<LittleEndian>(*value)?, EncodedTerm::FloatLiteral(value) => self.write_f32::<LittleEndian>(*value)?,
@ -670,37 +696,160 @@ impl<W: Write> TermWriter for W {
} }
} }
pub struct Encoder<S: StringStore> { pub trait StrLookup {
string_store: S, type StrType: Deref<Target = str> + ToString + Into<String>;
fn get_str(&self, id: u128) -> Result<Option<Self::StrType>>;
}
pub trait StrContainer {
fn insert_str(&self, key: u128, value: &str) -> Result<()>;
/// Should be called when the bytes store is created
fn set_first_strings(&self) -> Result<()> {
self.insert_str(EMPTY_STRING_ID, "")?;
self.insert_str(RDF_LANG_STRING_ID, rdf::LANG_STRING.as_str())?;
self.insert_str(XSD_STRING_ID, xsd::STRING.as_str())?;
self.insert_str(XSD_BOOLEAN_ID, xsd::BOOLEAN.as_str())?;
self.insert_str(XSD_FLOAT_ID, xsd::FLOAT.as_str())?;
self.insert_str(XSD_DOUBLE_ID, xsd::DOUBLE.as_str())?;
self.insert_str(XSD_INTEGER_ID, xsd::INTEGER.as_str())?;
self.insert_str(XSD_DECIMAL_ID, xsd::DECIMAL.as_str())?;
self.insert_str(XSD_DATE_TIME_ID, xsd::DATE_TIME.as_str())?;
self.insert_str(XSD_DATE_ID, xsd::DATE.as_str())?;
self.insert_str(XSD_TIME_ID, xsd::TIME.as_str())?;
Ok(())
}
}
impl<'a, S: StrLookup + 'a> StrLookup for &'a S {
type StrType = S::StrType;
fn get_str(&self, id: u128) -> Result<Option<S::StrType>> {
(*self).get_str(id)
}
}
impl<'a, S: StrContainer + 'a> StrContainer for &'a S {
fn insert_str(&self, key: u128, value: &str) -> Result<()> {
(*self).insert_str(key, value)
}
}
pub struct MemoryStrStore {
id2str: RwLock<HashMap<u128, String>>,
}
impl Default for MemoryStrStore {
fn default() -> Self {
let new = Self {
id2str: RwLock::default(),
};
new.set_first_strings().unwrap();
new
}
}
impl StrLookup for MemoryStrStore {
type StrType = String;
fn get_str(&self, id: u128) -> Result<Option<String>> {
//TODO: avoid copy by adding a lifetime limit to get_str
Ok(self
.id2str
.read()
.map_err(MutexPoisonError::from)?
.get(&id)
.cloned())
}
} }
impl<S: StringStore> Encoder<S> { impl StrContainer for MemoryStrStore {
pub fn new(string_store: S) -> Self { fn insert_str(&self, key: u128, value: &str) -> Result<()> {
Self { string_store } let mut id2str = self.id2str.write().map_err(MutexPoisonError::from)?;
if !id2str.contains_key(&key) {
id2str.insert(key, value.to_owned());
}
Ok(())
} }
}
pub trait Encoder {
fn encode_named_node(&self, named_node: &NamedNode) -> Result<EncodedTerm>;
fn encode_blank_node(&self, blank_node: &BlankNode) -> Result<EncodedTerm>;
fn encode_literal(&self, literal: &Literal) -> Result<EncodedTerm>;
fn encode_named_or_blank_node(&self, term: &NamedOrBlankNode) -> Result<EncodedTerm>;
pub fn encode_named_node(&self, named_node: &NamedNode) -> Result<EncodedTerm> { fn encode_term(&self, term: &Term) -> Result<EncodedTerm>;
fn encode_quad(&self, quad: &Quad) -> Result<EncodedQuad>;
fn encode_triple_in_graph(
&self,
triple: &Triple,
graph_name: EncodedTerm,
) -> Result<EncodedQuad>;
fn encode_rio_named_node(&self, named_node: rio::NamedNode) -> Result<EncodedTerm>;
fn encode_rio_blank_node(
&self,
blank_node: rio::BlankNode,
bnodes_map: &mut HashMap<String, u128>,
) -> Result<EncodedTerm>;
fn encode_rio_literal(&self, literal: rio::Literal) -> Result<EncodedTerm>;
fn encode_rio_named_or_blank_node(
&self,
term: rio::NamedOrBlankNode,
bnodes_map: &mut HashMap<String, u128>,
) -> Result<EncodedTerm>;
fn encode_rio_term(
&self,
term: rio::Term,
bnodes_map: &mut HashMap<String, u128>,
) -> Result<EncodedTerm>;
fn encode_rio_quad(
&self,
quad: rio::Quad,
bnodes_map: &mut HashMap<String, u128>,
) -> Result<EncodedQuad>;
fn encode_rio_triple_in_graph(
&self,
triple: rio::Triple,
graph_name: EncodedTerm,
bnodes_map: &mut HashMap<String, u128>,
) -> Result<EncodedQuad>;
}
impl<S: StrContainer> Encoder for S {
fn encode_named_node(&self, named_node: &NamedNode) -> Result<EncodedTerm> {
self.encode_rio_named_node(named_node.into()) self.encode_rio_named_node(named_node.into())
} }
pub fn encode_blank_node(&self, blank_node: &BlankNode) -> Result<EncodedTerm> { fn encode_blank_node(&self, blank_node: &BlankNode) -> Result<EncodedTerm> {
Ok(EncodedTerm::BlankNode { Ok(blank_node.into())
id: blank_node.id(),
})
} }
pub fn encode_literal(&self, literal: &Literal) -> Result<EncodedTerm> { fn encode_literal(&self, literal: &Literal) -> Result<EncodedTerm> {
self.encode_rio_literal(literal.into()) self.encode_rio_literal(literal.into())
} }
pub fn encode_named_or_blank_node(&self, term: &NamedOrBlankNode) -> Result<EncodedTerm> { fn encode_named_or_blank_node(&self, term: &NamedOrBlankNode) -> Result<EncodedTerm> {
match term { match term {
NamedOrBlankNode::NamedNode(named_node) => self.encode_named_node(named_node), NamedOrBlankNode::NamedNode(named_node) => self.encode_named_node(named_node),
NamedOrBlankNode::BlankNode(blank_node) => self.encode_blank_node(blank_node), NamedOrBlankNode::BlankNode(blank_node) => self.encode_blank_node(blank_node),
} }
} }
pub fn encode_term(&self, term: &Term) -> Result<EncodedTerm> { fn encode_term(&self, term: &Term) -> Result<EncodedTerm> {
match term { match term {
Term::NamedNode(named_node) => self.encode_named_node(named_node), Term::NamedNode(named_node) => self.encode_named_node(named_node),
Term::BlankNode(blank_node) => self.encode_blank_node(blank_node), Term::BlankNode(blank_node) => self.encode_blank_node(blank_node),
@ -708,7 +857,7 @@ impl<S: StringStore> Encoder<S> {
} }
} }
pub fn encode_quad(&self, quad: &Quad) -> Result<EncodedQuad> { fn encode_quad(&self, quad: &Quad) -> Result<EncodedQuad> {
Ok(EncodedQuad { Ok(EncodedQuad {
subject: self.encode_named_or_blank_node(quad.subject())?, subject: self.encode_named_or_blank_node(quad.subject())?,
predicate: self.encode_named_node(quad.predicate())?, predicate: self.encode_named_node(quad.predicate())?,
@ -720,7 +869,7 @@ impl<S: StringStore> Encoder<S> {
}) })
} }
pub fn encode_triple_in_graph( fn encode_triple_in_graph(
&self, &self,
triple: &Triple, triple: &Triple,
graph_name: EncodedTerm, graph_name: EncodedTerm,
@ -733,13 +882,13 @@ impl<S: StringStore> Encoder<S> {
}) })
} }
pub fn encode_rio_named_node(&self, named_node: rio::NamedNode) -> Result<EncodedTerm> { fn encode_rio_named_node(&self, named_node: rio::NamedNode) -> Result<EncodedTerm> {
Ok(EncodedTerm::NamedNode { let iri_id = get_str_id(named_node.iri);
iri_id: self.string_store.insert_str(named_node.iri)?, self.insert_str(iri_id, named_node.iri)?;
}) Ok(EncodedTerm::NamedNode { iri_id })
} }
pub fn encode_rio_blank_node( fn encode_rio_blank_node(
&self, &self,
blank_node: rio::BlankNode, blank_node: rio::BlankNode,
bnodes_map: &mut HashMap<String, u128>, bnodes_map: &mut HashMap<String, u128>,
@ -753,29 +902,43 @@ impl<S: StringStore> Encoder<S> {
}) })
} }
pub fn encode_rio_literal(&self, literal: rio::Literal) -> Result<EncodedTerm> { fn encode_rio_literal(&self, literal: rio::Literal) -> Result<EncodedTerm> {
Ok(match literal { Ok(match literal {
rio::Literal::Simple { value } => EncodedTerm::StringLiteral { rio::Literal::Simple { value } => {
value_id: self.string_store.insert_str(value)?, let value_id = get_str_id(value);
}, self.insert_str(value_id, value)?;
EncodedTerm::StringLiteral { value_id }
}
rio::Literal::LanguageTaggedString { value, language } => { rio::Literal::LanguageTaggedString { value, language } => {
let value_id = get_str_id(value);
self.insert_str(value_id, value)?;
let language_id = if language.bytes().all(|b| b.is_ascii_lowercase()) {
let language_id = get_str_id(language);
self.insert_str(language_id, language)?;
language_id
} else {
let language = language.to_ascii_lowercase();
let language_id = get_str_id(&language);
self.insert_str(language_id, &language)?;
language_id
};
EncodedTerm::LangStringLiteral { EncodedTerm::LangStringLiteral {
value_id: self.string_store.insert_str(value)?, value_id,
language_id: if language.bytes().all(|b| b.is_ascii_lowercase()) { language_id,
self.string_store.insert_str(language)
} else {
self.string_store.insert_str(&language.to_ascii_lowercase())
}?,
} }
} }
rio::Literal::Typed { value, datatype } => { rio::Literal::Typed { value, datatype } => {
match match datatype.iri { match match datatype.iri {
"http://www.w3.org/2001/XMLSchema#boolean" => self.encode_boolean_str(value), "http://www.w3.org/2001/XMLSchema#boolean" => parse_boolean_str(value),
"http://www.w3.org/2001/XMLSchema#string" => Some(EncodedTerm::StringLiteral { "http://www.w3.org/2001/XMLSchema#string" => {
value_id: self.string_store.insert_str(value)?, let value_id = get_str_id(value);
}), self.insert_str(value_id, value)?;
"http://www.w3.org/2001/XMLSchema#float" => self.encode_float_str(value), Some(EncodedTerm::StringLiteral { value_id })
"http://www.w3.org/2001/XMLSchema#double" => self.encode_double_str(value), }
"http://www.w3.org/2001/XMLSchema#float" => parse_float_str(value),
"http://www.w3.org/2001/XMLSchema#double" => parse_double_str(value),
"http://www.w3.org/2001/XMLSchema#integer" "http://www.w3.org/2001/XMLSchema#integer"
| "http://www.w3.org/2001/XMLSchema#byte" | "http://www.w3.org/2001/XMLSchema#byte"
| "http://www.w3.org/2001/XMLSchema#short" | "http://www.w3.org/2001/XMLSchema#short"
@ -789,28 +952,34 @@ impl<S: StringStore> Encoder<S> {
| "http://www.w3.org/2001/XMLSchema#negativeInteger" | "http://www.w3.org/2001/XMLSchema#negativeInteger"
| "http://www.w3.org/2001/XMLSchema#nonPositiveInteger" | "http://www.w3.org/2001/XMLSchema#nonPositiveInteger"
| "http://www.w3.org/2001/XMLSchema#nonNegativeInteger" => { | "http://www.w3.org/2001/XMLSchema#nonNegativeInteger" => {
self.encode_integer_str(value) parse_integer_str(value)
} }
"http://www.w3.org/2001/XMLSchema#decimal" => self.encode_decimal_str(value), "http://www.w3.org/2001/XMLSchema#decimal" => parse_decimal_str(value),
"http://www.w3.org/2001/XMLSchema#date" => self.encode_date_str(value), "http://www.w3.org/2001/XMLSchema#date" => parse_date_str(value),
"http://www.w3.org/2001/XMLSchema#time" => self.encode_time_str(value), "http://www.w3.org/2001/XMLSchema#time" => parse_time_str(value),
"http://www.w3.org/2001/XMLSchema#dateTime" "http://www.w3.org/2001/XMLSchema#dateTime"
| "http://www.w3.org/2001/XMLSchema#dateTimeStamp" => { | "http://www.w3.org/2001/XMLSchema#dateTimeStamp" => {
self.encode_date_time_str(value) parse_date_time_str(value)
} }
_ => None, _ => None,
} { } {
Some(v) => v, Some(v) => v,
None => EncodedTerm::TypedLiteral { None => {
value_id: self.string_store.insert_str(value)?, let value_id = get_str_id(value);
datatype_id: self.string_store.insert_str(datatype.iri)?, self.insert_str(value_id, value)?;
}, let datatype_id = get_str_id(datatype.iri);
self.insert_str(datatype_id, datatype.iri)?;
EncodedTerm::TypedLiteral {
value_id,
datatype_id,
}
}
} }
} }
}) })
} }
pub fn encode_rio_named_or_blank_node( fn encode_rio_named_or_blank_node(
&self, &self,
term: rio::NamedOrBlankNode, term: rio::NamedOrBlankNode,
bnodes_map: &mut HashMap<String, u128>, bnodes_map: &mut HashMap<String, u128>,
@ -823,7 +992,7 @@ impl<S: StringStore> Encoder<S> {
} }
} }
pub fn encode_rio_term( fn encode_rio_term(
&self, &self,
term: rio::Term, term: rio::Term,
bnodes_map: &mut HashMap<String, u128>, bnodes_map: &mut HashMap<String, u128>,
@ -835,7 +1004,7 @@ impl<S: StringStore> Encoder<S> {
} }
} }
pub fn encode_rio_quad( fn encode_rio_quad(
&self, &self,
quad: rio::Quad, quad: rio::Quad,
bnodes_map: &mut HashMap<String, u128>, bnodes_map: &mut HashMap<String, u128>,
@ -851,7 +1020,7 @@ impl<S: StringStore> Encoder<S> {
}) })
} }
pub fn encode_rio_triple_in_graph( fn encode_rio_triple_in_graph(
&self, &self,
triple: rio::Triple, triple: rio::Triple,
graph_name: EncodedTerm, graph_name: EncodedTerm,
@ -864,124 +1033,78 @@ impl<S: StringStore> Encoder<S> {
graph_name, graph_name,
}) })
} }
}
pub fn encode_boolean_str(&self, value: &str) -> Option<EncodedTerm> { pub fn parse_boolean_str(value: &str) -> Option<EncodedTerm> {
match value { match value {
"true" | "1" => Some(EncodedTerm::BooleanLiteral(true)), "true" | "1" => Some(EncodedTerm::BooleanLiteral(true)),
"false" | "0" => Some(EncodedTerm::BooleanLiteral(false)), "false" | "0" => Some(EncodedTerm::BooleanLiteral(false)),
_ => None, _ => None,
}
}
pub fn encode_float_str(&self, value: &str) -> Option<EncodedTerm> {
value
.parse()
.map(|value| EncodedTerm::FloatLiteral(OrderedFloat(value)))
.ok()
}
pub fn encode_double_str(&self, value: &str) -> Option<EncodedTerm> {
value
.parse()
.map(|value| EncodedTerm::DoubleLiteral(OrderedFloat(value)))
.ok()
} }
}
pub fn encode_integer_str(&self, value: &str) -> Option<EncodedTerm> { pub fn parse_float_str(value: &str) -> Option<EncodedTerm> {
value.parse().map(EncodedTerm::IntegerLiteral).ok() value
} .parse()
.map(|value| EncodedTerm::FloatLiteral(OrderedFloat(value)))
.ok()
}
pub fn encode_decimal_str(&self, value: &str) -> Option<EncodedTerm> { pub fn parse_double_str(value: &str) -> Option<EncodedTerm> {
value.parse().map(EncodedTerm::DecimalLiteral).ok() value
} .parse()
.map(|value| EncodedTerm::DoubleLiteral(OrderedFloat(value)))
.ok()
}
pub fn encode_date_str(&self, value: &str) -> Option<EncodedTerm> { pub fn parse_integer_str(value: &str) -> Option<EncodedTerm> {
let mut parsed = Parsed::new(); value.parse().map(EncodedTerm::IntegerLiteral).ok()
match parse(&mut parsed, &value, StrftimeItems::new("%Y-%m-%d%:z")).and_then(|_| { }
Ok(Date::from_utc(
parsed.to_naive_date()?,
parsed.to_fixed_offset()?,
))
}) {
Ok(value) => Some(EncodedTerm::DateLiteral(value)),
Err(_) => match NaiveDate::parse_from_str(&value, "%Y-%m-%dZ") {
Ok(value) => Some(EncodedTerm::DateLiteral(Date::from_utc(
value,
FixedOffset::east(0),
))),
Err(_) => NaiveDate::parse_from_str(&value, "%Y-%m-%d")
.map(EncodedTerm::NaiveDateLiteral)
.ok(),
},
}
}
pub fn encode_time_str(&self, value: &str) -> Option<EncodedTerm> { pub fn parse_decimal_str(value: &str) -> Option<EncodedTerm> {
NaiveTime::parse_from_str(&value, "%H:%M:%S") value.parse().map(EncodedTerm::DecimalLiteral).ok()
.map(EncodedTerm::NaiveTimeLiteral) }
.ok()
}
pub fn encode_date_time_str(&self, value: &str) -> Option<EncodedTerm> { pub fn parse_date_str(value: &str) -> Option<EncodedTerm> {
match DateTime::parse_from_rfc3339(&value) { let mut parsed = Parsed::new();
Ok(value) => Some(EncodedTerm::DateTimeLiteral(value)), match parse(&mut parsed, &value, StrftimeItems::new("%Y-%m-%d%:z")).and_then(|_| {
Err(_) => NaiveDateTime::parse_from_str(&value, "%Y-%m-%dT%H:%M:%S") Ok(Date::from_utc(
.map(EncodedTerm::NaiveDateTimeLiteral) parsed.to_naive_date()?,
parsed.to_fixed_offset()?,
))
}) {
Ok(value) => Some(EncodedTerm::DateLiteral(value)),
Err(_) => match NaiveDate::parse_from_str(&value, "%Y-%m-%dZ") {
Ok(value) => Some(EncodedTerm::DateLiteral(Date::from_utc(
value,
FixedOffset::east(0),
))),
Err(_) => NaiveDate::parse_from_str(&value, "%Y-%m-%d")
.map(EncodedTerm::NaiveDateLiteral)
.ok(), .ok(),
} },
} }
}
pub fn decode_term(&self, encoded: EncodedTerm) -> Result<Term> { pub fn parse_time_str(value: &str) -> Option<EncodedTerm> {
match encoded { NaiveTime::parse_from_str(&value, "%H:%M:%S")
EncodedTerm::DefaultGraph => { .map(EncodedTerm::NaiveTimeLiteral)
Err(format_err!("The default graph tag is not a valid term")) .ok()
} }
EncodedTerm::NamedNode { iri_id } => {
Ok(NamedNode::new_from_string(self.get_str(iri_id)?).into())
}
EncodedTerm::BlankNode { id } => Ok(BlankNode::new_from_unique_id(id).into()),
EncodedTerm::StringLiteral { value_id } => {
Ok(Literal::new_simple_literal(self.get_str(value_id)?).into())
}
EncodedTerm::LangStringLiteral {
value_id,
language_id,
} => Ok(Literal::new_language_tagged_literal(
self.get_str(value_id)?,
self.get_str(language_id)?,
)
.into()),
EncodedTerm::TypedLiteral {
value_id,
datatype_id,
} => Ok(Literal::new_typed_literal(
self.get_str(value_id)?,
NamedNode::new_from_string(self.get_str(datatype_id)?),
)
.into()),
EncodedTerm::BooleanLiteral(value) => Ok(Literal::from(value).into()),
EncodedTerm::FloatLiteral(value) => Ok(Literal::from(*value).into()),
EncodedTerm::DoubleLiteral(value) => Ok(Literal::from(*value).into()),
EncodedTerm::IntegerLiteral(value) => Ok(Literal::from(value).into()),
EncodedTerm::DecimalLiteral(value) => Ok(Literal::from(value).into()),
EncodedTerm::DateLiteral(value) => Ok(Literal::from(value).into()),
EncodedTerm::NaiveDateLiteral(value) => Ok(Literal::from(value).into()),
EncodedTerm::NaiveTimeLiteral(value) => Ok(Literal::from(value).into()),
EncodedTerm::DateTimeLiteral(value) => Ok(Literal::from(value).into()),
EncodedTerm::NaiveDateTimeLiteral(value) => Ok(Literal::from(value).into()),
}
}
fn get_str(&self, id: u64) -> Result<S::StringType> { pub fn parse_date_time_str(value: &str) -> Option<EncodedTerm> {
self.string_store.get_str(id)?.ok_or_else(|| { match DateTime::parse_from_rfc3339(&value) {
format_err!( Ok(value) => Some(EncodedTerm::DateTimeLiteral(value)),
"Not able to find the string with id {} in the string store", Err(_) => NaiveDateTime::parse_from_str(&value, "%Y-%m-%dT%H:%M:%S")
id .map(EncodedTerm::NaiveDateTimeLiteral)
) .ok(),
})
} }
}
pub trait Decoder {
fn decode_term(&self, encoded: EncodedTerm) -> Result<Term>;
pub fn decode_named_or_blank_node(&self, encoded: EncodedTerm) -> Result<NamedOrBlankNode> { fn decode_named_or_blank_node(&self, encoded: EncodedTerm) -> Result<NamedOrBlankNode> {
match self.decode_term(encoded)? { match self.decode_term(encoded)? {
Term::NamedNode(named_node) => Ok(named_node.into()), Term::NamedNode(named_node) => Ok(named_node.into()),
Term::BlankNode(blank_node) => Ok(blank_node.into()), Term::BlankNode(blank_node) => Ok(blank_node.into()),
@ -991,7 +1114,7 @@ impl<S: StringStore> Encoder<S> {
} }
} }
pub fn decode_named_node(&self, encoded: EncodedTerm) -> Result<NamedNode> { fn decode_named_node(&self, encoded: EncodedTerm) -> Result<NamedNode> {
match self.decode_term(encoded)? { match self.decode_term(encoded)? {
Term::NamedNode(named_node) => Ok(named_node), Term::NamedNode(named_node) => Ok(named_node),
Term::BlankNode(_) => Err(format_err!( Term::BlankNode(_) => Err(format_err!(
@ -1003,7 +1126,7 @@ impl<S: StringStore> Encoder<S> {
} }
} }
pub fn decode_triple(&self, encoded: &EncodedQuad) -> Result<Triple> { fn decode_triple(&self, encoded: &EncodedQuad) -> Result<Triple> {
Ok(Triple::new( Ok(Triple::new(
self.decode_named_or_blank_node(encoded.subject)?, self.decode_named_or_blank_node(encoded.subject)?,
self.decode_named_node(encoded.predicate)?, self.decode_named_node(encoded.predicate)?,
@ -1011,7 +1134,7 @@ impl<S: StringStore> Encoder<S> {
)) ))
} }
pub fn decode_quad(&self, encoded: &EncodedQuad) -> Result<Quad> { fn decode_quad(&self, encoded: &EncodedQuad) -> Result<Quad> {
Ok(Quad::new( Ok(Quad::new(
self.decode_named_or_blank_node(encoded.subject)?, self.decode_named_or_blank_node(encoded.subject)?,
self.decode_named_node(encoded.predicate)?, self.decode_named_node(encoded.predicate)?,
@ -1024,14 +1147,58 @@ impl<S: StringStore> Encoder<S> {
} }
} }
impl<S: StringStore + Default> Default for Encoder<S> { impl<S: StrLookup> Decoder for S {
fn default() -> Self { fn decode_term(&self, encoded: EncodedTerm) -> Result<Term> {
Self { match encoded {
string_store: S::default(), EncodedTerm::DefaultGraph => {
Err(format_err!("The default graph tag is not a valid term"))
}
EncodedTerm::NamedNode { iri_id } => {
Ok(NamedNode::new_from_string(get_required_str(self, iri_id)?).into())
}
EncodedTerm::BlankNode { id } => Ok(BlankNode::new_from_unique_id(id).into()),
EncodedTerm::StringLiteral { value_id } => {
Ok(Literal::new_simple_literal(get_required_str(self, value_id)?).into())
}
EncodedTerm::LangStringLiteral {
value_id,
language_id,
} => Ok(Literal::new_language_tagged_literal(
get_required_str(self, value_id)?,
get_required_str(self, language_id)?,
)
.into()),
EncodedTerm::TypedLiteral {
value_id,
datatype_id,
} => Ok(Literal::new_typed_literal(
get_required_str(self, value_id)?,
NamedNode::new_from_string(get_required_str(self, datatype_id)?),
)
.into()),
EncodedTerm::BooleanLiteral(value) => Ok(Literal::from(value).into()),
EncodedTerm::FloatLiteral(value) => Ok(Literal::from(*value).into()),
EncodedTerm::DoubleLiteral(value) => Ok(Literal::from(*value).into()),
EncodedTerm::IntegerLiteral(value) => Ok(Literal::from(value).into()),
EncodedTerm::DecimalLiteral(value) => Ok(Literal::from(value).into()),
EncodedTerm::DateLiteral(value) => Ok(Literal::from(value).into()),
EncodedTerm::NaiveDateLiteral(value) => Ok(Literal::from(value).into()),
EncodedTerm::NaiveTimeLiteral(value) => Ok(Literal::from(value).into()),
EncodedTerm::DateTimeLiteral(value) => Ok(Literal::from(value).into()),
EncodedTerm::NaiveDateTimeLiteral(value) => Ok(Literal::from(value).into()),
} }
} }
} }
fn get_required_str<S: StrLookup>(lookup: S, id: u128) -> Result<S::StrType> {
lookup.get_str(id)?.ok_or_else(|| {
format_err!(
"Not able to find the string with id {} in the string store",
id
)
})
}
#[derive(Debug, Fail)] #[derive(Debug, Fail)]
#[fail(display = "Mutex Mutex was poisoned")] #[fail(display = "Mutex Mutex was poisoned")]
pub struct MutexPoisonError { pub struct MutexPoisonError {
@ -1048,7 +1215,7 @@ impl<T> From<PoisonError<T>> for MutexPoisonError {
#[test] #[test]
fn test_encoding() { fn test_encoding() {
let encoder: Encoder<MemoryStringStore> = Encoder::default(); let store = MemoryStrStore::default();
let terms: Vec<Term> = vec![ let terms: Vec<Term> = vec![
NamedNode::new_from_string("http://foo.com").into(), NamedNode::new_from_string("http://foo.com").into(),
NamedNode::new_from_string("http://bar.com").into(), NamedNode::new_from_string("http://bar.com").into(),
@ -1063,14 +1230,8 @@ fn test_encoding() {
Literal::new_language_tagged_literal("foo", "FR").into(), Literal::new_language_tagged_literal("foo", "FR").into(),
]; ];
for term in terms { for term in terms {
let encoded = encoder.encode_term(&term).unwrap(); let encoded = store.encode_term(&term).unwrap();
assert_eq!(term, encoder.decode_term(encoded).unwrap()) assert_eq!(term, store.decode_term(encoded).unwrap());
assert_eq!(encoded, EncodedTerm::from(&term));
} }
} }
#[test]
fn test_encoded_term_size() {
use std::mem::size_of;
assert_eq!(size_of::<EncodedTerm>(), 24);
}

@ -1,8 +1,6 @@
use crate::store::numeric_encoder::*; use crate::store::numeric_encoder::*;
use crate::store::{Store, StoreConnection, StoreRepositoryConnection}; use crate::store::{Store, StoreConnection, StoreRepositoryConnection};
use crate::{Repository, Result}; use crate::{Repository, Result};
use byteorder::ByteOrder;
use byteorder::LittleEndian;
use failure::format_err; use failure::format_err;
use rocksdb::ColumnFamily; use rocksdb::ColumnFamily;
use rocksdb::DBCompactionStyle; use rocksdb::DBCompactionStyle;
@ -16,7 +14,6 @@ use std::iter::{empty, once};
use std::ops::Deref; use std::ops::Deref;
use std::path::Path; use std::path::Path;
use std::str; use std::str;
use std::sync::Mutex;
/// `Repository` implementation based on the [RocksDB](https://rocksdb.org/) key-value store /// `Repository` implementation based on the [RocksDB](https://rocksdb.org/) key-value store
/// ///
@ -55,7 +52,6 @@ pub struct RocksDbRepository {
pub type RocksDbRepositoryConnection<'a> = StoreRepositoryConnection<RocksDbStoreConnection<'a>>; pub type RocksDbRepositoryConnection<'a> = StoreRepositoryConnection<RocksDbStoreConnection<'a>>;
const ID2STR_CF: &str = "id2str"; const ID2STR_CF: &str = "id2str";
const STR2ID_CF: &str = "id2str";
const SPOG_CF: &str = "spog"; const SPOG_CF: &str = "spog";
const POSG_CF: &str = "posg"; const POSG_CF: &str = "posg";
const OSPG_CF: &str = "ospg"; const OSPG_CF: &str = "ospg";
@ -67,13 +63,12 @@ const EMPTY_BUF: [u8; 0] = [0 as u8; 0];
//TODO: indexes for the default graph and indexes for the named graphs (no more Optional and space saving) //TODO: indexes for the default graph and indexes for the named graphs (no more Optional and space saving)
const COLUMN_FAMILIES: [&str; 8] = [ const COLUMN_FAMILIES: [&str; 7] = [
ID2STR_CF, STR2ID_CF, SPOG_CF, POSG_CF, OSPG_CF, GSPO_CF, GPOS_CF, GOSP_CF, ID2STR_CF, SPOG_CF, POSG_CF, OSPG_CF, GSPO_CF, GPOS_CF, GOSP_CF,
]; ];
struct RocksDbStore { struct RocksDbStore {
db: DB, db: DB,
str_id_counter: Mutex<RocksDBCounter>,
} }
#[derive(Clone)] #[derive(Clone)]
@ -81,7 +76,6 @@ pub struct RocksDbStoreConnection<'a> {
store: &'a RocksDbStore, store: &'a RocksDbStore,
buffer: Vec<u8>, buffer: Vec<u8>,
id2str_cf: ColumnFamily<'a>, id2str_cf: ColumnFamily<'a>,
str2id_cf: ColumnFamily<'a>,
spog_cf: ColumnFamily<'a>, spog_cf: ColumnFamily<'a>,
posg_cf: ColumnFamily<'a>, posg_cf: ColumnFamily<'a>,
ospg_cf: ColumnFamily<'a>, ospg_cf: ColumnFamily<'a>,
@ -115,7 +109,6 @@ impl RocksDbStore {
let new = Self { let new = Self {
db: DB::open_cf(&options, path, &COLUMN_FAMILIES)?, db: DB::open_cf(&options, path, &COLUMN_FAMILIES)?,
str_id_counter: Mutex::new(RocksDBCounter::new("bsc")),
}; };
(&new).connection()?.set_first_strings()?; (&new).connection()?.set_first_strings()?;
Ok(new) Ok(new)
@ -130,7 +123,6 @@ impl<'a> Store for &'a RocksDbStore {
store: self, store: self,
buffer: Vec::default(), buffer: Vec::default(),
id2str_cf: get_cf(&self.db, ID2STR_CF)?, id2str_cf: get_cf(&self.db, ID2STR_CF)?,
str2id_cf: get_cf(&self.db, STR2ID_CF)?,
spog_cf: get_cf(&self.db, SPOG_CF)?, spog_cf: get_cf(&self.db, SPOG_CF)?,
posg_cf: get_cf(&self.db, POSG_CF)?, posg_cf: get_cf(&self.db, POSG_CF)?,
ospg_cf: get_cf(&self.db, OSPG_CF)?, ospg_cf: get_cf(&self.db, OSPG_CF)?,
@ -141,42 +133,24 @@ impl<'a> Store for &'a RocksDbStore {
} }
} }
impl StringStore for RocksDbStoreConnection<'_> { impl StrLookup for RocksDbStoreConnection<'_> {
type StringType = RocksString; type StrType = RocksString;
fn get_str(&self, id: u64) -> Result<Option<RocksString>> { fn get_str(&self, id: u128) -> Result<Option<RocksString>> {
Ok(self Ok(self
.store .store
.db .db
.get_cf(self.id2str_cf, &to_bytes(id))? .get_cf(self.id2str_cf, &id.to_le_bytes())?
.map(|v| RocksString { vec: v })) .map(|v| RocksString { vec: v }))
} }
}
fn get_str_id(&self, value: &str) -> Result<Option<u64>> { impl StrContainer for RocksDbStoreConnection<'_> {
Ok(self fn insert_str(&self, key: u128, value: &str) -> Result<()> {
.store self.store
.db .db
.get_cf(self.str2id_cf, value.as_bytes())? .put_cf(self.id2str_cf, &key.to_le_bytes(), value)?;
.map(|id| LittleEndian::read_u64(&id))) Ok(())
}
fn insert_str(&self, value: &str) -> Result<u64> {
Ok(if let Some(id) = self.get_str_id(value)? {
id
} else {
let id = self
.store
.str_id_counter
.lock()
.map_err(MutexPoisonError::from)?
.get_and_increment(&self.store.db)? as u64;
let id_bytes = to_bytes(id);
let mut batch = WriteBatch::default();
batch.put_cf(self.id2str_cf, &id_bytes, value)?;
batch.put_cf(self.str2id_cf, value, &id_bytes)?;
self.store.db.write(batch)?;
id
})
} }
} }
@ -530,24 +504,6 @@ fn wrap_error<'a, E: 'a, I: Iterator<Item = Result<E>> + 'a>(
} }
} }
struct RocksDBCounter {
name: &'static str,
}
impl RocksDBCounter {
fn new(name: &'static str) -> Self {
Self { name }
}
fn get_and_increment(&self, db: &DB) -> Result<u64> {
let value = db
.get(self.name.as_bytes())?
.map_or(0, |b| LittleEndian::read_u64(&b));
db.put(self.name.as_bytes(), &to_bytes(value + 1))?;
Ok(value)
}
}
struct EncodedQuadPattern { struct EncodedQuadPattern {
subject: Option<EncodedTerm>, subject: Option<EncodedTerm>,
predicate: Option<EncodedTerm>, predicate: Option<EncodedTerm>,
@ -740,12 +696,6 @@ impl<I: Iterator<Item = Result<EncodedQuad>>> Iterator for FilteringEncodedQuads
} }
} }
fn to_bytes(int: u64) -> [u8; 8] {
let mut buf = [0 as u8; 8];
LittleEndian::write_u64(&mut buf, int);
buf
}
pub struct RocksString { pub struct RocksString {
vec: DBVector, vec: DBVector,
} }

Loading…
Cancel
Save