From f7cc600054e1883764e3b78def48cf639ddbb193 Mon Sep 17 00:00:00 2001 From: Tpt Date: Sat, 19 Mar 2022 12:05:20 +0100 Subject: [PATCH] Introduces a TermEncoder struct Will allow configuring term encoding in the future --- lib/oxrdf/src/triple.rs | 34 +++++ lib/src/sparql/dataset.rs | 21 ++- lib/src/sparql/update.rs | 11 +- lib/src/storage/binary_encoder.rs | 5 +- lib/src/storage/mod.rs | 28 ++-- lib/src/storage/numeric_encoder.rs | 211 +++++++++++++---------------- lib/src/store.rs | 40 +++--- 7 files changed, 191 insertions(+), 159 deletions(-) diff --git a/lib/oxrdf/src/triple.rs b/lib/oxrdf/src/triple.rs index d91c287c..34327cc6 100644 --- a/lib/oxrdf/src/triple.rs +++ b/lib/oxrdf/src/triple.rs @@ -921,6 +921,23 @@ impl From> for GraphName { } } +impl From for GraphName { + #[inline] + fn from(node: NamedOrBlankNode) -> Self { + match node { + NamedOrBlankNode::NamedNode(node) => node.into(), + NamedOrBlankNode::BlankNode(node) => node.into(), + } + } +} + +impl From> for GraphName { + #[inline] + fn from(node: NamedOrBlankNodeRef<'_>) -> Self { + node.into_owned().into() + } +} + /// A possible borrowed graph name. /// It is the union of [IRIs](https://www.w3.org/TR/rdf11-concepts/#dfn-iri), [blank nodes](https://www.w3.org/TR/rdf11-concepts/#dfn-blank-node), and the [default graph name](https://www.w3.org/TR/rdf11-concepts/#dfn-default-graph). #[derive(Eq, PartialEq, Debug, Clone, Copy, Hash)] @@ -995,6 +1012,23 @@ impl<'a> From<&'a BlankNode> for GraphNameRef<'a> { } } +impl<'a> From> for GraphNameRef<'a> { + #[inline] + fn from(node: NamedOrBlankNodeRef<'a>) -> Self { + match node { + NamedOrBlankNodeRef::NamedNode(node) => node.into(), + NamedOrBlankNodeRef::BlankNode(node) => node.into(), + } + } +} + +impl<'a> From<&'a NamedOrBlankNode> for GraphNameRef<'a> { + #[inline] + fn from(node: &'a NamedOrBlankNode) -> Self { + node.as_ref().into() + } +} + impl<'a> From<&'a GraphName> for GraphNameRef<'a> { #[inline] fn from(node: &'a GraphName) -> Self { diff --git a/lib/src/sparql/dataset.rs b/lib/src/sparql/dataset.rs index 9ea0f76a..9d799964 100644 --- a/lib/src/sparql/dataset.rs +++ b/lib/src/sparql/dataset.rs @@ -18,13 +18,20 @@ pub struct DatasetView { impl DatasetView { pub fn new(reader: StorageReader, dataset: &QueryDataset) -> Self { + let encoder = reader.term_encoder(); let dataset = EncodedDatasetSpec { - default: dataset - .default_graph_graphs() - .map(|graphs| graphs.iter().map(|g| g.as_ref().into()).collect::>()), - named: dataset - .available_named_graphs() - .map(|graphs| graphs.iter().map(|g| g.as_ref().into()).collect::>()), + default: dataset.default_graph_graphs().map(|graphs| { + graphs + .iter() + .map(|g| encoder.encode_graph_name(g)) + .collect::>() + }), + named: dataset.available_named_graphs().map(|graphs| { + graphs + .iter() + .map(|g| encoder.encode_term(g)) + .collect::>() + }), }; Self { reader, @@ -152,7 +159,7 @@ impl DatasetView { pub fn encode_term<'a>(&self, term: impl Into>) -> EncodedTerm { let term = term.into(); - let encoded = term.into(); + let encoded = self.reader.term_encoder().encode_term(term); insert_term(term, &encoded, &mut |key, value| { self.insert_str(key, value); Ok(()) diff --git a/lib/src/sparql/update.rs b/lib/src/sparql/update.rs index 1c2f53ed..0cdbcb1a 100644 --- a/lib/src/sparql/update.rs +++ b/lib/src/sparql/update.rs @@ -191,11 +191,12 @@ impl<'a, 'b: 'a> SimpleUpdateEvaluator<'a, 'b> { fn eval_clear(&mut self, graph: &GraphTarget, silent: bool) -> Result<(), EvaluationError> { match graph { GraphTarget::NamedNode(graph_name) => { - if self - .transaction - .reader() - .contains_named_graph(&graph_name.as_ref().into())? - { + if self.transaction.reader().contains_named_graph( + &self + .transaction + .term_encoder() + .encode_graph_name(graph_name), + )? { Ok(self.transaction.clear_graph(graph_name.into())?) } else if silent { Ok(()) diff --git a/lib/src/storage/binary_encoder.rs b/lib/src/storage/binary_encoder.rs index babd3547..32205a8d 100644 --- a/lib/src/storage/binary_encoder.rs +++ b/lib/src/storage/binary_encoder.rs @@ -744,9 +744,10 @@ mod tests { ]; let decoder = TermDecoder::new(&store); for term in terms { - let encoded = term.as_ref().into(); + let encoder = TermEncoder::new(); + let encoded = encoder.encode_term(&term); + assert_eq!(encoded, encoder.encode_term(&term)); store.insert_term(term.as_ref(), &encoded); - assert_eq!(encoded, term.as_ref().into()); assert_eq!(term, decoder.decode_term(&encoded).unwrap()); let mut buffer = Vec::new(); diff --git a/lib/src/storage/mod.rs b/lib/src/storage/mod.rs index b6859381..391caad6 100644 --- a/lib/src/storage/mod.rs +++ b/lib/src/storage/mod.rs @@ -10,7 +10,7 @@ use crate::storage::binary_encoder::{ }; pub use crate::storage::error::{CorruptionError, LoaderError, SerializerError, StorageError}; use crate::storage::numeric_encoder::{ - insert_term, EncodedQuad, EncodedTerm, StrHash, StrLookup, TermDecoder, + insert_term, EncodedQuad, EncodedTerm, StrHash, StrLookup, TermDecoder, TermEncoder, }; use backend::{ColumnFamily, ColumnFamilyDefinition, Db, Iter}; use std::cmp::{max, min}; @@ -294,6 +294,11 @@ pub struct StorageReader { } impl StorageReader { + #[allow(clippy::unused_self)] + pub fn term_encoder(&self) -> TermEncoder { + TermEncoder::new() + } + pub fn term_decoder(&self) -> TermDecoder { TermDecoder::new(self) } @@ -852,8 +857,13 @@ impl<'a> StorageWriter<'a> { } } + #[allow(clippy::unused_self)] + pub fn term_encoder(&self) -> TermEncoder { + TermEncoder::new() + } + pub fn insert(&mut self, quad: QuadRef<'_>) -> Result { - let encoded = quad.into(); + let encoded = self.term_encoder().encode_quad(quad); self.buffer.clear(); let result = if quad.graph_name.is_default_graph() { write_spo_quad(&mut self.buffer, &encoded); @@ -941,7 +951,7 @@ impl<'a> StorageWriter<'a> { &mut self, graph_name: NamedOrBlankNodeRef<'_>, ) -> Result { - let encoded_graph_name = graph_name.into(); + let encoded_graph_name = self.term_encoder().encode_graph_name(graph_name); self.buffer.clear(); write_term(&mut self.buffer, &encoded_graph_name); @@ -1005,7 +1015,7 @@ impl<'a> StorageWriter<'a> { } pub fn remove(&mut self, quad: QuadRef<'_>) -> Result { - self.remove_encoded(&quad.into()) + self.remove_encoded(&self.term_encoder().encode_quad(quad)) } fn remove_encoded(&mut self, quad: &EncodedQuad) -> Result { @@ -1081,14 +1091,15 @@ impl<'a> StorageWriter<'a> { self.remove_encoded(&quad?)?; } } else { + let graph_name = self.term_encoder().encode_graph_name(graph_name); self.buffer.clear(); - write_term(&mut self.buffer, &graph_name.into()); + write_term(&mut self.buffer, &graph_name); if self .transaction .contains_key_for_update(&self.storage.graphs_cf, &self.buffer)? { // The condition is useful to lock the graph itself and ensure no quad is inserted at the same time - for quad in self.reader().quads_for_graph(&graph_name.into()) { + for quad in self.reader().quads_for_graph(&graph_name) { self.remove_encoded(&quad?)?; } } @@ -1114,7 +1125,7 @@ impl<'a> StorageWriter<'a> { &mut self, graph_name: NamedOrBlankNodeRef<'_>, ) -> Result { - self.remove_encoded_named_graph(&graph_name.into()) + self.remove_encoded_named_graph(&self.term_encoder().encode_graph_name(graph_name)) } fn remove_encoded_named_graph( @@ -1327,8 +1338,9 @@ impl FileBulkLoader { } fn encode(&mut self, quads: impl IntoIterator) -> Result<(), StorageError> { + let encoder = TermEncoder::new(); for quad in quads { - let encoded = EncodedQuad::from(quad.as_ref()); + let encoded = encoder.encode_quad(&quad); if quad.graph_name.is_default_graph() { if self.triples.insert(encoded.clone()) { self.insert_term(quad.subject.as_ref().into(), &encoded.subject)?; diff --git a/lib/src/storage/numeric_encoder.rs b/lib/src/storage/numeric_encoder.rs index d3f92729..99f97fb4 100644 --- a/lib/src/storage/numeric_encoder.rs +++ b/lib/src/storage/numeric_encoder.rs @@ -429,33 +429,88 @@ impl From for EncodedTerm { } } -impl From> for EncodedTerm { - fn from(named_node: NamedNodeRef<'_>) -> Self { - Self::NamedNode { - iri_id: StrHash::new(named_node.as_str()), +#[derive(Eq, PartialEq, Debug, Clone, Hash)] +pub struct EncodedTriple { + pub subject: EncodedTerm, + pub predicate: EncodedTerm, + pub object: EncodedTerm, +} + +impl EncodedTriple { + pub fn new(subject: EncodedTerm, predicate: EncodedTerm, object: EncodedTerm) -> Self { + Self { + subject, + predicate, + object, } } } -impl From> for EncodedTerm { - fn from(blank_node: BlankNodeRef<'_>) -> Self { +#[derive(Eq, PartialEq, Debug, Clone, Hash)] +pub struct EncodedQuad { + pub subject: EncodedTerm, + pub predicate: EncodedTerm, + pub object: EncodedTerm, + pub graph_name: EncodedTerm, +} + +impl EncodedQuad { + pub fn new( + subject: EncodedTerm, + predicate: EncodedTerm, + object: EncodedTerm, + graph_name: EncodedTerm, + ) -> Self { + Self { + subject, + predicate, + object, + graph_name, + } + } +} + +pub struct TermEncoder {} + +impl TermEncoder { + pub fn new() -> Self { + Self {} + } + + pub fn encode_term<'a>(&self, term: impl Into>) -> EncodedTerm { + match term.into() { + TermRef::NamedNode(named_node) => self.encode_named_node(named_node), + TermRef::BlankNode(blank_node) => self.encode_blank_node(blank_node), + TermRef::Literal(literal) => self.encode_literal(literal), + TermRef::Triple(triple) => self.encode_triple(triple.as_ref()).into(), + } + } + + #[allow(clippy::unused_self)] + fn encode_named_node(&self, named_node: NamedNodeRef<'_>) -> EncodedTerm { + EncodedTerm::NamedNode { + iri_id: StrHash::new(named_node.as_str()), + } + } + + #[allow(clippy::unused_self)] + fn encode_blank_node(&self, blank_node: BlankNodeRef<'_>) -> EncodedTerm { if let Some(id) = blank_node.unique_id() { - Self::NumericalBlankNode { id } + EncodedTerm::NumericalBlankNode { id } } else { let id = blank_node.as_str(); if let Ok(id) = id.try_into() { - Self::SmallBlankNode(id) + EncodedTerm::SmallBlankNode(id) } else { - Self::BigBlankNode { + EncodedTerm::BigBlankNode { id_id: StrHash::new(id), } } } } -} -impl From> for EncodedTerm { - fn from(literal: LiteralRef<'_>) -> Self { + #[allow(clippy::unused_self)] + fn encode_literal(&self, literal: LiteralRef<'_>) -> EncodedTerm { let value = literal.value(); let datatype = literal.datatype().as_str(); let native_encoding = match datatype { @@ -463,20 +518,20 @@ impl From> for EncodedTerm { literal.language().map(|language| { if let Ok(value) = SmallString::try_from(value) { if let Ok(language) = SmallString::try_from(language) { - Self::SmallSmallLangStringLiteral { value, language } + EncodedTerm::SmallSmallLangStringLiteral { value, language } } else { - Self::SmallBigLangStringLiteral { + EncodedTerm::SmallBigLangStringLiteral { value, language_id: StrHash::new(language), } } } else if let Ok(language) = SmallString::try_from(language) { - Self::BigSmallLangStringLiteral { + EncodedTerm::BigSmallLangStringLiteral { value_id: StrHash::new(value), language, } } else { - Self::BigBigLangStringLiteral { + EncodedTerm::BigBigLangStringLiteral { value_id: StrHash::new(value), language_id: StrHash::new(language), } @@ -487,9 +542,9 @@ impl From> for EncodedTerm { "http://www.w3.org/2001/XMLSchema#string" => { let value = value; Some(if let Ok(value) = SmallString::try_from(value) { - Self::SmallStringLiteral(value) + EncodedTerm::SmallStringLiteral(value) } else { - Self::BigStringLiteral { + EncodedTerm::BigStringLiteral { value_id: StrHash::new(value), } }) @@ -532,12 +587,12 @@ impl From> for EncodedTerm { Some(term) => term, None => { if let Ok(value) = SmallString::try_from(value) { - Self::SmallTypedLiteral { + EncodedTerm::SmallTypedLiteral { value, datatype_id: StrHash::new(datatype), } } else { - Self::BigTypedLiteral { + EncodedTerm::BigTypedLiteral { value_id: StrHash::new(value), datatype_id: StrHash::new(datatype), } @@ -545,112 +600,30 @@ impl From> for EncodedTerm { } } } -} - -impl From> for EncodedTerm { - fn from(term: NamedOrBlankNodeRef<'_>) -> Self { - match term { - NamedOrBlankNodeRef::NamedNode(named_node) => named_node.into(), - NamedOrBlankNodeRef::BlankNode(blank_node) => blank_node.into(), - } - } -} - -impl From> for EncodedTerm { - fn from(term: SubjectRef<'_>) -> Self { - match term { - SubjectRef::NamedNode(named_node) => named_node.into(), - SubjectRef::BlankNode(blank_node) => blank_node.into(), - SubjectRef::Triple(triple) => triple.as_ref().into(), - } - } -} - -impl From> for EncodedTerm { - fn from(term: TermRef<'_>) -> Self { - match term { - TermRef::NamedNode(named_node) => named_node.into(), - TermRef::BlankNode(blank_node) => blank_node.into(), - TermRef::Literal(literal) => literal.into(), - TermRef::Triple(triple) => triple.as_ref().into(), - } - } -} -impl From> for EncodedTerm { - fn from(name: GraphNameRef<'_>) -> Self { - match name { - GraphNameRef::NamedNode(named_node) => named_node.into(), - GraphNameRef::BlankNode(blank_node) => blank_node.into(), - GraphNameRef::DefaultGraph => Self::DefaultGraph, + pub fn encode_graph_name<'a>(&self, name: impl Into>) -> EncodedTerm { + match name.into() { + GraphNameRef::NamedNode(named_node) => self.encode_named_node(named_node), + GraphNameRef::BlankNode(blank_node) => self.encode_blank_node(blank_node), + GraphNameRef::DefaultGraph => EncodedTerm::DefaultGraph, } } -} - -impl From> for EncodedTerm { - fn from(triple: TripleRef<'_>) -> Self { - Self::Triple(Rc::new(triple.into())) - } -} - -#[derive(Eq, PartialEq, Debug, Clone, Hash)] -pub struct EncodedTriple { - pub subject: EncodedTerm, - pub predicate: EncodedTerm, - pub object: EncodedTerm, -} -impl EncodedTriple { - pub fn new(subject: EncodedTerm, predicate: EncodedTerm, object: EncodedTerm) -> Self { - Self { - subject, - predicate, - object, + fn encode_triple(&self, triple: TripleRef<'_>) -> EncodedTriple { + EncodedTriple { + subject: self.encode_term(triple.subject), + predicate: self.encode_term(triple.predicate), + object: self.encode_term(triple.object), } } -} -impl From> for EncodedTriple { - fn from(triple: TripleRef<'_>) -> Self { - Self { - subject: triple.subject.into(), - predicate: triple.predicate.into(), - object: triple.object.into(), - } - } -} - -#[derive(Eq, PartialEq, Debug, Clone, Hash)] -pub struct EncodedQuad { - pub subject: EncodedTerm, - pub predicate: EncodedTerm, - pub object: EncodedTerm, - pub graph_name: EncodedTerm, -} - -impl EncodedQuad { - pub fn new( - subject: EncodedTerm, - predicate: EncodedTerm, - object: EncodedTerm, - graph_name: EncodedTerm, - ) -> Self { - Self { - subject, - predicate, - object, - graph_name, - } - } -} - -impl From> for EncodedQuad { - fn from(quad: QuadRef<'_>) -> Self { - Self { - subject: quad.subject.into(), - predicate: quad.predicate.into(), - object: quad.object.into(), - graph_name: quad.graph_name.into(), + pub fn encode_quad<'a>(&self, quad: impl Into>) -> EncodedQuad { + let quad = quad.into(); + EncodedQuad { + subject: self.encode_term(quad.subject), + predicate: self.encode_term(quad.predicate), + object: self.encode_term(quad.object), + graph_name: self.encode_graph_name(quad.graph_name), } } } diff --git a/lib/src/store.rs b/lib/src/store.rs index eb57e097..10cf8722 100644 --- a/lib/src/store.rs +++ b/lib/src/store.rs @@ -32,7 +32,6 @@ use crate::sparql::{ evaluate_query, evaluate_update, EvaluationError, Query, QueryOptions, QueryResults, Update, UpdateOptions, }; -use crate::storage::numeric_encoder::{EncodedQuad, EncodedTerm}; #[cfg(not(target_arch = "wasm32"))] use crate::storage::StorageBulkLoader; use crate::storage::{ @@ -184,12 +183,13 @@ impl Store { graph_name: Option>, ) -> QuadIter { let reader = self.storage.snapshot(); + let encoder = reader.term_encoder(); QuadIter { iter: reader.quads_for_pattern( - subject.map(EncodedTerm::from).as_ref(), - predicate.map(EncodedTerm::from).as_ref(), - object.map(EncodedTerm::from).as_ref(), - graph_name.map(EncodedTerm::from).as_ref(), + subject.map(|t| encoder.encode_term(t)).as_ref(), + predicate.map(|t| encoder.encode_term(t)).as_ref(), + object.map(|t| encoder.encode_term(t)).as_ref(), + graph_name.map(|t| encoder.encode_graph_name(t)).as_ref(), ), reader, } @@ -236,8 +236,9 @@ impl Store { /// # Result::<_, Box>::Ok(()) /// ``` pub fn contains<'a>(&self, quad: impl Into>) -> Result { - let quad = EncodedQuad::from(quad.into()); - self.storage.snapshot().contains(&quad) + let reader = self.storage.snapshot(); + let quad = reader.term_encoder().encode_quad(quad); + reader.contains(&quad) } /// Returns the number of quads in the store. @@ -611,8 +612,9 @@ impl Store { &self, graph_name: impl Into>, ) -> Result { - let graph_name = EncodedTerm::from(graph_name.into()); - self.storage.snapshot().contains_named_graph(&graph_name) + let reader = self.storage.snapshot(); + let graph_name = reader.term_encoder().encode_graph_name(graph_name.into()); + reader.contains_named_graph(&graph_name) } /// Inserts a graph into this store. @@ -899,12 +901,13 @@ impl<'a> Transaction<'a> { graph_name: Option>, ) -> QuadIter { let reader = self.writer.reader(); + let encoder = reader.term_encoder(); QuadIter { iter: reader.quads_for_pattern( - subject.map(EncodedTerm::from).as_ref(), - predicate.map(EncodedTerm::from).as_ref(), - object.map(EncodedTerm::from).as_ref(), - graph_name.map(EncodedTerm::from).as_ref(), + subject.map(|t| encoder.encode_term(t)).as_ref(), + predicate.map(|t| encoder.encode_term(t)).as_ref(), + object.map(|t| encoder.encode_term(t)).as_ref(), + graph_name.map(|t| encoder.encode_graph_name(t)).as_ref(), ), reader, } @@ -917,8 +920,9 @@ impl<'a> Transaction<'a> { /// Checks if this store contains a given quad. pub fn contains<'b>(&self, quad: impl Into>) -> Result { - let quad = EncodedQuad::from(quad.into()); - self.writer.reader().contains(&quad) + let reader = self.writer.reader(); + let quad = reader.term_encoder().encode_quad(quad); + reader.contains(&quad) } /// Returns the number of quads in the store. @@ -1125,9 +1129,9 @@ impl<'a> Transaction<'a> { &self, graph_name: impl Into>, ) -> Result { - self.writer - .reader() - .contains_named_graph(&EncodedTerm::from(graph_name.into())) + let reader = self.writer.reader(); + let graph_name = reader.term_encoder().encode_graph_name(graph_name.into()); + reader.contains_named_graph(&graph_name) } /// Inserts a graph into this store.