commit c20417c18c1209d5f2c61c998bd0c81d1e017016 Author: Niko PLP Date: Fri May 17 02:34:33 2024 +0300 Squashed 'oxigraph/' content from commit c5e23fb0 git-subtree-dir: oxigraph git-subtree-split: c5e23fb0dd1a6b6cc36a4d021276df46f55bc6e4 diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..52caf88 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,59 @@ +[package] +name = "oxigraph" +version.workspace = true +authors.workspace = true +license.workspace = true +readme = "README.md" +keywords = ["RDF", "SPARQL", "graph-database", "database"] +categories = ["database-implementations"] +repository = "https://github.com/oxigraph/oxigraph/tree/main/lib/oxigraph" +homepage = "https://oxigraph.org/" +documentation = "https://docs.rs/oxigraph" +description = """ +a SPARQL database and RDF toolkit +""" +edition.workspace = true +rust-version.workspace = true + +[features] +js = ["getrandom/js", "oxsdatatypes/js", "js-sys"] + + +[dependencies] +digest.workspace = true +hex.workspace = true +json-event-parser.workspace = true +md-5.workspace = true +oxilangtag.workspace = true +oxiri.workspace = true +oxrdf = { workspace = true, features = ["rdf-star", "oxsdatatypes"] } +oxrdfio = { workspace = true, features = ["rdf-star"] } +oxsdatatypes.workspace = true +rand.workspace = true +regex.workspace = true +sha1.workspace = true +sha2.workspace = true +siphasher.workspace = true +sparesults = { workspace = true, features = ["rdf-star"] } +spargebra = { workspace = true, features = ["rdf-star", "sep-0002", "sep-0006"] } +sparopt = { workspace = true, features = ["rdf-star", "sep-0002", "sep-0006"] } +thiserror.workspace = true + +[target.'cfg(not(target_family = "wasm"))'.dependencies] +libc = "0.2" +rocksdb.workspace = true + +[target.'cfg(all(target_family = "wasm", target_os = "unknown"))'.dependencies] +getrandom.workspace = true +js-sys = { workspace = true, optional = true } + +[target.'cfg(not(target_family = "wasm"))'.dev-dependencies] +codspeed-criterion-compat.workspace = true +zstd.workspace = true + +[lints] +workspace = true + +[package.metadata.docs.rs] +rustdoc-args = ["--cfg", "docsrs"] + diff --git a/README.md b/README.md new file mode 100644 index 0000000..293d7cd --- /dev/null +++ b/README.md @@ -0,0 +1,82 @@ +Oxigraph +======== + +[![Latest Version](https://img.shields.io/crates/v/oxigraph.svg)](https://crates.io/crates/oxigraph) +[![Released API docs](https://docs.rs/oxigraph/badge.svg)](https://docs.rs/oxigraph) +[![Crates.io downloads](https://img.shields.io/crates/d/oxigraph)](https://crates.io/crates/oxigraph) +[![actions status](https://github.com/oxigraph/oxigraph/workflows/build/badge.svg)](https://github.com/oxigraph/oxigraph/actions) +[![Gitter](https://badges.gitter.im/oxigraph/community.svg)](https://gitter.im/oxigraph/community) + +Oxigraph is a graph database library implementing the [SPARQL](https://www.w3.org/TR/sparql11-overview/) standard. + +Its goal is to provide a compliant, safe and fast on-disk graph database. +It also provides a set of utility functions for reading, writing, and processing RDF files. + +Oxigraph is in heavy development and SPARQL query evaluation has not been optimized yet. + +Oxigraph also provides [a CLI tool](https://crates.io/crates/oxigraph-cli) and [a Python library](https://pyoxigraph.readthedocs.io/) based on this library. + + +Oxigraph implements the following specifications: +* [SPARQL 1.1 Query](https://www.w3.org/TR/sparql11-query/), [SPARQL 1.1 Update](https://www.w3.org/TR/sparql11-update/), and [SPARQL 1.1 Federated Query](https://www.w3.org/TR/sparql11-federated-query/). +* [Turtle](https://www.w3.org/TR/turtle/), [TriG](https://www.w3.org/TR/trig/), [N-Triples](https://www.w3.org/TR/n-triples/), [N-Quads](https://www.w3.org/TR/n-quads/), and [RDF/XML](https://www.w3.org/TR/rdf-syntax-grammar/) RDF serialization formats for both data ingestion and retrieval. +* [SPARQL Query Results XML Format](https://www.w3.org/TR/rdf-sparql-XMLres/), [SPARQL 1.1 Query Results JSON Format](https://www.w3.org/TR/sparql11-results-json/) and [SPARQL 1.1 Query Results CSV and TSV Formats](https://www.w3.org/TR/sparql11-results-csv-tsv/). + +A preliminary benchmark [is provided](../bench/README.md). Oxigraph internal design [is described on the wiki](https://github.com/oxigraph/oxigraph/wiki/Architecture). + +The main entry point of Oxigraph is the [`Store`](store::Store) struct: +```rust +use oxigraph::store::Store; +use oxigraph::model::*; +use oxigraph::sparql::QueryResults; + +let store = Store::new().unwrap(); + +// insertion +let ex = NamedNode::new("http://example.com").unwrap(); +let quad = Quad::new(ex.clone(), ex.clone(), ex.clone(), GraphName::DefaultGraph); +store.insert(&quad).unwrap(); + +// quad filter +let results = store.quads_for_pattern(Some(ex.as_ref().into()), None, None, None).collect::,_>>().unwrap(); +assert_eq!(vec![quad], results); + +// SPARQL query +if let QueryResults::Solutions(mut solutions) = store.query("SELECT ?s WHERE { ?s ?p ?o }").unwrap() { + assert_eq!(solutions.next().unwrap().unwrap().get("s"), Some(&ex.into())); +} +``` + +It is based on these crates that can be used separately: +* [`oxrdf`](https://crates.io/crates/oxrdf), datastructures encoding RDF basic concepts (the [`oxigraph::model`](crate::model) module). +* [`oxrdfio`](https://crates.io/crates/oxrdfio), a unified parser and serializer API for RDF formats (the [`oxigraph::io`](crate::io) module). It itself relies on: + * [`oxttl`](https://crates.io/crates/oxttl), N-Triple, N-Quad, Turtle, TriG and N3 parsing and serialization. + * [`oxrdfxml`](https://crates.io/crates/oxrdfxml), RDF/XML parsing and serialization. +* [`spargebra`](https://crates.io/crates/spargebra), a SPARQL parser. +* [`sparesults`](https://crates.io/crates/sparesults), parsers and serializers for SPARQL result formats (the [`oxigraph::sparql::results`](crate::sparql::results) module). +* [`sparopt`](https://crates.io/crates/sparesults), a SPARQL optimizer. +* [`oxsdatatypes`](https://crates.io/crates/oxsdatatypes), an implementation of some XML Schema datatypes. + +To build the library locally, don't forget to clone the submodules using `git clone --recursive https://github.com/oxigraph/oxigraph.git` to clone the repository including submodules or `git submodule update --init` to add submodules to the already cloned repository. + +It is possible to disable the RocksDB storage backend to only use the in-memory fallback by disabling the `rocksdb` default feature: +```toml +oxigraph = { version = "*", default-features = false } +``` +This is the default behavior when compiling Oxigraph to WASM. + +## License + +This project is licensed under either of + +* Apache License, Version 2.0, ([LICENSE-APACHE](../LICENSE-APACHE) or + ``) +* MIT license ([LICENSE-MIT](../LICENSE-MIT) or + ``) + +at your option. + + +### Contribution + +Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in Oxigraph by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any additional terms or conditions. diff --git a/src/io/format.rs b/src/io/format.rs new file mode 100644 index 0000000..08b61d8 --- /dev/null +++ b/src/io/format.rs @@ -0,0 +1,301 @@ +#![allow(deprecated)] + +use oxrdfio::{RdfFormat, RdfParser, RdfSerializer}; + +/// [RDF graph](https://www.w3.org/TR/rdf11-concepts/#dfn-rdf-graph) serialization formats. +/// +/// This enumeration is non exhaustive. New formats like JSON-LD will be added in the future. +#[derive(Eq, PartialEq, Debug, Clone, Copy, Hash)] +#[non_exhaustive] +#[deprecated(note = "use RdfFormat instead", since = "0.4.0")] +pub enum GraphFormat { + /// [N-Triples](https://www.w3.org/TR/n-triples/) + NTriples, + /// [Turtle](https://www.w3.org/TR/turtle/) + Turtle, + /// [RDF/XML](https://www.w3.org/TR/rdf-syntax-grammar/) + RdfXml, +} + +impl GraphFormat { + /// The format canonical IRI according to the [Unique URIs for file formats registry](https://www.w3.org/ns/formats/). + /// + /// ``` + /// use oxigraph::io::GraphFormat; + /// + /// assert_eq!( + /// GraphFormat::NTriples.iri(), + /// "http://www.w3.org/ns/formats/N-Triples" + /// ) + /// ``` + #[inline] + pub fn iri(self) -> &'static str { + match self { + Self::NTriples => "http://www.w3.org/ns/formats/N-Triples", + Self::Turtle => "http://www.w3.org/ns/formats/Turtle", + Self::RdfXml => "http://www.w3.org/ns/formats/RDF_XML", + } + } + + /// The format [IANA media type](https://tools.ietf.org/html/rfc2046). + /// + /// ``` + /// use oxigraph::io::GraphFormat; + /// + /// assert_eq!(GraphFormat::NTriples.media_type(), "application/n-triples") + /// ``` + #[inline] + pub fn media_type(self) -> &'static str { + match self { + Self::NTriples => "application/n-triples", + Self::Turtle => "text/turtle", + Self::RdfXml => "application/rdf+xml", + } + } + + /// The format [IANA-registered](https://tools.ietf.org/html/rfc2046) file extension. + /// + /// ``` + /// use oxigraph::io::GraphFormat; + /// + /// assert_eq!(GraphFormat::NTriples.file_extension(), "nt") + /// ``` + #[inline] + pub fn file_extension(self) -> &'static str { + match self { + Self::NTriples => "nt", + Self::Turtle => "ttl", + Self::RdfXml => "rdf", + } + } + + /// Looks for a known format from a media type. + /// + /// It supports some media type aliases. + /// For example, "application/xml" is going to return `GraphFormat::RdfXml` even if it is not its canonical media type. + /// + /// Example: + /// ``` + /// use oxigraph::io::GraphFormat; + /// + /// assert_eq!( + /// GraphFormat::from_media_type("text/turtle; charset=utf-8"), + /// Some(GraphFormat::Turtle) + /// ) + /// ``` + #[inline] + pub fn from_media_type(media_type: &str) -> Option { + match media_type.split(';').next()?.trim() { + "application/n-triples" | "text/plain" => Some(Self::NTriples), + "text/turtle" | "application/turtle" | "application/x-turtle" => Some(Self::Turtle), + "application/rdf+xml" | "application/xml" | "text/xml" => Some(Self::RdfXml), + _ => None, + } + } + + /// Looks for a known format from an extension. + /// + /// It supports some aliases. + /// + /// Example: + /// ``` + /// use oxigraph::io::GraphFormat; + /// + /// assert_eq!( + /// GraphFormat::from_extension("nt"), + /// Some(GraphFormat::NTriples) + /// ) + /// ``` + #[inline] + pub fn from_extension(extension: &str) -> Option { + match extension { + "nt" | "txt" => Some(Self::NTriples), + "ttl" => Some(Self::Turtle), + "rdf" | "xml" => Some(Self::RdfXml), + _ => None, + } + } +} + +impl From for RdfFormat { + #[inline] + fn from(format: GraphFormat) -> Self { + match format { + GraphFormat::NTriples => Self::NTriples, + GraphFormat::Turtle => Self::Turtle, + GraphFormat::RdfXml => Self::RdfXml, + } + } +} + +impl From for RdfParser { + #[inline] + fn from(format: GraphFormat) -> Self { + RdfFormat::from(format).into() + } +} + +impl From for RdfSerializer { + #[inline] + fn from(format: GraphFormat) -> Self { + RdfFormat::from(format).into() + } +} + +/// [RDF dataset](https://www.w3.org/TR/rdf11-concepts/#dfn-rdf-dataset) serialization formats. +/// +/// This enumeration is non exhaustive. New formats like JSON-LD will be added in the future. +#[derive(Eq, PartialEq, Debug, Clone, Copy, Hash)] +#[non_exhaustive] +#[deprecated(note = "use RdfFormat instead", since = "0.4.0")] +pub enum DatasetFormat { + /// [N-Quads](https://www.w3.org/TR/n-quads/) + NQuads, + /// [TriG](https://www.w3.org/TR/trig/) + TriG, +} + +impl DatasetFormat { + /// The format canonical IRI according to the [Unique URIs for file formats registry](https://www.w3.org/ns/formats/). + /// + /// ``` + /// use oxigraph::io::DatasetFormat; + /// + /// assert_eq!( + /// DatasetFormat::NQuads.iri(), + /// "http://www.w3.org/ns/formats/N-Quads" + /// ) + /// ``` + #[inline] + pub fn iri(self) -> &'static str { + match self { + Self::NQuads => "http://www.w3.org/ns/formats/N-Quads", + Self::TriG => "http://www.w3.org/ns/formats/TriG", + } + } + + /// The format [IANA media type](https://tools.ietf.org/html/rfc2046). + /// + /// ``` + /// use oxigraph::io::DatasetFormat; + /// + /// assert_eq!(DatasetFormat::NQuads.media_type(), "application/n-quads") + /// ``` + #[inline] + pub fn media_type(self) -> &'static str { + match self { + Self::NQuads => "application/n-quads", + Self::TriG => "application/trig", + } + } + + /// The format [IANA-registered](https://tools.ietf.org/html/rfc2046) file extension. + /// + /// ``` + /// use oxigraph::io::DatasetFormat; + /// + /// assert_eq!(DatasetFormat::NQuads.file_extension(), "nq") + /// ``` + #[inline] + pub fn file_extension(self) -> &'static str { + match self { + Self::NQuads => "nq", + Self::TriG => "trig", + } + } + + /// Looks for a known format from a media type. + /// + /// It supports some media type aliases. + /// + /// Example: + /// ``` + /// use oxigraph::io::DatasetFormat; + /// + /// assert_eq!( + /// DatasetFormat::from_media_type("application/n-quads; charset=utf-8"), + /// Some(DatasetFormat::NQuads) + /// ) + /// ``` + #[inline] + pub fn from_media_type(media_type: &str) -> Option { + match media_type.split(';').next()?.trim() { + "application/n-quads" | "text/x-nquads" | "text/nquads" => Some(Self::NQuads), + "application/trig" | "application/x-trig" => Some(Self::TriG), + _ => None, + } + } + + /// Looks for a known format from an extension. + /// + /// It supports some aliases. + /// + /// Example: + /// ``` + /// use oxigraph::io::DatasetFormat; + /// + /// assert_eq!( + /// DatasetFormat::from_extension("nq"), + /// Some(DatasetFormat::NQuads) + /// ) + /// ``` + #[inline] + pub fn from_extension(extension: &str) -> Option { + match extension { + "nq" | "txt" => Some(Self::NQuads), + "trig" => Some(Self::TriG), + _ => None, + } + } +} + +impl From for RdfFormat { + #[inline] + fn from(format: DatasetFormat) -> Self { + match format { + DatasetFormat::NQuads => Self::NQuads, + DatasetFormat::TriG => Self::TriG, + } + } +} + +impl From for RdfParser { + #[inline] + fn from(format: DatasetFormat) -> Self { + RdfFormat::from(format).into() + } +} + +impl From for RdfSerializer { + #[inline] + fn from(format: DatasetFormat) -> Self { + RdfFormat::from(format).into() + } +} + +impl TryFrom for GraphFormat { + type Error = (); + + /// Attempts to find a graph format that is a subset of this [`DatasetFormat`]. + #[inline] + fn try_from(value: DatasetFormat) -> Result { + match value { + DatasetFormat::NQuads => Ok(Self::NTriples), + DatasetFormat::TriG => Ok(Self::Turtle), + } + } +} + +impl TryFrom for DatasetFormat { + type Error = (); + + /// Attempts to find a dataset format that is a superset of this [`GraphFormat`]. + #[inline] + fn try_from(value: GraphFormat) -> Result { + match value { + GraphFormat::NTriples => Ok(Self::NQuads), + GraphFormat::Turtle => Ok(Self::TriG), + GraphFormat::RdfXml => Err(()), + } + } +} diff --git a/src/io/mod.rs b/src/io/mod.rs new file mode 100644 index 0000000..1b15bc8 --- /dev/null +++ b/src/io/mod.rs @@ -0,0 +1,39 @@ +//! Utilities to read and write RDF graphs and datasets using [OxRDF I/O](https://crates.io/crates/oxrdfio). +//! +//! The entry points of this module are the two [`RdfParser`] and [`RdfSerializer`] structs. +//! +//! Usage example converting a Turtle file to a N-Triples file: +//! ``` +//! use oxigraph::io::{RdfFormat, RdfParser, RdfSerializer}; +//! +//! let turtle_file = b"@base . +//! @prefix schema: . +//! a schema:Person ; +//! schema:name \"Foo\" . +//! a schema:Person ; +//! schema:name \"Bar\" ."; +//! +//! let ntriples_file = b" . +//! \"Foo\" . +//! . +//! \"Bar\" . +//! "; +//! +//! let mut writer = RdfSerializer::from_format(RdfFormat::NTriples).serialize_to_write(Vec::new()); +//! for quad in RdfParser::from_format(RdfFormat::Turtle).parse_read(turtle_file.as_ref()) { +//! writer.write_quad(&quad.unwrap()).unwrap(); +//! } +//! assert_eq!(writer.finish().unwrap(), ntriples_file); +//! ``` + +mod format; +pub mod read; +pub mod write; + +#[allow(deprecated)] +pub use self::format::{DatasetFormat, GraphFormat}; +#[allow(deprecated)] +pub use self::read::{DatasetParser, GraphParser}; +#[allow(deprecated)] +pub use self::write::{DatasetSerializer, GraphSerializer}; +pub use oxrdfio::*; diff --git a/src/io/read.rs b/src/io/read.rs new file mode 100644 index 0000000..6d01f6f --- /dev/null +++ b/src/io/read.rs @@ -0,0 +1,199 @@ +#![allow(deprecated)] + +//! Utilities to read RDF graphs and datasets. + +use crate::io::{DatasetFormat, GraphFormat}; +use crate::model::*; +use oxrdfio::{FromReadQuadReader, RdfParseError, RdfParser}; +use std::io::Read; + +/// Parsers for RDF graph serialization formats. +/// +/// It currently supports the following formats: +/// * [N-Triples](https://www.w3.org/TR/n-triples/) ([`GraphFormat::NTriples`]) +/// * [Turtle](https://www.w3.org/TR/turtle/) ([`GraphFormat::Turtle`]) +/// * [RDF/XML](https://www.w3.org/TR/rdf-syntax-grammar/) ([`GraphFormat::RdfXml`]) +/// +/// ``` +/// use oxigraph::io::{GraphFormat, GraphParser}; +/// +/// let file = " ."; +/// +/// let parser = GraphParser::from_format(GraphFormat::NTriples); +/// let triples = parser +/// .read_triples(file.as_bytes()) +/// .collect::, _>>()?; +/// +/// assert_eq!(triples.len(), 1); +/// assert_eq!(triples[0].subject.to_string(), ""); +/// # std::io::Result::Ok(()) +/// ``` +#[deprecated(note = "use RdfParser instead", since = "0.4.0")] +pub struct GraphParser { + inner: RdfParser, +} + +impl GraphParser { + /// Builds a parser for the given format. + #[inline] + pub fn from_format(format: GraphFormat) -> Self { + Self { + inner: RdfParser::from_format(format.into()) + .without_named_graphs() + .rename_blank_nodes(), + } + } + + /// Provides an IRI that could be used to resolve the file relative IRIs. + /// + /// ``` + /// use oxigraph::io::{GraphFormat, GraphParser}; + /// + /// let file = "

."; + /// + /// let parser = + /// GraphParser::from_format(GraphFormat::Turtle).with_base_iri("http://example.com")?; + /// let triples = parser + /// .read_triples(file.as_bytes()) + /// .collect::, _>>()?; + /// + /// assert_eq!(triples.len(), 1); + /// assert_eq!(triples[0].subject.to_string(), ""); + /// # Result::<_,Box>::Ok(()) + /// ``` + #[inline] + pub fn with_base_iri(self, base_iri: impl Into) -> Result { + Ok(Self { + inner: self.inner.with_base_iri(base_iri)?, + }) + } + + /// Executes the parsing itself on a [`Read`] implementation and returns an iterator of triples. + pub fn read_triples(self, reader: R) -> TripleReader { + TripleReader { + parser: self.inner.parse_read(reader), + } + } +} + +/// An iterator yielding read triples. +/// Could be built using a [`GraphParser`]. +/// +/// ``` +/// use oxigraph::io::{GraphFormat, GraphParser}; +/// +/// let file = " ."; +/// +/// let parser = GraphParser::from_format(GraphFormat::NTriples); +/// let triples = parser +/// .read_triples(file.as_bytes()) +/// .collect::, _>>()?; +/// +/// assert_eq!(triples.len(), 1); +/// assert_eq!(triples[0].subject.to_string(), ""); +/// # std::io::Result::Ok(()) +/// ``` +#[must_use] +pub struct TripleReader { + parser: FromReadQuadReader, +} + +impl Iterator for TripleReader { + type Item = Result; + + fn next(&mut self) -> Option { + Some(self.parser.next()?.map(Into::into).map_err(Into::into)) + } +} + +/// A parser for RDF dataset serialization formats. +/// +/// It currently supports the following formats: +/// * [N-Quads](https://www.w3.org/TR/n-quads/) ([`DatasetFormat::NQuads`]) +/// * [TriG](https://www.w3.org/TR/trig/) ([`DatasetFormat::TriG`]) +/// +/// ``` +/// use oxigraph::io::{DatasetFormat, DatasetParser}; +/// +/// let file = " ."; +/// +/// let parser = DatasetParser::from_format(DatasetFormat::NQuads); +/// let quads = parser.read_quads(file.as_bytes()).collect::,_>>()?; +/// +/// assert_eq!(quads.len(), 1); +/// assert_eq!(quads[0].subject.to_string(), ""); +/// # std::io::Result::Ok(()) +/// ``` +#[deprecated(note = "use RdfParser instead", since = "0.4.0")] +pub struct DatasetParser { + inner: RdfParser, +} + +impl DatasetParser { + /// Builds a parser for the given format. + #[inline] + pub fn from_format(format: DatasetFormat) -> Self { + Self { + inner: RdfParser::from_format(format.into()).rename_blank_nodes(), + } + } + + /// Provides an IRI that could be used to resolve the file relative IRIs. + /// + /// ``` + /// use oxigraph::io::{DatasetFormat, DatasetParser}; + /// + /// let file = " {

}"; + /// + /// let parser = + /// DatasetParser::from_format(DatasetFormat::TriG).with_base_iri("http://example.com")?; + /// let triples = parser + /// .read_quads(file.as_bytes()) + /// .collect::, _>>()?; + /// + /// assert_eq!(triples.len(), 1); + /// assert_eq!(triples[0].subject.to_string(), ""); + /// # Result::<_,Box>::Ok(()) + /// ``` + #[inline] + pub fn with_base_iri(self, base_iri: impl Into) -> Result { + Ok(Self { + inner: self.inner.with_base_iri(base_iri)?, + }) + } + + /// Executes the parsing itself on a [`Read`] implementation and returns an iterator of quads. + pub fn read_quads(self, reader: R) -> QuadReader { + QuadReader { + parser: self.inner.parse_read(reader), + } + } +} + +/// An iterator yielding read quads. +/// Could be built using a [`DatasetParser`]. +/// +/// ``` +/// use oxigraph::io::{DatasetFormat, DatasetParser}; +/// +/// let file = " ."; +/// +/// let parser = DatasetParser::from_format(DatasetFormat::NQuads); +/// let quads = parser.read_quads(file.as_bytes()).collect::,_>>()?; +/// +/// assert_eq!(quads.len(), 1); +/// assert_eq!(quads[0].subject.to_string(), ""); +/// # std::io::Result::Ok(()) +/// ``` +#[must_use] +pub struct QuadReader { + parser: FromReadQuadReader, +} + +impl Iterator for QuadReader { + type Item = Result; + + fn next(&mut self) -> Option { + Some(self.parser.next()?.map_err(Into::into)) + } +} diff --git a/src/io/write.rs b/src/io/write.rs new file mode 100644 index 0000000..7f27cd9 --- /dev/null +++ b/src/io/write.rs @@ -0,0 +1,185 @@ +#![allow(deprecated)] + +//! Utilities to write RDF graphs and datasets. + +use crate::io::{DatasetFormat, GraphFormat}; +use crate::model::*; +use oxrdfio::{RdfSerializer, ToWriteQuadWriter}; +use std::io::{self, Write}; + +/// A serializer for RDF graph serialization formats. +/// +/// It currently supports the following formats: +/// * [N-Triples](https://www.w3.org/TR/n-triples/) ([`GraphFormat::NTriples`]) +/// * [Turtle](https://www.w3.org/TR/turtle/) ([`GraphFormat::Turtle`]) +/// * [RDF/XML](https://www.w3.org/TR/rdf-syntax-grammar/) ([`GraphFormat::RdfXml`]) +/// +/// ``` +/// use oxigraph::io::{GraphFormat, GraphSerializer}; +/// use oxigraph::model::*; +/// +/// let mut buffer = Vec::new(); +/// let mut writer = GraphSerializer::from_format(GraphFormat::NTriples).triple_writer(&mut buffer); +/// writer.write(&Triple { +/// subject: NamedNode::new("http://example.com/s")?.into(), +/// predicate: NamedNode::new("http://example.com/p")?, +/// object: NamedNode::new("http://example.com/o")?.into(), +/// })?; +/// writer.finish()?; +/// +/// assert_eq!( +/// buffer.as_slice(), +/// " .\n".as_bytes() +/// ); +/// # Result::<_,Box>::Ok(()) +/// ``` +#[deprecated(note = "use RdfSerializer instead", since = "0.4.0")] +pub struct GraphSerializer { + inner: RdfSerializer, +} + +impl GraphSerializer { + /// Builds a serializer for the given format + #[inline] + pub fn from_format(format: GraphFormat) -> Self { + Self { + inner: RdfSerializer::from_format(format.into()), + } + } + + /// Returns a [`TripleWriter`] allowing writing triples into the given [`Write`] implementation + pub fn triple_writer(self, write: W) -> TripleWriter { + TripleWriter { + writer: self.inner.serialize_to_write(write), + } + } +} + +/// Allows writing triples. +/// Could be built using a [`GraphSerializer`]. +/// +///
+/// +/// Do not forget to run the [`finish`](TripleWriter::finish()) method to properly write the last bytes of the file.
+/// +/// ``` +/// use oxigraph::io::{GraphFormat, GraphSerializer}; +/// use oxigraph::model::*; +/// +/// let mut buffer = Vec::new(); +/// let mut writer = GraphSerializer::from_format(GraphFormat::NTriples).triple_writer(&mut buffer); +/// writer.write(&Triple { +/// subject: NamedNode::new("http://example.com/s")?.into(), +/// predicate: NamedNode::new("http://example.com/p")?, +/// object: NamedNode::new("http://example.com/o")?.into(), +/// })?; +/// writer.finish()?; +/// +/// assert_eq!( +/// buffer.as_slice(), +/// " .\n".as_bytes() +/// ); +/// # Result::<_,Box>::Ok(()) +/// ``` +#[must_use] +pub struct TripleWriter { + writer: ToWriteQuadWriter, +} + +impl TripleWriter { + /// Writes a triple + pub fn write<'a>(&mut self, triple: impl Into>) -> io::Result<()> { + self.writer.write_triple(triple) + } + + /// Writes the last bytes of the file + pub fn finish(self) -> io::Result<()> { + self.writer.finish()?.flush() + } +} + +/// A serializer for RDF graph serialization formats. +/// +/// It currently supports the following formats: +/// * [N-Quads](https://www.w3.org/TR/n-quads/) ([`DatasetFormat::NQuads`]) +/// * [TriG](https://www.w3.org/TR/trig/) ([`DatasetFormat::TriG`]) +/// +/// ``` +/// use oxigraph::io::{DatasetFormat, DatasetSerializer}; +/// use oxigraph::model::*; +/// +/// let mut buffer = Vec::new(); +/// let mut writer = DatasetSerializer::from_format(DatasetFormat::NQuads).quad_writer(&mut buffer); +/// writer.write(&Quad { +/// subject: NamedNode::new("http://example.com/s")?.into(), +/// predicate: NamedNode::new("http://example.com/p")?, +/// object: NamedNode::new("http://example.com/o")?.into(), +/// graph_name: NamedNode::new("http://example.com/g")?.into(), +/// })?; +/// writer.finish()?; +/// +/// assert_eq!(buffer.as_slice(), " .\n".as_bytes()); +/// # Result::<_,Box>::Ok(()) +/// ``` +#[deprecated(note = "use RdfSerializer instead", since = "0.4.0")] +pub struct DatasetSerializer { + inner: RdfSerializer, +} + +impl DatasetSerializer { + /// Builds a serializer for the given format + #[inline] + pub fn from_format(format: DatasetFormat) -> Self { + Self { + inner: RdfSerializer::from_format(format.into()), + } + } + + /// Returns a [`QuadWriter`] allowing writing triples into the given [`Write`] implementation + pub fn quad_writer(self, write: W) -> QuadWriter { + QuadWriter { + writer: self.inner.serialize_to_write(write), + } + } +} + +/// Allows writing triples. +/// Could be built using a [`DatasetSerializer`]. +/// +///
+/// +/// Do not forget to run the [`finish`](QuadWriter::finish()) method to properly write the last bytes of the file.
+/// +/// ``` +/// use oxigraph::io::{DatasetFormat, DatasetSerializer}; +/// use oxigraph::model::*; +/// +/// let mut buffer = Vec::new(); +/// let mut writer = DatasetSerializer::from_format(DatasetFormat::NQuads).quad_writer(&mut buffer); +/// writer.write(&Quad { +/// subject: NamedNode::new("http://example.com/s")?.into(), +/// predicate: NamedNode::new("http://example.com/p")?, +/// object: NamedNode::new("http://example.com/o")?.into(), +/// graph_name: NamedNode::new("http://example.com/g")?.into(), +/// })?; +/// writer.finish()?; +/// +/// assert_eq!(buffer.as_slice(), " .\n".as_bytes()); +/// # Result::<_,Box>::Ok(()) +/// ``` +#[must_use] +pub struct QuadWriter { + writer: ToWriteQuadWriter, +} + +impl QuadWriter { + /// Writes a quad + pub fn write<'a>(&mut self, quad: impl Into>) -> io::Result<()> { + self.writer.write_quad(quad) + } + + /// Writes the last bytes of the file + pub fn finish(self) -> io::Result<()> { + self.writer.finish()?.flush() + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..b36c4d6 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,12 @@ +#![doc = include_str!("../README.md")] +#![doc(test(attr(deny(warnings))))] +#![doc(test(attr(allow(deprecated))))] +#![cfg_attr(docsrs, feature(doc_auto_cfg))] +#![doc(html_favicon_url = "https://raw.githubusercontent.com/oxigraph/oxigraph/main/logo.svg")] +#![doc(html_logo_url = "https://raw.githubusercontent.com/oxigraph/oxigraph/main/logo.svg")] + +pub mod io; +pub mod model; +pub mod sparql; +mod storage; +pub mod store; diff --git a/src/model.rs b/src/model.rs new file mode 100644 index 0000000..dbca934 --- /dev/null +++ b/src/model.rs @@ -0,0 +1,22 @@ +//! Implements data structures for [RDF 1.1 Concepts](https://www.w3.org/TR/rdf11-concepts/) using [OxRDF](https://crates.io/crates/oxrdf). +//! +//! Usage example: +//! +//! ``` +//! use oxigraph::model::*; +//! +//! let mut graph = Graph::default(); +//! +//! // insertion +//! let ex = NamedNodeRef::new("http://example.com").unwrap(); +//! let triple = TripleRef::new(ex, ex, ex); +//! graph.insert(triple); +//! +//! // simple filter +//! let results: Vec<_> = graph.triples_for_subject(ex).collect(); +//! assert_eq!(vec![triple], results); +//! ``` + +pub use oxrdf::*; + +pub use spargebra::term::GroundQuad; diff --git a/src/sparql/algebra.rs b/src/sparql/algebra.rs new file mode 100644 index 0000000..8b3f385 --- /dev/null +++ b/src/sparql/algebra.rs @@ -0,0 +1,311 @@ +//! [SPARQL 1.1 Query Algebra](https://www.w3.org/TR/sparql11-query/#sparqlQuery) +//! +//! The root type for SPARQL queries is [`Query`] and the root type for updates is [`Update`]. + +use crate::model::*; +use crate::sparql::eval::Timer; +use oxsdatatypes::DayTimeDuration; +use spargebra::GraphUpdateOperation; +use std::fmt; +use std::str::FromStr; + +/// A parsed [SPARQL query](https://www.w3.org/TR/sparql11-query/). +/// +/// ``` +/// use oxigraph::model::NamedNode; +/// use oxigraph::sparql::Query; +/// +/// let query_str = "SELECT ?s ?p ?o WHERE { ?s ?p ?o . }"; +/// let mut query = Query::parse(query_str, None)?; +/// +/// assert_eq!(query.to_string(), query_str); +/// +/// // We edit the query dataset specification +/// let default = vec![NamedNode::new("http://example.com")?.into()]; +/// query.dataset_mut().set_default_graph(default.clone()); +/// assert_eq!( +/// query.dataset().default_graph_graphs(), +/// Some(default.as_slice()) +/// ); +/// # Ok::<_, Box>(()) +/// ``` +#[derive(Eq, PartialEq, Debug, Clone, Hash)] +pub struct Query { + pub(super) inner: spargebra::Query, + pub(super) dataset: QueryDataset, + pub(super) parsing_duration: Option, +} + +impl Query { + /// Parses a SPARQL query with an optional base IRI to resolve relative IRIs in the query. + pub fn parse( + query: &str, + base_iri: Option<&str>, + ) -> Result { + let start = Timer::now(); + let query = Self::from(spargebra::Query::parse(query, base_iri)?); + Ok(Self { + dataset: query.dataset, + inner: query.inner, + parsing_duration: start.elapsed(), + }) + } + + /// Returns [the query dataset specification](https://www.w3.org/TR/sparql11-query/#specifyingDataset) + pub fn dataset(&self) -> &QueryDataset { + &self.dataset + } + + /// Returns [the query dataset specification](https://www.w3.org/TR/sparql11-query/#specifyingDataset) + pub fn dataset_mut(&mut self) -> &mut QueryDataset { + &mut self.dataset + } +} + +impl fmt::Display for Query { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.inner.fmt(f) // TODO: override + } +} + +impl FromStr for Query { + type Err = spargebra::SparqlSyntaxError; + + fn from_str(query: &str) -> Result { + Self::parse(query, None) + } +} + +impl TryFrom<&str> for Query { + type Error = spargebra::SparqlSyntaxError; + + fn try_from(query: &str) -> Result { + Self::from_str(query) + } +} + +impl TryFrom<&String> for Query { + type Error = spargebra::SparqlSyntaxError; + + fn try_from(query: &String) -> Result { + Self::from_str(query) + } +} + +impl From for Query { + fn from(query: spargebra::Query) -> Self { + Self { + dataset: QueryDataset::from_algebra(match &query { + spargebra::Query::Select { dataset, .. } + | spargebra::Query::Construct { dataset, .. } + | spargebra::Query::Describe { dataset, .. } + | spargebra::Query::Ask { dataset, .. } => dataset, + }), + inner: query, + parsing_duration: None, + } + } +} + +/// A parsed [SPARQL update](https://www.w3.org/TR/sparql11-update/). +/// +/// ``` +/// use oxigraph::sparql::Update; +/// +/// let update_str = "CLEAR ALL ;"; +/// let update = Update::parse(update_str, None)?; +/// +/// assert_eq!(update.to_string().trim(), update_str); +/// # Ok::<_, oxigraph::sparql::SparqlSyntaxError>(()) +/// ``` +#[derive(Eq, PartialEq, Debug, Clone, Hash)] +pub struct Update { + pub(super) inner: spargebra::Update, + pub(super) using_datasets: Vec>, +} + +impl Update { + /// Parses a SPARQL update with an optional base IRI to resolve relative IRIs in the query. + pub fn parse( + update: &str, + base_iri: Option<&str>, + ) -> Result { + Ok(spargebra::Update::parse(update, base_iri)?.into()) + } + + /// Returns [the query dataset specification](https://www.w3.org/TR/sparql11-query/#specifyingDataset) in [DELETE/INSERT operations](https://www.w3.org/TR/sparql11-update/#deleteInsert). + pub fn using_datasets(&self) -> impl Iterator { + self.using_datasets.iter().filter_map(Option::as_ref) + } + + /// Returns [the query dataset specification](https://www.w3.org/TR/sparql11-query/#specifyingDataset) in [DELETE/INSERT operations](https://www.w3.org/TR/sparql11-update/#deleteInsert). + pub fn using_datasets_mut(&mut self) -> impl Iterator { + self.using_datasets.iter_mut().filter_map(Option::as_mut) + } +} + +impl fmt::Display for Update { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.inner.fmt(f) + } +} + +impl FromStr for Update { + type Err = spargebra::SparqlSyntaxError; + + fn from_str(update: &str) -> Result { + Self::parse(update, None) + } +} + +impl TryFrom<&str> for Update { + type Error = spargebra::SparqlSyntaxError; + + fn try_from(update: &str) -> Result { + Self::from_str(update) + } +} + +impl TryFrom<&String> for Update { + type Error = spargebra::SparqlSyntaxError; + + fn try_from(update: &String) -> Result { + Self::from_str(update) + } +} + +impl From for Update { + fn from(update: spargebra::Update) -> Self { + Self { + using_datasets: update + .operations + .iter() + .map(|operation| { + if let GraphUpdateOperation::DeleteInsert { using, .. } = operation { + Some(QueryDataset::from_algebra(using)) + } else { + None + } + }) + .collect(), + inner: update, + } + } +} + +/// A SPARQL query [dataset specification](https://www.w3.org/TR/sparql11-query/#specifyingDataset) +#[derive(Eq, PartialEq, Debug, Clone, Hash)] +pub struct QueryDataset { + default: Option>, + named: Option>, +} + +impl QueryDataset { + pub(crate) fn new() -> Self { + Self { + default: None, + named: None, + } + } + + fn from_algebra(inner: &Option) -> Self { + if let Some(inner) = inner { + Self { + default: Some(inner.default.iter().map(|g| g.clone().into()).collect()), + named: inner + .named + .as_ref() + .map(|named| named.iter().map(|g| g.clone().into()).collect()), + } + } else { + Self { + default: Some(vec![GraphName::DefaultGraph]), + named: None, + } + } + } + + /// Checks if this dataset specification is the default one + /// (i.e. the default graph is the store default graph and all the store named graphs are available) + /// + /// ``` + /// use oxigraph::sparql::Query; + /// + /// assert!(Query::parse("SELECT ?s ?p ?o WHERE { ?s ?p ?o . }", None)? + /// .dataset() + /// .is_default_dataset()); + /// assert!(!Query::parse( + /// "SELECT ?s ?p ?o FROM WHERE { ?s ?p ?o . }", + /// None + /// )? + /// .dataset() + /// .is_default_dataset()); + /// + /// # Ok::<_, Box>(()) + /// ``` + pub fn is_default_dataset(&self) -> bool { + self.default + .as_ref() + .map_or(false, |t| t == &[GraphName::DefaultGraph]) + && self.named.is_none() + } + + /// Returns the list of the store graphs that are available to the query as the default graph or `None` if the union of all graphs is used as the default graph + /// This list is by default only the store default graph + pub fn default_graph_graphs(&self) -> Option<&[GraphName]> { + self.default.as_deref() + } + + /// Sets if the default graph for the query should be the union of all the graphs in the queried store + pub fn set_default_graph_as_union(&mut self) { + self.default = None; + } + + /// Sets the list of graphs the query should consider as being part of the default graph. + /// + /// By default only the store default graph is considered. + /// ``` + /// use oxigraph::model::NamedNode; + /// use oxigraph::sparql::Query; + /// + /// let mut query = Query::parse("SELECT ?s ?p ?o WHERE { ?s ?p ?o . }", None)?; + /// let default = vec![NamedNode::new("http://example.com")?.into()]; + /// query.dataset_mut().set_default_graph(default.clone()); + /// assert_eq!( + /// query.dataset().default_graph_graphs(), + /// Some(default.as_slice()) + /// ); + /// + /// # Ok::<_, Box>(()) + /// ``` + pub fn set_default_graph(&mut self, graphs: Vec) { + self.default = Some(graphs) + } + + /// Returns the list of the available named graphs for the query or `None` if all graphs are available + pub fn available_named_graphs(&self) -> Option<&[NamedOrBlankNode]> { + self.named.as_deref() + } + + /// Sets the list of allowed named graphs in the query. + /// + /// ``` + /// use oxigraph::model::NamedNode; + /// use oxigraph::sparql::Query; + /// + /// let mut query = Query::parse("SELECT ?s ?p ?o WHERE { ?s ?p ?o . }", None)?; + /// let named = vec![NamedNode::new("http://example.com")?.into()]; + /// query + /// .dataset_mut() + /// .set_available_named_graphs(named.clone()); + /// assert_eq!( + /// query.dataset().available_named_graphs(), + /// Some(named.as_slice()) + /// ); + /// + /// # Ok::<_, Box>(()) + /// ``` + pub fn set_available_named_graphs(&mut self, named_graphs: Vec) { + self.named = Some(named_graphs); + } +} diff --git a/src/sparql/dataset.rs b/src/sparql/dataset.rs new file mode 100644 index 0000000..3253be1 --- /dev/null +++ b/src/sparql/dataset.rs @@ -0,0 +1,184 @@ +use crate::model::TermRef; +use crate::sparql::algebra::QueryDataset; +use crate::sparql::EvaluationError; +use crate::storage::numeric_encoder::{insert_term, EncodedQuad, EncodedTerm, StrHash, StrLookup}; +use crate::storage::{StorageError, StorageReader}; +use std::cell::RefCell; +use std::collections::hash_map::Entry; +use std::collections::HashMap; +use std::iter::empty; + +pub struct DatasetView { + reader: StorageReader, + extra: RefCell>, + dataset: EncodedDatasetSpec, +} + +impl DatasetView { + pub fn new(reader: StorageReader, dataset: &QueryDataset) -> Self { + let dataset = EncodedDatasetSpec { + default: dataset + .default_graph_graphs() + .map(|graphs| graphs.iter().map(|g| g.as_ref().into()).collect::>()), + named: dataset + .available_named_graphs() + .map(|graphs| graphs.iter().map(|g| g.as_ref().into()).collect::>()), + }; + Self { + reader, + extra: RefCell::new(HashMap::default()), + dataset, + } + } + + fn store_encoded_quads_for_pattern( + &self, + subject: Option<&EncodedTerm>, + predicate: Option<&EncodedTerm>, + object: Option<&EncodedTerm>, + graph_name: Option<&EncodedTerm>, + ) -> impl Iterator> + 'static { + self.reader + .quads_for_pattern(subject, predicate, object, graph_name) + .map(|t| t.map_err(Into::into)) + } + + #[allow(clippy::needless_collect)] + pub fn encoded_quads_for_pattern( + &self, + subject: Option<&EncodedTerm>, + predicate: Option<&EncodedTerm>, + object: Option<&EncodedTerm>, + graph_name: Option<&EncodedTerm>, + ) -> Box>> { + if let Some(graph_name) = graph_name { + if graph_name.is_default_graph() { + if let Some(default_graph_graphs) = &self.dataset.default { + if default_graph_graphs.len() == 1 { + // Single graph optimization + Box::new( + self.store_encoded_quads_for_pattern( + subject, + predicate, + object, + Some(&default_graph_graphs[0]), + ) + .map(|quad| { + let quad = quad?; + Ok(EncodedQuad::new( + quad.subject, + quad.predicate, + quad.object, + EncodedTerm::DefaultGraph, + )) + }), + ) + } else { + let iters = default_graph_graphs + .iter() + .map(|graph_name| { + self.store_encoded_quads_for_pattern( + subject, + predicate, + object, + Some(graph_name), + ) + }) + .collect::>(); + Box::new(iters.into_iter().flatten().map(|quad| { + let quad = quad?; + Ok(EncodedQuad::new( + quad.subject, + quad.predicate, + quad.object, + EncodedTerm::DefaultGraph, + )) + })) + } + } else { + Box::new( + self.store_encoded_quads_for_pattern(subject, predicate, object, None) + .map(|quad| { + let quad = quad?; + Ok(EncodedQuad::new( + quad.subject, + quad.predicate, + quad.object, + EncodedTerm::DefaultGraph, + )) + }), + ) + } + } else if self + .dataset + .named + .as_ref() + .map_or(true, |d| d.contains(graph_name)) + { + Box::new(self.store_encoded_quads_for_pattern( + subject, + predicate, + object, + Some(graph_name), + )) + } else { + Box::new(empty()) + } + } else if let Some(named_graphs) = &self.dataset.named { + let iters = named_graphs + .iter() + .map(|graph_name| { + self.store_encoded_quads_for_pattern( + subject, + predicate, + object, + Some(graph_name), + ) + }) + .collect::>(); + Box::new(iters.into_iter().flatten()) + } else { + Box::new( + self.store_encoded_quads_for_pattern(subject, predicate, object, None) + .filter(|quad| match quad { + Err(_) => true, + Ok(quad) => !quad.graph_name.is_default_graph(), + }), + ) + } + } + + pub fn encode_term<'a>(&self, term: impl Into>) -> EncodedTerm { + let term = term.into(); + let encoded = term.into(); + insert_term(term, &encoded, &mut |key, value| { + self.insert_str(key, value); + Ok(()) + }) + .unwrap(); + encoded + } + + pub fn insert_str(&self, key: &StrHash, value: &str) { + if let Entry::Vacant(e) = self.extra.borrow_mut().entry(*key) { + if !matches!(self.reader.contains_str(key), Ok(true)) { + e.insert(value.to_owned()); + } + } + } +} + +impl StrLookup for DatasetView { + fn get_str(&self, key: &StrHash) -> Result, StorageError> { + Ok(if let Some(value) = self.extra.borrow().get(key) { + Some(value.clone()) + } else { + self.reader.get_str(key)? + }) + } +} + +struct EncodedDatasetSpec { + default: Option>, + named: Option>, +} diff --git a/src/sparql/error.rs b/src/sparql/error.rs new file mode 100644 index 0000000..38731de --- /dev/null +++ b/src/sparql/error.rs @@ -0,0 +1,84 @@ +use crate::io::RdfParseError; +use crate::model::NamedNode; +use crate::sparql::results::QueryResultsParseError as ResultsParseError; +use crate::sparql::SparqlSyntaxError; +use crate::storage::StorageError; +use std::convert::Infallible; +use std::error::Error; +use std::io; + +/// A SPARQL evaluation error. +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum EvaluationError { + /// An error in SPARQL parsing. + #[error(transparent)] + Parsing(#[from] SparqlSyntaxError), + /// An error from the storage. + #[error(transparent)] + Storage(#[from] StorageError), + /// An error while parsing an external RDF file. + #[error(transparent)] + GraphParsing(#[from] RdfParseError), + /// An error while parsing an external result file (likely from a federated query). + #[error(transparent)] + ResultsParsing(#[from] ResultsParseError), + /// An error returned during results serialization. + #[error(transparent)] + ResultsSerialization(#[from] io::Error), + /// Error during `SERVICE` evaluation + #[error("{0}")] + Service(#[source] Box), + /// Error when `CREATE` tries to create an already existing graph + #[error("The graph {0} already exists")] + GraphAlreadyExists(NamedNode), + /// Error when `DROP` or `CLEAR` tries to remove a not existing graph + #[error("The graph {0} does not exist")] + GraphDoesNotExist(NamedNode), + /// The variable storing the `SERVICE` name is unbound + #[error("The variable encoding the service name is unbound")] + UnboundService, + /// The given `SERVICE` is not supported + #[error("The service {0} is not supported")] + UnsupportedService(NamedNode), + /// The given content media type returned from an HTTP response is not supported (`SERVICE` and `LOAD`) + #[error("The content media type {0} is not supported")] + UnsupportedContentType(String), + /// The `SERVICE` call has not returns solutions + #[error("The service is not returning solutions but a boolean or a graph")] + ServiceDoesNotReturnSolutions, + /// The results are not a RDF graph + #[error("The query results are not a RDF graph")] + NotAGraph, +} + +impl From for EvaluationError { + #[inline] + fn from(error: Infallible) -> Self { + match error {} + } +} + +impl From for io::Error { + #[inline] + fn from(error: EvaluationError) -> Self { + match error { + EvaluationError::Parsing(error) => Self::new(io::ErrorKind::InvalidData, error), + EvaluationError::GraphParsing(error) => error.into(), + EvaluationError::ResultsParsing(error) => error.into(), + EvaluationError::ResultsSerialization(error) => error, + EvaluationError::Storage(error) => error.into(), + EvaluationError::Service(error) => match error.downcast() { + Ok(error) => *error, + Err(error) => Self::new(io::ErrorKind::Other, error), + }, + EvaluationError::GraphAlreadyExists(_) + | EvaluationError::GraphDoesNotExist(_) + | EvaluationError::UnboundService + | EvaluationError::UnsupportedService(_) + | EvaluationError::UnsupportedContentType(_) + | EvaluationError::ServiceDoesNotReturnSolutions + | EvaluationError::NotAGraph => Self::new(io::ErrorKind::InvalidInput, error), + } + } +} diff --git a/src/sparql/eval.rs b/src/sparql/eval.rs new file mode 100644 index 0000000..5065f52 --- /dev/null +++ b/src/sparql/eval.rs @@ -0,0 +1,5870 @@ +use crate::model::vocab::{rdf, xsd}; +use crate::model::{BlankNode, LiteralRef, NamedNodeRef, Term, Triple}; +use crate::sparql::algebra::{Query, QueryDataset}; +use crate::sparql::dataset::DatasetView; +use crate::sparql::error::EvaluationError; +use crate::sparql::model::*; +use crate::sparql::service::ServiceHandler; +use crate::sparql::CustomFunctionRegistry; +use crate::storage::numeric_encoder::*; +use crate::storage::small_string::SmallString; +use digest::Digest; +use json_event_parser::{JsonEvent, ToWriteJsonWriter}; +use md5::Md5; +use oxilangtag::LanguageTag; +use oxiri::Iri; +use oxrdf::{TermRef, Variable}; +use oxsdatatypes::*; +use rand::random; +use regex::{Regex, RegexBuilder}; +use sha1::Sha1; +use sha2::{Sha256, Sha384, Sha512}; +use spargebra::algebra::{AggregateFunction, Function, PropertyPathExpression}; +use spargebra::term::{ + GroundSubject, GroundTerm, GroundTermPattern, GroundTriple, NamedNodePattern, TermPattern, + TriplePattern, +}; +use sparopt::algebra::{ + AggregateExpression, Expression, GraphPattern, JoinAlgorithm, LeftJoinAlgorithm, + MinusAlgorithm, OrderExpression, +}; +use std::cell::Cell; +use std::cmp::Ordering; +use std::collections::hash_map::DefaultHasher; +use std::collections::{HashMap, HashSet}; +use std::hash::{Hash, Hasher}; +use std::iter::{empty, once}; +use std::rc::Rc; +use std::sync::Arc; +use std::{fmt, io, str}; + +const REGEX_SIZE_LIMIT: usize = 1_000_000; + +#[derive(Eq, PartialEq, Debug, Clone, Hash)] +pub struct EncodedTuple { + inner: Vec>, +} + +impl EncodedTuple { + pub fn with_capacity(capacity: usize) -> Self { + Self { + inner: Vec::with_capacity(capacity), + } + } + + pub fn capacity(&self) -> usize { + self.inner.capacity() + } + + pub fn contains(&self, index: usize) -> bool { + self.inner.get(index).map_or(false, Option::is_some) + } + + pub fn get(&self, index: usize) -> Option<&EncodedTerm> { + self.inner.get(index).unwrap_or(&None).as_ref() + } + + pub fn iter(&self) -> impl Iterator> + '_ { + self.inner.iter().cloned() + } + + pub fn set(&mut self, index: usize, value: EncodedTerm) { + if self.inner.len() <= index { + self.inner.resize(index + 1, None); + } + self.inner[index] = Some(value); + } + + pub fn combine_with(&self, other: &Self) -> Option { + if self.inner.len() < other.inner.len() { + let mut result = other.inner.clone(); + for (key, self_value) in self.inner.iter().enumerate() { + if let Some(self_value) = self_value { + match &other.inner[key] { + Some(other_value) => { + if self_value != other_value { + return None; + } + } + None => result[key] = Some(self_value.clone()), + } + } + } + Some(Self { inner: result }) + } else { + let mut result = self.inner.clone(); + for (key, other_value) in other.inner.iter().enumerate() { + if let Some(other_value) = other_value { + match &self.inner[key] { + Some(self_value) => { + if self_value != other_value { + return None; + } + } + None => result[key] = Some(other_value.clone()), + } + } + } + Some(Self { inner: result }) + } + } +} + +impl IntoIterator for EncodedTuple { + type Item = Option; + type IntoIter = std::vec::IntoIter>; + + fn into_iter(self) -> Self::IntoIter { + self.inner.into_iter() + } +} + +type EncodedTuplesIterator = Box>>; + +#[derive(Clone)] +pub struct SimpleEvaluator { + dataset: Rc, + base_iri: Option>>, + now: DateTime, + service_handler: Arc>, + custom_functions: Arc, + run_stats: bool, +} + +impl SimpleEvaluator { + pub fn new( + dataset: Rc, + base_iri: Option>>, + service_handler: Arc>, + custom_functions: Arc, + run_stats: bool, + ) -> Self { + Self { + dataset, + base_iri, + now: DateTime::now(), + service_handler, + custom_functions, + run_stats, + } + } + + pub fn evaluate_select(&self, pattern: &GraphPattern) -> (QueryResults, Rc) { + let mut variables = Vec::new(); + let (eval, stats) = self.graph_pattern_evaluator(pattern, &mut variables); + let from = EncodedTuple::with_capacity(variables.len()); + ( + QueryResults::Solutions(decode_bindings( + Rc::clone(&self.dataset), + eval(from), + Arc::from(variables), + )), + stats, + ) + } + + pub fn evaluate_ask( + &self, + pattern: &GraphPattern, + ) -> (Result, Rc) { + let mut variables = Vec::new(); + let (eval, stats) = self.graph_pattern_evaluator(pattern, &mut variables); + let from = EncodedTuple::with_capacity(variables.len()); + ( + match eval(from).next() { + Some(Ok(_)) => Ok(QueryResults::Boolean(true)), + Some(Err(error)) => Err(error), + None => Ok(QueryResults::Boolean(false)), + }, + stats, + ) + } + + pub fn evaluate_construct( + &self, + pattern: &GraphPattern, + template: &[TriplePattern], + ) -> (QueryResults, Rc) { + let mut variables = Vec::new(); + let (eval, stats) = self.graph_pattern_evaluator(pattern, &mut variables); + let mut bnodes = Vec::new(); + let template = template + .iter() + .map(|t| TripleTemplate { + subject: self.template_value_from_term_or_variable( + &t.subject, + &mut variables, + &mut bnodes, + ), + predicate: self + .template_value_from_named_node_or_variable(&t.predicate, &mut variables), + object: self.template_value_from_term_or_variable( + &t.object, + &mut variables, + &mut bnodes, + ), + }) + .collect(); + let from = EncodedTuple::with_capacity(variables.len()); + ( + QueryResults::Graph(QueryTripleIter { + iter: Box::new(ConstructIterator { + eval: self.clone(), + iter: eval(from), + template, + buffered_results: Vec::default(), + bnodes: Vec::default(), + }), + }), + stats, + ) + } + + pub fn evaluate_describe( + &self, + pattern: &GraphPattern, + ) -> (QueryResults, Rc) { + let mut variables = Vec::new(); + let (eval, stats) = self.graph_pattern_evaluator(pattern, &mut variables); + let from = EncodedTuple::with_capacity(variables.len()); + ( + QueryResults::Graph(QueryTripleIter { + iter: Box::new(DescribeIterator { + eval: self.clone(), + iter: eval(from), + quads: Box::new(empty()), + }), + }), + stats, + ) + } + + pub fn graph_pattern_evaluator( + &self, + pattern: &GraphPattern, + encoded_variables: &mut Vec, + ) -> ( + Rc EncodedTuplesIterator>, + Rc, + ) { + let mut stat_children = Vec::new(); + let mut evaluator = + self.build_graph_pattern_evaluator(pattern, encoded_variables, &mut stat_children); + let stats = Rc::new(EvalNodeWithStats { + label: eval_node_label(pattern), + children: stat_children, + exec_count: Cell::new(0), + exec_duration: Cell::new(self.run_stats.then(DayTimeDuration::default)), + }); + if self.run_stats { + let stats = Rc::clone(&stats); + evaluator = Rc::new(move |tuple| { + let start = Timer::now(); + let inner = evaluator(tuple); + stats.exec_duration.set( + stats + .exec_duration + .get() + .and_then(|stat| stat.checked_add(start.elapsed()?)), + ); + Box::new(StatsIterator { + inner, + stats: Rc::clone(&stats), + }) + }) + } + (evaluator, stats) + } + + fn build_graph_pattern_evaluator( + &self, + pattern: &GraphPattern, + encoded_variables: &mut Vec, + stat_children: &mut Vec>, + ) -> Rc EncodedTuplesIterator> { + match pattern { + GraphPattern::Values { + variables, + bindings, + } => { + let encoding = variables + .iter() + .map(|v| encode_variable(encoded_variables, v)) + .collect::>(); + let encoded_tuples = bindings + .iter() + .map(|row| { + let mut result = EncodedTuple::with_capacity(variables.len()); + for (key, value) in row.iter().enumerate() { + if let Some(term) = value { + result.set( + encoding[key], + match term { + GroundTerm::NamedNode(node) => self.encode_term(node), + GroundTerm::Literal(literal) => self.encode_term(literal), + GroundTerm::Triple(triple) => self.encode_triple(triple), + }, + ); + } + } + result + }) + .collect::>(); + Rc::new(move |from| { + Box::new( + encoded_tuples + .iter() + .filter_map(move |t| Some(Ok(t.combine_with(&from)?))) + .collect::>() + .into_iter(), + ) + }) + } + GraphPattern::Service { + name, + inner, + silent, + } => { + #[allow(clippy::shadow_same)] + let silent = *silent; + let service_name = + TupleSelector::from_named_node_pattern(name, encoded_variables, &self.dataset); + self.build_graph_pattern_evaluator(inner, encoded_variables, &mut Vec::new()); // We call recursively to fill "encoded_variables" + let graph_pattern = spargebra::algebra::GraphPattern::from(inner.as_ref()); + let variables = Rc::from(encoded_variables.as_slice()); + let eval = self.clone(); + Rc::new(move |from| { + match eval.evaluate_service( + &service_name, + &graph_pattern, + Rc::clone(&variables), + &from, + ) { + Ok(result) => Box::new(result.filter_map(move |binding| { + binding + .map(|binding| binding.combine_with(&from)) + .transpose() + })), + Err(e) => { + if silent { + Box::new(once(Ok(from))) + } else { + Box::new(once(Err(e))) + } + } + } + }) + } + GraphPattern::QuadPattern { + subject, + predicate, + object, + graph_name, + } => { + let subject = TupleSelector::from_ground_term_pattern( + subject, + encoded_variables, + &self.dataset, + ); + let predicate = TupleSelector::from_named_node_pattern( + predicate, + encoded_variables, + &self.dataset, + ); + let object = TupleSelector::from_ground_term_pattern( + object, + encoded_variables, + &self.dataset, + ); + let graph_name = TupleSelector::from_graph_name_pattern( + graph_name, + encoded_variables, + &self.dataset, + ); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |from| { + let iter = dataset.encoded_quads_for_pattern( + subject.get_pattern_value(&from).as_ref(), + predicate.get_pattern_value(&from).as_ref(), + object.get_pattern_value(&from).as_ref(), + graph_name.get_pattern_value(&from).as_ref(), + ); + let subject = subject.clone(); + let predicate = predicate.clone(); + let object = object.clone(); + let graph_name = graph_name.clone(); + Box::new(iter.filter_map(move |quad| match quad { + Ok(quad) => { + let mut new_tuple = from.clone(); + put_pattern_value(&subject, quad.subject, &mut new_tuple)?; + put_pattern_value(&predicate, quad.predicate, &mut new_tuple)?; + put_pattern_value(&object, quad.object, &mut new_tuple)?; + put_pattern_value(&graph_name, quad.graph_name, &mut new_tuple)?; + Some(Ok(new_tuple)) + } + Err(error) => Some(Err(error)), + })) + }) + } + GraphPattern::Path { + subject, + path, + object, + graph_name, + } => { + let subject = TupleSelector::from_ground_term_pattern( + subject, + encoded_variables, + &self.dataset, + ); + let path = self.encode_property_path(path); + + let object = TupleSelector::from_ground_term_pattern( + object, + encoded_variables, + &self.dataset, + ); + let graph_name = TupleSelector::from_graph_name_pattern( + graph_name, + encoded_variables, + &self.dataset, + ); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |from| { + let input_subject = subject.get_pattern_value(&from); + let input_object = object.get_pattern_value(&from); + let input_graph_name = graph_name.get_pattern_value(&from); + let path_eval = PathEvaluator { + dataset: Rc::clone(&dataset), + }; + match (input_subject, input_object, input_graph_name) { + (Some(input_subject), Some(input_object), Some(input_graph_name)) => { + match path_eval.eval_closed_in_graph( + &path, + &input_subject, + &input_object, + &input_graph_name, + ) { + Ok(true) => Box::new(once(Ok(from))), + Ok(false) => Box::new(empty()), + Err(e) => Box::new(once(Err(e))), + } + } + (Some(input_subject), None, Some(input_graph_name)) => { + let object = object.clone(); + Box::new( + path_eval + .eval_from_in_graph(&path, &input_subject, &input_graph_name) + .filter_map(move |o| match o { + Ok(o) => { + let mut new_tuple = from.clone(); + put_pattern_value(&object, o, &mut new_tuple)?; + Some(Ok(new_tuple)) + } + Err(error) => Some(Err(error)), + }), + ) + } + (None, Some(input_object), Some(input_graph_name)) => { + let subject = subject.clone(); + Box::new( + path_eval + .eval_to_in_graph(&path, &input_object, &input_graph_name) + .filter_map(move |s| match s { + Ok(s) => { + let mut new_tuple = from.clone(); + put_pattern_value(&subject, s, &mut new_tuple)?; + Some(Ok(new_tuple)) + } + Err(error) => Some(Err(error)), + }), + ) + } + (None, None, Some(input_graph_name)) => { + let subject = subject.clone(); + let object = object.clone(); + Box::new( + path_eval + .eval_open_in_graph(&path, &input_graph_name) + .filter_map(move |so| match so { + Ok((s, o)) => { + let mut new_tuple = from.clone(); + put_pattern_value(&subject, s, &mut new_tuple)?; + put_pattern_value(&object, o, &mut new_tuple)?; + Some(Ok(new_tuple)) + } + Err(error) => Some(Err(error)), + }), + ) + } + (Some(input_subject), Some(input_object), None) => { + let graph_name = graph_name.clone(); + Box::new( + path_eval + .eval_closed_in_unknown_graph( + &path, + &input_subject, + &input_object, + ) + .filter_map(move |r| match r { + Ok(g) => { + let mut new_tuple = from.clone(); + put_pattern_value(&graph_name, g, &mut new_tuple)?; + Some(Ok(new_tuple)) + } + Err(error) => Some(Err(error)), + }), + ) + } + (Some(input_subject), None, None) => { + let object = object.clone(); + let graph_name = graph_name.clone(); + Box::new( + path_eval + .eval_from_in_unknown_graph(&path, &input_subject) + .filter_map(move |r| match r { + Ok((o, g)) => { + let mut new_tuple = from.clone(); + put_pattern_value(&object, o, &mut new_tuple)?; + put_pattern_value(&graph_name, g, &mut new_tuple)?; + Some(Ok(new_tuple)) + } + Err(error) => Some(Err(error)), + }), + ) + } + (None, Some(input_object), None) => { + let subject = subject.clone(); + let graph_name = graph_name.clone(); + Box::new( + path_eval + .eval_to_in_unknown_graph(&path, &input_object) + .filter_map(move |r| match r { + Ok((s, g)) => { + let mut new_tuple = from.clone(); + put_pattern_value(&subject, s, &mut new_tuple)?; + put_pattern_value(&graph_name, g, &mut new_tuple)?; + + Some(Ok(new_tuple)) + } + Err(error) => Some(Err(error)), + }), + ) + } + (None, None, None) => { + let subject = subject.clone(); + let object = object.clone(); + let graph_name = graph_name.clone(); + Box::new(path_eval.eval_open_in_unknown_graph(&path).filter_map( + move |r| match r { + Ok((s, o, g)) => { + let mut new_tuple = from.clone(); + put_pattern_value(&subject, s, &mut new_tuple)?; + put_pattern_value(&object, o, &mut new_tuple)?; + put_pattern_value(&graph_name, g, &mut new_tuple)?; + Some(Ok(new_tuple)) + } + Err(error) => Some(Err(error)), + }, + )) + } + } + }) + } + GraphPattern::Join { + left, + right, + algorithm, + } => { + let (left, left_stats) = self.graph_pattern_evaluator(left, encoded_variables); + stat_children.push(left_stats); + let (right, right_stats) = self.graph_pattern_evaluator(right, encoded_variables); + stat_children.push(right_stats); + + match algorithm { + JoinAlgorithm::HashBuildLeftProbeRight { keys } => { + let build = left; + let probe = right; + if keys.is_empty() { + // Cartesian product + Rc::new(move |from| { + let mut errors = Vec::default(); + let build_values = build(from.clone()) + .filter_map(|result| match result { + Ok(result) => Some(result), + Err(error) => { + errors.push(Err(error)); + None + } + }) + .collect::>(); + Box::new(CartesianProductJoinIterator { + probe_iter: probe(from), + built: build_values, + buffered_results: errors, + }) + }) + } else { + // Real hash join + let keys = keys + .iter() + .map(|v| encode_variable(encoded_variables, v)) + .collect::>(); + Rc::new(move |from| { + let mut errors = Vec::default(); + let mut built_values = EncodedTupleSet::new(keys.clone()); + built_values.extend(build(from.clone()).filter_map(|result| { + match result { + Ok(result) => Some(result), + Err(error) => { + errors.push(Err(error)); + None + } + } + })); + Box::new(HashJoinIterator { + probe_iter: probe(from), + built: built_values, + buffered_results: errors, + }) + }) + } + } + } + } + GraphPattern::Lateral { left, right } => { + let (left, left_stats) = self.graph_pattern_evaluator(left, encoded_variables); + stat_children.push(left_stats); + + if let GraphPattern::LeftJoin { + left: nested_left, + right: nested_right, + expression, + .. + } = right.as_ref() + { + if nested_left.is_empty_singleton() { + // We are in a ForLoopLeftJoin + let right = + GraphPattern::filter(nested_right.as_ref().clone(), expression.clone()); + let (right, right_stats) = + self.graph_pattern_evaluator(&right, encoded_variables); + stat_children.push(right_stats); + return Rc::new(move |from| { + Box::new(ForLoopLeftJoinIterator { + right_evaluator: Rc::clone(&right), + left_iter: left(from), + current_right: Box::new(empty()), + }) + }); + } + } + let (right, right_stats) = self.graph_pattern_evaluator(right, encoded_variables); + stat_children.push(right_stats); + Rc::new(move |from| { + let right = Rc::clone(&right); + Box::new(left(from).flat_map(move |t| match t { + Ok(t) => right(t), + Err(e) => Box::new(once(Err(e))), + })) + }) + } + GraphPattern::Minus { + left, + right, + algorithm, + } => { + let (left, left_stats) = self.graph_pattern_evaluator(left, encoded_variables); + stat_children.push(left_stats); + let (right, right_stats) = self.graph_pattern_evaluator(right, encoded_variables); + stat_children.push(right_stats); + + match algorithm { + MinusAlgorithm::HashBuildRightProbeLeft { keys } => { + if keys.is_empty() { + Rc::new(move |from| { + let right: Vec<_> = + right(from.clone()).filter_map(Result::ok).collect(); + Box::new(left(from).filter(move |left_tuple| { + if let Ok(left_tuple) = left_tuple { + !right.iter().any(|right_tuple| { + are_compatible_and_not_disjointed( + left_tuple, + right_tuple, + ) + }) + } else { + true + } + })) + }) + } else { + let keys = keys + .iter() + .map(|v| encode_variable(encoded_variables, v)) + .collect::>(); + Rc::new(move |from| { + let mut right_values = EncodedTupleSet::new(keys.clone()); + right_values.extend(right(from.clone()).filter_map(Result::ok)); + Box::new(left(from).filter(move |left_tuple| { + if let Ok(left_tuple) = left_tuple { + !right_values.get(left_tuple).iter().any(|right_tuple| { + are_compatible_and_not_disjointed( + left_tuple, + right_tuple, + ) + }) + } else { + true + } + })) + }) + } + } + } + } + GraphPattern::LeftJoin { + left, + right, + expression, + algorithm, + } => { + let (left, left_stats) = self.graph_pattern_evaluator(left, encoded_variables); + stat_children.push(left_stats); + let (right, right_stats) = self.graph_pattern_evaluator(right, encoded_variables); + stat_children.push(right_stats); + let expression = + self.expression_evaluator(expression, encoded_variables, stat_children); + + match algorithm { + LeftJoinAlgorithm::HashBuildRightProbeLeft { keys } => { + // Real hash join + let keys = keys + .iter() + .map(|v| encode_variable(encoded_variables, v)) + .collect::>(); + Rc::new(move |from| { + let mut errors = Vec::default(); + let mut right_values = EncodedTupleSet::new(keys.clone()); + right_values.extend(right(from.clone()).filter_map( + |result| match result { + Ok(result) => Some(result), + Err(error) => { + errors.push(Err(error)); + None + } + }, + )); + Box::new(HashLeftJoinIterator { + left_iter: left(from), + right: right_values, + buffered_results: errors, + expression: Rc::clone(&expression), + }) + }) + } + } + } + GraphPattern::Filter { inner, expression } => { + let (child, child_stats) = self.graph_pattern_evaluator(inner, encoded_variables); + stat_children.push(child_stats); + let expression = + self.expression_evaluator(expression, encoded_variables, stat_children); + + Rc::new(move |from| { + let expression = Rc::clone(&expression); + Box::new(child(from).filter(move |tuple| { + match tuple { + Ok(tuple) => expression(tuple) + .and_then(|term| to_bool(&term)) + .unwrap_or(false), + Err(_) => true, + } + })) + }) + } + GraphPattern::Union { inner } => { + let children = inner + .iter() + .map(|child| { + let (child, child_stats) = + self.graph_pattern_evaluator(child, encoded_variables); + stat_children.push(child_stats); + child + }) + .collect::>(); + + Rc::new(move |from| { + Box::new(UnionIterator { + plans: children.clone(), + input: from, + current_iterator: Box::new(empty()), + current_plan: 0, + }) + }) + } + GraphPattern::Extend { + inner, + variable, + expression, + } => { + let (child, child_stats) = self.graph_pattern_evaluator(inner, encoded_variables); + stat_children.push(child_stats); + + let position = encode_variable(encoded_variables, variable); + let expression = + self.expression_evaluator(expression, encoded_variables, stat_children); + Rc::new(move |from| { + let expression = Rc::clone(&expression); + Box::new(child(from).map(move |tuple| { + let mut tuple = tuple?; + if let Some(value) = expression(&tuple) { + tuple.set(position, value); + } + Ok(tuple) + })) + }) + } + GraphPattern::OrderBy { inner, expression } => { + let (child, child_stats) = self.graph_pattern_evaluator(inner, encoded_variables); + stat_children.push(child_stats); + let by = expression + .iter() + .map(|comp| match comp { + OrderExpression::Asc(expression) => ComparatorFunction::Asc( + self.expression_evaluator(expression, encoded_variables, stat_children), + ), + OrderExpression::Desc(expression) => ComparatorFunction::Desc( + self.expression_evaluator(expression, encoded_variables, stat_children), + ), + }) + .collect::>(); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |from| { + let mut errors = Vec::default(); + let mut values = child(from) + .filter_map(|result| match result { + Ok(result) => Some(result), + Err(error) => { + errors.push(Err(error)); + None + } + }) + .collect::>(); + values.sort_unstable_by(|a, b| { + for comp in &by { + match comp { + ComparatorFunction::Asc(expression) => { + match cmp_terms( + &dataset, + expression(a).as_ref(), + expression(b).as_ref(), + ) { + Ordering::Greater => return Ordering::Greater, + Ordering::Less => return Ordering::Less, + Ordering::Equal => (), + } + } + ComparatorFunction::Desc(expression) => { + match cmp_terms( + &dataset, + expression(a).as_ref(), + expression(b).as_ref(), + ) { + Ordering::Greater => return Ordering::Less, + Ordering::Less => return Ordering::Greater, + Ordering::Equal => (), + } + } + } + } + Ordering::Equal + }); + Box::new(errors.into_iter().chain(values.into_iter().map(Ok))) + }) + } + GraphPattern::Distinct { inner } => { + let (child, child_stats) = self.graph_pattern_evaluator(inner, encoded_variables); + stat_children.push(child_stats); + Rc::new(move |from| Box::new(hash_deduplicate(child(from)))) + } + GraphPattern::Reduced { inner } => { + let (child, child_stats) = self.graph_pattern_evaluator(inner, encoded_variables); + stat_children.push(child_stats); + Rc::new(move |from| { + Box::new(ConsecutiveDeduplication { + inner: child(from), + current: None, + }) + }) + } + GraphPattern::Slice { + inner, + start, + length, + } => { + let (mut child, child_stats) = + self.graph_pattern_evaluator(inner, encoded_variables); + stat_children.push(child_stats); + #[allow(clippy::shadow_same)] + let start = *start; + if start > 0 { + child = Rc::new(move |from| Box::new(child(from).skip(start))); + } + if let Some(length) = *length { + child = Rc::new(move |from| Box::new(child(from).take(length))); + } + child + } + GraphPattern::Project { inner, variables } => { + let mut inner_encoded_variables = variables.clone(); + let (child, child_stats) = + self.graph_pattern_evaluator(inner, &mut inner_encoded_variables); + stat_children.push(child_stats); + let mapping = variables + .iter() + .enumerate() + .map(|(new_variable, variable)| { + (new_variable, encode_variable(encoded_variables, variable)) + }) + .collect::>(); + Rc::new(move |from| { + let mapping = Rc::clone(&mapping); + let mut input_tuple = EncodedTuple::with_capacity(mapping.len()); + for (input_key, output_key) in &*mapping { + if let Some(value) = from.get(*output_key) { + input_tuple.set(*input_key, value.clone()); + } + } + Box::new(child(input_tuple).filter_map(move |tuple| { + match tuple { + Ok(tuple) => { + let mut output_tuple = from.clone(); + for (input_key, output_key) in &*mapping { + if let Some(value) = tuple.get(*input_key) { + if let Some(existing_value) = output_tuple.get(*output_key) + { + if existing_value != value { + return None; // Conflict + } + } else { + output_tuple.set(*output_key, value.clone()); + } + } + } + Some(Ok(output_tuple)) + } + Err(e) => Some(Err(e)), + } + })) + }) + } + GraphPattern::Group { + inner, + aggregates, + variables, + } => { + let (child, child_stats) = self.graph_pattern_evaluator(inner, encoded_variables); + stat_children.push(child_stats); + let key_variables = variables + .iter() + .map(|k| encode_variable(encoded_variables, k)) + .collect::>(); + let aggregate_input_expressions = aggregates + .iter() + .map(|(_, expression)| match expression { + AggregateExpression::CountSolutions { .. } => None, + AggregateExpression::FunctionCall { expr, .. } => { + Some(self.expression_evaluator(expr, encoded_variables, stat_children)) + } + }) + .collect::>(); + let accumulator_builders = aggregates + .iter() + .map(|(_, aggregate)| Self::accumulator_builder(&self.dataset, aggregate)) + .collect::>(); + let accumulator_variables = aggregates + .iter() + .map(|(variable, _)| encode_variable(encoded_variables, variable)) + .collect::>(); + Rc::new(move |from| { + let tuple_size = from.capacity(); + let key_variables = Rc::clone(&key_variables); + let mut errors = Vec::default(); + let mut accumulators_for_group = + HashMap::>, Vec>>::default(); + if key_variables.is_empty() { + // There is always a single group if there is no GROUP BY + accumulators_for_group.insert( + Vec::new(), + accumulator_builders.iter().map(|c| c()).collect::>(), + ); + } + child(from) + .filter_map(|result| match result { + Ok(result) => Some(result), + Err(error) => { + errors.push(error); + None + } + }) + .for_each(|tuple| { + // TODO avoid copy for key? + let key = key_variables + .iter() + .map(|v| tuple.get(*v).cloned()) + .collect(); + + let key_accumulators = + accumulators_for_group.entry(key).or_insert_with(|| { + accumulator_builders.iter().map(|c| c()).collect::>() + }); + for (accumulator, input_expression) in key_accumulators + .iter_mut() + .zip(&aggregate_input_expressions) + { + accumulator.add( + input_expression + .as_ref() + .and_then(|parameter| parameter(&tuple)), + ); + } + }); + let accumulator_variables = accumulator_variables.clone(); + Box::new( + errors + .into_iter() + .map(Err) + .chain(accumulators_for_group.into_iter().map( + move |(key, accumulators)| { + let mut result = EncodedTuple::with_capacity(tuple_size); + for (variable, value) in key_variables.iter().zip(key) { + if let Some(value) = value { + result.set(*variable, value); + } + } + for (accumulator, variable) in + accumulators.into_iter().zip(&accumulator_variables) + { + if let Some(value) = accumulator.state() { + result.set(*variable, value); + } + } + Ok(result) + }, + )), + ) + }) + } + } + } + + fn evaluate_service( + &self, + service_name: &TupleSelector, + graph_pattern: &spargebra::algebra::GraphPattern, + variables: Rc<[Variable]>, + from: &EncodedTuple, + ) -> Result { + let service_name = service_name + .get_pattern_value(from) + .ok_or(EvaluationError::UnboundService)?; + if let QueryResults::Solutions(iter) = self.service_handler.handle( + self.dataset.decode_named_node(&service_name)?, + Query { + inner: spargebra::Query::Select { + dataset: None, + pattern: graph_pattern.clone(), + #[allow(clippy::useless_asref)] + base_iri: self.base_iri.as_ref().map(|iri| iri.as_ref().clone()), + }, + dataset: QueryDataset::new(), + parsing_duration: None, + }, + )? { + Ok(encode_bindings(Rc::clone(&self.dataset), variables, iter)) + } else { + Err(EvaluationError::ServiceDoesNotReturnSolutions) + } + } + + #[allow(clippy::redundant_closure)] // False positive in 1.60 + fn accumulator_builder( + dataset: &Rc, + expression: &AggregateExpression, + ) -> Box Box> { + let mut accumulator: Box Box> = match expression { + AggregateExpression::CountSolutions { .. } => { + Box::new(|| Box::::default()) + } + AggregateExpression::FunctionCall { name, .. } => match name { + AggregateFunction::Count => Box::new(|| Box::::default()), + AggregateFunction::Sum => Box::new(|| Box::::default()), + AggregateFunction::Min => { + let dataset = Rc::clone(dataset); + Box::new(move || Box::new(MinAccumulator::new(Rc::clone(&dataset)))) + } + AggregateFunction::Max => { + let dataset = Rc::clone(dataset); + Box::new(move || Box::new(MaxAccumulator::new(Rc::clone(&dataset)))) + } + AggregateFunction::Avg => Box::new(|| Box::::default()), + AggregateFunction::Sample => Box::new(|| Box::::default()), + AggregateFunction::GroupConcat { separator } => { + let dataset = Rc::clone(dataset); + let separator = Rc::from(separator.as_deref().unwrap_or(" ")); + Box::new(move || { + Box::new(GroupConcatAccumulator::new( + Rc::clone(&dataset), + Rc::clone(&separator), + )) + }) + } + AggregateFunction::Custom(_) => Box::new(|| Box::new(FailingAccumulator)), + }, + }; + if matches!( + expression, + AggregateExpression::CountSolutions { distinct: true } + | AggregateExpression::FunctionCall { distinct: true, .. } + ) { + accumulator = Box::new(move || Box::new(Deduplicate::new(accumulator()))); + } + accumulator + } + + fn expression_evaluator( + &self, + expression: &Expression, + encoded_variables: &mut Vec, + stat_children: &mut Vec>, + ) -> Rc Option> { + match expression { + Expression::NamedNode(t) => { + let t = self.encode_term(t); + Rc::new(move |_| Some(t.clone())) + } + Expression::Literal(t) => { + let t = self.encode_term(t); + Rc::new(move |_| Some(t.clone())) + } + Expression::Variable(v) => { + let v = encode_variable(encoded_variables, v); + Rc::new(move |tuple| tuple.get(v).cloned()) + } + Expression::Bound(v) => { + let v = encode_variable(encoded_variables, v); + Rc::new(move |tuple| Some(tuple.contains(v).into())) + } + Expression::Exists(plan) => { + let (eval, stats) = self.graph_pattern_evaluator(plan, encoded_variables); + stat_children.push(stats); + Rc::new(move |tuple| Some(eval(tuple.clone()).next().is_some().into())) + } + Expression::Or(inner) => { + let children = inner + .iter() + .map(|i| self.expression_evaluator(i, encoded_variables, stat_children)) + .collect::>(); + Rc::new(move |tuple| { + let mut error = false; + for child in &*children { + match child(tuple).and_then(|v| to_bool(&v)) { + Some(true) => return Some(true.into()), + Some(false) => continue, + None => error = true, + } + } + if error { + None + } else { + Some(false.into()) + } + }) + } + Expression::And(inner) => { + let children = inner + .iter() + .map(|i| self.expression_evaluator(i, encoded_variables, stat_children)) + .collect::>(); + Rc::new(move |tuple| { + let mut error = false; + for child in &*children { + match child(tuple).and_then(|v| to_bool(&v)) { + Some(true) => continue, + Some(false) => return Some(false.into()), + None => error = true, + } + } + if error { + None + } else { + Some(true.into()) + } + }) + } + Expression::Equal(a, b) => { + let a = self.expression_evaluator(a, encoded_variables, stat_children); + let b = self.expression_evaluator(b, encoded_variables, stat_children); + Rc::new(move |tuple| equals(&a(tuple)?, &b(tuple)?).map(Into::into)) + } + Expression::SameTerm(a, b) => { + let a = self.expression_evaluator(a, encoded_variables, stat_children); + let b = self.expression_evaluator(b, encoded_variables, stat_children); + Rc::new(move |tuple| Some((a(tuple)? == b(tuple)?).into())) + } + Expression::Greater(a, b) => { + let a = self.expression_evaluator(a, encoded_variables, stat_children); + let b = self.expression_evaluator(b, encoded_variables, stat_children); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| { + Some( + (partial_cmp(&dataset, &a(tuple)?, &b(tuple)?)? == Ordering::Greater) + .into(), + ) + }) + } + Expression::GreaterOrEqual(a, b) => { + let a = self.expression_evaluator(a, encoded_variables, stat_children); + let b = self.expression_evaluator(b, encoded_variables, stat_children); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| { + Some( + match partial_cmp(&dataset, &a(tuple)?, &b(tuple)?)? { + Ordering::Greater | Ordering::Equal => true, + Ordering::Less => false, + } + .into(), + ) + }) + } + Expression::Less(a, b) => { + let a = self.expression_evaluator(a, encoded_variables, stat_children); + let b = self.expression_evaluator(b, encoded_variables, stat_children); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| { + Some((partial_cmp(&dataset, &a(tuple)?, &b(tuple)?)? == Ordering::Less).into()) + }) + } + Expression::LessOrEqual(a, b) => { + let a = self.expression_evaluator(a, encoded_variables, stat_children); + let b = self.expression_evaluator(b, encoded_variables, stat_children); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| { + Some( + match partial_cmp(&dataset, &a(tuple)?, &b(tuple)?)? { + Ordering::Less | Ordering::Equal => true, + Ordering::Greater => false, + } + .into(), + ) + }) + } + Expression::Add(a, b) => { + let a = self.expression_evaluator(a, encoded_variables, stat_children); + let b = self.expression_evaluator(b, encoded_variables, stat_children); + Rc::new( + move |tuple| match NumericBinaryOperands::new(a(tuple)?, b(tuple)?)? { + NumericBinaryOperands::Float(v1, v2) => Some((v1 + v2).into()), + NumericBinaryOperands::Double(v1, v2) => Some((v1 + v2).into()), + NumericBinaryOperands::Integer(v1, v2) => Some(v1.checked_add(v2)?.into()), + NumericBinaryOperands::Decimal(v1, v2) => Some(v1.checked_add(v2)?.into()), + NumericBinaryOperands::Duration(v1, v2) => Some(v1.checked_add(v2)?.into()), + NumericBinaryOperands::YearMonthDuration(v1, v2) => { + Some(v1.checked_add(v2)?.into()) + } + NumericBinaryOperands::DayTimeDuration(v1, v2) => { + Some(v1.checked_add(v2)?.into()) + } + NumericBinaryOperands::DateTimeDuration(v1, v2) => { + Some(v1.checked_add_duration(v2)?.into()) + } + NumericBinaryOperands::DateTimeYearMonthDuration(v1, v2) => { + Some(v1.checked_add_year_month_duration(v2)?.into()) + } + NumericBinaryOperands::DateTimeDayTimeDuration(v1, v2) => { + Some(v1.checked_add_day_time_duration(v2)?.into()) + } + NumericBinaryOperands::DateDuration(v1, v2) => { + Some(v1.checked_add_duration(v2)?.into()) + } + NumericBinaryOperands::DateYearMonthDuration(v1, v2) => { + Some(v1.checked_add_year_month_duration(v2)?.into()) + } + NumericBinaryOperands::DateDayTimeDuration(v1, v2) => { + Some(v1.checked_add_day_time_duration(v2)?.into()) + } + NumericBinaryOperands::TimeDuration(v1, v2) => { + Some(v1.checked_add_duration(v2)?.into()) + } + NumericBinaryOperands::TimeDayTimeDuration(v1, v2) => { + Some(v1.checked_add_day_time_duration(v2)?.into()) + } + NumericBinaryOperands::DateTime(_, _) + | NumericBinaryOperands::Time(_, _) + | NumericBinaryOperands::Date(_, _) => None, + }, + ) + } + Expression::Subtract(a, b) => { + let a = self.expression_evaluator(a, encoded_variables, stat_children); + let b = self.expression_evaluator(b, encoded_variables, stat_children); + Rc::new(move |tuple| { + Some(match NumericBinaryOperands::new(a(tuple)?, b(tuple)?)? { + NumericBinaryOperands::Float(v1, v2) => (v1 - v2).into(), + NumericBinaryOperands::Double(v1, v2) => (v1 - v2).into(), + NumericBinaryOperands::Integer(v1, v2) => v1.checked_sub(v2)?.into(), + NumericBinaryOperands::Decimal(v1, v2) => v1.checked_sub(v2)?.into(), + NumericBinaryOperands::DateTime(v1, v2) => v1.checked_sub(v2)?.into(), + NumericBinaryOperands::Date(v1, v2) => v1.checked_sub(v2)?.into(), + NumericBinaryOperands::Time(v1, v2) => v1.checked_sub(v2)?.into(), + NumericBinaryOperands::Duration(v1, v2) => v1.checked_sub(v2)?.into(), + NumericBinaryOperands::YearMonthDuration(v1, v2) => { + v1.checked_sub(v2)?.into() + } + NumericBinaryOperands::DayTimeDuration(v1, v2) => { + v1.checked_sub(v2)?.into() + } + NumericBinaryOperands::DateTimeDuration(v1, v2) => { + v1.checked_sub_duration(v2)?.into() + } + NumericBinaryOperands::DateTimeYearMonthDuration(v1, v2) => { + v1.checked_sub_year_month_duration(v2)?.into() + } + NumericBinaryOperands::DateTimeDayTimeDuration(v1, v2) => { + v1.checked_sub_day_time_duration(v2)?.into() + } + NumericBinaryOperands::DateDuration(v1, v2) => { + v1.checked_sub_duration(v2)?.into() + } + NumericBinaryOperands::DateYearMonthDuration(v1, v2) => { + v1.checked_sub_year_month_duration(v2)?.into() + } + NumericBinaryOperands::DateDayTimeDuration(v1, v2) => { + v1.checked_sub_day_time_duration(v2)?.into() + } + NumericBinaryOperands::TimeDuration(v1, v2) => { + v1.checked_sub_duration(v2)?.into() + } + NumericBinaryOperands::TimeDayTimeDuration(v1, v2) => { + v1.checked_sub_day_time_duration(v2)?.into() + } + }) + }) + } + Expression::Multiply(a, b) => { + let a = self.expression_evaluator(a, encoded_variables, stat_children); + let b = self.expression_evaluator(b, encoded_variables, stat_children); + Rc::new( + move |tuple| match NumericBinaryOperands::new(a(tuple)?, b(tuple)?)? { + NumericBinaryOperands::Float(v1, v2) => Some((v1 * v2).into()), + NumericBinaryOperands::Double(v1, v2) => Some((v1 * v2).into()), + NumericBinaryOperands::Integer(v1, v2) => Some(v1.checked_mul(v2)?.into()), + NumericBinaryOperands::Decimal(v1, v2) => Some(v1.checked_mul(v2)?.into()), + _ => None, + }, + ) + } + Expression::Divide(a, b) => { + let a = self.expression_evaluator(a, encoded_variables, stat_children); + let b = self.expression_evaluator(b, encoded_variables, stat_children); + Rc::new( + move |tuple| match NumericBinaryOperands::new(a(tuple)?, b(tuple)?)? { + NumericBinaryOperands::Float(v1, v2) => Some((v1 / v2).into()), + NumericBinaryOperands::Double(v1, v2) => Some((v1 / v2).into()), + NumericBinaryOperands::Integer(v1, v2) => { + Some(Decimal::from(v1).checked_div(v2)?.into()) + } + NumericBinaryOperands::Decimal(v1, v2) => Some(v1.checked_div(v2)?.into()), + _ => None, + }, + ) + } + Expression::UnaryPlus(e) => { + let e = self.expression_evaluator(e, encoded_variables, stat_children); + Rc::new(move |tuple| match e(tuple)? { + EncodedTerm::FloatLiteral(value) => Some(value.into()), + EncodedTerm::DoubleLiteral(value) => Some(value.into()), + EncodedTerm::IntegerLiteral(value) => Some(value.into()), + EncodedTerm::DecimalLiteral(value) => Some(value.into()), + EncodedTerm::DurationLiteral(value) => Some(value.into()), + EncodedTerm::YearMonthDurationLiteral(value) => Some(value.into()), + EncodedTerm::DayTimeDurationLiteral(value) => Some(value.into()), + _ => None, + }) + } + Expression::UnaryMinus(e) => { + let e = self.expression_evaluator(e, encoded_variables, stat_children); + Rc::new(move |tuple| match e(tuple)? { + EncodedTerm::FloatLiteral(value) => Some((-value).into()), + EncodedTerm::DoubleLiteral(value) => Some((-value).into()), + EncodedTerm::IntegerLiteral(value) => Some(value.checked_neg()?.into()), + EncodedTerm::DecimalLiteral(value) => Some(value.checked_neg()?.into()), + EncodedTerm::DurationLiteral(value) => Some(value.checked_neg()?.into()), + EncodedTerm::YearMonthDurationLiteral(value) => { + Some(value.checked_neg()?.into()) + } + EncodedTerm::DayTimeDurationLiteral(value) => Some(value.checked_neg()?.into()), + _ => None, + }) + } + Expression::Not(e) => { + let e = self.expression_evaluator(e, encoded_variables, stat_children); + Rc::new(move |tuple| to_bool(&e(tuple)?).map(|v| (!v).into())) + } + Expression::Coalesce(l) => { + let l: Vec<_> = l + .iter() + .map(|e| self.expression_evaluator(e, encoded_variables, stat_children)) + .collect(); + Rc::new(move |tuple| { + for e in &l { + if let Some(result) = e(tuple) { + return Some(result); + } + } + None + }) + } + Expression::If(a, b, c) => { + let a = self.expression_evaluator(a, encoded_variables, stat_children); + let b = self.expression_evaluator(b, encoded_variables, stat_children); + let c = self.expression_evaluator(c, encoded_variables, stat_children); + Rc::new(move |tuple| { + if to_bool(&a(tuple)?)? { + b(tuple) + } else { + c(tuple) + } + }) + } + Expression::FunctionCall(function, parameters) => { + match function { + Function::Str => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| { + Some(build_string_literal_from_id(to_string_id( + &dataset, + &e(tuple)?, + )?)) + }) + } + Function::Lang => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| match e(tuple)? { + EncodedTerm::SmallSmallLangStringLiteral { language, .. } + | EncodedTerm::BigSmallLangStringLiteral { language, .. } => { + Some(build_string_literal_from_id(language.into())) + } + EncodedTerm::SmallBigLangStringLiteral { language_id, .. } + | EncodedTerm::BigBigLangStringLiteral { language_id, .. } => { + Some(build_string_literal_from_id(language_id.into())) + } + e if e.is_literal() => Some(build_string_literal(&dataset, "")), + _ => None, + }) + } + Function::LangMatches => { + let language_tag = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let language_range = self.expression_evaluator( + ¶meters[1], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| { + let mut language_tag = + to_simple_string(&dataset, &language_tag(tuple)?)?; + language_tag.make_ascii_lowercase(); + let mut language_range = + to_simple_string(&dataset, &language_range(tuple)?)?; + language_range.make_ascii_lowercase(); + Some( + if &*language_range == "*" { + !language_tag.is_empty() + } else { + !ZipLongest::new( + language_range.split('-'), + language_tag.split('-'), + ) + .any(|parts| match parts { + (Some(range_subtag), Some(language_subtag)) => { + range_subtag != language_subtag + } + (Some(_), None) => true, + (None, _) => false, + }) + } + .into(), + ) + }) + } + Function::Datatype => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| datatype(&dataset, &e(tuple)?)) + } + Function::Iri => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + let base_iri = self.base_iri.clone(); + Rc::new(move |tuple| { + let e = e(tuple)?; + if e.is_named_node() { + Some(e) + } else { + let iri = to_simple_string(&dataset, &e)?; + Some(build_named_node( + &dataset, + &if let Some(base_iri) = &base_iri { + base_iri.resolve(&iri) + } else { + Iri::parse(iri) + } + .ok()? + .into_inner(), + )) + } + }) + } + Function::BNode => match parameters.first() { + Some(id) => { + let id = + self.expression_evaluator(id, encoded_variables, stat_children); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| { + Some( + dataset.encode_term( + BlankNode::new(to_simple_string(&dataset, &id(tuple)?)?) + .ok()? + .as_ref(), + ), + ) + }) + } + None => Rc::new(|_| { + Some(EncodedTerm::NumericalBlankNode { + id: random::(), + }) + }), + }, + Function::Rand => Rc::new(|_| Some(random::().into())), + Function::Abs => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + Rc::new(move |tuple| match e(tuple)? { + EncodedTerm::IntegerLiteral(value) => Some(value.checked_abs()?.into()), + EncodedTerm::DecimalLiteral(value) => Some(value.checked_abs()?.into()), + EncodedTerm::FloatLiteral(value) => Some(value.abs().into()), + EncodedTerm::DoubleLiteral(value) => Some(value.abs().into()), + _ => None, + }) + } + Function::Ceil => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + Rc::new(move |tuple| match e(tuple)? { + EncodedTerm::IntegerLiteral(value) => Some(value.into()), + EncodedTerm::DecimalLiteral(value) => { + Some(value.checked_ceil()?.into()) + } + EncodedTerm::FloatLiteral(value) => Some(value.ceil().into()), + EncodedTerm::DoubleLiteral(value) => Some(value.ceil().into()), + _ => None, + }) + } + Function::Floor => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + Rc::new(move |tuple| match e(tuple)? { + EncodedTerm::IntegerLiteral(value) => Some(value.into()), + EncodedTerm::DecimalLiteral(value) => { + Some(value.checked_floor()?.into()) + } + EncodedTerm::FloatLiteral(value) => Some(value.floor().into()), + EncodedTerm::DoubleLiteral(value) => Some(value.floor().into()), + _ => None, + }) + } + Function::Round => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + Rc::new(move |tuple| match e(tuple)? { + EncodedTerm::IntegerLiteral(value) => Some(value.into()), + EncodedTerm::DecimalLiteral(value) => { + Some(value.checked_round()?.into()) + } + EncodedTerm::FloatLiteral(value) => Some(value.round().into()), + EncodedTerm::DoubleLiteral(value) => Some(value.round().into()), + _ => None, + }) + } + Function::Concat => { + let l: Vec<_> = parameters + .iter() + .map(|e| self.expression_evaluator(e, encoded_variables, stat_children)) + .collect(); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| { + let mut result = String::default(); + let mut language = None; + for e in &l { + let (value, e_language) = + to_string_and_language(&dataset, &e(tuple)?)?; + if let Some(lang) = language { + if lang != e_language { + language = Some(None) + } + } else { + language = Some(e_language) + } + result += &value + } + Some(build_plain_literal( + &dataset, + &result, + language.and_then(|v| v), + )) + }) + } + Function::SubStr => { + let source = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let starting_loc = self.expression_evaluator( + ¶meters[1], + encoded_variables, + stat_children, + ); + let length = parameters.get(2).map(|l| { + self.expression_evaluator(l, encoded_variables, stat_children) + }); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| { + let (source, language) = + to_string_and_language(&dataset, &source(tuple)?)?; + + let starting_location: usize = + if let EncodedTerm::IntegerLiteral(v) = starting_loc(tuple)? { + i64::from(v).try_into().ok()? + } else { + return None; + }; + let length: Option = if let Some(length) = &length { + if let EncodedTerm::IntegerLiteral(v) = length(tuple)? { + Some(i64::from(v).try_into().ok()?) + } else { + return None; + } + } else { + None + }; + + // We want to slice on char indices, not byte indices + let mut start_iter = source + .char_indices() + .skip(starting_location.checked_sub(1)?) + .peekable(); + let result = + if let Some((start_position, _)) = start_iter.peek().copied() { + if let Some(length) = length { + let mut end_iter = start_iter.skip(length).peekable(); + if let Some((end_position, _)) = end_iter.peek() { + &source[start_position..*end_position] + } else { + &source[start_position..] + } + } else { + &source[start_position..] + } + } else { + "" + }; + Some(build_plain_literal(&dataset, result, language)) + }) + } + Function::StrLen => { + let arg = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| { + Some( + i64::try_from(to_string(&dataset, &arg(tuple)?)?.chars().count()) + .ok()? + .into(), + ) + }) + } + Function::Replace => { + let arg = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let replacement = self.expression_evaluator( + ¶meters[2], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + if let Some(regex) = + compile_static_pattern_if_exists(¶meters[1], parameters.get(3)) + { + Rc::new(move |tuple| { + let (text, language) = + to_string_and_language(&dataset, &arg(tuple)?)?; + let replacement = to_simple_string(&dataset, &replacement(tuple)?)?; + Some(build_plain_literal( + &dataset, + ®ex.replace_all(&text, replacement.as_str()), + language, + )) + }) + } else { + let pattern = self.expression_evaluator( + ¶meters[1], + encoded_variables, + stat_children, + ); + let flags = parameters.get(3).map(|flags| { + self.expression_evaluator(flags, encoded_variables, stat_children) + }); + Rc::new(move |tuple| { + let pattern = to_simple_string(&dataset, &pattern(tuple)?)?; + let options = if let Some(flags) = &flags { + Some(to_simple_string(&dataset, &flags(tuple)?)?) + } else { + None + }; + let regex = compile_pattern(&pattern, options.as_deref())?; + let (text, language) = + to_string_and_language(&dataset, &arg(tuple)?)?; + let replacement = to_simple_string(&dataset, &replacement(tuple)?)?; + Some(build_plain_literal( + &dataset, + ®ex.replace_all(&text, replacement.as_str()), + language, + )) + }) + } + } + Function::UCase => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| { + let (value, language) = to_string_and_language(&dataset, &e(tuple)?)?; + Some(build_plain_literal( + &dataset, + &value.to_uppercase(), + language, + )) + }) + } + Function::LCase => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| { + let (value, language) = to_string_and_language(&dataset, &e(tuple)?)?; + Some(build_plain_literal( + &dataset, + &value.to_lowercase(), + language, + )) + }) + } + Function::StrStarts => { + let arg1 = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let arg2 = self.expression_evaluator( + ¶meters[1], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| { + let (arg1, arg2, _) = to_argument_compatible_strings( + &dataset, + &arg1(tuple)?, + &arg2(tuple)?, + )?; + Some(arg1.starts_with(arg2.as_str()).into()) + }) + } + Function::EncodeForUri => { + let ltrl = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| { + let ltlr = to_string(&dataset, <rl(tuple)?)?; + let mut result = Vec::with_capacity(ltlr.len()); + for c in ltlr.bytes() { + match c { + b'A'..=b'Z' + | b'a'..=b'z' + | b'0'..=b'9' + | b'-' + | b'_' + | b'.' + | b'~' => result.push(c), + _ => { + result.push(b'%'); + let high = c / 16; + let low = c % 16; + result.push(if high < 10 { + b'0' + high + } else { + b'A' + (high - 10) + }); + result.push(if low < 10 { + b'0' + low + } else { + b'A' + (low - 10) + }); + } + } + } + Some(build_string_literal( + &dataset, + str::from_utf8(&result).ok()?, + )) + }) + } + Function::StrEnds => { + let arg1 = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let arg2 = self.expression_evaluator( + ¶meters[1], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| { + let (arg1, arg2, _) = to_argument_compatible_strings( + &dataset, + &arg1(tuple)?, + &arg2(tuple)?, + )?; + Some(arg1.ends_with(arg2.as_str()).into()) + }) + } + Function::Contains => { + let arg1 = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let arg2 = self.expression_evaluator( + ¶meters[1], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| { + let (arg1, arg2, _) = to_argument_compatible_strings( + &dataset, + &arg1(tuple)?, + &arg2(tuple)?, + )?; + Some(arg1.contains(arg2.as_str()).into()) + }) + } + Function::StrBefore => { + let arg1 = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let arg2 = self.expression_evaluator( + ¶meters[1], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| { + let (arg1, arg2, language) = to_argument_compatible_strings( + &dataset, + &arg1(tuple)?, + &arg2(tuple)?, + )?; + Some(if let Some(position) = arg1.find(arg2.as_str()) { + build_plain_literal(&dataset, &arg1[..position], language) + } else { + build_string_literal(&dataset, "") + }) + }) + } + Function::StrAfter => { + let arg1 = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let arg2 = self.expression_evaluator( + ¶meters[1], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| { + let (arg1, arg2, language) = to_argument_compatible_strings( + &dataset, + &arg1(tuple)?, + &arg2(tuple)?, + )?; + Some(if let Some(position) = arg1.find(arg2.as_str()) { + build_plain_literal( + &dataset, + &arg1[position + arg2.len()..], + language, + ) + } else { + build_string_literal(&dataset, "") + }) + }) + } + Function::Year => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + Rc::new(move |tuple| match e(tuple)? { + EncodedTerm::DateTimeLiteral(date_time) => { + Some(date_time.year().into()) + } + EncodedTerm::DateLiteral(date) => Some(date.year().into()), + EncodedTerm::GYearMonthLiteral(year_month) => { + Some(year_month.year().into()) + } + EncodedTerm::GYearLiteral(year) => Some(year.year().into()), + _ => None, + }) + } + Function::Month => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + Rc::new(move |tuple| match e(tuple)? { + EncodedTerm::DateTimeLiteral(date_time) => { + Some(date_time.month().into()) + } + EncodedTerm::DateLiteral(date) => Some(date.month().into()), + EncodedTerm::GYearMonthLiteral(year_month) => { + Some(year_month.month().into()) + } + EncodedTerm::GMonthDayLiteral(month_day) => { + Some(month_day.month().into()) + } + EncodedTerm::GMonthLiteral(month) => Some(month.month().into()), + _ => None, + }) + } + Function::Day => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + Rc::new(move |tuple| match e(tuple)? { + EncodedTerm::DateTimeLiteral(date_time) => Some(date_time.day().into()), + EncodedTerm::DateLiteral(date) => Some(date.day().into()), + EncodedTerm::GMonthDayLiteral(month_day) => { + Some(month_day.day().into()) + } + EncodedTerm::GDayLiteral(day) => Some(day.day().into()), + _ => None, + }) + } + Function::Hours => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + Rc::new(move |tuple| match e(tuple)? { + EncodedTerm::DateTimeLiteral(date_time) => { + Some(date_time.hour().into()) + } + EncodedTerm::TimeLiteral(time) => Some(time.hour().into()), + _ => None, + }) + } + Function::Minutes => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + Rc::new(move |tuple| match e(tuple)? { + EncodedTerm::DateTimeLiteral(date_time) => { + Some(date_time.minute().into()) + } + EncodedTerm::TimeLiteral(time) => Some(time.minute().into()), + _ => None, + }) + } + Function::Seconds => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + Rc::new(move |tuple| match e(tuple)? { + EncodedTerm::DateTimeLiteral(date_time) => { + Some(date_time.second().into()) + } + EncodedTerm::TimeLiteral(time) => Some(time.second().into()), + _ => None, + }) + } + Function::Timezone => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + Rc::new(move |tuple| { + Some( + match e(tuple)? { + EncodedTerm::DateTimeLiteral(date_time) => date_time.timezone(), + EncodedTerm::TimeLiteral(time) => time.timezone(), + EncodedTerm::DateLiteral(date) => date.timezone(), + EncodedTerm::GYearMonthLiteral(year_month) => { + year_month.timezone() + } + EncodedTerm::GYearLiteral(year) => year.timezone(), + EncodedTerm::GMonthDayLiteral(month_day) => { + month_day.timezone() + } + EncodedTerm::GDayLiteral(day) => day.timezone(), + EncodedTerm::GMonthLiteral(month) => month.timezone(), + _ => None, + }? + .into(), + ) + }) + } + Function::Tz => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| { + let timezone_offset = match e(tuple)? { + EncodedTerm::DateTimeLiteral(date_time) => { + date_time.timezone_offset() + } + EncodedTerm::TimeLiteral(time) => time.timezone_offset(), + EncodedTerm::DateLiteral(date) => date.timezone_offset(), + EncodedTerm::GYearMonthLiteral(year_month) => { + year_month.timezone_offset() + } + EncodedTerm::GYearLiteral(year) => year.timezone_offset(), + EncodedTerm::GMonthDayLiteral(month_day) => { + month_day.timezone_offset() + } + EncodedTerm::GDayLiteral(day) => day.timezone_offset(), + EncodedTerm::GMonthLiteral(month) => month.timezone_offset(), + _ => return None, + }; + Some(match timezone_offset { + Some(timezone_offset) => { + build_string_literal(&dataset, &timezone_offset.to_string()) + } + None => build_string_literal(&dataset, ""), + }) + }) + } + Function::Adjust => { + let dt = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let tz = self.expression_evaluator( + ¶meters[1], + encoded_variables, + stat_children, + ); + Rc::new(move |tuple| { + let timezone_offset = Some( + match tz(tuple)? { + EncodedTerm::DayTimeDurationLiteral(tz) => { + TimezoneOffset::try_from(tz) + } + EncodedTerm::DurationLiteral(tz) => { + TimezoneOffset::try_from(tz) + } + _ => return None, + } + .ok()?, + ); + Some(match dt(tuple)? { + EncodedTerm::DateTimeLiteral(date_time) => { + date_time.adjust(timezone_offset)?.into() + } + EncodedTerm::TimeLiteral(time) => { + time.adjust(timezone_offset)?.into() + } + EncodedTerm::DateLiteral(date) => { + date.adjust(timezone_offset)?.into() + } + EncodedTerm::GYearMonthLiteral(year_month) => { + year_month.adjust(timezone_offset)?.into() + } + EncodedTerm::GYearLiteral(year) => { + year.adjust(timezone_offset)?.into() + } + EncodedTerm::GMonthDayLiteral(month_day) => { + month_day.adjust(timezone_offset)?.into() + } + EncodedTerm::GDayLiteral(day) => { + day.adjust(timezone_offset)?.into() + } + EncodedTerm::GMonthLiteral(month) => { + month.adjust(timezone_offset)?.into() + } + _ => return None, + }) + }) + } + Function::Now => { + let now = self.now; + Rc::new(move |_| Some(now.into())) + } + Function::Uuid => { + let dataset = Rc::clone(&self.dataset); + Rc::new(move |_| { + let mut buffer = String::with_capacity(44); + buffer.push_str("urn:uuid:"); + generate_uuid(&mut buffer); + Some(build_named_node(&dataset, &buffer)) + }) + } + Function::StrUuid => { + let dataset = Rc::clone(&self.dataset); + Rc::new(move |_| { + let mut buffer = String::with_capacity(36); + generate_uuid(&mut buffer); + Some(build_string_literal(&dataset, &buffer)) + }) + } + Function::Md5 => self.hash::(parameters, encoded_variables, stat_children), + Function::Sha1 => { + self.hash::(parameters, encoded_variables, stat_children) + } + Function::Sha256 => { + self.hash::(parameters, encoded_variables, stat_children) + } + Function::Sha384 => { + self.hash::(parameters, encoded_variables, stat_children) + } + Function::Sha512 => { + self.hash::(parameters, encoded_variables, stat_children) + } + Function::StrLang => { + let lexical_form = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let lang_tag = self.expression_evaluator( + ¶meters[1], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| { + Some(build_lang_string_literal_from_id( + to_simple_string_id(&lexical_form(tuple)?)?, + build_language_id(&dataset, &lang_tag(tuple)?)?, + )) + }) + } + Function::StrDt => { + let lexical_form = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let datatype = self.expression_evaluator( + ¶meters[1], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| { + let value = to_simple_string(&dataset, &lexical_form(tuple)?)?; + let datatype = + if let EncodedTerm::NamedNode { iri_id } = datatype(tuple)? { + dataset.get_str(&iri_id).ok()? + } else { + None + }?; + Some(dataset.encode_term(LiteralRef::new_typed_literal( + &value, + NamedNodeRef::new_unchecked(&datatype), + ))) + }) + } + Function::IsIri => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + Rc::new(move |tuple| Some(e(tuple)?.is_named_node().into())) + } + Function::IsBlank => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + Rc::new(move |tuple| Some(e(tuple)?.is_blank_node().into())) + } + Function::IsLiteral => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + Rc::new(move |tuple| Some(e(tuple)?.is_literal().into())) + } + Function::IsNumeric => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + Rc::new(move |tuple| { + Some( + matches!( + e(tuple)?, + EncodedTerm::FloatLiteral(_) + | EncodedTerm::DoubleLiteral(_) + | EncodedTerm::IntegerLiteral(_) + | EncodedTerm::DecimalLiteral(_) + ) + .into(), + ) + }) + } + Function::Regex => { + let text = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + if let Some(regex) = + compile_static_pattern_if_exists(¶meters[1], parameters.get(2)) + { + Rc::new(move |tuple| { + let text = to_string(&dataset, &text(tuple)?)?; + Some(regex.is_match(&text).into()) + }) + } else { + let pattern = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let flags = parameters.get(2).map(|flags| { + self.expression_evaluator(flags, encoded_variables, stat_children) + }); + Rc::new(move |tuple| { + let pattern = to_simple_string(&dataset, &pattern(tuple)?)?; + let options = if let Some(flags) = &flags { + Some(to_simple_string(&dataset, &flags(tuple)?)?) + } else { + None + }; + let regex = compile_pattern(&pattern, options.as_deref())?; + let text = to_string(&dataset, &text(tuple)?)?; + Some(regex.is_match(&text).into()) + }) + } + } + Function::Triple => { + let s = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let p = self.expression_evaluator( + ¶meters[1], + encoded_variables, + stat_children, + ); + let o = self.expression_evaluator( + ¶meters[2], + encoded_variables, + stat_children, + ); + Rc::new(move |tuple| { + let s = s(tuple)?; + let p = p(tuple)?; + let o = o(tuple)?; + (!s.is_literal() + && !s.is_default_graph() + && p.is_named_node() + && !o.is_default_graph()) + .then(|| EncodedTriple::new(s, p, o).into()) + }) + } + Function::Subject => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + Rc::new(move |tuple| { + if let EncodedTerm::Triple(t) = e(tuple)? { + Some(t.subject.clone()) + } else { + None + } + }) + } + Function::Predicate => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + Rc::new(move |tuple| { + if let EncodedTerm::Triple(t) = e(tuple)? { + Some(t.predicate.clone()) + } else { + None + } + }) + } + Function::Object => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + Rc::new(move |tuple| { + if let EncodedTerm::Triple(t) = e(tuple)? { + Some(t.object.clone()) + } else { + None + } + }) + } + Function::IsTriple => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + Rc::new(move |tuple| Some(e(tuple)?.is_triple().into())) + } + Function::Custom(function_name) => { + if let Some(function) = self.custom_functions.get(function_name).cloned() { + let args = parameters + .iter() + .map(|e| { + self.expression_evaluator(e, encoded_variables, stat_children) + }) + .collect::>(); + let dataset = Rc::clone(&self.dataset); + return Rc::new(move |tuple| { + let args = args + .iter() + .map(|f| dataset.decode_term(&f(tuple)?).ok()) + .collect::>>()?; + Some(dataset.encode_term(&function(&args)?)) + }); + } + match function_name.as_ref() { + xsd::STRING => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| { + Some(build_string_literal_from_id(to_string_id( + &dataset, + &e(tuple)?, + )?)) + }) + } + xsd::BOOLEAN => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + Rc::new(move |tuple| match e(tuple)? { + EncodedTerm::BooleanLiteral(value) => Some(value.into()), + EncodedTerm::FloatLiteral(value) => { + Some(Boolean::from(value).into()) + } + EncodedTerm::DoubleLiteral(value) => { + Some(Boolean::from(value).into()) + } + EncodedTerm::IntegerLiteral(value) => { + Some(Boolean::from(value).into()) + } + EncodedTerm::DecimalLiteral(value) => { + Some(Boolean::from(value).into()) + } + EncodedTerm::SmallStringLiteral(value) => { + parse_boolean_str(&value) + } + _ => None, + }) + } + xsd::DOUBLE => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| match e(tuple)? { + EncodedTerm::FloatLiteral(value) => { + Some(Double::from(value).into()) + } + EncodedTerm::DoubleLiteral(value) => Some(value.into()), + EncodedTerm::IntegerLiteral(value) => { + Some(Double::from(value).into()) + } + EncodedTerm::DecimalLiteral(value) => { + Some(Double::from(value).into()) + } + EncodedTerm::BooleanLiteral(value) => { + Some(Double::from(value).into()) + } + EncodedTerm::SmallStringLiteral(value) => { + parse_double_str(&value) + } + EncodedTerm::BigStringLiteral { value_id } => { + parse_double_str(&dataset.get_str(&value_id).ok()??) + } + _ => None, + }) + } + xsd::FLOAT => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| match e(tuple)? { + EncodedTerm::FloatLiteral(value) => Some(value.into()), + EncodedTerm::DoubleLiteral(value) => { + Some(Float::from(value).into()) + } + EncodedTerm::IntegerLiteral(value) => { + Some(Float::from(value).into()) + } + EncodedTerm::DecimalLiteral(value) => { + Some(Float::from(value).into()) + } + EncodedTerm::BooleanLiteral(value) => { + Some(Float::from(value).into()) + } + EncodedTerm::SmallStringLiteral(value) => { + parse_float_str(&value) + } + EncodedTerm::BigStringLiteral { value_id } => { + parse_float_str(&dataset.get_str(&value_id).ok()??) + } + _ => None, + }) + } + xsd::INTEGER => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| match e(tuple)? { + EncodedTerm::FloatLiteral(value) => { + Some(Integer::try_from(value).ok()?.into()) + } + EncodedTerm::DoubleLiteral(value) => { + Some(Integer::try_from(value).ok()?.into()) + } + EncodedTerm::IntegerLiteral(value) => Some(value.into()), + EncodedTerm::DecimalLiteral(value) => { + Some(Integer::try_from(value).ok()?.into()) + } + EncodedTerm::BooleanLiteral(value) => { + Some(Integer::from(value).into()) + } + EncodedTerm::SmallStringLiteral(value) => { + parse_integer_str(&value) + } + EncodedTerm::BigStringLiteral { value_id } => { + parse_integer_str(&dataset.get_str(&value_id).ok()??) + } + _ => None, + }) + } + xsd::DECIMAL => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| match e(tuple)? { + EncodedTerm::FloatLiteral(value) => { + Some(Decimal::try_from(value).ok()?.into()) + } + EncodedTerm::DoubleLiteral(value) => { + Some(Decimal::try_from(value).ok()?.into()) + } + EncodedTerm::IntegerLiteral(value) => { + Some(Decimal::from(value).into()) + } + EncodedTerm::DecimalLiteral(value) => Some(value.into()), + EncodedTerm::BooleanLiteral(value) => { + Some(Decimal::from(value).into()) + } + EncodedTerm::SmallStringLiteral(value) => { + parse_decimal_str(&value) + } + EncodedTerm::BigStringLiteral { value_id } => { + parse_decimal_str(&dataset.get_str(&value_id).ok()??) + } + _ => None, + }) + } + xsd::DATE => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| match e(tuple)? { + EncodedTerm::DateLiteral(value) => Some(value.into()), + EncodedTerm::DateTimeLiteral(value) => { + Some(Date::try_from(value).ok()?.into()) + } + EncodedTerm::SmallStringLiteral(value) => { + parse_date_str(&value) + } + EncodedTerm::BigStringLiteral { value_id } => { + parse_date_str(&dataset.get_str(&value_id).ok()??) + } + _ => None, + }) + } + xsd::TIME => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| match e(tuple)? { + EncodedTerm::TimeLiteral(value) => Some(value.into()), + EncodedTerm::DateTimeLiteral(value) => { + Some(Time::from(value).into()) + } + EncodedTerm::SmallStringLiteral(value) => { + parse_time_str(&value) + } + EncodedTerm::BigStringLiteral { value_id } => { + parse_time_str(&dataset.get_str(&value_id).ok()??) + } + _ => None, + }) + } + xsd::DATE_TIME => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| match e(tuple)? { + EncodedTerm::DateTimeLiteral(value) => Some(value.into()), + EncodedTerm::DateLiteral(value) => { + Some(DateTime::try_from(value).ok()?.into()) + } + EncodedTerm::SmallStringLiteral(value) => { + parse_date_time_str(&value) + } + EncodedTerm::BigStringLiteral { value_id } => { + parse_date_time_str(&dataset.get_str(&value_id).ok()??) + } + _ => None, + }) + } + xsd::DURATION => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| match e(tuple)? { + EncodedTerm::DurationLiteral(value) => Some(value.into()), + EncodedTerm::YearMonthDurationLiteral(value) => { + Some(Duration::from(value).into()) + } + EncodedTerm::DayTimeDurationLiteral(value) => { + Some(Duration::from(value).into()) + } + EncodedTerm::SmallStringLiteral(value) => { + parse_duration_str(&value) + } + EncodedTerm::BigStringLiteral { value_id } => { + parse_duration_str(&dataset.get_str(&value_id).ok()??) + } + _ => None, + }) + } + xsd::YEAR_MONTH_DURATION => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| match e(tuple)? { + EncodedTerm::DurationLiteral(value) => { + Some(YearMonthDuration::try_from(value).ok()?.into()) + } + EncodedTerm::YearMonthDurationLiteral(value) => { + Some(value.into()) + } + EncodedTerm::SmallStringLiteral(value) => { + parse_year_month_duration_str(&value) + } + EncodedTerm::BigStringLiteral { value_id } => { + parse_year_month_duration_str( + &dataset.get_str(&value_id).ok()??, + ) + } + _ => None, + }) + } + xsd::DAY_TIME_DURATION => { + let e = self.expression_evaluator( + ¶meters[0], + encoded_variables, + stat_children, + ); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| match e(tuple)? { + EncodedTerm::DurationLiteral(value) => { + Some(DayTimeDuration::try_from(value).ok()?.into()) + } + EncodedTerm::DayTimeDurationLiteral(value) => { + Some(value.into()) + } + EncodedTerm::SmallStringLiteral(value) => { + parse_day_time_duration_str(&value) + } + EncodedTerm::BigStringLiteral { value_id } => { + parse_day_time_duration_str( + &dataset.get_str(&value_id).ok()??, + ) + } + _ => None, + }) + } + _ => Rc::new(|_| None), + } + } + } + } + } + } + + fn hash( + &self, + parameters: &[Expression], + encoded_variables: &mut Vec, + stat_children: &mut Vec>, + ) -> Rc Option> { + let arg = self.expression_evaluator(¶meters[0], encoded_variables, stat_children); + let dataset = Rc::clone(&self.dataset); + Rc::new(move |tuple| { + let input = to_simple_string(&dataset, &arg(tuple)?)?; + let hash = hex::encode(H::new().chain_update(input.as_str()).finalize()); + Some(build_string_literal(&dataset, &hash)) + }) + } + + fn encode_term<'b>(&self, term: impl Into>) -> EncodedTerm { + self.dataset.encode_term(term) + } + + fn encode_triple(&self, triple: &GroundTriple) -> EncodedTerm { + EncodedTriple::new( + match &triple.subject { + GroundSubject::NamedNode(node) => self.encode_term(node), + GroundSubject::Triple(triple) => self.encode_triple(triple), + }, + self.encode_term(&triple.predicate), + match &triple.object { + GroundTerm::NamedNode(node) => self.encode_term(node), + GroundTerm::Literal(literal) => self.encode_term(literal), + GroundTerm::Triple(triple) => self.encode_triple(triple), + }, + ) + .into() + } + + fn encode_property_path(&self, path: &PropertyPathExpression) -> Rc { + Rc::new(match path { + PropertyPathExpression::NamedNode(node) => PropertyPath::Path(self.encode_term(node)), + PropertyPathExpression::Reverse(p) => { + PropertyPath::Reverse(self.encode_property_path(p)) + } + PropertyPathExpression::Sequence(a, b) => { + PropertyPath::Sequence(self.encode_property_path(a), self.encode_property_path(b)) + } + PropertyPathExpression::Alternative(a, b) => PropertyPath::Alternative( + self.encode_property_path(a), + self.encode_property_path(b), + ), + PropertyPathExpression::ZeroOrMore(p) => { + PropertyPath::ZeroOrMore(self.encode_property_path(p)) + } + PropertyPathExpression::OneOrMore(p) => { + PropertyPath::OneOrMore(self.encode_property_path(p)) + } + PropertyPathExpression::ZeroOrOne(p) => { + PropertyPath::ZeroOrOne(self.encode_property_path(p)) + } + PropertyPathExpression::NegatedPropertySet(ps) => { + PropertyPath::NegatedPropertySet(ps.iter().map(|p| self.encode_term(p)).collect()) + } + }) + } + + fn template_value_from_term_or_variable( + &self, + term_or_variable: &TermPattern, + variables: &mut Vec, + bnodes: &mut Vec, + ) -> TripleTemplateValue { + match term_or_variable { + TermPattern::Variable(variable) => { + TripleTemplateValue::Variable(encode_variable(variables, variable)) + } + TermPattern::NamedNode(node) => TripleTemplateValue::Constant(self.encode_term(node)), + TermPattern::BlankNode(bnode) => { + TripleTemplateValue::BlankNode(bnode_key(bnodes, bnode)) + } + TermPattern::Literal(literal) => { + TripleTemplateValue::Constant(self.encode_term(literal)) + } + TermPattern::Triple(triple) => match ( + self.template_value_from_term_or_variable(&triple.subject, variables, bnodes), + self.template_value_from_named_node_or_variable(&triple.predicate, variables), + self.template_value_from_term_or_variable(&triple.object, variables, bnodes), + ) { + ( + TripleTemplateValue::Constant(subject), + TripleTemplateValue::Constant(predicate), + TripleTemplateValue::Constant(object), + ) => TripleTemplateValue::Constant( + EncodedTriple { + subject, + predicate, + object, + } + .into(), + ), + (subject, predicate, object) => { + TripleTemplateValue::Triple(Box::new(TripleTemplate { + subject, + predicate, + object, + })) + } + }, + } + } + + fn template_value_from_named_node_or_variable( + &self, + named_node_or_variable: &NamedNodePattern, + variables: &mut Vec, + ) -> TripleTemplateValue { + match named_node_or_variable { + NamedNodePattern::Variable(variable) => { + TripleTemplateValue::Variable(encode_variable(variables, variable)) + } + NamedNodePattern::NamedNode(term) => { + TripleTemplateValue::Constant(self.encode_term(term)) + } + } + } +} + +fn to_bool(term: &EncodedTerm) -> Option { + match term { + EncodedTerm::BooleanLiteral(value) => Some((*value).into()), + EncodedTerm::SmallStringLiteral(value) => Some(!value.is_empty()), + EncodedTerm::BigStringLiteral { .. } => { + Some(false) // A big literal can't be empty + } + EncodedTerm::FloatLiteral(value) => Some(Boolean::from(*value).into()), + EncodedTerm::DoubleLiteral(value) => Some(Boolean::from(*value).into()), + EncodedTerm::IntegerLiteral(value) => Some(Boolean::from(*value).into()), + EncodedTerm::DecimalLiteral(value) => Some(Boolean::from(*value).into()), + _ => None, + } +} + +fn to_string_id(dataset: &DatasetView, term: &EncodedTerm) -> Option { + match term { + EncodedTerm::NamedNode { iri_id } => Some( + if let Ok(value) = SmallString::try_from(dataset.get_str(iri_id).ok()??.as_str()) { + value.into() + } else { + SmallStringOrId::Big(*iri_id) + }, + ), + EncodedTerm::DefaultGraph + | EncodedTerm::NumericalBlankNode { .. } + | EncodedTerm::SmallBlankNode { .. } + | EncodedTerm::BigBlankNode { .. } + | EncodedTerm::Triple(_) => None, + EncodedTerm::SmallStringLiteral(value) + | EncodedTerm::SmallSmallLangStringLiteral { value, .. } + | EncodedTerm::SmallBigLangStringLiteral { value, .. } + | EncodedTerm::SmallTypedLiteral { value, .. } => Some((*value).into()), + EncodedTerm::BigStringLiteral { value_id } + | EncodedTerm::BigSmallLangStringLiteral { value_id, .. } + | EncodedTerm::BigBigLangStringLiteral { value_id, .. } + | EncodedTerm::BigTypedLiteral { value_id, .. } => Some((*value_id).into()), + EncodedTerm::BooleanLiteral(value) => Some(build_string_id( + dataset, + if bool::from(*value) { "true" } else { "false" }, + )), + EncodedTerm::FloatLiteral(value) => Some(build_string_id(dataset, &value.to_string())), + EncodedTerm::DoubleLiteral(value) => Some(build_string_id(dataset, &value.to_string())), + EncodedTerm::IntegerLiteral(value) => Some(build_string_id(dataset, &value.to_string())), + EncodedTerm::DecimalLiteral(value) => Some(build_string_id(dataset, &value.to_string())), + EncodedTerm::DateTimeLiteral(value) => Some(build_string_id(dataset, &value.to_string())), + EncodedTerm::TimeLiteral(value) => Some(build_string_id(dataset, &value.to_string())), + EncodedTerm::DateLiteral(value) => Some(build_string_id(dataset, &value.to_string())), + EncodedTerm::GYearMonthLiteral(value) => Some(build_string_id(dataset, &value.to_string())), + EncodedTerm::GYearLiteral(value) => Some(build_string_id(dataset, &value.to_string())), + EncodedTerm::GMonthDayLiteral(value) => Some(build_string_id(dataset, &value.to_string())), + EncodedTerm::GDayLiteral(value) => Some(build_string_id(dataset, &value.to_string())), + EncodedTerm::GMonthLiteral(value) => Some(build_string_id(dataset, &value.to_string())), + EncodedTerm::DurationLiteral(value) => Some(build_string_id(dataset, &value.to_string())), + EncodedTerm::YearMonthDurationLiteral(value) => { + Some(build_string_id(dataset, &value.to_string())) + } + EncodedTerm::DayTimeDurationLiteral(value) => { + Some(build_string_id(dataset, &value.to_string())) + } + } +} + +fn to_simple_string(dataset: &DatasetView, term: &EncodedTerm) -> Option { + match term { + EncodedTerm::SmallStringLiteral(value) => Some((*value).into()), + EncodedTerm::BigStringLiteral { value_id } => dataset.get_str(value_id).ok()?, + _ => None, + } +} + +fn to_simple_string_id(term: &EncodedTerm) -> Option { + match term { + EncodedTerm::SmallStringLiteral(value) => Some((*value).into()), + EncodedTerm::BigStringLiteral { value_id } => Some((*value_id).into()), + _ => None, + } +} + +fn to_string(dataset: &DatasetView, term: &EncodedTerm) -> Option { + match term { + EncodedTerm::SmallStringLiteral(value) + | EncodedTerm::SmallSmallLangStringLiteral { value, .. } + | EncodedTerm::SmallBigLangStringLiteral { value, .. } => Some((*value).into()), + EncodedTerm::BigStringLiteral { value_id } + | EncodedTerm::BigSmallLangStringLiteral { value_id, .. } + | EncodedTerm::BigBigLangStringLiteral { value_id, .. } => { + dataset.get_str(value_id).ok()? + } + _ => None, + } +} + +fn to_string_and_language( + dataset: &DatasetView, + term: &EncodedTerm, +) -> Option<(String, Option)> { + match term { + EncodedTerm::SmallStringLiteral(value) => Some(((*value).into(), None)), + EncodedTerm::BigStringLiteral { value_id } => { + Some((dataset.get_str(value_id).ok()??, None)) + } + EncodedTerm::SmallSmallLangStringLiteral { value, language } => { + Some(((*value).into(), Some((*language).into()))) + } + EncodedTerm::SmallBigLangStringLiteral { value, language_id } => { + Some(((*value).into(), Some((*language_id).into()))) + } + EncodedTerm::BigSmallLangStringLiteral { value_id, language } => { + Some((dataset.get_str(value_id).ok()??, Some((*language).into()))) + } + EncodedTerm::BigBigLangStringLiteral { + value_id, + language_id, + } => Some(( + dataset.get_str(value_id).ok()??, + Some((*language_id).into()), + )), + _ => None, + } +} + +fn build_named_node(dataset: &DatasetView, iri: &str) -> EncodedTerm { + dataset.encode_term(NamedNodeRef::new_unchecked(iri)) +} + +fn encode_named_node(dataset: &DatasetView, node: NamedNodeRef<'_>) -> EncodedTerm { + dataset.encode_term(node) +} + +fn build_string_literal(dataset: &DatasetView, value: &str) -> EncodedTerm { + build_string_literal_from_id(build_string_id(dataset, value)) +} + +fn build_string_literal_from_id(id: SmallStringOrId) -> EncodedTerm { + match id { + SmallStringOrId::Small(value) => EncodedTerm::SmallStringLiteral(value), + SmallStringOrId::Big(value_id) => EncodedTerm::BigStringLiteral { value_id }, + } +} + +fn build_lang_string_literal( + dataset: &DatasetView, + value: &str, + language_id: SmallStringOrId, +) -> EncodedTerm { + build_lang_string_literal_from_id(build_string_id(dataset, value), language_id) +} + +fn build_lang_string_literal_from_id( + value_id: SmallStringOrId, + language_id: SmallStringOrId, +) -> EncodedTerm { + match (value_id, language_id) { + (SmallStringOrId::Small(value), SmallStringOrId::Small(language)) => { + EncodedTerm::SmallSmallLangStringLiteral { value, language } + } + (SmallStringOrId::Small(value), SmallStringOrId::Big(language_id)) => { + EncodedTerm::SmallBigLangStringLiteral { value, language_id } + } + (SmallStringOrId::Big(value_id), SmallStringOrId::Small(language)) => { + EncodedTerm::BigSmallLangStringLiteral { value_id, language } + } + (SmallStringOrId::Big(value_id), SmallStringOrId::Big(language_id)) => { + EncodedTerm::BigBigLangStringLiteral { + value_id, + language_id, + } + } + } +} + +fn build_plain_literal( + dataset: &DatasetView, + value: &str, + language: Option, +) -> EncodedTerm { + if let Some(language_id) = language { + build_lang_string_literal(dataset, value, language_id) + } else { + build_string_literal(dataset, value) + } +} + +fn build_string_id(dataset: &DatasetView, value: &str) -> SmallStringOrId { + if let Ok(value) = SmallString::try_from(value) { + value.into() + } else { + let id = StrHash::new(value); + dataset.insert_str(&id, value); + SmallStringOrId::Big(id) + } +} + +fn build_language_id(dataset: &DatasetView, value: &EncodedTerm) -> Option { + let mut language = to_simple_string(dataset, value)?; + language.make_ascii_lowercase(); + Some(build_string_id( + dataset, + LanguageTag::parse(language).ok()?.as_str(), + )) +} + +fn to_argument_compatible_strings( + dataset: &DatasetView, + arg1: &EncodedTerm, + arg2: &EncodedTerm, +) -> Option<(String, String, Option)> { + let (value1, language1) = to_string_and_language(dataset, arg1)?; + let (value2, language2) = to_string_and_language(dataset, arg2)?; + (language2.is_none() || language1 == language2).then_some((value1, value2, language1)) +} + +fn compile_static_pattern_if_exists( + pattern: &Expression, + options: Option<&Expression>, +) -> Option { + let static_pattern = if let Expression::Literal(pattern) = pattern { + (pattern.datatype() == xsd::STRING).then(|| pattern.value()) + } else { + None + }; + let static_options = if let Some(options) = options { + if let Expression::Literal(options) = options { + (options.datatype() == xsd::STRING).then(|| Some(options.value())) + } else { + None + } + } else { + Some(None) + }; + if let (Some(static_pattern), Some(static_options)) = (static_pattern, static_options) { + compile_pattern(static_pattern, static_options) + } else { + None + } +} + +pub(super) fn compile_pattern(pattern: &str, flags: Option<&str>) -> Option { + let mut regex_builder = RegexBuilder::new(pattern); + regex_builder.size_limit(REGEX_SIZE_LIMIT); + if let Some(flags) = flags { + for flag in flags.chars() { + match flag { + 's' => { + regex_builder.dot_matches_new_line(true); + } + 'm' => { + regex_builder.multi_line(true); + } + 'i' => { + regex_builder.case_insensitive(true); + } + 'x' => { + regex_builder.ignore_whitespace(true); + } + _ => (), // TODO: implement q + } + } + } + regex_builder.build().ok() +} + +fn decode_bindings( + dataset: Rc, + iter: EncodedTuplesIterator, + variables: Arc<[Variable]>, +) -> QuerySolutionIter { + let tuple_size = variables.len(); + QuerySolutionIter::new( + variables, + Box::new(iter.map(move |values| { + let mut result = vec![None; tuple_size]; + for (i, value) in values?.iter().enumerate() { + if let Some(term) = value { + result[i] = Some(dataset.decode_term(&term)?) + } + } + Ok(result) + })), + ) +} + +// this is used to encode results from a BindingIterator into an EncodedTuplesIterator. This happens when SERVICE clauses are evaluated +fn encode_bindings( + dataset: Rc, + variables: Rc<[Variable]>, + iter: QuerySolutionIter, +) -> EncodedTuplesIterator { + Box::new(iter.map(move |solution| { + let mut encoded_terms = EncodedTuple::with_capacity(variables.len()); + for (variable, term) in solution?.iter() { + put_variable_value( + variable, + &variables, + dataset.encode_term(term), + &mut encoded_terms, + ) + } + Ok(encoded_terms) + })) +} + +fn equals(a: &EncodedTerm, b: &EncodedTerm) -> Option { + match a { + EncodedTerm::DefaultGraph + | EncodedTerm::NamedNode { .. } + | EncodedTerm::NumericalBlankNode { .. } + | EncodedTerm::SmallBlankNode { .. } + | EncodedTerm::BigBlankNode { .. } + | EncodedTerm::SmallSmallLangStringLiteral { .. } + | EncodedTerm::SmallBigLangStringLiteral { .. } + | EncodedTerm::BigSmallLangStringLiteral { .. } + | EncodedTerm::BigBigLangStringLiteral { .. } => Some(a == b), + EncodedTerm::SmallStringLiteral(a) => match b { + EncodedTerm::SmallStringLiteral(b) => Some(a == b), + EncodedTerm::SmallTypedLiteral { .. } | EncodedTerm::BigTypedLiteral { .. } => None, + _ => Some(false), + }, + EncodedTerm::BigStringLiteral { value_id: a } => match b { + EncodedTerm::BigStringLiteral { value_id: b } => Some(a == b), + EncodedTerm::SmallTypedLiteral { .. } | EncodedTerm::BigTypedLiteral { .. } => None, + _ => Some(false), + }, + EncodedTerm::SmallTypedLiteral { .. } => match b { + EncodedTerm::SmallTypedLiteral { .. } if a == b => Some(true), + EncodedTerm::NamedNode { .. } + | EncodedTerm::NumericalBlankNode { .. } + | EncodedTerm::SmallBlankNode { .. } + | EncodedTerm::BigBlankNode { .. } + | EncodedTerm::SmallSmallLangStringLiteral { .. } + | EncodedTerm::SmallBigLangStringLiteral { .. } + | EncodedTerm::BigSmallLangStringLiteral { .. } + | EncodedTerm::BigBigLangStringLiteral { .. } + | EncodedTerm::BigTypedLiteral { .. } => Some(false), + _ => None, + }, + EncodedTerm::BigTypedLiteral { .. } => match b { + EncodedTerm::BigTypedLiteral { .. } if a == b => Some(true), + EncodedTerm::NamedNode { .. } + | EncodedTerm::NumericalBlankNode { .. } + | EncodedTerm::SmallBlankNode { .. } + | EncodedTerm::BigBlankNode { .. } + | EncodedTerm::SmallSmallLangStringLiteral { .. } + | EncodedTerm::SmallBigLangStringLiteral { .. } + | EncodedTerm::BigSmallLangStringLiteral { .. } + | EncodedTerm::BigBigLangStringLiteral { .. } + | EncodedTerm::SmallTypedLiteral { .. } => Some(false), + _ => None, + }, + EncodedTerm::BooleanLiteral(a) => match b { + EncodedTerm::BooleanLiteral(b) => Some(a == b), + _ if b.is_unknown_typed_literal() => None, + _ => Some(false), + }, + EncodedTerm::FloatLiteral(a) => match b { + EncodedTerm::FloatLiteral(b) => Some(a == b), + EncodedTerm::DoubleLiteral(b) => Some(Double::from(*a) == *b), + EncodedTerm::IntegerLiteral(b) => Some(*a == (*b).into()), + EncodedTerm::DecimalLiteral(b) => Some(*a == (*b).into()), + _ if b.is_unknown_typed_literal() => None, + _ => Some(false), + }, + EncodedTerm::DoubleLiteral(a) => match b { + EncodedTerm::FloatLiteral(b) => Some(*a == (*b).into()), + EncodedTerm::DoubleLiteral(b) => Some(a == b), + EncodedTerm::IntegerLiteral(b) => Some(*a == (*b).into()), + EncodedTerm::DecimalLiteral(b) => Some(*a == (*b).into()), + _ if b.is_unknown_typed_literal() => None, + _ => Some(false), + }, + EncodedTerm::IntegerLiteral(a) => match b { + EncodedTerm::FloatLiteral(b) => Some(Float::from(*a) == *b), + EncodedTerm::DoubleLiteral(b) => Some(Double::from(*a) == *b), + EncodedTerm::IntegerLiteral(b) => Some(a == b), + EncodedTerm::DecimalLiteral(b) => Some(Decimal::from(*a) == *b), + _ if b.is_unknown_typed_literal() => None, + _ => Some(false), + }, + EncodedTerm::DecimalLiteral(a) => match b { + EncodedTerm::FloatLiteral(b) => Some(Float::from(*a) == *b), + EncodedTerm::DoubleLiteral(b) => Some(Double::from(*a) == *b), + EncodedTerm::IntegerLiteral(b) => Some(*a == (*b).into()), + EncodedTerm::DecimalLiteral(b) => Some(a == b), + _ if b.is_unknown_typed_literal() => None, + _ => Some(false), + }, + EncodedTerm::DateTimeLiteral(a) => match b { + EncodedTerm::DateTimeLiteral(b) => Some(a == b), + _ if b.is_unknown_typed_literal() => None, + _ => Some(false), + }, + EncodedTerm::TimeLiteral(a) => match b { + EncodedTerm::TimeLiteral(b) => Some(a == b), + _ if b.is_unknown_typed_literal() => None, + _ => Some(false), + }, + EncodedTerm::DateLiteral(a) => match b { + EncodedTerm::DateLiteral(b) => Some(a == b), + _ if b.is_unknown_typed_literal() => None, + _ => Some(false), + }, + EncodedTerm::GYearMonthLiteral(a) => match b { + EncodedTerm::GYearMonthLiteral(b) => Some(a == b), + _ if b.is_unknown_typed_literal() => None, + _ => Some(false), + }, + EncodedTerm::GYearLiteral(a) => match b { + EncodedTerm::GYearLiteral(b) => Some(a == b), + _ if b.is_unknown_typed_literal() => None, + _ => Some(false), + }, + EncodedTerm::GMonthDayLiteral(a) => match b { + EncodedTerm::GMonthDayLiteral(b) => Some(a == b), + _ if b.is_unknown_typed_literal() => None, + _ => Some(false), + }, + EncodedTerm::GDayLiteral(a) => match b { + EncodedTerm::GDayLiteral(b) => Some(a == b), + _ if b.is_unknown_typed_literal() => None, + _ => Some(false), + }, + EncodedTerm::GMonthLiteral(a) => match b { + EncodedTerm::GMonthLiteral(b) => Some(a == b), + _ if b.is_unknown_typed_literal() => None, + _ => Some(false), + }, + EncodedTerm::DurationLiteral(a) => match b { + EncodedTerm::DurationLiteral(b) => Some(a == b), + EncodedTerm::YearMonthDurationLiteral(b) => Some(a == b), + EncodedTerm::DayTimeDurationLiteral(b) => Some(a == b), + _ if b.is_unknown_typed_literal() => None, + _ => Some(false), + }, + EncodedTerm::YearMonthDurationLiteral(a) => match b { + EncodedTerm::DurationLiteral(b) => Some(a == b), + EncodedTerm::YearMonthDurationLiteral(b) => Some(a == b), + EncodedTerm::DayTimeDurationLiteral(b) => Some(a == b), + _ if b.is_unknown_typed_literal() => None, + _ => Some(false), + }, + EncodedTerm::DayTimeDurationLiteral(a) => match b { + EncodedTerm::DurationLiteral(b) => Some(a == b), + EncodedTerm::YearMonthDurationLiteral(b) => Some(a == b), + EncodedTerm::DayTimeDurationLiteral(b) => Some(a == b), + _ if b.is_unknown_typed_literal() => None, + _ => Some(false), + }, + EncodedTerm::Triple(a) => { + if let EncodedTerm::Triple(b) = b { + Some( + equals(&a.subject, &b.subject)? + && equals(&a.predicate, &b.predicate)? + && equals(&a.object, &b.object)?, + ) + } else { + Some(false) + } + } + } +} + +fn cmp_terms(dataset: &DatasetView, a: Option<&EncodedTerm>, b: Option<&EncodedTerm>) -> Ordering { + match (a, b) { + (Some(a), Some(b)) => match a { + EncodedTerm::SmallBlankNode(a) => match b { + EncodedTerm::SmallBlankNode(b) => a.cmp(b), + EncodedTerm::BigBlankNode { id_id: b } => { + compare_str_str_id(dataset, a, b).unwrap_or(Ordering::Equal) + } + EncodedTerm::NumericalBlankNode { id: b } => { + a.as_str().cmp(BlankNode::new_from_unique_id(*b).as_str()) + } + _ => Ordering::Less, + }, + EncodedTerm::BigBlankNode { id_id: a } => match b { + EncodedTerm::SmallBlankNode(b) => { + compare_str_id_str(dataset, a, b).unwrap_or(Ordering::Equal) + } + EncodedTerm::BigBlankNode { id_id: b } => { + compare_str_ids(dataset, a, b).unwrap_or(Ordering::Equal) + } + EncodedTerm::NumericalBlankNode { id: b } => { + compare_str_id_str(dataset, a, BlankNode::new_from_unique_id(*b).as_str()) + .unwrap_or(Ordering::Equal) + } + _ => Ordering::Less, + }, + EncodedTerm::NumericalBlankNode { id: a } => { + let a = BlankNode::new_from_unique_id(*a); + match b { + EncodedTerm::SmallBlankNode(b) => a.as_str().cmp(b), + EncodedTerm::BigBlankNode { id_id: b } => { + compare_str_str_id(dataset, a.as_str(), b).unwrap_or(Ordering::Equal) + } + EncodedTerm::NumericalBlankNode { id: b } => { + a.as_str().cmp(BlankNode::new_from_unique_id(*b).as_str()) + } + _ => Ordering::Less, + } + } + EncodedTerm::NamedNode { iri_id: a } => match b { + EncodedTerm::NamedNode { iri_id: b } => { + compare_str_ids(dataset, a, b).unwrap_or(Ordering::Equal) + } + _ if b.is_blank_node() => Ordering::Greater, + _ => Ordering::Less, + }, + EncodedTerm::Triple(a) => match b { + EncodedTerm::Triple(b) => { + match cmp_terms(dataset, Some(&a.subject), Some(&b.subject)) { + Ordering::Equal => { + match cmp_terms(dataset, Some(&a.predicate), Some(&b.predicate)) { + Ordering::Equal => { + cmp_terms(dataset, Some(&a.object), Some(&b.object)) + } + o => o, + } + } + o => o, + } + } + _ => Ordering::Greater, + }, + _ => match b { + _ if b.is_named_node() || b.is_blank_node() => Ordering::Greater, + _ if b.is_triple() => Ordering::Less, + _ => { + if let Some(ord) = partial_cmp_literals(dataset, a, b) { + ord + } else if let (Ok(Term::Literal(a)), Ok(Term::Literal(b))) = + (dataset.decode_term(a), dataset.decode_term(b)) + { + (a.value(), a.datatype(), a.language()).cmp(&( + b.value(), + b.datatype(), + b.language(), + )) + } else { + Ordering::Equal // Should never happen + } + } + }, + }, + (Some(_), None) => Ordering::Greater, + (None, Some(_)) => Ordering::Less, + (None, None) => Ordering::Equal, + } +} + +fn partial_cmp(dataset: &DatasetView, a: &EncodedTerm, b: &EncodedTerm) -> Option { + if a == b { + Some(Ordering::Equal) + } else if let EncodedTerm::Triple(a) = a { + if let EncodedTerm::Triple(b) = b { + match partial_cmp(dataset, &a.subject, &b.subject) { + Some(Ordering::Equal) => match partial_cmp(dataset, &a.predicate, &b.predicate) { + Some(Ordering::Equal) => partial_cmp(dataset, &a.object, &b.object), + o => o, + }, + o => o, + } + } else { + None + } + } else { + partial_cmp_literals(dataset, a, b) + } +} + +fn partial_cmp_literals( + dataset: &DatasetView, + a: &EncodedTerm, + b: &EncodedTerm, +) -> Option { + match a { + EncodedTerm::SmallStringLiteral(a) => match b { + EncodedTerm::SmallStringLiteral(b) => a.partial_cmp(b), + EncodedTerm::BigStringLiteral { value_id: b } => compare_str_str_id(dataset, a, b), + _ => None, + }, + EncodedTerm::BigStringLiteral { value_id: a } => match b { + EncodedTerm::SmallStringLiteral(b) => compare_str_id_str(dataset, a, b), + EncodedTerm::BigStringLiteral { value_id: b } => compare_str_ids(dataset, a, b), + _ => None, + }, + EncodedTerm::SmallSmallLangStringLiteral { + value: a, + language: la, + } => match b { + EncodedTerm::SmallSmallLangStringLiteral { + value: b, + language: lb, + } if la == lb => a.partial_cmp(b), + EncodedTerm::BigSmallLangStringLiteral { + value_id: b, + language: lb, + } if la == lb => compare_str_str_id(dataset, a, b), + _ => None, + }, + EncodedTerm::SmallBigLangStringLiteral { + value: a, + language_id: la, + } => match b { + EncodedTerm::SmallBigLangStringLiteral { + value: b, + language_id: lb, + } if la == lb => a.partial_cmp(b), + EncodedTerm::BigBigLangStringLiteral { + value_id: b, + language_id: lb, + } if la == lb => compare_str_str_id(dataset, a, b), + _ => None, + }, + EncodedTerm::BigSmallLangStringLiteral { + value_id: a, + language: la, + } => match b { + EncodedTerm::SmallSmallLangStringLiteral { + value: b, + language: lb, + } if la == lb => compare_str_id_str(dataset, a, b), + EncodedTerm::BigSmallLangStringLiteral { + value_id: b, + language: lb, + } if la == lb => compare_str_ids(dataset, a, b), + _ => None, + }, + EncodedTerm::BigBigLangStringLiteral { + value_id: a, + language_id: la, + } => match b { + EncodedTerm::SmallBigLangStringLiteral { + value: b, + language_id: lb, + } if la == lb => compare_str_id_str(dataset, a, b), + EncodedTerm::BigBigLangStringLiteral { + value_id: b, + language_id: lb, + } if la == lb => compare_str_ids(dataset, a, b), + _ => None, + }, + EncodedTerm::FloatLiteral(a) => match b { + EncodedTerm::FloatLiteral(b) => a.partial_cmp(b), + EncodedTerm::DoubleLiteral(b) => Double::from(*a).partial_cmp(b), + EncodedTerm::IntegerLiteral(b) => a.partial_cmp(&Float::from(*b)), + EncodedTerm::DecimalLiteral(b) => a.partial_cmp(&(*b).into()), + _ => None, + }, + EncodedTerm::DoubleLiteral(a) => match b { + EncodedTerm::FloatLiteral(b) => a.partial_cmp(&(*b).into()), + EncodedTerm::DoubleLiteral(b) => a.partial_cmp(b), + EncodedTerm::IntegerLiteral(b) => a.partial_cmp(&Double::from(*b)), + EncodedTerm::DecimalLiteral(b) => a.partial_cmp(&(*b).into()), + _ => None, + }, + EncodedTerm::IntegerLiteral(a) => match b { + EncodedTerm::FloatLiteral(b) => Float::from(*a).partial_cmp(b), + EncodedTerm::DoubleLiteral(b) => Double::from(*a).partial_cmp(b), + EncodedTerm::IntegerLiteral(b) => a.partial_cmp(b), + EncodedTerm::DecimalLiteral(b) => Decimal::from(*a).partial_cmp(b), + _ => None, + }, + EncodedTerm::DecimalLiteral(a) => match b { + EncodedTerm::FloatLiteral(b) => Float::from(*a).partial_cmp(b), + EncodedTerm::DoubleLiteral(b) => Double::from(*a).partial_cmp(b), + EncodedTerm::IntegerLiteral(b) => a.partial_cmp(&Decimal::from(*b)), + EncodedTerm::DecimalLiteral(b) => a.partial_cmp(b), + _ => None, + }, + EncodedTerm::DateTimeLiteral(a) => { + if let EncodedTerm::DateTimeLiteral(b) = b { + a.partial_cmp(b) + } else { + None + } + } + EncodedTerm::TimeLiteral(a) => { + if let EncodedTerm::TimeLiteral(b) = b { + a.partial_cmp(b) + } else { + None + } + } + EncodedTerm::DateLiteral(a) => { + if let EncodedTerm::DateLiteral(b) = b { + a.partial_cmp(b) + } else { + None + } + } + EncodedTerm::GYearMonthLiteral(a) => { + if let EncodedTerm::GYearMonthLiteral(b) = b { + a.partial_cmp(b) + } else { + None + } + } + EncodedTerm::GYearLiteral(a) => { + if let EncodedTerm::GYearLiteral(b) = b { + a.partial_cmp(b) + } else { + None + } + } + EncodedTerm::GMonthDayLiteral(a) => { + if let EncodedTerm::GMonthDayLiteral(b) = b { + a.partial_cmp(b) + } else { + None + } + } + EncodedTerm::GDayLiteral(a) => { + if let EncodedTerm::GDayLiteral(b) = b { + a.partial_cmp(b) + } else { + None + } + } + EncodedTerm::GMonthLiteral(a) => { + if let EncodedTerm::GMonthLiteral(b) = b { + a.partial_cmp(b) + } else { + None + } + } + EncodedTerm::DurationLiteral(a) => match b { + EncodedTerm::DurationLiteral(b) => a.partial_cmp(b), + EncodedTerm::YearMonthDurationLiteral(b) => a.partial_cmp(b), + EncodedTerm::DayTimeDurationLiteral(b) => a.partial_cmp(b), + _ => None, + }, + EncodedTerm::YearMonthDurationLiteral(a) => match b { + EncodedTerm::DurationLiteral(b) => a.partial_cmp(b), + EncodedTerm::YearMonthDurationLiteral(b) => a.partial_cmp(b), + EncodedTerm::DayTimeDurationLiteral(b) => a.partial_cmp(b), + _ => None, + }, + EncodedTerm::DayTimeDurationLiteral(a) => match b { + EncodedTerm::DurationLiteral(b) => a.partial_cmp(b), + EncodedTerm::YearMonthDurationLiteral(b) => a.partial_cmp(b), + EncodedTerm::DayTimeDurationLiteral(b) => a.partial_cmp(b), + _ => None, + }, + _ => None, + } +} + +fn compare_str_ids(dataset: &DatasetView, a: &StrHash, b: &StrHash) -> Option { + Some(dataset.get_str(a).ok()??.cmp(&dataset.get_str(b).ok()??)) +} + +fn compare_str_id_str(dataset: &DatasetView, a: &StrHash, b: &str) -> Option { + Some(dataset.get_str(a).ok()??.as_str().cmp(b)) +} + +fn compare_str_str_id(dataset: &DatasetView, a: &str, b: &StrHash) -> Option { + Some(a.cmp(dataset.get_str(b).ok()??.as_str())) +} + +fn datatype(dataset: &DatasetView, value: &EncodedTerm) -> Option { + // TODO: optimize? + match value { + EncodedTerm::NamedNode { .. } + | EncodedTerm::SmallBlankNode { .. } + | EncodedTerm::BigBlankNode { .. } + | EncodedTerm::NumericalBlankNode { .. } + | EncodedTerm::DefaultGraph + | EncodedTerm::Triple(_) => None, + EncodedTerm::SmallStringLiteral(_) | EncodedTerm::BigStringLiteral { .. } => { + Some(encode_named_node(dataset, xsd::STRING)) + } + EncodedTerm::SmallSmallLangStringLiteral { .. } + | EncodedTerm::SmallBigLangStringLiteral { .. } + | EncodedTerm::BigSmallLangStringLiteral { .. } + | EncodedTerm::BigBigLangStringLiteral { .. } => { + Some(encode_named_node(dataset, rdf::LANG_STRING)) + } + EncodedTerm::SmallTypedLiteral { datatype_id, .. } + | EncodedTerm::BigTypedLiteral { datatype_id, .. } => Some(EncodedTerm::NamedNode { + iri_id: *datatype_id, + }), + EncodedTerm::BooleanLiteral(..) => Some(encode_named_node(dataset, xsd::BOOLEAN)), + EncodedTerm::FloatLiteral(..) => Some(encode_named_node(dataset, xsd::FLOAT)), + EncodedTerm::DoubleLiteral(..) => Some(encode_named_node(dataset, xsd::DOUBLE)), + EncodedTerm::IntegerLiteral(..) => Some(encode_named_node(dataset, xsd::INTEGER)), + EncodedTerm::DecimalLiteral(..) => Some(encode_named_node(dataset, xsd::DECIMAL)), + EncodedTerm::DateTimeLiteral(..) => Some(encode_named_node(dataset, xsd::DATE_TIME)), + EncodedTerm::TimeLiteral(..) => Some(encode_named_node(dataset, xsd::TIME)), + EncodedTerm::DateLiteral(..) => Some(encode_named_node(dataset, xsd::DATE)), + EncodedTerm::GYearMonthLiteral(..) => Some(encode_named_node(dataset, xsd::G_YEAR_MONTH)), + EncodedTerm::GYearLiteral(..) => Some(encode_named_node(dataset, xsd::G_YEAR)), + EncodedTerm::GMonthDayLiteral(..) => Some(encode_named_node(dataset, xsd::G_MONTH_DAY)), + EncodedTerm::GDayLiteral(..) => Some(encode_named_node(dataset, xsd::G_DAY)), + EncodedTerm::GMonthLiteral(..) => Some(encode_named_node(dataset, xsd::G_MONTH)), + EncodedTerm::DurationLiteral(..) => Some(encode_named_node(dataset, xsd::DURATION)), + EncodedTerm::YearMonthDurationLiteral(..) => { + Some(encode_named_node(dataset, xsd::YEAR_MONTH_DURATION)) + } + EncodedTerm::DayTimeDurationLiteral(..) => { + Some(encode_named_node(dataset, xsd::DAY_TIME_DURATION)) + } + } +} + +enum NumericBinaryOperands { + Float(Float, Float), + Double(Double, Double), + Integer(Integer, Integer), + Decimal(Decimal, Decimal), + Duration(Duration, Duration), + YearMonthDuration(YearMonthDuration, YearMonthDuration), + DayTimeDuration(DayTimeDuration, DayTimeDuration), + DateTime(DateTime, DateTime), + Time(Time, Time), + Date(Date, Date), + DateTimeDuration(DateTime, Duration), + DateTimeYearMonthDuration(DateTime, YearMonthDuration), + DateTimeDayTimeDuration(DateTime, DayTimeDuration), + DateDuration(Date, Duration), + DateYearMonthDuration(Date, YearMonthDuration), + DateDayTimeDuration(Date, DayTimeDuration), + TimeDuration(Time, Duration), + TimeDayTimeDuration(Time, DayTimeDuration), +} + +impl NumericBinaryOperands { + fn new(a: EncodedTerm, b: EncodedTerm) -> Option { + match (a, b) { + (EncodedTerm::FloatLiteral(v1), EncodedTerm::FloatLiteral(v2)) => { + Some(Self::Float(v1, v2)) + } + (EncodedTerm::FloatLiteral(v1), EncodedTerm::DoubleLiteral(v2)) => { + Some(Self::Double(v1.into(), v2)) + } + (EncodedTerm::FloatLiteral(v1), EncodedTerm::IntegerLiteral(v2)) => { + Some(Self::Float(v1, v2.into())) + } + (EncodedTerm::FloatLiteral(v1), EncodedTerm::DecimalLiteral(v2)) => { + Some(Self::Float(v1, v2.into())) + } + (EncodedTerm::DoubleLiteral(v1), EncodedTerm::FloatLiteral(v2)) => { + Some(Self::Double(v1, v2.into())) + } + (EncodedTerm::DoubleLiteral(v1), EncodedTerm::DoubleLiteral(v2)) => { + Some(Self::Double(v1, v2)) + } + (EncodedTerm::DoubleLiteral(v1), EncodedTerm::IntegerLiteral(v2)) => { + Some(Self::Double(v1, v2.into())) + } + (EncodedTerm::DoubleLiteral(v1), EncodedTerm::DecimalLiteral(v2)) => { + Some(Self::Double(v1, v2.into())) + } + (EncodedTerm::IntegerLiteral(v1), EncodedTerm::FloatLiteral(v2)) => { + Some(Self::Float(v1.into(), v2)) + } + (EncodedTerm::IntegerLiteral(v1), EncodedTerm::DoubleLiteral(v2)) => { + Some(Self::Double(v1.into(), v2)) + } + (EncodedTerm::IntegerLiteral(v1), EncodedTerm::IntegerLiteral(v2)) => { + Some(Self::Integer(v1, v2)) + } + (EncodedTerm::IntegerLiteral(v1), EncodedTerm::DecimalLiteral(v2)) => { + Some(Self::Decimal(v1.into(), v2)) + } + (EncodedTerm::DecimalLiteral(v1), EncodedTerm::FloatLiteral(v2)) => { + Some(Self::Float(v1.into(), v2)) + } + (EncodedTerm::DecimalLiteral(v1), EncodedTerm::DoubleLiteral(v2)) => { + Some(Self::Double(v1.into(), v2)) + } + (EncodedTerm::DecimalLiteral(v1), EncodedTerm::IntegerLiteral(v2)) => { + Some(Self::Decimal(v1, v2.into())) + } + (EncodedTerm::DecimalLiteral(v1), EncodedTerm::DecimalLiteral(v2)) => { + Some(Self::Decimal(v1, v2)) + } + (EncodedTerm::DurationLiteral(v1), EncodedTerm::DurationLiteral(v2)) => { + Some(Self::Duration(v1, v2)) + } + (EncodedTerm::DurationLiteral(v1), EncodedTerm::YearMonthDurationLiteral(v2)) => { + Some(Self::Duration(v1, v2.into())) + } + (EncodedTerm::DurationLiteral(v1), EncodedTerm::DayTimeDurationLiteral(v2)) => { + Some(Self::Duration(v1, v2.into())) + } + (EncodedTerm::YearMonthDurationLiteral(v1), EncodedTerm::DurationLiteral(v2)) => { + Some(Self::Duration(v1.into(), v2)) + } + ( + EncodedTerm::YearMonthDurationLiteral(v1), + EncodedTerm::YearMonthDurationLiteral(v2), + ) => Some(Self::YearMonthDuration(v1, v2)), + ( + EncodedTerm::YearMonthDurationLiteral(v1), + EncodedTerm::DayTimeDurationLiteral(v2), + ) => Some(Self::Duration(v1.into(), v2.into())), + (EncodedTerm::DayTimeDurationLiteral(v1), EncodedTerm::DurationLiteral(v2)) => { + Some(Self::Duration(v1.into(), v2)) + } + ( + EncodedTerm::DayTimeDurationLiteral(v1), + EncodedTerm::YearMonthDurationLiteral(v2), + ) => Some(Self::Duration(v1.into(), v2.into())), + (EncodedTerm::DayTimeDurationLiteral(v1), EncodedTerm::DayTimeDurationLiteral(v2)) => { + Some(Self::DayTimeDuration(v1, v2)) + } + (EncodedTerm::DateTimeLiteral(v1), EncodedTerm::DateTimeLiteral(v2)) => { + Some(Self::DateTime(v1, v2)) + } + (EncodedTerm::DateLiteral(v1), EncodedTerm::DateLiteral(v2)) => { + Some(Self::Date(v1, v2)) + } + (EncodedTerm::TimeLiteral(v1), EncodedTerm::TimeLiteral(v2)) => { + Some(Self::Time(v1, v2)) + } + (EncodedTerm::DateTimeLiteral(v1), EncodedTerm::DurationLiteral(v2)) => { + Some(Self::DateTimeDuration(v1, v2)) + } + (EncodedTerm::DateTimeLiteral(v1), EncodedTerm::YearMonthDurationLiteral(v2)) => { + Some(Self::DateTimeYearMonthDuration(v1, v2)) + } + (EncodedTerm::DateTimeLiteral(v1), EncodedTerm::DayTimeDurationLiteral(v2)) => { + Some(Self::DateTimeDayTimeDuration(v1, v2)) + } + (EncodedTerm::DateLiteral(v1), EncodedTerm::DurationLiteral(v2)) => { + Some(Self::DateDuration(v1, v2)) + } + (EncodedTerm::DateLiteral(v1), EncodedTerm::YearMonthDurationLiteral(v2)) => { + Some(Self::DateYearMonthDuration(v1, v2)) + } + (EncodedTerm::DateLiteral(v1), EncodedTerm::DayTimeDurationLiteral(v2)) => { + Some(Self::DateDayTimeDuration(v1, v2)) + } + (EncodedTerm::TimeLiteral(v1), EncodedTerm::DurationLiteral(v2)) => { + Some(Self::TimeDuration(v1, v2)) + } + (EncodedTerm::TimeLiteral(v1), EncodedTerm::DayTimeDurationLiteral(v2)) => { + Some(Self::TimeDayTimeDuration(v1, v2)) + } + _ => None, + } + } +} + +#[derive(Clone)] +enum TupleSelector { + Constant(EncodedTerm), + Variable(usize), + TriplePattern(Rc), +} + +impl TupleSelector { + fn from_ground_term_pattern( + term_pattern: &GroundTermPattern, + variables: &mut Vec, + dataset: &DatasetView, + ) -> Self { + match term_pattern { + GroundTermPattern::Variable(variable) => { + Self::Variable(encode_variable(variables, variable)) + } + GroundTermPattern::NamedNode(term) => Self::Constant(dataset.encode_term(term)), + GroundTermPattern::Literal(term) => Self::Constant(dataset.encode_term(term)), + GroundTermPattern::Triple(triple) => { + match ( + Self::from_ground_term_pattern(&triple.subject, variables, dataset), + Self::from_named_node_pattern(&triple.predicate, variables, dataset), + Self::from_ground_term_pattern(&triple.object, variables, dataset), + ) { + ( + Self::Constant(subject), + Self::Constant(predicate), + Self::Constant(object), + ) => Self::Constant( + EncodedTriple { + subject, + predicate, + object, + } + .into(), + ), + (subject, predicate, object) => { + Self::TriplePattern(Rc::new(TripleTupleSelector { + subject, + predicate, + object, + })) + } + } + } + } + } + + fn from_named_node_pattern( + named_node_pattern: &NamedNodePattern, + variables: &mut Vec, + dataset: &DatasetView, + ) -> Self { + match named_node_pattern { + NamedNodePattern::Variable(variable) => { + Self::Variable(encode_variable(variables, variable)) + } + NamedNodePattern::NamedNode(term) => Self::Constant(dataset.encode_term(term)), + } + } + + fn from_graph_name_pattern( + graph_name_pattern: &Option, + variables: &mut Vec, + dataset: &DatasetView, + ) -> Self { + if let Some(graph_name_pattern) = graph_name_pattern { + Self::from_named_node_pattern(graph_name_pattern, variables, dataset) + } else { + Self::Constant(EncodedTerm::DefaultGraph) + } + } + + fn get_pattern_value(&self, tuple: &EncodedTuple) -> Option { + match self { + Self::Constant(c) => Some(c.clone()), + Self::Variable(v) => tuple.get(*v).cloned(), + Self::TriplePattern(triple) => Some( + EncodedTriple { + subject: triple.subject.get_pattern_value(tuple)?, + predicate: triple.predicate.get_pattern_value(tuple)?, + object: triple.object.get_pattern_value(tuple)?, + } + .into(), + ), + } + } +} + +struct TripleTupleSelector { + subject: TupleSelector, + predicate: TupleSelector, + object: TupleSelector, +} + +fn put_pattern_value( + selector: &TupleSelector, + value: EncodedTerm, + tuple: &mut EncodedTuple, +) -> Option<()> { + match selector { + TupleSelector::Constant(c) => (*c == value).then_some(()), + TupleSelector::Variable(v) => { + if let Some(old) = tuple.get(*v) { + (value == *old).then_some(()) + } else { + tuple.set(*v, value); + Some(()) + } + } + TupleSelector::TriplePattern(triple) => { + if let EncodedTerm::Triple(value) = value { + put_pattern_value(&triple.subject, value.subject.clone(), tuple)?; + put_pattern_value(&triple.predicate, value.predicate.clone(), tuple)?; + put_pattern_value(&triple.object, value.object.clone(), tuple) + } else { + None + } + } + } +} + +fn put_variable_value( + selector: &Variable, + variables: &[Variable], + value: EncodedTerm, + tuple: &mut EncodedTuple, +) { + for (i, v) in variables.iter().enumerate() { + if selector == v { + tuple.set(i, value); + break; + } + } +} + +pub fn are_compatible_and_not_disjointed(a: &EncodedTuple, b: &EncodedTuple) -> bool { + let mut found_intersection = false; + for (a_value, b_value) in a.iter().zip(b.iter()) { + if let (Some(a_value), Some(b_value)) = (a_value, b_value) { + if a_value != b_value { + return false; + } + found_intersection = true; + } + } + found_intersection +} + +pub enum PropertyPath { + Path(EncodedTerm), + Reverse(Rc), + Sequence(Rc, Rc), + Alternative(Rc, Rc), + ZeroOrMore(Rc), + OneOrMore(Rc), + ZeroOrOne(Rc), + NegatedPropertySet(Rc<[EncodedTerm]>), +} + +#[derive(Clone)] +struct PathEvaluator { + dataset: Rc, +} + +impl PathEvaluator { + fn eval_closed_in_graph( + &self, + path: &PropertyPath, + start: &EncodedTerm, + end: &EncodedTerm, + graph_name: &EncodedTerm, + ) -> Result { + Ok(match path { + PropertyPath::Path(p) => self + .dataset + .encoded_quads_for_pattern(Some(start), Some(p), Some(end), Some(graph_name)) + .next() + .transpose()? + .is_some(), + PropertyPath::Reverse(p) => self.eval_closed_in_graph(p, end, start, graph_name)?, + PropertyPath::Sequence(a, b) => self + .eval_from_in_graph(a, start, graph_name) + .find_map(|middle| { + middle + .and_then(|middle| { + Ok(self + .eval_closed_in_graph(b, &middle, end, graph_name)? + .then_some(())) + }) + .transpose() + }) + .transpose()? + .is_some(), + PropertyPath::Alternative(a, b) => { + self.eval_closed_in_graph(a, start, end, graph_name)? + || self.eval_closed_in_graph(b, start, end, graph_name)? + } + PropertyPath::ZeroOrMore(p) => { + if start == end { + self.is_subject_or_object_in_graph(start, graph_name)? + } else { + look_in_transitive_closure( + self.eval_from_in_graph(p, start, graph_name), + move |e| self.eval_from_in_graph(p, &e, graph_name), + end, + )? + } + } + PropertyPath::OneOrMore(p) => look_in_transitive_closure( + self.eval_from_in_graph(p, start, graph_name), + move |e| self.eval_from_in_graph(p, &e, graph_name), + end, + )?, + PropertyPath::ZeroOrOne(p) => { + if start == end { + self.is_subject_or_object_in_graph(start, graph_name) + } else { + self.eval_closed_in_graph(p, start, end, graph_name) + }? + } + PropertyPath::NegatedPropertySet(ps) => self + .dataset + .encoded_quads_for_pattern(Some(start), None, Some(end), Some(graph_name)) + .find_map(move |t| match t { + Ok(t) => { + if ps.iter().any(|p| *p == t.predicate) { + None + } else { + Some(Ok(())) + } + } + Err(e) => Some(Err(e)), + }) + .transpose()? + .is_some(), + }) + } + + fn eval_closed_in_unknown_graph( + &self, + path: &PropertyPath, + start: &EncodedTerm, + end: &EncodedTerm, + ) -> Box>> { + match path { + PropertyPath::Path(p) => Box::new( + self.dataset + .encoded_quads_for_pattern(Some(start), Some(p), Some(end), None) + .map(|t| Ok(t?.graph_name)), + ), + PropertyPath::Reverse(p) => self.eval_closed_in_unknown_graph(p, end, start), + PropertyPath::Sequence(a, b) => { + let eval = self.clone(); + let b = Rc::clone(b); + let end = end.clone(); + Box::new(self.eval_from_in_unknown_graph(a, start).flat_map_ok( + move |(middle, graph_name)| { + eval.eval_closed_in_graph(&b, &middle, &end, &graph_name) + .map(|is_found| is_found.then_some(graph_name)) + .transpose() + }, + )) + } + PropertyPath::Alternative(a, b) => Box::new(hash_deduplicate( + self.eval_closed_in_unknown_graph(a, start, end) + .chain(self.eval_closed_in_unknown_graph(b, start, end)), + )), + PropertyPath::ZeroOrMore(p) => { + let eval = self.clone(); + let start2 = start.clone(); + let end = end.clone(); + let p = Rc::clone(p); + self.run_if_term_is_a_dataset_node(start, move |graph_name| { + look_in_transitive_closure( + Some(Ok(start2.clone())), + |e| eval.eval_from_in_graph(&p, &e, &graph_name), + &end, + ) + .map(|is_found| is_found.then_some(graph_name)) + .transpose() + }) + } + PropertyPath::OneOrMore(p) => { + let eval = self.clone(); + let end = end.clone(); + let p = Rc::clone(p); + Box::new( + self.eval_from_in_unknown_graph(&p, start) + .filter_map(move |r| { + r.and_then(|(start, graph_name)| { + look_in_transitive_closure( + Some(Ok(start)), + |e| eval.eval_from_in_graph(&p, &e, &graph_name), + &end, + ) + .map(|is_found| is_found.then_some(graph_name)) + }) + .transpose() + }), + ) + } + PropertyPath::ZeroOrOne(p) => { + if start == end { + self.run_if_term_is_a_dataset_node(start, |graph_name| Some(Ok(graph_name))) + } else { + let eval = self.clone(); + let start2 = start.clone(); + let end = end.clone(); + let p = Rc::clone(p); + self.run_if_term_is_a_dataset_node(start, move |graph_name| { + eval.eval_closed_in_graph(&p, &start2, &end, &graph_name) + .map(|is_found| is_found.then_some(graph_name)) + .transpose() + }) + } + } + PropertyPath::NegatedPropertySet(ps) => { + let ps = Rc::clone(ps); + Box::new( + self.dataset + .encoded_quads_for_pattern(Some(start), None, Some(end), None) + .filter_map(move |t| match t { + Ok(t) => { + if ps.iter().any(|p| *p == t.predicate) { + None + } else { + Some(Ok(t.graph_name)) + } + } + Err(e) => Some(Err(e)), + }), + ) + } + } + } + + fn eval_from_in_graph( + &self, + path: &PropertyPath, + start: &EncodedTerm, + graph_name: &EncodedTerm, + ) -> Box>> { + match path { + PropertyPath::Path(p) => Box::new( + self.dataset + .encoded_quads_for_pattern(Some(start), Some(p), None, Some(graph_name)) + .map(|t| Ok(t?.object)), + ), + PropertyPath::Reverse(p) => self.eval_to_in_graph(p, start, graph_name), + PropertyPath::Sequence(a, b) => { + let eval = self.clone(); + let b = Rc::clone(b); + let graph_name2 = graph_name.clone(); + Box::new( + self.eval_from_in_graph(a, start, graph_name) + .flat_map_ok(move |middle| { + eval.eval_from_in_graph(&b, &middle, &graph_name2) + }), + ) + } + PropertyPath::Alternative(a, b) => Box::new(hash_deduplicate( + self.eval_from_in_graph(a, start, graph_name) + .chain(self.eval_from_in_graph(b, start, graph_name)), + )), + PropertyPath::ZeroOrMore(p) => { + self.run_if_term_is_a_graph_node(start, graph_name, || { + let eval = self.clone(); + let p = Rc::clone(p); + let graph_name2 = graph_name.clone(); + transitive_closure(Some(Ok(start.clone())), move |e| { + eval.eval_from_in_graph(&p, &e, &graph_name2) + }) + }) + } + PropertyPath::OneOrMore(p) => { + let eval = self.clone(); + let p = Rc::clone(p); + let graph_name2 = graph_name.clone(); + Box::new(transitive_closure( + self.eval_from_in_graph(&p, start, graph_name), + move |e| eval.eval_from_in_graph(&p, &e, &graph_name2), + )) + } + PropertyPath::ZeroOrOne(p) => { + self.run_if_term_is_a_graph_node(start, graph_name, || { + hash_deduplicate( + once(Ok(start.clone())) + .chain(self.eval_from_in_graph(p, start, graph_name)), + ) + }) + } + PropertyPath::NegatedPropertySet(ps) => { + let ps = Rc::clone(ps); + Box::new( + self.dataset + .encoded_quads_for_pattern(Some(start), None, None, Some(graph_name)) + .filter_map(move |t| match t { + Ok(t) => { + if ps.iter().any(|p| *p == t.predicate) { + None + } else { + Some(Ok(t.object)) + } + } + Err(e) => Some(Err(e)), + }), + ) + } + } + } + + fn eval_from_in_unknown_graph( + &self, + path: &PropertyPath, + start: &EncodedTerm, + ) -> Box>> { + match path { + PropertyPath::Path(p) => Box::new( + self.dataset + .encoded_quads_for_pattern(Some(start), Some(p), None, None) + .map(|t| { + let t = t?; + Ok((t.object, t.graph_name)) + }), + ), + PropertyPath::Reverse(p) => self.eval_to_in_unknown_graph(p, start), + PropertyPath::Sequence(a, b) => { + let eval = self.clone(); + let b = Rc::clone(b); + Box::new(self.eval_from_in_unknown_graph(a, start).flat_map_ok( + move |(middle, graph_name)| { + eval.eval_from_in_graph(&b, &middle, &graph_name) + .map(move |end| Ok((end?, graph_name.clone()))) + }, + )) + } + PropertyPath::Alternative(a, b) => Box::new(hash_deduplicate( + self.eval_from_in_unknown_graph(a, start) + .chain(self.eval_from_in_unknown_graph(b, start)), + )), + PropertyPath::ZeroOrMore(p) => { + let start2 = start.clone(); + let eval = self.clone(); + let p = Rc::clone(p); + self.run_if_term_is_a_dataset_node(start, move |graph_name| { + let eval = eval.clone(); + let p = Rc::clone(&p); + let graph_name2 = graph_name.clone(); + transitive_closure(Some(Ok(start2.clone())), move |e| { + eval.eval_from_in_graph(&p, &e, &graph_name2) + }) + .map(move |e| Ok((e?, graph_name.clone()))) + }) + } + PropertyPath::OneOrMore(p) => { + let eval = self.clone(); + let p = Rc::clone(p); + Box::new(transitive_closure( + self.eval_from_in_unknown_graph(&p, start), + move |(e, graph_name)| { + eval.eval_from_in_graph(&p, &e, &graph_name) + .map(move |e| Ok((e?, graph_name.clone()))) + }, + )) + } + PropertyPath::ZeroOrOne(p) => { + let eval = self.clone(); + let start2 = start.clone(); + let p = Rc::clone(p); + self.run_if_term_is_a_dataset_node(start, move |graph_name| { + hash_deduplicate(once(Ok(start2.clone())).chain(eval.eval_from_in_graph( + &p, + &start2, + &graph_name, + ))) + .map(move |e| Ok((e?, graph_name.clone()))) + }) + } + PropertyPath::NegatedPropertySet(ps) => { + let ps = Rc::clone(ps); + Box::new( + self.dataset + .encoded_quads_for_pattern(Some(start), None, None, None) + .filter_map(move |t| match t { + Ok(t) => { + if ps.iter().any(|p| *p == t.predicate) { + None + } else { + Some(Ok((t.object, t.graph_name))) + } + } + Err(e) => Some(Err(e)), + }), + ) + } + } + } + + fn eval_to_in_graph( + &self, + path: &PropertyPath, + end: &EncodedTerm, + graph_name: &EncodedTerm, + ) -> Box>> { + match path { + PropertyPath::Path(p) => Box::new( + self.dataset + .encoded_quads_for_pattern(None, Some(p), Some(end), Some(graph_name)) + .map(|t| Ok(t?.subject)), + ), + PropertyPath::Reverse(p) => self.eval_from_in_graph(p, end, graph_name), + PropertyPath::Sequence(a, b) => { + let eval = self.clone(); + let a = Rc::clone(a); + let graph_name2 = graph_name.clone(); + Box::new( + self.eval_to_in_graph(b, end, graph_name) + .flat_map_ok(move |middle| { + eval.eval_to_in_graph(&a, &middle, &graph_name2) + }), + ) + } + PropertyPath::Alternative(a, b) => Box::new(hash_deduplicate( + self.eval_to_in_graph(a, end, graph_name) + .chain(self.eval_to_in_graph(b, end, graph_name)), + )), + PropertyPath::ZeroOrMore(p) => { + self.run_if_term_is_a_graph_node(end, graph_name, || { + let eval = self.clone(); + let p = Rc::clone(p); + let graph_name2 = graph_name.clone(); + transitive_closure(Some(Ok(end.clone())), move |e| { + eval.eval_to_in_graph(&p, &e, &graph_name2) + }) + }) + } + PropertyPath::OneOrMore(p) => { + let eval = self.clone(); + let p = Rc::clone(p); + let graph_name2 = graph_name.clone(); + Box::new(transitive_closure( + self.eval_to_in_graph(&p, end, graph_name), + move |e| eval.eval_to_in_graph(&p, &e, &graph_name2), + )) + } + PropertyPath::ZeroOrOne(p) => self.run_if_term_is_a_graph_node(end, graph_name, || { + hash_deduplicate( + once(Ok(end.clone())).chain(self.eval_to_in_graph(p, end, graph_name)), + ) + }), + PropertyPath::NegatedPropertySet(ps) => { + let ps = Rc::clone(ps); + Box::new( + self.dataset + .encoded_quads_for_pattern(None, None, Some(end), Some(graph_name)) + .filter_map(move |t| match t { + Ok(t) => { + if ps.iter().any(|p| *p == t.predicate) { + None + } else { + Some(Ok(t.subject)) + } + } + Err(e) => Some(Err(e)), + }), + ) + } + } + } + + fn eval_to_in_unknown_graph( + &self, + path: &PropertyPath, + end: &EncodedTerm, + ) -> Box>> { + match path { + PropertyPath::Path(p) => Box::new( + self.dataset + .encoded_quads_for_pattern(None, Some(p), Some(end), None) + .map(|t| { + let t = t?; + Ok((t.subject, t.graph_name)) + }), + ), + PropertyPath::Reverse(p) => self.eval_from_in_unknown_graph(p, end), + PropertyPath::Sequence(a, b) => { + let eval = self.clone(); + let a = Rc::clone(a); + Box::new(self.eval_to_in_unknown_graph(b, end).flat_map_ok( + move |(middle, graph_name)| { + eval.eval_from_in_graph(&a, &middle, &graph_name) + .map(move |start| Ok((start?, graph_name.clone()))) + }, + )) + } + PropertyPath::Alternative(a, b) => Box::new(hash_deduplicate( + self.eval_to_in_unknown_graph(a, end) + .chain(self.eval_to_in_unknown_graph(b, end)), + )), + PropertyPath::ZeroOrMore(p) => { + let end2 = end.clone(); + let eval = self.clone(); + let p = Rc::clone(p); + self.run_if_term_is_a_dataset_node(end, move |graph_name| { + let eval = eval.clone(); + let p = Rc::clone(&p); + let graph_name2 = graph_name.clone(); + transitive_closure(Some(Ok(end2.clone())), move |e| { + eval.eval_to_in_graph(&p, &e, &graph_name2) + }) + .map(move |e| Ok((e?, graph_name.clone()))) + }) + } + PropertyPath::OneOrMore(p) => { + let eval = self.clone(); + let p = Rc::clone(p); + Box::new(transitive_closure( + self.eval_to_in_unknown_graph(&p, end), + move |(e, graph_name)| { + eval.eval_to_in_graph(&p, &e, &graph_name) + .map(move |e| Ok((e?, graph_name.clone()))) + }, + )) + } + PropertyPath::ZeroOrOne(p) => { + let eval = self.clone(); + let end2 = end.clone(); + let p = Rc::clone(p); + self.run_if_term_is_a_dataset_node(end, move |graph_name| { + hash_deduplicate(once(Ok(end2.clone())).chain(eval.eval_to_in_graph( + &p, + &end2, + &graph_name, + ))) + .map(move |e| Ok((e?, graph_name.clone()))) + }) + } + PropertyPath::NegatedPropertySet(ps) => { + let ps = Rc::clone(ps); + Box::new( + self.dataset + .encoded_quads_for_pattern(Some(end), None, None, None) + .filter_map(move |t| match t { + Ok(t) => { + if ps.iter().any(|p| *p == t.predicate) { + None + } else { + Some(Ok((t.subject, t.graph_name))) + } + } + Err(e) => Some(Err(e)), + }), + ) + } + } + } + + fn eval_open_in_graph( + &self, + path: &PropertyPath, + graph_name: &EncodedTerm, + ) -> Box>> { + match path { + PropertyPath::Path(p) => Box::new( + self.dataset + .encoded_quads_for_pattern(None, Some(p), None, Some(graph_name)) + .map(|t| t.map(|t| (t.subject, t.object))), + ), + PropertyPath::Reverse(p) => Box::new( + self.eval_open_in_graph(p, graph_name) + .map(|t| t.map(|(s, o)| (o, s))), + ), + PropertyPath::Sequence(a, b) => { + let eval = self.clone(); + let b = Rc::clone(b); + let graph_name2 = graph_name.clone(); + Box::new(self.eval_open_in_graph(a, graph_name).flat_map_ok( + move |(start, middle)| { + eval.eval_from_in_graph(&b, &middle, &graph_name2) + .map(move |end| Ok((start.clone(), end?))) + }, + )) + } + PropertyPath::Alternative(a, b) => Box::new(hash_deduplicate( + self.eval_open_in_graph(a, graph_name) + .chain(self.eval_open_in_graph(b, graph_name)), + )), + PropertyPath::ZeroOrMore(p) => { + let eval = self.clone(); + let p = Rc::clone(p); + let graph_name2 = graph_name.clone(); + Box::new(transitive_closure( + self.get_subject_or_object_identity_pairs_in_graph(graph_name), + move |(start, middle)| { + eval.eval_from_in_graph(&p, &middle, &graph_name2) + .map(move |end| Ok((start.clone(), end?))) + }, + )) + } + PropertyPath::OneOrMore(p) => { + let eval = self.clone(); + let p = Rc::clone(p); + let graph_name2 = graph_name.clone(); + Box::new(transitive_closure( + self.eval_open_in_graph(&p, graph_name), + move |(start, middle)| { + eval.eval_from_in_graph(&p, &middle, &graph_name2) + .map(move |end| Ok((start.clone(), end?))) + }, + )) + } + PropertyPath::ZeroOrOne(p) => Box::new(hash_deduplicate( + self.get_subject_or_object_identity_pairs_in_graph(graph_name) + .chain(self.eval_open_in_graph(p, graph_name)), + )), + PropertyPath::NegatedPropertySet(ps) => { + let ps = Rc::clone(ps); + Box::new( + self.dataset + .encoded_quads_for_pattern(None, None, None, Some(graph_name)) + .filter_map(move |t| match t { + Ok(t) => { + if ps.iter().any(|p| *p == t.predicate) { + None + } else { + Some(Ok((t.subject, t.object))) + } + } + Err(e) => Some(Err(e)), + }), + ) + } + } + } + + fn eval_open_in_unknown_graph( + &self, + path: &PropertyPath, + ) -> Box>> + { + match path { + PropertyPath::Path(p) => Box::new( + self.dataset + .encoded_quads_for_pattern(None, Some(p), None, None) + .map(|t| t.map(|t| (t.subject, t.object, t.graph_name))), + ), + PropertyPath::Reverse(p) => Box::new( + self.eval_open_in_unknown_graph(p) + .map(|t| t.map(|(s, o, g)| (o, s, g))), + ), + PropertyPath::Sequence(a, b) => { + let eval = self.clone(); + let b = Rc::clone(b); + Box::new(self.eval_open_in_unknown_graph(a).flat_map_ok( + move |(start, middle, graph_name)| { + eval.eval_from_in_graph(&b, &middle, &graph_name) + .map(move |end| Ok((start.clone(), end?, graph_name.clone()))) + }, + )) + } + PropertyPath::Alternative(a, b) => Box::new(hash_deduplicate( + self.eval_open_in_unknown_graph(a) + .chain(self.eval_open_in_unknown_graph(b)), + )), + PropertyPath::ZeroOrMore(p) => { + let eval = self.clone(); + let p = Rc::clone(p); + Box::new(transitive_closure( + self.get_subject_or_object_identity_pairs_in_dataset(), + move |(start, middle, graph_name)| { + eval.eval_from_in_graph(&p, &middle, &graph_name) + .map(move |end| Ok((start.clone(), end?, graph_name.clone()))) + }, + )) + } + PropertyPath::OneOrMore(p) => { + let eval = self.clone(); + let p = Rc::clone(p); + Box::new(transitive_closure( + self.eval_open_in_unknown_graph(&p), + move |(start, middle, graph_name)| { + eval.eval_from_in_graph(&p, &middle, &graph_name) + .map(move |end| Ok((start.clone(), end?, graph_name.clone()))) + }, + )) + } + PropertyPath::ZeroOrOne(p) => Box::new(hash_deduplicate( + self.get_subject_or_object_identity_pairs_in_dataset() + .chain(self.eval_open_in_unknown_graph(p)), + )), + PropertyPath::NegatedPropertySet(ps) => { + let ps = Rc::clone(ps); + Box::new( + self.dataset + .encoded_quads_for_pattern(None, None, None, None) + .filter_map(move |t| match t { + Ok(t) => { + if ps.iter().any(|p| *p == t.predicate) { + None + } else { + Some(Ok((t.subject, t.object, t.graph_name))) + } + } + Err(e) => Some(Err(e)), + }), + ) + } + } + } + + fn get_subject_or_object_identity_pairs_in_graph( + &self, + graph_name: &EncodedTerm, + ) -> impl Iterator> { + self.dataset + .encoded_quads_for_pattern(None, None, None, Some(graph_name)) + .flat_map_ok(|t| { + [ + Ok((t.subject.clone(), t.subject)), + Ok((t.object.clone(), t.object)), + ] + }) + } + + fn get_subject_or_object_identity_pairs_in_dataset( + &self, + ) -> impl Iterator> + { + self.dataset + .encoded_quads_for_pattern(None, None, None, None) + .flat_map_ok(|t| { + [ + Ok((t.subject.clone(), t.subject, t.graph_name.clone())), + Ok((t.object.clone(), t.object, t.graph_name)), + ] + }) + } + + fn run_if_term_is_a_graph_node< + T: 'static, + I: Iterator> + 'static, + >( + &self, + term: &EncodedTerm, + graph_name: &EncodedTerm, + f: impl FnOnce() -> I, + ) -> Box>> { + match self.is_subject_or_object_in_graph(term, graph_name) { + Ok(true) => Box::new(f()), + Ok(false) => { + Box::new(empty()) // Not in the database + } + Err(error) => Box::new(once(Err(error))), + } + } + + fn is_subject_or_object_in_graph( + &self, + term: &EncodedTerm, + graph_name: &EncodedTerm, + ) -> Result { + Ok(self + .dataset + .encoded_quads_for_pattern(Some(term), None, None, Some(graph_name)) + .next() + .transpose()? + .is_some() + || self + .dataset + .encoded_quads_for_pattern(None, None, Some(term), Some(graph_name)) + .next() + .transpose()? + .is_some()) + } + + fn run_if_term_is_a_dataset_node< + T: 'static, + I: IntoIterator> + 'static, + >( + &self, + term: &EncodedTerm, + f: impl FnMut(EncodedTerm) -> I + 'static, + ) -> Box>> { + match self + .find_graphs_where_the_node_is_in(term) + .collect::, _>>() + { + Ok(graph_names) => Box::new(graph_names.into_iter().flat_map(f)), + Err(error) => Box::new(once(Err(error))), + } + } + + fn find_graphs_where_the_node_is_in( + &self, + term: &EncodedTerm, + ) -> impl Iterator> { + self.dataset + .encoded_quads_for_pattern(Some(term), None, None, None) + .chain( + self.dataset + .encoded_quads_for_pattern(None, None, Some(term), None), + ) + .map(|q| Ok(q?.graph_name)) + } +} + +struct CartesianProductJoinIterator { + probe_iter: EncodedTuplesIterator, + built: Vec, + buffered_results: Vec>, +} + +impl Iterator for CartesianProductJoinIterator { + type Item = Result; + + fn next(&mut self) -> Option { + loop { + if let Some(result) = self.buffered_results.pop() { + return Some(result); + } + let probe_tuple = match self.probe_iter.next()? { + Ok(probe_tuple) => probe_tuple, + Err(error) => return Some(Err(error)), + }; + for built_tuple in &self.built { + if let Some(result_tuple) = probe_tuple.combine_with(built_tuple) { + self.buffered_results.push(Ok(result_tuple)) + } + } + } + } + + fn size_hint(&self) -> (usize, Option) { + let (min, max) = self.probe_iter.size_hint(); + ( + min.saturating_mul(self.built.len()), + max.map(|v| v.saturating_mul(self.built.len())), + ) + } +} + +struct HashJoinIterator { + probe_iter: EncodedTuplesIterator, + built: EncodedTupleSet, + buffered_results: Vec>, +} + +impl Iterator for HashJoinIterator { + type Item = Result; + + fn next(&mut self) -> Option { + loop { + if let Some(result) = self.buffered_results.pop() { + return Some(result); + } + let probe_tuple = match self.probe_iter.next()? { + Ok(probe_tuple) => probe_tuple, + Err(error) => return Some(Err(error)), + }; + self.buffered_results.extend( + self.built + .get(&probe_tuple) + .iter() + .filter_map(|built_tuple| probe_tuple.combine_with(built_tuple).map(Ok)), + ) + } + } + + fn size_hint(&self) -> (usize, Option) { + ( + 0, + self.probe_iter + .size_hint() + .1 + .map(|v| v.saturating_mul(self.built.len())), + ) + } +} + +struct HashLeftJoinIterator { + left_iter: EncodedTuplesIterator, + right: EncodedTupleSet, + buffered_results: Vec>, + expression: Rc Option>, +} + +impl Iterator for HashLeftJoinIterator { + type Item = Result; + + fn next(&mut self) -> Option { + loop { + if let Some(result) = self.buffered_results.pop() { + return Some(result); + } + let left_tuple = match self.left_iter.next()? { + Ok(left_tuple) => left_tuple, + Err(error) => return Some(Err(error)), + }; + self.buffered_results.extend( + self.right + .get(&left_tuple) + .iter() + .filter_map(|right_tuple| left_tuple.combine_with(right_tuple)) + .filter(|tuple| { + (self.expression)(tuple) + .and_then(|term| to_bool(&term)) + .unwrap_or(false) + }) + .map(Ok), + ); + if self.buffered_results.is_empty() { + // We have not manage to join with anything + return Some(Ok(left_tuple)); + } + } + } + + fn size_hint(&self) -> (usize, Option) { + ( + 0, + self.left_iter + .size_hint() + .1 + .map(|v| v.saturating_mul(self.right.len())), + ) + } +} + +struct ForLoopLeftJoinIterator { + right_evaluator: Rc EncodedTuplesIterator>, + left_iter: EncodedTuplesIterator, + current_right: EncodedTuplesIterator, +} + +impl Iterator for ForLoopLeftJoinIterator { + type Item = Result; + + fn next(&mut self) -> Option { + if let Some(tuple) = self.current_right.next() { + return Some(tuple); + } + let left_tuple = match self.left_iter.next()? { + Ok(left_tuple) => left_tuple, + Err(error) => return Some(Err(error)), + }; + self.current_right = (self.right_evaluator)(left_tuple.clone()); + if let Some(right_tuple) = self.current_right.next() { + Some(right_tuple) + } else { + Some(Ok(left_tuple)) + } + } +} + +struct UnionIterator { + plans: Vec EncodedTuplesIterator>>, + input: EncodedTuple, + current_iterator: EncodedTuplesIterator, + current_plan: usize, +} + +impl Iterator for UnionIterator { + type Item = Result; + + fn next(&mut self) -> Option { + loop { + if let Some(tuple) = self.current_iterator.next() { + return Some(tuple); + } + if self.current_plan >= self.plans.len() { + return None; + } + self.current_iterator = self.plans[self.current_plan](self.input.clone()); + self.current_plan += 1; + } + } +} + +struct ConsecutiveDeduplication { + inner: EncodedTuplesIterator, + current: Option, +} + +impl Iterator for ConsecutiveDeduplication { + type Item = Result; + + fn next(&mut self) -> Option { + // Basic idea. We buffer the previous result and we only emit it when we know the next one or it's the end + loop { + if let Some(next) = self.inner.next() { + match next { + Ok(next) => match self.current.take() { + Some(current) if current != next => { + // We found a relevant value + self.current = Some(next); + return Some(Ok(current)); + } + _ => { + // We discard the value and move to the next one + self.current = Some(next); + } + }, + Err(error) => return Some(Err(error)), // We swap but it's fine. It's an error. + } + } else { + return self.current.take().map(Ok); + } + } + } + + fn size_hint(&self) -> (usize, Option) { + let (min, max) = self.inner.size_hint(); + ((min != 0).into(), max) + } +} + +struct ConstructIterator { + eval: SimpleEvaluator, + iter: EncodedTuplesIterator, + template: Vec, + buffered_results: Vec>, + bnodes: Vec, +} + +impl Iterator for ConstructIterator { + type Item = Result; + + fn next(&mut self) -> Option { + loop { + if let Some(result) = self.buffered_results.pop() { + return Some(result); + } + { + let tuple = match self.iter.next()? { + Ok(tuple) => tuple, + Err(error) => return Some(Err(error)), + }; + for template in &self.template { + if let (Some(subject), Some(predicate), Some(object)) = ( + get_triple_template_value(&template.subject, &tuple, &mut self.bnodes), + get_triple_template_value(&template.predicate, &tuple, &mut self.bnodes), + get_triple_template_value(&template.object, &tuple, &mut self.bnodes), + ) { + self.buffered_results.push(decode_triple( + &*self.eval.dataset, + &subject, + &predicate, + &object, + )); + } + } + self.bnodes.clear(); // We do not reuse old bnodes + } + } + } + + fn size_hint(&self) -> (usize, Option) { + let (min, max) = self.iter.size_hint(); + ( + min.saturating_mul(self.template.len()), + max.map(|v| v.saturating_mul(self.template.len())), + ) + } +} + +pub struct TripleTemplate { + pub subject: TripleTemplateValue, + pub predicate: TripleTemplateValue, + pub object: TripleTemplateValue, +} + +pub enum TripleTemplateValue { + Constant(EncodedTerm), + BlankNode(usize), + Variable(usize), + Triple(Box), +} + +fn get_triple_template_value<'a>( + selector: &'a TripleTemplateValue, + tuple: &'a EncodedTuple, + bnodes: &'a mut Vec, +) -> Option { + match selector { + TripleTemplateValue::Constant(term) => Some(term.clone()), + TripleTemplateValue::Variable(v) => tuple.get(*v).cloned(), + TripleTemplateValue::BlankNode(bnode) => { + if *bnode >= bnodes.len() { + bnodes.resize_with(*bnode + 1, new_bnode) + } + Some(bnodes[*bnode].clone()) + } + TripleTemplateValue::Triple(triple) => Some( + EncodedTriple { + subject: get_triple_template_value(&triple.subject, tuple, bnodes)?, + predicate: get_triple_template_value(&triple.predicate, tuple, bnodes)?, + object: get_triple_template_value(&triple.object, tuple, bnodes)?, + } + .into(), + ), + } +} + +fn new_bnode() -> EncodedTerm { + EncodedTerm::NumericalBlankNode { id: random() } +} + +fn decode_triple( + decoder: &D, + subject: &EncodedTerm, + predicate: &EncodedTerm, + object: &EncodedTerm, +) -> Result { + Ok(Triple::new( + decoder.decode_subject(subject)?, + decoder.decode_named_node(predicate)?, + decoder.decode_term(object)?, + )) +} + +struct DescribeIterator { + eval: SimpleEvaluator, + iter: EncodedTuplesIterator, + quads: Box>>, +} + +impl Iterator for DescribeIterator { + type Item = Result; + + fn next(&mut self) -> Option { + loop { + if let Some(quad) = self.quads.next() { + return Some(match quad { + Ok(quad) => self + .eval + .dataset + .decode_quad(&quad) + .map(Into::into) + .map_err(Into::into), + Err(error) => Err(error), + }); + } + let tuple = match self.iter.next()? { + Ok(tuple) => tuple, + Err(error) => return Some(Err(error)), + }; + let eval = self.eval.clone(); + self.quads = Box::new(tuple.into_iter().flatten().flat_map(move |subject| { + eval.dataset + .encoded_quads_for_pattern( + Some(&subject), + None, + None, + Some(&EncodedTerm::DefaultGraph), + ) + .chain( + eval.dataset + .encoded_quads_for_pattern(Some(&subject), None, None, None), + ) + })); + } + } +} + +struct ZipLongest, I2: Iterator> { + a: I1, + b: I2, +} + +impl, I2: Iterator> ZipLongest { + fn new(a: I1, b: I2) -> Self { + Self { a, b } + } +} + +impl, I2: Iterator> Iterator + for ZipLongest +{ + type Item = (Option, Option); + + fn next(&mut self) -> Option { + match (self.a.next(), self.b.next()) { + (None, None) => None, + r => Some(r), + } + } +} + +fn transitive_closure>>( + start: impl IntoIterator>, + mut next: impl FnMut(T) -> NI, +) -> impl Iterator> { + let mut errors = Vec::new(); + let mut todo = start + .into_iter() + .filter_map(|e| match e { + Ok(e) => Some(e), + Err(e) => { + errors.push(e); + None + } + }) + .collect::>(); + let mut all = todo.iter().cloned().collect::>(); + while let Some(e) = todo.pop() { + for e in next(e) { + match e { + Ok(e) => { + if all.insert(e.clone()) { + todo.push(e) + } + } + Err(e) => errors.push(e), + } + } + } + errors.into_iter().map(Err).chain(all.into_iter().map(Ok)) +} + +fn look_in_transitive_closure< + T: Clone + Eq + Hash, + NI: Iterator>, +>( + start: impl IntoIterator>, + mut next: impl FnMut(T) -> NI, + target: &T, +) -> Result { + let mut todo = start.into_iter().collect::, _>>()?; + let mut all = todo.iter().cloned().collect::>(); + while let Some(e) = todo.pop() { + if e == *target { + return Ok(true); + } + for e in next(e) { + let e = e?; + if all.insert(e.clone()) { + todo.push(e); + } + } + } + Ok(false) +} + +fn hash_deduplicate( + iter: impl Iterator>, +) -> impl Iterator> { + let mut already_seen = HashSet::with_capacity(iter.size_hint().0); + iter.filter(move |e| { + if let Ok(e) = e { + if already_seen.contains(e) { + false + } else { + already_seen.insert(e.clone()); + true + } + } else { + true + } + }) +} + +trait ResultIterator: Iterator> + Sized { + fn flat_map_ok U, U: IntoIterator>>( + self, + f: F, + ) -> FlatMapOk; +} + +impl> + Sized> ResultIterator for I { + fn flat_map_ok U, U: IntoIterator>>( + self, + f: F, + ) -> FlatMapOk { + FlatMapOk { + inner: self, + f, + current: None, + } + } +} + +struct FlatMapOk< + T, + O, + I: Iterator>, + F: FnMut(T) -> U, + U: IntoIterator>, +> { + inner: I, + f: F, + current: Option, +} + +impl< + T, + O, + I: Iterator>, + F: FnMut(T) -> U, + U: IntoIterator>, + > Iterator for FlatMapOk +{ + type Item = Result; + + fn next(&mut self) -> Option { + loop { + if let Some(current) = &mut self.current { + if let Some(next) = current.next() { + return Some(next); + } + } + self.current = None; + match self.inner.next()? { + Ok(e) => self.current = Some((self.f)(e).into_iter()), + Err(error) => return Some(Err(error)), + } + } + } +} + +trait Accumulator { + fn add(&mut self, element: Option); + + fn state(&self) -> Option; +} + +struct Deduplicate { + seen: HashSet>, + inner: Box, +} + +impl Deduplicate { + fn new(inner: Box) -> Self { + Self { + seen: HashSet::default(), + inner, + } + } +} + +impl Accumulator for Deduplicate { + fn add(&mut self, element: Option) { + if self.seen.insert(element.clone()) { + self.inner.add(element) + } + } + + fn state(&self) -> Option { + self.inner.state() + } +} + +#[derive(Default, Debug)] +struct CountAccumulator { + count: i64, +} + +impl Accumulator for CountAccumulator { + fn add(&mut self, _element: Option) { + self.count += 1; + } + + fn state(&self) -> Option { + Some(self.count.into()) + } +} + +struct SumAccumulator { + sum: Option, +} + +impl Default for SumAccumulator { + fn default() -> Self { + Self { + sum: Some(0.into()), + } + } +} + +impl Accumulator for SumAccumulator { + fn add(&mut self, element: Option) { + if let Some(sum) = &self.sum { + if let Some(operands) = element.and_then(|e| NumericBinaryOperands::new(sum.clone(), e)) + { + // TODO: unify with addition? + self.sum = match operands { + NumericBinaryOperands::Float(v1, v2) => Some((v1 + v2).into()), + NumericBinaryOperands::Double(v1, v2) => Some((v1 + v2).into()), + NumericBinaryOperands::Integer(v1, v2) => v1.checked_add(v2).map(Into::into), + NumericBinaryOperands::Decimal(v1, v2) => v1.checked_add(v2).map(Into::into), + NumericBinaryOperands::Duration(v1, v2) => v1.checked_add(v2).map(Into::into), + NumericBinaryOperands::YearMonthDuration(v1, v2) => { + v1.checked_add(v2).map(Into::into) + } + NumericBinaryOperands::DayTimeDuration(v1, v2) => { + v1.checked_add(v2).map(Into::into) + } + _ => None, + }; + } else { + self.sum = None; + } + } + } + + fn state(&self) -> Option { + self.sum.clone() + } +} + +#[derive(Default)] +struct AvgAccumulator { + sum: SumAccumulator, + count: i64, +} + +impl Accumulator for AvgAccumulator { + fn add(&mut self, element: Option) { + self.sum.add(element); + self.count += 1; + } + + fn state(&self) -> Option { + let sum = self.sum.state()?; + if self.count == 0 { + Some(0.into()) + } else { + // TODO: deduplicate? + // TODO: duration? + let count = Integer::from(self.count); + match sum { + EncodedTerm::FloatLiteral(sum) => Some((sum / Float::from(count)).into()), + EncodedTerm::DoubleLiteral(sum) => Some((sum / Double::from(count)).into()), + EncodedTerm::IntegerLiteral(sum) => { + Some(Decimal::from(sum).checked_div(count)?.into()) + } + EncodedTerm::DecimalLiteral(sum) => Some(sum.checked_div(count)?.into()), + _ => None, + } + } + } +} + +#[allow(clippy::option_option)] +struct MinAccumulator { + dataset: Rc, + min: Option>, +} + +impl MinAccumulator { + fn new(dataset: Rc) -> Self { + Self { dataset, min: None } + } +} + +impl Accumulator for MinAccumulator { + fn add(&mut self, element: Option) { + if let Some(min) = &self.min { + if cmp_terms(&self.dataset, element.as_ref(), min.as_ref()) == Ordering::Less { + self.min = Some(element) + } + } else { + self.min = Some(element) + } + } + + fn state(&self) -> Option { + self.min.clone().and_then(|v| v) + } +} + +#[allow(clippy::option_option)] +struct MaxAccumulator { + dataset: Rc, + max: Option>, +} + +impl MaxAccumulator { + fn new(dataset: Rc) -> Self { + Self { dataset, max: None } + } +} + +impl Accumulator for MaxAccumulator { + fn add(&mut self, element: Option) { + if let Some(max) = &self.max { + if cmp_terms(&self.dataset, element.as_ref(), max.as_ref()) == Ordering::Greater { + self.max = Some(element) + } + } else { + self.max = Some(element) + } + } + + fn state(&self) -> Option { + self.max.clone().and_then(|v| v) + } +} + +#[derive(Default)] +struct SampleAccumulator { + value: Option, +} + +impl Accumulator for SampleAccumulator { + fn add(&mut self, element: Option) { + if element.is_some() { + self.value = element + } + } + + fn state(&self) -> Option { + self.value.clone() + } +} + +#[allow(clippy::option_option)] +struct GroupConcatAccumulator { + dataset: Rc, + concat: Option, + language: Option>, + separator: Rc, +} + +impl GroupConcatAccumulator { + fn new(dataset: Rc, separator: Rc) -> Self { + Self { + dataset, + concat: Some(String::new()), + language: None, + separator, + } + } +} + +impl Accumulator for GroupConcatAccumulator { + fn add(&mut self, element: Option) { + if let Some(concat) = self.concat.as_mut() { + if let Some(element) = element { + if let Some((value, e_language)) = to_string_and_language(&self.dataset, &element) { + if let Some(lang) = self.language { + if lang != e_language { + self.language = Some(None) + } + concat.push_str(&self.separator); + } else { + self.language = Some(e_language) + } + concat.push_str(&value); + } + } + } + } + + fn state(&self) -> Option { + self.concat + .as_ref() + .map(|result| build_plain_literal(&self.dataset, result, self.language.and_then(|v| v))) + } +} + +struct FailingAccumulator; + +impl Accumulator for FailingAccumulator { + fn add(&mut self, _: Option) {} + + fn state(&self) -> Option { + None + } +} + +fn encode_variable(variables: &mut Vec, variable: &Variable) -> usize { + if let Some(key) = slice_key(variables, variable) { + key + } else { + variables.push(variable.clone()); + variables.len() - 1 + } +} + +fn bnode_key(blank_nodes: &mut Vec, blank_node: &BlankNode) -> usize { + if let Some(key) = slice_key(blank_nodes, blank_node) { + key + } else { + blank_nodes.push(blank_node.clone()); + blank_nodes.len() - 1 + } +} + +fn slice_key(slice: &[T], element: &T) -> Option { + for (i, item) in slice.iter().enumerate() { + if item == element { + return Some(i); + } + } + None +} + +fn generate_uuid(buffer: &mut String) { + let mut uuid = random::().to_le_bytes(); + uuid[6] = (uuid[6] & 0x0F) | 0x40; + uuid[8] = (uuid[8] & 0x3F) | 0x80; + + write_hexa_bytes(&uuid[0..4], buffer); + buffer.push('-'); + write_hexa_bytes(&uuid[4..6], buffer); + buffer.push('-'); + write_hexa_bytes(&uuid[6..8], buffer); + buffer.push('-'); + write_hexa_bytes(&uuid[8..10], buffer); + buffer.push('-'); + write_hexa_bytes(&uuid[10..16], buffer); +} + +fn write_hexa_bytes(bytes: &[u8], buffer: &mut String) { + for b in bytes { + let high = b / 16; + buffer.push(char::from(if high < 10 { + b'0' + high + } else { + b'a' + (high - 10) + })); + let low = b % 16; + buffer.push(char::from(if low < 10 { + b'0' + low + } else { + b'a' + (low - 10) + })); + } +} + +#[derive(Eq, PartialEq, Clone, Copy)] +enum SmallStringOrId { + Small(SmallString), + Big(StrHash), +} + +impl From for SmallStringOrId { + fn from(value: SmallString) -> Self { + Self::Small(value) + } +} + +impl From for SmallStringOrId { + fn from(value: StrHash) -> Self { + Self::Big(value) + } +} + +pub enum ComparatorFunction { + Asc(Rc Option>), + Desc(Rc Option>), +} + +struct EncodedTupleSet { + key: Vec, + map: HashMap>, + len: usize, +} + +impl EncodedTupleSet { + fn new(key: Vec) -> Self { + Self { + key, + map: HashMap::new(), + len: 0, + } + } + + fn insert(&mut self, tuple: EncodedTuple) { + self.map + .entry(self.tuple_key(&tuple)) + .or_default() + .push(tuple); + self.len += 1; + } + + fn get(&self, tuple: &EncodedTuple) -> &[EncodedTuple] { + self.map.get(&self.tuple_key(tuple)).map_or(&[], |v| v) + } + + fn tuple_key(&self, tuple: &EncodedTuple) -> u64 { + let mut hasher = DefaultHasher::default(); + for v in &self.key { + if let Some(val) = tuple.get(*v) { + val.hash(&mut hasher); + } + } + hasher.finish() + } + + fn len(&self) -> usize { + self.len + } +} + +impl Extend for EncodedTupleSet { + fn extend>(&mut self, iter: T) { + let iter = iter.into_iter(); + self.map.reserve(iter.size_hint().0); + for tuple in iter { + self.insert(tuple); + } + } +} + +struct StatsIterator { + inner: EncodedTuplesIterator, + stats: Rc, +} + +impl Iterator for StatsIterator { + type Item = Result; + + fn next(&mut self) -> Option { + let start = Timer::now(); + let result = self.inner.next(); + self.stats.exec_duration.set( + self.stats + .exec_duration + .get() + .and_then(|stat| stat.checked_add(start.elapsed()?)), + ); + if matches!(result, Some(Ok(_))) { + self.stats.exec_count.set(self.stats.exec_count.get() + 1); + } + result + } +} + +pub struct EvalNodeWithStats { + pub label: String, + pub children: Vec>, + pub exec_count: Cell, + pub exec_duration: Cell>, +} + +impl EvalNodeWithStats { + pub fn json_node( + &self, + writer: &mut ToWriteJsonWriter, + with_stats: bool, + ) -> io::Result<()> { + writer.write_event(JsonEvent::StartObject)?; + writer.write_event(JsonEvent::ObjectKey("name".into()))?; + writer.write_event(JsonEvent::String((&self.label).into()))?; + if with_stats { + writer.write_event(JsonEvent::ObjectKey("number of results".into()))?; + writer.write_event(JsonEvent::Number(self.exec_count.get().to_string().into()))?; + if let Some(duration) = self.exec_duration.get() { + writer.write_event(JsonEvent::ObjectKey("duration in seconds".into()))?; + writer.write_event(JsonEvent::Number(duration.as_seconds().to_string().into()))?; + } + } + writer.write_event(JsonEvent::ObjectKey("children".into()))?; + writer.write_event(JsonEvent::StartArray)?; + for child in &self.children { + child.json_node(writer, with_stats)?; + } + writer.write_event(JsonEvent::EndArray)?; + writer.write_event(JsonEvent::EndObject) + } +} + +impl fmt::Debug for EvalNodeWithStats { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut obj = f.debug_struct("Node"); + obj.field("name", &self.label); + if let Some(exec_duration) = self.exec_duration.get() { + obj.field("number of results", &self.exec_count.get()); + obj.field( + "duration in seconds", + &f32::from(Float::from(exec_duration.as_seconds())), + ); + } + if !self.children.is_empty() { + obj.field("children", &self.children); + } + obj.finish() + } +} + +fn eval_node_label(node: &GraphPattern) -> String { + match node { + GraphPattern::Distinct { .. } => "Distinct(Hash)".to_owned(), + GraphPattern::Extend { + expression, + variable, + .. + } => format!( + "Extend({} -> {variable})", + spargebra::algebra::Expression::from(expression) + ), + GraphPattern::Filter { expression, .. } => format!( + "Filter({})", + spargebra::algebra::Expression::from(expression) + ), + GraphPattern::Group { + variables, + aggregates, + .. + } => { + format!( + "Aggregate({})", + format_list(variables.iter().map(ToString::to_string).chain( + aggregates.iter().map(|(v, agg)| format!( + "{} -> {v}", + spargebra::algebra::AggregateExpression::from(agg) + )) + )) + ) + } + GraphPattern::Join { algorithm, .. } => match algorithm { + JoinAlgorithm::HashBuildLeftProbeRight { keys } => format!( + "LeftJoin(HashBuildLeftProbeRight, keys = {})", + format_list(keys) + ), + }, + GraphPattern::Lateral { right, .. } => { + if let GraphPattern::LeftJoin { + left: nested_left, + expression, + .. + } = right.as_ref() + { + if nested_left.is_empty_singleton() { + // We are in a ForLoopLeftJoin + return format!( + "ForLoopLeftJoin(expression = {})", + spargebra::algebra::Expression::from(expression) + ); + } + } + "Lateral".to_owned() + } + GraphPattern::LeftJoin { + algorithm, + expression, + .. + } => match algorithm { + LeftJoinAlgorithm::HashBuildRightProbeLeft { keys } => format!( + "LeftJoin(HashBuildRightProbeLeft, keys = {}, expression = {})", + format_list(keys), + spargebra::algebra::Expression::from(expression) + ), + }, + GraphPattern::Minus { algorithm, .. } => match algorithm { + MinusAlgorithm::HashBuildRightProbeLeft { keys } => format!( + "AntiJoin(HashBuildRightProbeLeft, keys = {})", + format_list(keys) + ), + }, + GraphPattern::OrderBy { expression, .. } => { + format!( + "Sort({})", + format_list( + expression + .iter() + .map(spargebra::algebra::OrderExpression::from) + ) + ) + } + GraphPattern::Path { + subject, + path, + object, + graph_name, + } => { + if let Some(graph_name) = graph_name { + format!("Path({subject} {path} {object} {graph_name})") + } else { + format!("Path({subject} {path} {object})") + } + } + GraphPattern::Project { variables, .. } => { + format!("Project({})", format_list(variables)) + } + GraphPattern::QuadPattern { + subject, + predicate, + object, + graph_name, + } => { + if let Some(graph_name) = graph_name { + format!("QuadPattern({subject} {predicate} {object} {graph_name})") + } else { + format!("QuadPattern({subject} {predicate} {object})") + } + } + GraphPattern::Reduced { .. } => "Reduced".to_owned(), + GraphPattern::Service { name, silent, .. } => { + if *silent { + format!("Service({name}, Silent)") + } else { + format!("Service({name})") + } + } + GraphPattern::Slice { start, length, .. } => { + if let Some(length) = length { + format!("Slice(start = {start}, length = {length})") + } else { + format!("Slice(start = {start})") + } + } + GraphPattern::Union { .. } => "Union".to_owned(), + GraphPattern::Values { variables, .. } => { + format!("StaticBindings({})", format_list(variables)) + } + } +} + +fn format_list(values: impl IntoIterator) -> String { + values + .into_iter() + .map(|v| v.to_string()) + .collect::>() + .join(", ") +} + +pub struct Timer { + start: DateTime, +} + +impl Timer { + pub fn now() -> Self { + Self { + start: DateTime::now(), + } + } + + pub fn elapsed(&self) -> Option { + DateTime::now().checked_sub(self.start) + } +} + +#[cfg(test)] +#[allow(clippy::panic_in_result_fn)] +mod tests { + use super::*; + + #[test] + fn uuid() { + let mut buffer = String::default(); + generate_uuid(&mut buffer); + assert!( + Regex::new("^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$") + .unwrap() + .is_match(&buffer), + "{buffer} is not a valid UUID" + ); + } +} diff --git a/src/sparql/http/dummy.rs b/src/sparql/http/dummy.rs new file mode 100644 index 0000000..7b3a551 --- /dev/null +++ b/src/sparql/http/dummy.rs @@ -0,0 +1,34 @@ +//! Simple HTTP client + +use std::io::{Empty, Error, ErrorKind, Result}; +use std::time::Duration; + +pub struct Client; + +impl Client { + pub fn new(_timeout: Option, _redirection_limit: usize) -> Self { + Self + } + + #[allow(clippy::unused_self)] + pub fn get(&self, _url: &str, _accept: &'static str) -> Result<(String, Empty)> { + Err(Error::new( + ErrorKind::Unsupported, + "HTTP client is not available. Enable the feature 'http-client'", + )) + } + + #[allow(clippy::unused_self, clippy::needless_pass_by_value)] + pub fn post( + &self, + _url: &str, + _payload: Vec, + _content_type: &'static str, + _accept: &'static str, + ) -> Result<(String, Empty)> { + Err(Error::new( + ErrorKind::Unsupported, + "HTTP client is not available. Enable the feature 'http-client'", + )) + } +} diff --git a/src/sparql/http/mod.rs b/src/sparql/http/mod.rs new file mode 100644 index 0000000..b309cf5 --- /dev/null +++ b/src/sparql/http/mod.rs @@ -0,0 +1,9 @@ +#[cfg(not(feature = "http-client"))] +mod dummy; +#[cfg(feature = "http-client")] +mod simple; + +#[cfg(not(feature = "http-client"))] +pub use dummy::Client; +#[cfg(feature = "http-client")] +pub use simple::Client; diff --git a/src/sparql/http/simple.rs b/src/sparql/http/simple.rs new file mode 100644 index 0000000..bd81d7c --- /dev/null +++ b/src/sparql/http/simple.rs @@ -0,0 +1,90 @@ +use oxhttp::model::{Body, HeaderName, Method, Request}; +use std::io::{Error, ErrorKind, Result}; +use std::time::Duration; + +pub struct Client { + client: oxhttp::Client, +} + +impl Client { + pub fn new(timeout: Option, redirection_limit: usize) -> Self { + let mut client = oxhttp::Client::new() + .with_redirection_limit(redirection_limit) + .with_user_agent(concat!("Oxigraph/", env!("CARGO_PKG_VERSION"))) + .unwrap(); + if let Some(timeout) = timeout { + client = client.with_global_timeout(timeout); + } + Self { client } + } + + pub fn get(&self, url: &str, accept: &'static str) -> Result<(String, Body)> { + let request = Request::builder(Method::GET, url.parse().map_err(invalid_input_error)?) + .with_header(HeaderName::ACCEPT, accept) + .map_err(invalid_input_error)? + .build(); + let response = self.client.request(request)?; + let status = response.status(); + if !status.is_successful() { + return Err(Error::new( + ErrorKind::Other, + format!( + "Error {} returned by {} with payload:\n{}", + status, + url, + response.into_body().to_string()? + ), + )); + } + let content_type = response + .header(&HeaderName::CONTENT_TYPE) + .ok_or_else(|| invalid_data_error(format!("No Content-Type returned by {url}")))? + .to_str() + .map_err(invalid_data_error)? + .to_owned(); + Ok((content_type, response.into_body())) + } + + pub fn post( + &self, + url: &str, + payload: Vec, + content_type: &'static str, + accept: &'static str, + ) -> Result<(String, Body)> { + let request = Request::builder(Method::POST, url.parse().map_err(invalid_input_error)?) + .with_header(HeaderName::ACCEPT, accept) + .map_err(invalid_input_error)? + .with_header(HeaderName::CONTENT_TYPE, content_type) + .map_err(invalid_input_error)? + .with_body(payload); + let response = self.client.request(request)?; + let status = response.status(); + if !status.is_successful() { + return Err(Error::new( + ErrorKind::Other, + format!( + "Error {} returned by {} with payload:\n{}", + status, + url, + response.into_body().to_string()? + ), + )); + } + let content_type = response + .header(&HeaderName::CONTENT_TYPE) + .ok_or_else(|| invalid_data_error(format!("No Content-Type returned by {url}")))? + .to_str() + .map_err(invalid_data_error)? + .to_owned(); + Ok((content_type, response.into_body())) + } +} + +fn invalid_data_error(error: impl Into>) -> Error { + Error::new(ErrorKind::InvalidData, error) +} + +fn invalid_input_error(error: impl Into>) -> Error { + Error::new(ErrorKind::InvalidInput, error) +} diff --git a/src/sparql/mod.rs b/src/sparql/mod.rs new file mode 100644 index 0000000..089f84b --- /dev/null +++ b/src/sparql/mod.rs @@ -0,0 +1,328 @@ +//! [SPARQL](https://www.w3.org/TR/sparql11-overview/) implementation. +//! +//! Stores execute SPARQL. See [`Store`](crate::store::Store::query()) for an example. + +mod algebra; +mod dataset; +mod error; +mod eval; +mod http; +mod model; +pub mod results; +mod service; +mod update; + +use crate::model::{NamedNode, Term}; +pub use crate::sparql::algebra::{Query, QueryDataset, Update}; +use crate::sparql::dataset::DatasetView; +pub use crate::sparql::error::EvaluationError; +use crate::sparql::eval::{EvalNodeWithStats, SimpleEvaluator, Timer}; +pub use crate::sparql::model::{QueryResults, QuerySolution, QuerySolutionIter, QueryTripleIter}; +pub use crate::sparql::service::ServiceHandler; +use crate::sparql::service::{EmptyServiceHandler, ErrorConversionServiceHandler}; +pub(crate) use crate::sparql::update::evaluate_update; +use crate::storage::StorageReader; +use json_event_parser::{JsonEvent, ToWriteJsonWriter}; +pub use oxrdf::{Variable, VariableNameParseError}; +use oxsdatatypes::{DayTimeDuration, Float}; +pub use spargebra::SparqlSyntaxError; +use sparopt::algebra::GraphPattern; +use sparopt::Optimizer; +use std::collections::HashMap; +use std::rc::Rc; +use std::sync::Arc; +use std::time::Duration; +use std::{fmt, io}; + +#[allow(clippy::needless_pass_by_value)] +pub(crate) fn evaluate_query( + reader: StorageReader, + query: impl TryInto>, + options: QueryOptions, + run_stats: bool, +) -> Result<(Result, QueryExplanation), EvaluationError> { + let query = query.try_into().map_err(Into::into)?; + let dataset = DatasetView::new(reader, &query.dataset); + let start_planning = Timer::now(); + let (results, plan_node_with_stats, planning_duration) = match query.inner { + spargebra::Query::Select { + pattern, base_iri, .. + } => { + let mut pattern = GraphPattern::from(&pattern); + if !options.without_optimizations { + pattern = Optimizer::optimize_graph_pattern(pattern); + } + let planning_duration = start_planning.elapsed(); + let (results, explanation) = SimpleEvaluator::new( + Rc::new(dataset), + base_iri.map(Rc::new), + options.service_handler(), + Arc::new(options.custom_functions), + run_stats, + ) + .evaluate_select(&pattern); + (Ok(results), explanation, planning_duration) + } + spargebra::Query::Ask { + pattern, base_iri, .. + } => { + let mut pattern = GraphPattern::from(&pattern); + if !options.without_optimizations { + pattern = Optimizer::optimize_graph_pattern(GraphPattern::Reduced { + inner: Box::new(pattern), + }); + } + let planning_duration = start_planning.elapsed(); + let (results, explanation) = SimpleEvaluator::new( + Rc::new(dataset), + base_iri.map(Rc::new), + options.service_handler(), + Arc::new(options.custom_functions), + run_stats, + ) + .evaluate_ask(&pattern); + (results, explanation, planning_duration) + } + spargebra::Query::Construct { + template, + pattern, + base_iri, + .. + } => { + let mut pattern = GraphPattern::from(&pattern); + if !options.without_optimizations { + pattern = Optimizer::optimize_graph_pattern(GraphPattern::Reduced { + inner: Box::new(pattern), + }); + } + let planning_duration = start_planning.elapsed(); + let (results, explanation) = SimpleEvaluator::new( + Rc::new(dataset), + base_iri.map(Rc::new), + options.service_handler(), + Arc::new(options.custom_functions), + run_stats, + ) + .evaluate_construct(&pattern, &template); + (Ok(results), explanation, planning_duration) + } + spargebra::Query::Describe { + pattern, base_iri, .. + } => { + let mut pattern = GraphPattern::from(&pattern); + if !options.without_optimizations { + pattern = Optimizer::optimize_graph_pattern(GraphPattern::Reduced { + inner: Box::new(pattern), + }); + } + let planning_duration = start_planning.elapsed(); + let (results, explanation) = SimpleEvaluator::new( + Rc::new(dataset), + base_iri.map(Rc::new), + options.service_handler(), + Arc::new(options.custom_functions), + run_stats, + ) + .evaluate_describe(&pattern); + (Ok(results), explanation, planning_duration) + } + }; + let explanation = QueryExplanation { + inner: plan_node_with_stats, + with_stats: run_stats, + parsing_duration: query.parsing_duration, + planning_duration, + }; + Ok((results, explanation)) +} + +/// Options for SPARQL query evaluation. +/// +/// +/// If the `"http-client"` optional feature is enabled, +/// a simple HTTP 1.1 client is used to execute [SPARQL 1.1 Federated Query](https://www.w3.org/TR/sparql11-federated-query/) SERVICE calls. +/// +/// Usage example disabling the federated query support: +/// ``` +/// use oxigraph::sparql::QueryOptions; +/// use oxigraph::store::Store; +/// +/// let store = Store::new()?; +/// store.query_opt( +/// "SELECT * WHERE { SERVICE {} }", +/// QueryOptions::default().without_service_handler(), +/// )?; +/// # Result::<_,Box>::Ok(()) +/// ``` +#[derive(Clone, Default)] +pub struct QueryOptions { + service_handler: Option>>, + custom_functions: CustomFunctionRegistry, + http_timeout: Option, + http_redirection_limit: usize, + without_optimizations: bool, +} + +pub(crate) type CustomFunctionRegistry = + HashMap Option) + Send + Sync>>; + +impl QueryOptions { + /// Use a given [`ServiceHandler`] to execute [SPARQL 1.1 Federated Query](https://www.w3.org/TR/sparql11-federated-query/) SERVICE calls. + #[inline] + #[must_use] + pub fn with_service_handler(mut self, service_handler: impl ServiceHandler + 'static) -> Self { + self.service_handler = Some(Arc::new(ErrorConversionServiceHandler::wrap( + service_handler, + ))); + self + } + + /// Disables the `SERVICE` calls + #[inline] + #[must_use] + pub fn without_service_handler(mut self) -> Self { + self.service_handler = Some(Arc::new(EmptyServiceHandler)); + self + } + + /// Sets a timeout for HTTP requests done during SPARQL evaluation. + #[cfg(feature = "http-client")] + #[inline] + #[must_use] + pub fn with_http_timeout(mut self, timeout: Duration) -> Self { + self.http_timeout = Some(timeout); + self + } + + /// Sets an upper bound of the number of HTTP redirection followed per HTTP request done during SPARQL evaluation. + /// + /// By default this value is `0`. + #[cfg(feature = "http-client")] + #[inline] + #[must_use] + pub fn with_http_redirection_limit(mut self, redirection_limit: usize) -> Self { + self.http_redirection_limit = redirection_limit; + self + } + + /// Adds a custom SPARQL evaluation function. + /// + /// Example with a function serializing terms to N-Triples: + /// ``` + /// use oxigraph::model::*; + /// use oxigraph::sparql::{QueryOptions, QueryResults}; + /// use oxigraph::store::Store; + /// + /// let store = Store::new()?; + /// + /// if let QueryResults::Solutions(mut solutions) = store.query_opt( + /// "SELECT ((1) AS ?nt) WHERE {}", + /// QueryOptions::default().with_custom_function( + /// NamedNode::new("http://www.w3.org/ns/formats/N-Triples")?, + /// |args| args.get(0).map(|t| Literal::from(t.to_string()).into()), + /// ), + /// )? { + /// assert_eq!( + /// solutions.next().unwrap()?.get("nt"), + /// Some(&Literal::from("\"1\"^^").into()) + /// ); + /// } + /// # Result::<_,Box>::Ok(()) + /// ``` + #[inline] + #[must_use] + pub fn with_custom_function( + mut self, + name: NamedNode, + evaluator: impl Fn(&[Term]) -> Option + Send + Sync + 'static, + ) -> Self { + self.custom_functions.insert(name, Arc::new(evaluator)); + self + } + + fn service_handler(&self) -> Arc> { + self.service_handler.clone().unwrap_or_else(|| { + if cfg!(feature = "http-client") { + Arc::new(service::SimpleServiceHandler::new( + self.http_timeout, + self.http_redirection_limit, + )) + } else { + Arc::new(EmptyServiceHandler) + } + }) + } + + #[doc(hidden)] + #[inline] + #[must_use] + pub fn without_optimizations(mut self) -> Self { + self.without_optimizations = true; + self + } +} + +/// Options for SPARQL update evaluation. +#[derive(Clone, Default)] +pub struct UpdateOptions { + query_options: QueryOptions, +} + +impl From for UpdateOptions { + #[inline] + fn from(query_options: QueryOptions) -> Self { + Self { query_options } + } +} + +/// The explanation of a query. +#[derive(Clone)] +pub struct QueryExplanation { + inner: Rc, + with_stats: bool, + parsing_duration: Option, + planning_duration: Option, +} + +impl QueryExplanation { + /// Writes the explanation as JSON. + pub fn write_in_json(&self, write: impl io::Write) -> io::Result<()> { + let mut writer = ToWriteJsonWriter::new(write); + writer.write_event(JsonEvent::StartObject)?; + if let Some(parsing_duration) = self.parsing_duration { + writer.write_event(JsonEvent::ObjectKey("parsing duration in seconds".into()))?; + writer.write_event(JsonEvent::Number( + parsing_duration.as_seconds().to_string().into(), + ))?; + } + if let Some(planning_duration) = self.planning_duration { + writer.write_event(JsonEvent::ObjectKey("planning duration in seconds".into()))?; + writer.write_event(JsonEvent::Number( + planning_duration.as_seconds().to_string().into(), + ))?; + } + writer.write_event(JsonEvent::ObjectKey("plan".into()))?; + self.inner.json_node(&mut writer, self.with_stats)?; + writer.write_event(JsonEvent::EndObject) + } +} + +impl fmt::Debug for QueryExplanation { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut obj = f.debug_struct("QueryExplanation"); + if let Some(parsing_duration) = self.parsing_duration { + obj.field( + "parsing duration in seconds", + &f32::from(Float::from(parsing_duration.as_seconds())), + ); + } + if let Some(planning_duration) = self.planning_duration { + obj.field( + "planning duration in seconds", + &f32::from(Float::from(planning_duration.as_seconds())), + ); + } + obj.field("tree", &self.inner); + obj.finish_non_exhaustive() + } +} diff --git a/src/sparql/model.rs b/src/sparql/model.rs new file mode 100644 index 0000000..0fca83e --- /dev/null +++ b/src/sparql/model.rs @@ -0,0 +1,371 @@ +use crate::io::{RdfFormat, RdfSerializer}; +use crate::model::*; +use crate::sparql::error::EvaluationError; +use crate::sparql::results::{ + FromReadQueryResultsReader, FromReadSolutionsReader, QueryResultsFormat, + QueryResultsParseError, QueryResultsParser, QueryResultsSerializer, +}; +pub use sparesults::QuerySolution; +use std::io::{Read, Write}; +use std::sync::Arc; + +/// Results of a [SPARQL query](https://www.w3.org/TR/sparql11-query/). +pub enum QueryResults { + /// Results of a [SELECT](https://www.w3.org/TR/sparql11-query/#select) query. + Solutions(QuerySolutionIter), + /// Result of a [ASK](https://www.w3.org/TR/sparql11-query/#ask) query. + Boolean(bool), + /// Results of a [CONSTRUCT](https://www.w3.org/TR/sparql11-query/#construct) or [DESCRIBE](https://www.w3.org/TR/sparql11-query/#describe) query. + Graph(QueryTripleIter), +} + +impl QueryResults { + /// Reads a SPARQL query results serialization. + pub fn read( + read: impl Read + 'static, + format: QueryResultsFormat, + ) -> Result { + Ok(QueryResultsParser::from_format(format) + .parse_read(read)? + .into()) + } + + /// Writes the query results (solutions or boolean). + /// + /// This method fails if it is called on the `Graph` results. + /// + /// ``` + /// use oxigraph::store::Store; + /// use oxigraph::model::*; + /// use oxigraph::sparql::results::QueryResultsFormat; + /// + /// let store = Store::new()?; + /// let ex = NamedNodeRef::new("http://example.com")?; + /// store.insert(QuadRef::new(ex, ex, ex, GraphNameRef::DefaultGraph))?; + /// + /// let results = store.query("SELECT ?s WHERE { ?s ?p ?o }")?; + /// assert_eq!( + /// results.write(Vec::new(), QueryResultsFormat::Json)?, + /// r#"{"head":{"vars":["s"]},"results":{"bindings":[{"s":{"type":"uri","value":"http://example.com"}}]}}"#.as_bytes() + /// ); + /// # Result::<_,Box>::Ok(()) + /// ``` + pub fn write( + self, + write: W, + format: QueryResultsFormat, + ) -> Result { + let serializer = QueryResultsSerializer::from_format(format); + match self { + Self::Boolean(value) => serializer.serialize_boolean_to_write(write, value), + Self::Solutions(solutions) => { + let mut writer = serializer + .serialize_solutions_to_write(write, solutions.variables().to_vec()) + .map_err(EvaluationError::ResultsSerialization)?; + for solution in solutions { + writer + .write(&solution?) + .map_err(EvaluationError::ResultsSerialization)?; + } + writer.finish() + } + Self::Graph(triples) => { + let s = VariableRef::new_unchecked("subject"); + let p = VariableRef::new_unchecked("predicate"); + let o = VariableRef::new_unchecked("object"); + let mut writer = serializer + .serialize_solutions_to_write( + write, + vec![s.into_owned(), p.into_owned(), o.into_owned()], + ) + .map_err(EvaluationError::ResultsSerialization)?; + for triple in triples { + let triple = triple?; + writer + .write([ + (s, &triple.subject.into()), + (p, &triple.predicate.into()), + (o, &triple.object), + ]) + .map_err(EvaluationError::ResultsSerialization)?; + } + writer.finish() + } + } + .map_err(EvaluationError::ResultsSerialization) + } + + /// Writes the graph query results. + /// + /// This method fails if it is called on the `Solution` or `Boolean` results. + /// + /// ``` + /// use oxigraph::io::RdfFormat; + /// use oxigraph::model::*; + /// use oxigraph::store::Store; + /// + /// let graph = " .\n"; + /// + /// let store = Store::new()?; + /// store.load_graph( + /// graph.as_bytes(), + /// RdfFormat::NTriples, + /// GraphName::DefaultGraph, + /// None, + /// )?; + /// + /// let results = store.query("CONSTRUCT WHERE { ?s ?p ?o }")?; + /// assert_eq!( + /// results.write_graph(Vec::new(), RdfFormat::NTriples)?, + /// graph.as_bytes() + /// ); + /// # Result::<_,Box>::Ok(()) + /// ``` + pub fn write_graph( + self, + write: W, + format: impl Into, + ) -> Result { + if let Self::Graph(triples) = self { + let mut writer = RdfSerializer::from_format(format.into()).serialize_to_write(write); + for triple in triples { + writer + .write_triple(&triple?) + .map_err(EvaluationError::ResultsSerialization)?; + } + writer + .finish() + .map_err(EvaluationError::ResultsSerialization) + } else { + Err(EvaluationError::NotAGraph) + } + } +} + +impl From for QueryResults { + #[inline] + fn from(value: QuerySolutionIter) -> Self { + Self::Solutions(value) + } +} + +impl From> for QueryResults { + fn from(reader: FromReadQueryResultsReader) -> Self { + match reader { + FromReadQueryResultsReader::Solutions(s) => Self::Solutions(s.into()), + FromReadQueryResultsReader::Boolean(v) => Self::Boolean(v), + } + } +} + +/// An iterator over [`QuerySolution`]s. +/// +/// ``` +/// use oxigraph::sparql::QueryResults; +/// use oxigraph::store::Store; +/// +/// let store = Store::new()?; +/// if let QueryResults::Solutions(solutions) = store.query("SELECT ?s WHERE { ?s ?p ?o }")? { +/// for solution in solutions { +/// println!("{:?}", solution?.get("s")); +/// } +/// } +/// # Result::<_,Box>::Ok(()) +/// ``` +pub struct QuerySolutionIter { + variables: Arc<[Variable]>, + iter: Box>>, +} + +impl QuerySolutionIter { + /// Construct a new iterator of solution from an ordered list of solution variables and an iterator of solution tuples + /// (each tuple using the same ordering as the variable list such that tuple element 0 is the value for the variable 0...) + pub fn new( + variables: Arc<[Variable]>, + iter: impl Iterator>, EvaluationError>> + 'static, + ) -> Self { + Self { + variables: Arc::clone(&variables), + iter: Box::new( + iter.map(move |t| t.map(|values| (Arc::clone(&variables), values).into())), + ), + } + } + + /// The variables used in the solutions. + /// + /// ``` + /// use oxigraph::sparql::{QueryResults, Variable}; + /// use oxigraph::store::Store; + /// + /// let store = Store::new()?; + /// if let QueryResults::Solutions(solutions) = store.query("SELECT ?s ?o WHERE { ?s ?p ?o }")? { + /// assert_eq!( + /// solutions.variables(), + /// &[Variable::new("s")?, Variable::new("o")?] + /// ); + /// } + /// # Result::<_,Box>::Ok(()) + /// ``` + #[inline] + pub fn variables(&self) -> &[Variable] { + &self.variables + } +} + +impl From> for QuerySolutionIter { + fn from(reader: FromReadSolutionsReader) -> Self { + Self { + variables: reader.variables().into(), + iter: Box::new(reader.map(|t| t.map_err(EvaluationError::from))), + } + } +} + +impl Iterator for QuerySolutionIter { + type Item = Result; + + #[inline] + fn next(&mut self) -> Option { + self.iter.next() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.iter.size_hint() + } +} + +/// An iterator over the triples that compose a graph solution. +/// +/// ``` +/// use oxigraph::sparql::QueryResults; +/// use oxigraph::store::Store; +/// +/// let store = Store::new()?; +/// if let QueryResults::Graph(triples) = store.query("CONSTRUCT WHERE { ?s ?p ?o }")? { +/// for triple in triples { +/// println!("{}", triple?); +/// } +/// } +/// # Result::<_,Box>::Ok(()) +/// ``` +pub struct QueryTripleIter { + pub(crate) iter: Box>>, +} + +impl Iterator for QueryTripleIter { + type Item = Result; + + #[inline] + fn next(&mut self) -> Option { + self.iter.next() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.iter.size_hint() + } + + #[inline] + fn fold(self, init: Acc, g: G) -> Acc + where + G: FnMut(Acc, Self::Item) -> Acc, + { + self.iter.fold(init, g) + } +} + +#[cfg(test)] +#[allow(clippy::panic_in_result_fn)] +mod tests { + use super::*; + use std::io::Cursor; + + #[test] + fn test_serialization_roundtrip() -> Result<(), EvaluationError> { + use std::str; + + for format in [ + QueryResultsFormat::Json, + QueryResultsFormat::Xml, + QueryResultsFormat::Tsv, + ] { + let results = vec![ + QueryResults::Boolean(true), + QueryResults::Boolean(false), + QueryResults::Solutions(QuerySolutionIter::new( + [ + Variable::new_unchecked("foo"), + Variable::new_unchecked("bar"), + ] + .as_ref() + .into(), + Box::new( + vec![ + Ok(vec![None, None]), + Ok(vec![ + Some(NamedNode::new_unchecked("http://example.com").into()), + None, + ]), + Ok(vec![ + None, + Some(NamedNode::new_unchecked("http://example.com").into()), + ]), + Ok(vec![ + Some(BlankNode::new_unchecked("foo").into()), + Some(BlankNode::new_unchecked("bar").into()), + ]), + Ok(vec![Some(Literal::new_simple_literal("foo").into()), None]), + Ok(vec![ + Some( + Literal::new_language_tagged_literal_unchecked("foo", "fr") + .into(), + ), + None, + ]), + Ok(vec![ + Some(Literal::from(1).into()), + Some(Literal::from(true).into()), + ]), + Ok(vec![ + Some(Literal::from(1.33).into()), + Some(Literal::from(false).into()), + ]), + Ok(vec![ + Some( + Triple::new( + NamedNode::new_unchecked("http://example.com/s"), + NamedNode::new_unchecked("http://example.com/p"), + Triple::new( + NamedNode::new_unchecked("http://example.com/os"), + NamedNode::new_unchecked("http://example.com/op"), + NamedNode::new_unchecked("http://example.com/oo"), + ), + ) + .into(), + ), + None, + ]), + ] + .into_iter(), + ), + )), + ]; + + for ex in results { + let mut buffer = Vec::new(); + ex.write(&mut buffer, format)?; + let ex2 = QueryResults::read(Cursor::new(buffer.clone()), format)?; + let mut buffer2 = Vec::new(); + ex2.write(&mut buffer2, format)?; + assert_eq!( + str::from_utf8(&buffer).unwrap(), + str::from_utf8(&buffer2).unwrap() + ); + } + } + + Ok(()) + } +} diff --git a/src/sparql/results.rs b/src/sparql/results.rs new file mode 100644 index 0000000..00f8cc3 --- /dev/null +++ b/src/sparql/results.rs @@ -0,0 +1,44 @@ +//! Utilities to read and write RDF results formats using [sparesults](https://crates.io/crates/sparesults). +//! +//! It supports [SPARQL Query Results XML Format (Second Edition)](https://www.w3.org/TR/rdf-sparql-XMLres/), [SPARQL 1.1 Query Results JSON Format](https://www.w3.org/TR/sparql11-results-json/) and [SPARQL 1.1 Query Results CSV and TSV Formats](https://www.w3.org/TR/sparql11-results-csv-tsv/). +//! +//! Usage example converting a JSON result file into a TSV result file: +//! +//! ``` +//! use oxigraph::sparql::results::{QueryResultsFormat, QueryResultsParser, FromReadQueryResultsReader, QueryResultsSerializer}; +//! use std::io::Result; +//! +//! fn convert_json_to_tsv(json_file: &[u8]) -> Result> { +//! let json_parser = QueryResultsParser::from_format(QueryResultsFormat::Json); +//! let tsv_serializer = QueryResultsSerializer::from_format(QueryResultsFormat::Tsv); +//! // We start to read the JSON file and see which kind of results it is +//! match json_parser.parse_read(json_file)? { +//! FromReadQueryResultsReader::Boolean(value) => { +//! // it's a boolean result, we copy it in TSV to the output buffer +//! tsv_serializer.serialize_boolean_to_write(Vec::new(), value) +//! } +//! FromReadQueryResultsReader::Solutions(solutions_reader) => { +//! // it's a set of solutions, we create a writer and we write to it while reading in streaming from the JSON file +//! let mut serialize_solutions_to_write = tsv_serializer.serialize_solutions_to_write(Vec::new(), solutions_reader.variables().to_vec())?; +//! for solution in solutions_reader { +//! serialize_solutions_to_write.write(&solution?)?; +//! } +//! serialize_solutions_to_write.finish() +//! } +//! } +//! } +//! +//! // Let's test with a boolean +//! assert_eq!( +//! convert_json_to_tsv(br#"{"boolean":true}"#.as_slice()).unwrap(), +//! b"true" +//! ); +//! +//! // And with a set of solutions +//! assert_eq!( +//! convert_json_to_tsv(br#"{"head":{"vars":["foo","bar"]},"results":{"bindings":[{"foo":{"type":"literal","value":"test"}}]}}"#.as_slice()).unwrap(), +//! b"?foo\t?bar\n\"test\"\t\n" +//! ); +//! ``` + +pub use sparesults::*; diff --git a/src/sparql/service.rs b/src/sparql/service.rs new file mode 100644 index 0000000..e3dd560 --- /dev/null +++ b/src/sparql/service.rs @@ -0,0 +1,124 @@ +use crate::model::NamedNode; +use crate::sparql::algebra::Query; +use crate::sparql::error::EvaluationError; +use crate::sparql::http::Client; +use crate::sparql::model::QueryResults; +use crate::sparql::results::QueryResultsFormat; +use std::error::Error; +use std::time::Duration; + +/// Handler for [SPARQL 1.1 Federated Query](https://www.w3.org/TR/sparql11-federated-query/) SERVICE. +/// +/// Should be given to [`QueryOptions`](super::QueryOptions::with_service_handler()) +/// before evaluating a SPARQL query that uses SERVICE calls. +/// +/// ``` +/// use oxigraph::model::*; +/// use oxigraph::sparql::{EvaluationError, Query, QueryOptions, QueryResults, ServiceHandler}; +/// use oxigraph::store::Store; +/// +/// struct TestServiceHandler { +/// store: Store, +/// } +/// +/// impl ServiceHandler for TestServiceHandler { +/// type Error = EvaluationError; +/// +/// fn handle( +/// &self, +/// service_name: NamedNode, +/// query: Query, +/// ) -> Result { +/// if service_name == "http://example.com/service" { +/// self.store.query(query) +/// } else { +/// panic!() +/// } +/// } +/// } +/// +/// let store = Store::new()?; +/// let service = TestServiceHandler { +/// store: Store::new()?, +/// }; +/// let ex = NamedNodeRef::new("http://example.com")?; +/// service +/// .store +/// .insert(QuadRef::new(ex, ex, ex, GraphNameRef::DefaultGraph))?; +/// +/// if let QueryResults::Solutions(mut solutions) = store.query_opt( +/// "SELECT ?s WHERE { SERVICE { ?s ?p ?o } }", +/// QueryOptions::default().with_service_handler(service), +/// )? { +/// assert_eq!(solutions.next().unwrap()?.get("s"), Some(&ex.into())); +/// } +/// # Result::<_,Box>::Ok(()) +/// ``` +pub trait ServiceHandler: Send + Sync { + /// The service evaluation error. + type Error: Error + Send + Sync + 'static; + + /// Evaluates a [`Query`] against a given service identified by a [`NamedNode`]. + fn handle(&self, service_name: NamedNode, query: Query) -> Result; +} + +pub struct EmptyServiceHandler; + +impl ServiceHandler for EmptyServiceHandler { + type Error = EvaluationError; + + fn handle(&self, name: NamedNode, _: Query) -> Result { + Err(EvaluationError::UnsupportedService(name)) + } +} + +pub struct ErrorConversionServiceHandler { + handler: S, +} + +impl ErrorConversionServiceHandler { + pub fn wrap(handler: S) -> Self { + Self { handler } + } +} + +impl ServiceHandler for ErrorConversionServiceHandler { + type Error = EvaluationError; + + fn handle(&self, service_name: NamedNode, query: Query) -> Result { + self.handler + .handle(service_name, query) + .map_err(|e| EvaluationError::Service(Box::new(e))) + } +} + +pub struct SimpleServiceHandler { + client: Client, +} + +impl SimpleServiceHandler { + pub fn new(http_timeout: Option, http_redirection_limit: usize) -> Self { + Self { + client: Client::new(http_timeout, http_redirection_limit), + } + } +} + +impl ServiceHandler for SimpleServiceHandler { + type Error = EvaluationError; + + fn handle(&self, service_name: NamedNode, query: Query) -> Result { + let (content_type, body) = self + .client + .post( + service_name.as_str(), + query.to_string().into_bytes(), + "application/sparql-query", + "application/sparql-results+json, application/sparql-results+xml", + ) + .map_err(|e| EvaluationError::Service(Box::new(e)))?; + let format = QueryResultsFormat::from_media_type(&content_type) + .ok_or_else(|| EvaluationError::UnsupportedContentType(content_type))?; + Ok(QueryResults::read(body, format)?) + } +} diff --git a/src/sparql/update.rs b/src/sparql/update.rs new file mode 100644 index 0000000..967de82 --- /dev/null +++ b/src/sparql/update.rs @@ -0,0 +1,565 @@ +use crate::io::{RdfFormat, RdfParser}; +use crate::model::{GraphName as OxGraphName, GraphNameRef, Quad as OxQuad}; +use crate::sparql::algebra::QueryDataset; +use crate::sparql::dataset::DatasetView; +use crate::sparql::eval::{EncodedTuple, SimpleEvaluator}; +use crate::sparql::http::Client; +use crate::sparql::{EvaluationError, Update, UpdateOptions}; +use crate::storage::numeric_encoder::{Decoder, EncodedTerm}; +use crate::storage::StorageWriter; +use oxiri::Iri; +use spargebra::algebra::{GraphPattern, GraphTarget}; +use spargebra::term::{ + BlankNode, GraphName, GraphNamePattern, GroundQuad, GroundQuadPattern, GroundSubject, + GroundTerm, GroundTermPattern, GroundTriple, GroundTriplePattern, NamedNode, NamedNodePattern, + Quad, QuadPattern, Subject, Term, TermPattern, Triple, TriplePattern, Variable, +}; +use spargebra::GraphUpdateOperation; +use sparopt::Optimizer; +use std::collections::HashMap; +use std::io; +use std::rc::Rc; +use std::sync::Arc; + +pub fn evaluate_update<'a, 'b: 'a>( + transaction: &'a mut StorageWriter<'b>, + update: &Update, + options: &UpdateOptions, +) -> Result<(), EvaluationError> { + SimpleUpdateEvaluator { + transaction, + base_iri: update.inner.base_iri.clone().map(Rc::new), + options: options.clone(), + client: Client::new( + options.query_options.http_timeout, + options.query_options.http_redirection_limit, + ), + } + .eval_all(&update.inner.operations, &update.using_datasets) +} + +struct SimpleUpdateEvaluator<'a, 'b> { + transaction: &'a mut StorageWriter<'b>, + base_iri: Option>>, + options: UpdateOptions, + client: Client, +} + +impl<'a, 'b: 'a> SimpleUpdateEvaluator<'a, 'b> { + fn eval_all( + &mut self, + updates: &[GraphUpdateOperation], + using_datasets: &[Option], + ) -> Result<(), EvaluationError> { + for (update, using_dataset) in updates.iter().zip(using_datasets) { + self.eval(update, using_dataset)?; + } + Ok(()) + } + + fn eval( + &mut self, + update: &GraphUpdateOperation, + using_dataset: &Option, + ) -> Result<(), EvaluationError> { + match update { + GraphUpdateOperation::InsertData { data } => self.eval_insert_data(data), + GraphUpdateOperation::DeleteData { data } => self.eval_delete_data(data), + GraphUpdateOperation::DeleteInsert { + delete, + insert, + pattern, + .. + } => self.eval_delete_insert( + delete, + insert, + using_dataset.as_ref().unwrap_or(&QueryDataset::new()), + pattern, + ), + GraphUpdateOperation::Load { + silent, + source, + destination, + } => { + if let Err(error) = self.eval_load(source, destination) { + if *silent { + Ok(()) + } else { + Err(error) + } + } else { + Ok(()) + } + } + GraphUpdateOperation::Clear { graph, silent } => self.eval_clear(graph, *silent), + GraphUpdateOperation::Create { graph, silent } => self.eval_create(graph, *silent), + GraphUpdateOperation::Drop { graph, silent } => self.eval_drop(graph, *silent), + } + } + + fn eval_insert_data(&mut self, data: &[Quad]) -> Result<(), EvaluationError> { + let mut bnodes = HashMap::new(); + for quad in data { + let quad = Self::convert_quad(quad, &mut bnodes); + self.transaction.insert(quad.as_ref())?; + } + Ok(()) + } + + fn eval_delete_data(&mut self, data: &[GroundQuad]) -> Result<(), EvaluationError> { + for quad in data { + let quad = Self::convert_ground_quad(quad); + self.transaction.remove(quad.as_ref())?; + } + Ok(()) + } + + fn eval_delete_insert( + &mut self, + delete: &[GroundQuadPattern], + insert: &[QuadPattern], + using: &QueryDataset, + algebra: &GraphPattern, + ) -> Result<(), EvaluationError> { + let dataset = Rc::new(DatasetView::new(self.transaction.reader(), using)); + let mut pattern = sparopt::algebra::GraphPattern::from(algebra); + if !self.options.query_options.without_optimizations { + pattern = Optimizer::optimize_graph_pattern(sparopt::algebra::GraphPattern::Reduced { + inner: Box::new(pattern), + }); + } + let evaluator = SimpleEvaluator::new( + Rc::clone(&dataset), + self.base_iri.clone(), + self.options.query_options.service_handler(), + Arc::new(self.options.query_options.custom_functions.clone()), + false, + ); + let mut variables = Vec::new(); + let mut bnodes = HashMap::new(); + let (eval, _) = evaluator.graph_pattern_evaluator(&pattern, &mut variables); + let tuples = + eval(EncodedTuple::with_capacity(variables.len())).collect::, _>>()?; // TODO: would be much better to stream + for tuple in tuples { + for quad in delete { + if let Some(quad) = + Self::convert_ground_quad_pattern(quad, &variables, &tuple, &dataset)? + { + self.transaction.remove(quad.as_ref())?; + } + } + for quad in insert { + if let Some(quad) = + Self::convert_quad_pattern(quad, &variables, &tuple, &dataset, &mut bnodes)? + { + self.transaction.insert(quad.as_ref())?; + } + } + bnodes.clear(); + } + Ok(()) + } + + fn eval_load(&mut self, from: &NamedNode, to: &GraphName) -> Result<(), EvaluationError> { + let (content_type, body) = self + .client + .get( + from.as_str(), + "application/n-triples, text/turtle, application/rdf+xml", + ) + .map_err(|e| EvaluationError::Service(Box::new(e)))?; + let format = RdfFormat::from_media_type(&content_type) + .ok_or_else(|| EvaluationError::UnsupportedContentType(content_type))?; + let to_graph_name = match to { + GraphName::NamedNode(graph_name) => graph_name.into(), + GraphName::DefaultGraph => GraphNameRef::DefaultGraph, + }; + let mut parser = RdfParser::from_format(format) + .rename_blank_nodes() + .without_named_graphs() + .with_default_graph(to_graph_name); + parser = parser.with_base_iri(from.as_str()).map_err(|e| { + EvaluationError::Service(Box::new(io::Error::new( + io::ErrorKind::InvalidInput, + format!("Invalid URL: {from}: {e}"), + ))) + })?; + for q in parser.parse_read(body) { + self.transaction.insert(q?.as_ref())?; + } + Ok(()) + } + + fn eval_create(&mut self, graph_name: &NamedNode, silent: bool) -> Result<(), EvaluationError> { + if self.transaction.insert_named_graph(graph_name.into())? || silent { + Ok(()) + } else { + Err(EvaluationError::GraphAlreadyExists(graph_name.clone())) + } + } + + fn eval_clear(&mut self, graph: &GraphTarget, silent: bool) -> Result<(), EvaluationError> { + match graph { + GraphTarget::NamedNode(graph_name) => { + if self + .transaction + .reader() + .contains_named_graph(&graph_name.as_ref().into())? + { + Ok(self.transaction.clear_graph(graph_name.into())?) + } else if silent { + Ok(()) + } else { + Err(EvaluationError::GraphDoesNotExist(graph_name.clone())) + } + } + GraphTarget::DefaultGraph => { + self.transaction.clear_graph(GraphNameRef::DefaultGraph)?; + Ok(()) + } + GraphTarget::NamedGraphs => Ok(self.transaction.clear_all_named_graphs()?), + GraphTarget::AllGraphs => Ok(self.transaction.clear_all_graphs()?), + } + } + + fn eval_drop(&mut self, graph: &GraphTarget, silent: bool) -> Result<(), EvaluationError> { + match graph { + GraphTarget::NamedNode(graph_name) => { + if self.transaction.remove_named_graph(graph_name.into())? || silent { + Ok(()) + } else { + Err(EvaluationError::GraphDoesNotExist(graph_name.clone())) + } + } + GraphTarget::DefaultGraph => { + Ok(self.transaction.clear_graph(GraphNameRef::DefaultGraph)?) + } + GraphTarget::NamedGraphs => Ok(self.transaction.remove_all_named_graphs()?), + GraphTarget::AllGraphs => Ok(self.transaction.clear()?), + } + } + + fn convert_quad(quad: &Quad, bnodes: &mut HashMap) -> OxQuad { + OxQuad { + subject: match &quad.subject { + Subject::NamedNode(subject) => subject.clone().into(), + Subject::BlankNode(subject) => Self::convert_blank_node(subject, bnodes).into(), + Subject::Triple(subject) => Self::convert_triple(subject, bnodes).into(), + }, + predicate: quad.predicate.clone(), + object: match &quad.object { + Term::NamedNode(object) => object.clone().into(), + Term::BlankNode(object) => Self::convert_blank_node(object, bnodes).into(), + Term::Literal(object) => object.clone().into(), + Term::Triple(subject) => Self::convert_triple(subject, bnodes).into(), + }, + graph_name: match &quad.graph_name { + GraphName::NamedNode(graph_name) => graph_name.clone().into(), + GraphName::DefaultGraph => OxGraphName::DefaultGraph, + }, + } + } + + fn convert_triple(triple: &Triple, bnodes: &mut HashMap) -> Triple { + Triple { + subject: match &triple.subject { + Subject::NamedNode(subject) => subject.clone().into(), + Subject::BlankNode(subject) => Self::convert_blank_node(subject, bnodes).into(), + Subject::Triple(subject) => Self::convert_triple(subject, bnodes).into(), + }, + predicate: triple.predicate.clone(), + object: match &triple.object { + Term::NamedNode(object) => object.clone().into(), + Term::BlankNode(object) => Self::convert_blank_node(object, bnodes).into(), + Term::Literal(object) => object.clone().into(), + Term::Triple(subject) => Self::convert_triple(subject, bnodes).into(), + }, + } + } + + fn convert_blank_node( + node: &BlankNode, + bnodes: &mut HashMap, + ) -> BlankNode { + bnodes.entry(node.clone()).or_default().clone() + } + + fn convert_ground_quad(quad: &GroundQuad) -> OxQuad { + OxQuad { + subject: match &quad.subject { + GroundSubject::NamedNode(subject) => subject.clone().into(), + GroundSubject::Triple(subject) => Self::convert_ground_triple(subject).into(), + }, + predicate: quad.predicate.clone(), + object: match &quad.object { + GroundTerm::NamedNode(object) => object.clone().into(), + GroundTerm::Literal(object) => object.clone().into(), + GroundTerm::Triple(subject) => Self::convert_ground_triple(subject).into(), + }, + graph_name: match &quad.graph_name { + GraphName::NamedNode(graph_name) => graph_name.clone().into(), + GraphName::DefaultGraph => OxGraphName::DefaultGraph, + }, + } + } + + fn convert_ground_triple(triple: &GroundTriple) -> Triple { + Triple { + subject: match &triple.subject { + GroundSubject::NamedNode(subject) => subject.clone().into(), + GroundSubject::Triple(subject) => Self::convert_ground_triple(subject).into(), + }, + predicate: triple.predicate.clone(), + object: match &triple.object { + GroundTerm::NamedNode(object) => object.clone().into(), + GroundTerm::Literal(object) => object.clone().into(), + GroundTerm::Triple(subject) => Self::convert_ground_triple(subject).into(), + }, + } + } + + fn convert_quad_pattern( + quad: &QuadPattern, + variables: &[Variable], + values: &EncodedTuple, + dataset: &DatasetView, + bnodes: &mut HashMap, + ) -> Result, EvaluationError> { + Ok(Some(OxQuad { + subject: match Self::convert_term_or_var( + &quad.subject, + variables, + values, + dataset, + bnodes, + )? { + Some(Term::NamedNode(node)) => node.into(), + Some(Term::BlankNode(node)) => node.into(), + Some(Term::Triple(triple)) => triple.into(), + Some(Term::Literal(_)) | None => return Ok(None), + }, + predicate: if let Some(predicate) = + Self::convert_named_node_or_var(&quad.predicate, variables, values, dataset)? + { + predicate + } else { + return Ok(None); + }, + object: if let Some(object) = + Self::convert_term_or_var(&quad.object, variables, values, dataset, bnodes)? + { + object + } else { + return Ok(None); + }, + graph_name: if let Some(graph_name) = + Self::convert_graph_name_or_var(&quad.graph_name, variables, values, dataset)? + { + graph_name + } else { + return Ok(None); + }, + })) + } + + fn convert_term_or_var( + term: &TermPattern, + variables: &[Variable], + values: &EncodedTuple, + dataset: &DatasetView, + bnodes: &mut HashMap, + ) -> Result, EvaluationError> { + Ok(match term { + TermPattern::NamedNode(term) => Some(term.clone().into()), + TermPattern::BlankNode(bnode) => Some(Self::convert_blank_node(bnode, bnodes).into()), + TermPattern::Literal(term) => Some(term.clone().into()), + TermPattern::Triple(triple) => { + Self::convert_triple_pattern(triple, variables, values, dataset, bnodes)? + .map(Into::into) + } + TermPattern::Variable(v) => Self::lookup_variable(v, variables, values) + .map(|node| dataset.decode_term(&node)) + .transpose()?, + }) + } + + fn convert_named_node_or_var( + term: &NamedNodePattern, + variables: &[Variable], + values: &EncodedTuple, + dataset: &DatasetView, + ) -> Result, EvaluationError> { + Ok(match term { + NamedNodePattern::NamedNode(term) => Some(term.clone()), + NamedNodePattern::Variable(v) => Self::lookup_variable(v, variables, values) + .map(|node| dataset.decode_named_node(&node)) + .transpose()?, + }) + } + + fn convert_graph_name_or_var( + term: &GraphNamePattern, + variables: &[Variable], + values: &EncodedTuple, + dataset: &DatasetView, + ) -> Result, EvaluationError> { + match term { + GraphNamePattern::NamedNode(term) => Ok(Some(term.clone().into())), + GraphNamePattern::DefaultGraph => Ok(Some(OxGraphName::DefaultGraph)), + GraphNamePattern::Variable(v) => Self::lookup_variable(v, variables, values) + .map(|node| { + Ok(if node == EncodedTerm::DefaultGraph { + OxGraphName::DefaultGraph + } else { + dataset.decode_named_node(&node)?.into() + }) + }) + .transpose(), + } + } + + fn convert_triple_pattern( + triple: &TriplePattern, + variables: &[Variable], + values: &EncodedTuple, + dataset: &DatasetView, + bnodes: &mut HashMap, + ) -> Result, EvaluationError> { + Ok(Some(Triple { + subject: match Self::convert_term_or_var( + &triple.subject, + variables, + values, + dataset, + bnodes, + )? { + Some(Term::NamedNode(node)) => node.into(), + Some(Term::BlankNode(node)) => node.into(), + Some(Term::Triple(triple)) => triple.into(), + Some(Term::Literal(_)) | None => return Ok(None), + }, + predicate: if let Some(predicate) = + Self::convert_named_node_or_var(&triple.predicate, variables, values, dataset)? + { + predicate + } else { + return Ok(None); + }, + object: if let Some(object) = + Self::convert_term_or_var(&triple.object, variables, values, dataset, bnodes)? + { + object + } else { + return Ok(None); + }, + })) + } + + fn convert_ground_quad_pattern( + quad: &GroundQuadPattern, + variables: &[Variable], + values: &EncodedTuple, + dataset: &DatasetView, + ) -> Result, EvaluationError> { + Ok(Some(OxQuad { + subject: match Self::convert_ground_term_or_var( + &quad.subject, + variables, + values, + dataset, + )? { + Some(Term::NamedNode(node)) => node.into(), + Some(Term::BlankNode(node)) => node.into(), + Some(Term::Triple(triple)) => triple.into(), + Some(Term::Literal(_)) | None => return Ok(None), + }, + predicate: if let Some(predicate) = + Self::convert_named_node_or_var(&quad.predicate, variables, values, dataset)? + { + predicate + } else { + return Ok(None); + }, + object: if let Some(object) = + Self::convert_ground_term_or_var(&quad.object, variables, values, dataset)? + { + object + } else { + return Ok(None); + }, + graph_name: if let Some(graph_name) = + Self::convert_graph_name_or_var(&quad.graph_name, variables, values, dataset)? + { + graph_name + } else { + return Ok(None); + }, + })) + } + + fn convert_ground_term_or_var( + term: &GroundTermPattern, + variables: &[Variable], + values: &EncodedTuple, + dataset: &DatasetView, + ) -> Result, EvaluationError> { + Ok(match term { + GroundTermPattern::NamedNode(term) => Some(term.clone().into()), + GroundTermPattern::Literal(term) => Some(term.clone().into()), + GroundTermPattern::Triple(triple) => { + Self::convert_ground_triple_pattern(triple, variables, values, dataset)? + .map(Into::into) + } + GroundTermPattern::Variable(v) => Self::lookup_variable(v, variables, values) + .map(|node| dataset.decode_term(&node)) + .transpose()?, + }) + } + + fn convert_ground_triple_pattern( + triple: &GroundTriplePattern, + variables: &[Variable], + values: &EncodedTuple, + dataset: &DatasetView, + ) -> Result, EvaluationError> { + Ok(Some(Triple { + subject: match Self::convert_ground_term_or_var( + &triple.subject, + variables, + values, + dataset, + )? { + Some(Term::NamedNode(node)) => node.into(), + Some(Term::BlankNode(node)) => node.into(), + Some(Term::Triple(triple)) => triple.into(), + Some(Term::Literal(_)) | None => return Ok(None), + }, + predicate: if let Some(predicate) = + Self::convert_named_node_or_var(&triple.predicate, variables, values, dataset)? + { + predicate + } else { + return Ok(None); + }, + object: if let Some(object) = + Self::convert_ground_term_or_var(&triple.object, variables, values, dataset)? + { + object + } else { + return Ok(None); + }, + })) + } + + fn lookup_variable( + v: &Variable, + variables: &[Variable], + values: &EncodedTuple, + ) -> Option { + variables + .iter() + .position(|v2| v == v2) + .and_then(|i| values.get(i)) + .cloned() + } +} diff --git a/src/storage/backend/fallback.rs b/src/storage/backend/fallback.rs new file mode 100644 index 0000000..7214851 --- /dev/null +++ b/src/storage/backend/fallback.rs @@ -0,0 +1,310 @@ +//! TODO: This storage is dramatically naive. + +use crate::storage::StorageError; +use crate::store::CorruptionError; +use std::cell::RefCell; +use std::collections::{BTreeMap, HashMap}; +use std::error::Error; +use std::mem::transmute; +use std::rc::{Rc, Weak}; +use std::sync::{Arc, RwLock, RwLockWriteGuard}; + +pub struct ColumnFamilyDefinition { + pub name: &'static str, + pub use_iter: bool, + pub min_prefix_size: usize, + pub unordered_writes: bool, +} + +#[derive(Clone)] +pub struct Db(Arc, Vec>>>>); + +impl Db { + #[allow(clippy::unnecessary_wraps)] + pub fn new(column_families: Vec) -> Result { + let mut trees = HashMap::new(); + for cf in column_families { + trees.insert(ColumnFamily(cf.name), BTreeMap::default()); + } + trees.entry(ColumnFamily("default")).or_default(); // We make sure that "default" key exists. + Ok(Self(Arc::new(RwLock::new(trees)))) + } + + #[allow(clippy::unwrap_in_result)] + pub fn column_family(&self, name: &'static str) -> Result { + let column_family = ColumnFamily(name); + if self.0.read().unwrap().contains_key(&column_family) { + Ok(column_family) + } else { + Err(CorruptionError::from_missing_column_family_name(name).into()) + } + } + + #[must_use] + pub fn snapshot(&self) -> Reader { + Reader(InnerReader::Simple(Arc::clone(&self.0))) + } + + #[allow(clippy::unwrap_in_result)] + pub fn transaction<'a, 'b: 'a, T, E: Error + 'static + From>( + &'b self, + f: impl Fn(Transaction<'a>) -> Result, + ) -> Result { + f(Transaction(Rc::new(RefCell::new(self.0.write().unwrap())))) + } +} + +#[derive(Clone, Ord, PartialOrd, Eq, PartialEq, Hash)] +pub struct ColumnFamily(&'static str); + +pub struct Reader(InnerReader); + +enum InnerReader { + Simple(Arc, Vec>>>>), + Transaction( + Weak, Vec>>>>>, + ), +} + +impl Reader { + #[allow(clippy::unwrap_in_result)] + pub fn get( + &self, + column_family: &ColumnFamily, + key: &[u8], + ) -> Result>, StorageError> { + match &self.0 { + InnerReader::Simple(reader) => Ok(reader + .read() + .unwrap() + .get(column_family) + .and_then(|cf| cf.get(key).cloned())), + InnerReader::Transaction(reader) => { + if let Some(reader) = reader.upgrade() { + Ok((*reader) + .borrow() + .get(column_family) + .and_then(|cf| cf.get(key).cloned())) + } else { + Err(StorageError::Other( + "The transaction is already ended".into(), + )) + } + } + } + } + + #[allow(clippy::unwrap_in_result)] + pub fn contains_key( + &self, + column_family: &ColumnFamily, + key: &[u8], + ) -> Result { + match &self.0 { + InnerReader::Simple(reader) => Ok(reader + .read() + .unwrap() + .get(column_family) + .map_or(false, |cf| cf.contains_key(key))), + InnerReader::Transaction(reader) => { + if let Some(reader) = reader.upgrade() { + Ok((*reader) + .borrow() + .get(column_family) + .map_or(false, |cf| cf.contains_key(key))) + } else { + Err(StorageError::Other( + "The transaction is already ended".into(), + )) + } + } + } + } + + #[allow(clippy::iter_not_returning_iterator)] + pub fn iter(&self, column_family: &ColumnFamily) -> Result { + self.scan_prefix(column_family, &[]) + } + + #[allow(clippy::unwrap_in_result)] + pub fn scan_prefix( + &self, + column_family: &ColumnFamily, + prefix: &[u8], + ) -> Result { + let data: Vec<_> = match &self.0 { + InnerReader::Simple(reader) => { + let trees = reader.read().unwrap(); + let Some(tree) = trees.get(column_family) else { + return Ok(Iter { + iter: Vec::new().into_iter(), + current: None, + }); + }; + if prefix.is_empty() { + tree.iter().map(|(k, v)| (k.clone(), v.clone())).collect() + } else { + tree.range(prefix.to_vec()..) + .take_while(|(k, _)| k.starts_with(prefix)) + .map(|(k, v)| (k.clone(), v.clone())) + .collect() + } + } + InnerReader::Transaction(reader) => { + let Some(reader) = reader.upgrade() else { + return Err(StorageError::Other( + "The transaction is already ended".into(), + )); + }; + let trees = (*reader).borrow(); + let Some(tree) = trees.get(column_family) else { + return Ok(Iter { + iter: Vec::new().into_iter(), + current: None, + }); + }; + if prefix.is_empty() { + tree.iter().map(|(k, v)| (k.clone(), v.clone())).collect() + } else { + tree.range(prefix.to_vec()..) + .take_while(|(k, _)| k.starts_with(prefix)) + .map(|(k, v)| (k.clone(), v.clone())) + .collect() + } + } + }; + let mut iter = data.into_iter(); + let current = iter.next(); + Ok(Iter { iter, current }) + } + + #[allow(clippy::unwrap_in_result)] + pub fn len(&self, column_family: &ColumnFamily) -> Result { + match &self.0 { + InnerReader::Simple(reader) => Ok(reader + .read() + .unwrap() + .get(column_family) + .map_or(0, BTreeMap::len)), + InnerReader::Transaction(reader) => { + if let Some(reader) = reader.upgrade() { + Ok((*reader) + .borrow() + .get(column_family) + .map_or(0, BTreeMap::len)) + } else { + Err(StorageError::Other( + "The transaction is already ended".into(), + )) + } + } + } + } + + #[allow(clippy::unwrap_in_result)] + pub fn is_empty(&self, column_family: &ColumnFamily) -> Result { + match &self.0 { + InnerReader::Simple(reader) => Ok(reader + .read() + .unwrap() + .get(column_family) + .map_or(true, BTreeMap::is_empty)), + InnerReader::Transaction(reader) => { + if let Some(reader) = reader.upgrade() { + Ok((*reader) + .borrow() + .get(column_family) + .map_or(true, BTreeMap::is_empty)) + } else { + Err(StorageError::Other( + "The transaction is already ended".into(), + )) + } + } + } + } +} + +pub struct Transaction<'a>( + Rc, Vec>>>>>, +); + +impl Transaction<'_> { + #[allow(unsafe_code, clippy::useless_transmute)] + pub fn reader(&self) -> Reader { + // SAFETY: This transmute is safe because we take a weak reference and the only Rc reference used is guarded by the lifetime. + Reader(InnerReader::Transaction(Rc::downgrade(unsafe { + transmute(&self.0) + }))) + } + + #[allow(clippy::unnecessary_wraps)] + pub fn contains_key_for_update( + &self, + column_family: &ColumnFamily, + key: &[u8], + ) -> Result { + Ok((*self.0) + .borrow() + .get(column_family) + .map_or(false, |cf| cf.contains_key(key))) + } + + #[allow(clippy::unnecessary_wraps, clippy::unwrap_in_result)] + pub fn insert( + &mut self, + column_family: &ColumnFamily, + key: &[u8], + value: &[u8], + ) -> Result<(), StorageError> { + self.0 + .borrow_mut() + .get_mut(column_family) + .unwrap() + .insert(key.into(), value.into()); + Ok(()) + } + + pub fn insert_empty( + &mut self, + column_family: &ColumnFamily, + key: &[u8], + ) -> Result<(), StorageError> { + self.insert(column_family, key, &[]) + } + + #[allow(clippy::unnecessary_wraps, clippy::unwrap_in_result)] + pub fn remove(&mut self, column_family: &ColumnFamily, key: &[u8]) -> Result<(), StorageError> { + self.0 + .borrow_mut() + .get_mut(column_family) + .unwrap() + .remove(key); + Ok(()) + } +} + +pub struct Iter { + iter: std::vec::IntoIter<(Vec, Vec)>, + current: Option<(Vec, Vec)>, +} + +impl Iter { + pub fn key(&self) -> Option<&[u8]> { + Some(&self.current.as_ref()?.0) + } + + #[allow(dead_code)] + pub fn value(&self) -> Option<&[u8]> { + Some(&self.current.as_ref()?.1) + } + + pub fn next(&mut self) { + self.current = self.iter.next(); + } + + #[allow(clippy::unnecessary_wraps, clippy::unused_self)] + pub fn status(&self) -> Result<(), StorageError> { + Ok(()) + } +} diff --git a/src/storage/backend/mod.rs b/src/storage/backend/mod.rs new file mode 100644 index 0000000..b94eb65 --- /dev/null +++ b/src/storage/backend/mod.rs @@ -0,0 +1,12 @@ +//! A storage backend +//! RocksDB is available, if not in memory + +#[cfg(any(target_family = "wasm"))] +pub use fallback::{ColumnFamily, ColumnFamilyDefinition, Db, Iter, Reader, Transaction}; +#[cfg(all(not(target_family = "wasm")))] +pub use oxi_rocksdb::{ColumnFamily, ColumnFamilyDefinition, Db, Iter, Reader, Transaction}; + +#[cfg(any(target_family = "wasm"))] +mod fallback; +#[cfg(all(not(target_family = "wasm")))] +mod oxi_rocksdb; diff --git a/src/storage/backend/oxi_rocksdb.rs b/src/storage/backend/oxi_rocksdb.rs new file mode 100644 index 0000000..1019d97 --- /dev/null +++ b/src/storage/backend/oxi_rocksdb.rs @@ -0,0 +1,1445 @@ +//! Code inspired by [Rust RocksDB](https://github.com/rust-rocksdb/rust-rocksdb) under Apache License 2.0. + +#![allow( + unsafe_code, + trivial_casts, + clippy::undocumented_unsafe_blocks, + clippy::panic_in_result_fn, + clippy::unwrap_in_result +)] + +use crate::storage::error::{CorruptionError, StorageError}; +use libc::{c_char, c_void}; +use rand::random; +use rocksdb::ffi::*; +use std::borrow::Borrow; +#[cfg(unix)] +use std::cmp::min; +use std::collections::HashMap; +use std::env::temp_dir; +use std::error::Error; +use std::ffi::{CStr, CString}; +use std::fs::remove_dir_all; +use std::marker::PhantomData; +use std::ops::Deref; +use std::path::{Path, PathBuf}; +use std::rc::{Rc, Weak}; +use std::sync::{Arc, OnceLock}; +use std::thread::{available_parallelism, yield_now}; +use std::{fmt, io, ptr, slice}; + +pub fn opt_bytes_to_ptr>(opt: Option) -> *const c_char { + match opt { + Some(v) => v.as_ref().as_ptr() as *const c_char, + None => ptr::null(), + } +} + +macro_rules! ffi_result { + ( $($function:ident)::*( $arg1:expr $(, $arg:expr)* $(,)? ) ) => {{ + let mut status = rocksdb_status_t { + code: rocksdb_status_code_t_rocksdb_status_code_ok, + subcode: rocksdb_status_subcode_t_rocksdb_status_subcode_none, + severity: rocksdb_status_severity_t_rocksdb_status_severity_none, + string: ptr::null() + }; + let result = $($function)::*($arg1 $(, $arg)* , &mut status); + if status.code == rocksdb_status_code_t_rocksdb_status_code_ok { + Ok(result) + } else { + Err(ErrorStatus(status)) + } + }} +} + +pub struct ColumnFamilyDefinition { + pub name: &'static str, + pub use_iter: bool, + pub min_prefix_size: usize, + pub unordered_writes: bool, +} + +#[derive(Clone)] +pub struct Db { + inner: DbKind, +} + +#[derive(Clone)] +enum DbKind { + ReadOnly(Arc), + ReadWrite(Arc), +} + +struct RwDbHandler { + db: *mut rocksdb_transactiondb_t, + options: *mut rocksdb_options_t, + transaction_options: *mut rocksdb_transaction_options_t, + transactiondb_options: *mut rocksdb_transactiondb_options_t, + read_options: *mut rocksdb_readoptions_t, + write_options: *mut rocksdb_writeoptions_t, + flush_options: *mut rocksdb_flushoptions_t, + env_options: *mut rocksdb_envoptions_t, + ingest_external_file_options: *mut rocksdb_ingestexternalfileoptions_t, + compaction_options: *mut rocksdb_compactoptions_t, + block_based_table_options: *mut rocksdb_block_based_table_options_t, + column_family_names: Vec<&'static str>, + cf_handles: Vec<*mut rocksdb_column_family_handle_t>, + cf_options: Vec<*mut rocksdb_options_t>, + in_memory: bool, + path: PathBuf, +} + +unsafe impl Send for RwDbHandler {} + +unsafe impl Sync for RwDbHandler {} + +impl Drop for RwDbHandler { + fn drop(&mut self) { + unsafe { + for cf_handle in &self.cf_handles { + rocksdb_column_family_handle_destroy(*cf_handle); + } + rocksdb_transactiondb_close(self.db); + for cf_option in &self.cf_options { + rocksdb_options_destroy(*cf_option); + } + rocksdb_readoptions_destroy(self.read_options); + rocksdb_writeoptions_destroy(self.write_options); + rocksdb_flushoptions_destroy(self.flush_options); + rocksdb_envoptions_destroy(self.env_options); + rocksdb_ingestexternalfileoptions_destroy(self.ingest_external_file_options); + rocksdb_compactoptions_destroy(self.compaction_options); + rocksdb_transaction_options_destroy(self.transaction_options); + rocksdb_transactiondb_options_destroy(self.transactiondb_options); + rocksdb_options_destroy(self.options); + rocksdb_block_based_options_destroy(self.block_based_table_options); + } + if self.in_memory { + drop(remove_dir_all(&self.path)); + } + } +} + +struct RoDbHandler { + db: *mut rocksdb_t, + options: *mut rocksdb_options_t, + read_options: *mut rocksdb_readoptions_t, + column_family_names: Vec<&'static str>, + cf_handles: Vec<*mut rocksdb_column_family_handle_t>, + cf_options: Vec<*mut rocksdb_options_t>, + is_secondary: bool, + path_to_remove: Option, +} + +unsafe impl Send for RoDbHandler {} + +unsafe impl Sync for RoDbHandler {} + +impl Drop for RoDbHandler { + fn drop(&mut self) { + unsafe { + for cf_handle in &self.cf_handles { + rocksdb_column_family_handle_destroy(*cf_handle); + } + rocksdb_close(self.db); + for cf_option in &self.cf_options { + rocksdb_options_destroy(*cf_option); + } + rocksdb_readoptions_destroy(self.read_options); + rocksdb_options_destroy(self.options); + } + if let Some(path) = &self.path_to_remove { + drop(remove_dir_all(path)); + } + } +} + +impl Db { + pub fn new(column_families: Vec) -> Result { + Self::open_read_write(None, column_families, None) + } + + pub fn open_read_write( + path: Option<&Path>, + column_families: Vec, + key: Option<[u8; 32]>, + ) -> Result { + let (path, in_memory) = if let Some(path) = path { + (path.to_path_buf(), false) + } else { + (tmp_path(), true) + }; + let c_path = path_to_cstring(&path)?; + unsafe { + let options = Self::db_options(true, in_memory, key)?; + rocksdb_options_set_create_if_missing(options, 1); + rocksdb_options_set_create_missing_column_families(options, 1); + rocksdb_options_set_compression( + options, + if in_memory { + rocksdb_no_compression + } else { + rocksdb_lz4_compression + } + .try_into() + .unwrap(), + ); + let block_based_table_options = rocksdb_block_based_options_create(); + assert!( + !block_based_table_options.is_null(), + "rocksdb_block_based_options_create returned null" + ); + rocksdb_block_based_options_set_format_version(block_based_table_options, 5); + rocksdb_block_based_options_set_index_block_restart_interval( + block_based_table_options, + 16, + ); + rocksdb_options_set_block_based_table_factory(options, block_based_table_options); + #[cfg(feature = "rocksdb-debug")] + { + rocksdb_options_set_info_log_level(options, 0); + rocksdb_options_enable_statistics(options); + rocksdb_options_set_stats_dump_period_sec(options, 60); + } + + let (column_family_names, c_column_family_names, cf_options) = + Self::column_families_names_and_options(column_families, options); + let mut cf_handles: Vec<*mut rocksdb_column_family_handle_t> = + vec![ptr::null_mut(); column_family_names.len()]; + let c_num_column_families = c_column_family_names.len().try_into().unwrap(); + + let transactiondb_options = rocksdb_transactiondb_options_create(); + assert!( + !transactiondb_options.is_null(), + "rocksdb_transactiondb_options_create returned null" + ); + + let db = ffi_result!(rocksdb_transactiondb_open_column_families_with_status( + options, + transactiondb_options, + c_path.as_ptr(), + c_num_column_families, + c_column_family_names + .iter() + .map(|cf| cf.as_ptr()) + .collect::>() + .as_ptr(), + cf_options.as_ptr().cast(), + cf_handles.as_mut_ptr(), + )) + .map_err(|e| { + rocksdb_transactiondb_options_destroy(transactiondb_options); + for cf_option in &cf_options { + rocksdb_options_destroy(*cf_option); + } + rocksdb_options_destroy(options); + rocksdb_block_based_options_destroy(block_based_table_options); + e + })?; + assert!(!db.is_null(), "rocksdb_create returned null"); + for handle in &cf_handles { + assert!( + !handle.is_null(), + "rocksdb_readoptions_create returned a null column family" + ); + } + + let read_options = rocksdb_readoptions_create(); + assert!( + !read_options.is_null(), + "rocksdb_readoptions_create returned null" + ); + + let write_options = rocksdb_writeoptions_create(); + assert!( + !write_options.is_null(), + "rocksdb_writeoptions_create returned null" + ); + rocksdb_writeoptions_set_sync(write_options, 1); + if in_memory { + rocksdb_writeoptions_disable_WAL(write_options, 1); // No need for WAL + } + + let transaction_options = rocksdb_transaction_options_create(); + assert!( + !transaction_options.is_null(), + "rocksdb_transaction_options_create returned null" + ); + rocksdb_transaction_options_set_set_snapshot(transaction_options, 1); + + let flush_options = rocksdb_flushoptions_create(); + assert!( + !flush_options.is_null(), + "rocksdb_flushoptions_create returned null" + ); + + let env_options = rocksdb_envoptions_create(); + assert!( + !env_options.is_null(), + "rocksdb_envoptions_create returned null" + ); + + let ingest_external_file_options = rocksdb_ingestexternalfileoptions_create(); + assert!( + !ingest_external_file_options.is_null(), + "rocksdb_ingestexternalfileoptions_create returned null" + ); + + let compaction_options = rocksdb_compactoptions_create(); + assert!( + !compaction_options.is_null(), + "rocksdb_compactoptions_create returned null" + ); + + Ok(Self { + inner: DbKind::ReadWrite(Arc::new(RwDbHandler { + db, + options, + transaction_options, + transactiondb_options, + read_options, + write_options, + flush_options, + env_options, + ingest_external_file_options, + compaction_options, + block_based_table_options, + column_family_names, + cf_handles, + cf_options, + in_memory, + path, + })), + }) + } + } + + // pub fn open_secondary( + // primary_path: &Path, + // secondary_path: Option<&Path>, + // column_families: Vec, + // ) -> Result { + // let c_primary_path = path_to_cstring(primary_path)?; + // let (secondary_path, in_memory) = if let Some(path) = secondary_path { + // (path.to_path_buf(), false) + // } else { + // (tmp_path(), true) + // }; + // let c_secondary_path = path_to_cstring(&secondary_path)?; + // unsafe { + // let options = Self::db_options(false, false)?; + // let (column_family_names, c_column_family_names, cf_options) = + // Self::column_families_names_and_options(column_families, options); + // let mut cf_handles: Vec<*mut rocksdb_column_family_handle_t> = + // vec![ptr::null_mut(); column_family_names.len()]; + // let c_num_column_families = c_column_family_names.len().try_into().unwrap(); + // let db = ffi_result!(rocksdb_open_as_secondary_column_families_with_status( + // options, + // c_primary_path.as_ptr(), + // c_secondary_path.as_ptr(), + // c_num_column_families, + // c_column_family_names + // .iter() + // .map(|cf| cf.as_ptr()) + // .collect::>() + // .as_ptr(), + // cf_options.as_ptr().cast(), + // cf_handles.as_mut_ptr(), + // )) + // .map_err(|e| { + // for cf_option in &cf_options { + // rocksdb_options_destroy(*cf_option); + // } + // rocksdb_options_destroy(options); + // e + // })?; + // assert!( + // !db.is_null(), + // "rocksdb_open_for_read_only_column_families_with_status returned null" + // ); + // for handle in &cf_handles { + // assert!( + // !handle.is_null(), + // "rocksdb_open_for_read_only_column_families_with_status returned a null column family" + // ); + // } + // let read_options = rocksdb_readoptions_create(); + // assert!( + // !read_options.is_null(), + // "rocksdb_readoptions_create returned null" + // ); + // Ok(Self { + // inner: DbKind::ReadOnly(Arc::new(RoDbHandler { + // db, + // options, + // read_options, + // column_family_names, + // cf_handles, + // cf_options, + // is_secondary: true, + // path_to_remove: in_memory.then_some(secondary_path), + // })), + // }) + // } + // } + + pub fn open_read_only( + path: &Path, + column_families: Vec, + key: Option<[u8; 32]>, + ) -> Result { + unsafe { + let c_path = path_to_cstring(path)?; + let options = Self::db_options(true, false, key)?; + let (column_family_names, c_column_family_names, cf_options) = + Self::column_families_names_and_options(column_families, options); + let mut cf_handles: Vec<*mut rocksdb_column_family_handle_t> = + vec![ptr::null_mut(); column_family_names.len()]; + let c_num_column_families = c_column_family_names.len().try_into().unwrap(); + let db = ffi_result!(rocksdb_open_for_read_only_column_families_with_status( + options, + c_path.as_ptr(), + c_num_column_families, + c_column_family_names + .iter() + .map(|cf| cf.as_ptr()) + .collect::>() + .as_ptr(), + cf_options.as_ptr().cast(), + cf_handles.as_mut_ptr(), + 0, // false + )) + .map_err(|e| { + for cf_option in &cf_options { + rocksdb_options_destroy(*cf_option); + } + rocksdb_options_destroy(options); + e + })?; + assert!( + !db.is_null(), + "rocksdb_open_for_read_only_column_families_with_status returned null" + ); + for handle in &cf_handles { + assert!( + !handle.is_null(), + "rocksdb_open_for_read_only_column_families_with_status returned a null column family" + ); + } + let read_options = rocksdb_readoptions_create(); + assert!( + !read_options.is_null(), + "rocksdb_readoptions_create returned null" + ); + + Ok(Self { + inner: DbKind::ReadOnly(Arc::new(RoDbHandler { + db, + options, + read_options, + column_family_names, + cf_handles, + cf_options, + is_secondary: false, + path_to_remove: None, + })), + }) + } + } + + fn db_options( + limit_max_open_files: bool, + in_memory: bool, + key: Option<[u8; 32]>, + ) -> Result<*mut rocksdb_options_t, StorageError> { + static ROCKSDB_ENV: OnceLock = OnceLock::new(); + static ROCKSDB_MEM_ENV: OnceLock = OnceLock::new(); + + unsafe { + let options = rocksdb_options_create(); + assert!(!options.is_null(), "rocksdb_options_create returned null"); + rocksdb_options_optimize_level_style_compaction(options, 512 * 1024 * 1024); + rocksdb_options_increase_parallelism( + options, + available_parallelism()?.get().try_into().unwrap(), + ); + if limit_max_open_files { + if let Some(available_fd) = available_file_descriptors()? { + if available_fd < 96 { + rocksdb_options_destroy(options); + return Err(io::Error::new( + io::ErrorKind::Other, + format!( + "Oxigraph needs at least 96 file descriptors, \ + only {available_fd} allowed. \ + Run e.g. `ulimit -n 512` to allow 512 opened files" + ), + ) + .into()); + } + rocksdb_options_set_max_open_files( + options, + (available_fd - 48).try_into().unwrap(), + ) + } + } else { + rocksdb_options_set_max_open_files(options, -1); + } + rocksdb_options_set_info_log_level(options, 2); // We only log warnings + rocksdb_options_set_max_log_file_size(options, 1024 * 1024); // Only 1MB log size + rocksdb_options_set_recycle_log_file_num(options, 10); // We do not keep more than 10 log files + rocksdb_options_set_env( + options, + if in_memory { + ROCKSDB_MEM_ENV.get_or_init(|| { + let env = rocksdb_create_mem_env(); + assert!(!env.is_null(), "rocksdb_create_mem_env returned null"); + UnsafeEnv(env) + }) + } else { + ROCKSDB_ENV.get_or_init(|| { + let env = match key { + Some(_) => rocksdb_create_encrypted_env(opt_bytes_to_ptr(key.as_ref())), + None => rocksdb_create_default_env(), + }; + assert!(!env.is_null(), "rocksdb_create_encrypted_env returned null"); + UnsafeEnv(env) + }) + } + .0, + ); + Ok(options) + } + } + + fn column_families_names_and_options( + mut column_families: Vec, + base_options: *mut rocksdb_options_t, + ) -> (Vec<&'static str>, Vec, Vec<*mut rocksdb_options_t>) { + if !column_families.iter().any(|c| c.name == "default") { + column_families.push(ColumnFamilyDefinition { + name: "default", + use_iter: true, + min_prefix_size: 0, + unordered_writes: false, + }) + } + let column_family_names = column_families.iter().map(|c| c.name).collect::>(); + let c_column_family_names = column_family_names + .iter() + .map(|name| CString::new(*name).unwrap()) + .collect(); + + let cf_options = column_families + .into_iter() + .map(|cf| unsafe { + let options = rocksdb_options_create_copy(base_options); + if !cf.use_iter { + rocksdb_options_optimize_for_point_lookup(options, 128); + } + if cf.min_prefix_size > 0 { + rocksdb_options_set_prefix_extractor( + options, + rocksdb_slicetransform_create_fixed_prefix(cf.min_prefix_size), + ); + } + if cf.unordered_writes { + rocksdb_options_set_unordered_write(options, 1); + } + options + }) + .collect::>(); + (column_family_names, c_column_family_names, cf_options) + } + + pub fn column_family(&self, name: &'static str) -> Result { + let (column_family_names, cf_handles) = match &self.inner { + DbKind::ReadOnly(db) => (&db.column_family_names, &db.cf_handles), + DbKind::ReadWrite(db) => (&db.column_family_names, &db.cf_handles), + }; + for (cf, cf_handle) in column_family_names.iter().zip(cf_handles) { + if *cf == name { + return Ok(ColumnFamily(*cf_handle)); + } + } + Err(CorruptionError::from_missing_column_family_name(name).into()) + } + + #[must_use] + pub fn snapshot(&self) -> Reader { + unsafe { + match &self.inner { + DbKind::ReadOnly(db) => { + if db.is_secondary { + // We try to refresh (and ignore the errors) + drop(ffi_result!(rocksdb_try_catch_up_with_primary_with_status( + db.db + ))); + } + let options = rocksdb_readoptions_create_copy(db.read_options); + Reader { + inner: InnerReader::PlainDb(Arc::clone(db)), + options, + } + } + DbKind::ReadWrite(db) => { + let options = rocksdb_readoptions_create_copy(db.read_options); + let snapshot = rocksdb_transactiondb_create_snapshot(db.db); + assert!( + !snapshot.is_null(), + "rocksdb_transactiondb_create_snapshot returned null" + ); + rocksdb_readoptions_set_snapshot(options, snapshot); + Reader { + inner: InnerReader::TransactionalSnapshot(Rc::new(TransactionalSnapshot { + db: Arc::clone(db), + snapshot, + })), + options, + } + } + } + } + } + + pub fn transaction<'a, 'b: 'a, T, E: Error + 'static + From>( + &'b self, + f: impl Fn(Transaction<'a>) -> Result, + ) -> Result { + if let DbKind::ReadWrite(db) = &self.inner { + loop { + let transaction = unsafe { + let transaction = rocksdb_transaction_begin( + db.db, + db.write_options, + db.transaction_options, + ptr::null_mut(), + ); + assert!( + !transaction.is_null(), + "rocksdb_transaction_begin returned null" + ); + transaction + }; + let (read_options, snapshot) = unsafe { + let options = rocksdb_readoptions_create_copy(db.read_options); + let snapshot = rocksdb_transaction_get_snapshot(transaction); + rocksdb_readoptions_set_snapshot(options, snapshot); + (options, snapshot) + }; + let result = f(Transaction { + inner: Rc::new(transaction), + read_options, + _lifetime: PhantomData, + }); + match result { + Ok(result) => { + unsafe { + let r = + ffi_result!(rocksdb_transaction_commit_with_status(transaction)); + rocksdb_transaction_destroy(transaction); + rocksdb_readoptions_destroy(read_options); + rocksdb_free(snapshot as *mut c_void); + r.map_err(StorageError::from)?; // We make sure to also run destructors if the commit fails + } + return Ok(result); + } + Err(e) => { + unsafe { + let r = + ffi_result!(rocksdb_transaction_rollback_with_status(transaction)); + rocksdb_transaction_destroy(transaction); + rocksdb_readoptions_destroy(read_options); + rocksdb_free(snapshot as *mut c_void); + r.map_err(StorageError::from)?; // We make sure to also run destructors if the commit fails + } + // We look for the root error + let mut error: &(dyn Error + 'static) = &e; + while let Some(e) = error.source() { + error = e; + } + let is_conflict_error = + error.downcast_ref::().map_or(false, |e| { + e.0.code == rocksdb_status_code_t_rocksdb_status_code_busy + || e.0.code + == rocksdb_status_code_t_rocksdb_status_code_timed_out + || e.0.code + == rocksdb_status_code_t_rocksdb_status_code_try_again + }); + if is_conflict_error { + // We give a chance to the OS to do something else before retrying in order to help avoiding another conflict + yield_now(); + } else { + // We raise the error + return Err(e); + } + } + } + } + } else { + Err( + StorageError::Other("Transaction are only possible on read-write instances".into()) + .into(), + ) + } + } + + pub fn get( + &self, + column_family: &ColumnFamily, + key: &[u8], + ) -> Result, StorageError> { + unsafe { + let slice = match &self.inner { + DbKind::ReadOnly(db) => { + ffi_result!(rocksdb_get_pinned_cf_with_status( + db.db, + db.read_options, + column_family.0, + key.as_ptr().cast(), + key.len(), + )) + } + DbKind::ReadWrite(db) => { + ffi_result!(rocksdb_transactiondb_get_pinned_cf_with_status( + db.db, + db.read_options, + column_family.0, + key.as_ptr().cast(), + key.len() + )) + } + }?; + Ok(if slice.is_null() { + None + } else { + Some(PinnableSlice(slice)) + }) + } + } + + pub fn contains_key( + &self, + column_family: &ColumnFamily, + key: &[u8], + ) -> Result { + Ok(self.get(column_family, key)?.is_some()) // TODO: optimize + } + + pub fn insert( + &self, + column_family: &ColumnFamily, + key: &[u8], + value: &[u8], + ) -> Result<(), StorageError> { + if let DbKind::ReadWrite(db) = &self.inner { + unsafe { + ffi_result!(rocksdb_transactiondb_put_cf_with_status( + db.db, + db.write_options, + column_family.0, + key.as_ptr().cast(), + key.len(), + value.as_ptr().cast(), + value.len(), + )) + }?; + Ok(()) + } else { + Err(StorageError::Other( + "Inserts are only possible on read-write instances".into(), + )) + } + } + + pub fn flush(&self) -> Result<(), StorageError> { + if let DbKind::ReadWrite(db) = &self.inner { + unsafe { + ffi_result!(rocksdb_transactiondb_flush_cfs_with_status( + db.db, + db.flush_options, + db.cf_handles.as_ptr().cast_mut(), + db.cf_handles.len().try_into().unwrap() + )) + }?; + Ok(()) + } else { + Err(StorageError::Other( + "Flush is only possible on read-write instances".into(), + )) + } + } + + pub fn compact(&self, column_family: &ColumnFamily) -> Result<(), StorageError> { + if let DbKind::ReadWrite(db) = &self.inner { + unsafe { + ffi_result!(rocksdb_transactiondb_compact_range_cf_opt_with_status( + db.db, + column_family.0, + db.compaction_options, + ptr::null(), + 0, + ptr::null(), + 0, + )) + }?; + Ok(()) + } else { + Err(StorageError::Other( + "Compaction is only possible on read-write instances".into(), + )) + } + } + + pub fn new_sst_file(&self) -> Result { + if let DbKind::ReadWrite(db) = &self.inner { + let path = db.path.join(random::().to_string()); + unsafe { + let writer = rocksdb_sstfilewriter_create(db.env_options, db.options); + ffi_result!(rocksdb_sstfilewriter_open_with_status( + writer, + path_to_cstring(&path)?.as_ptr() + )) + .map_err(|e| { + rocksdb_sstfilewriter_destroy(writer); + e + })?; + Ok(SstFileWriter { writer, path }) + } + } else { + Err(StorageError::Other( + "SST creation is only possible on read-write instances".into(), + )) + } + } + + pub fn insert_stt_files( + &self, + ssts_for_cf: &[(&ColumnFamily, PathBuf)], + ) -> Result<(), StorageError> { + if ssts_for_cf.is_empty() { + return Ok(()); // Rocksdb does not support empty lists + } + if let DbKind::ReadWrite(db) = &self.inner { + let mut paths_by_cf = HashMap::<_, Vec<_>>::new(); + for (cf, path) in ssts_for_cf { + paths_by_cf + .entry(*cf) + .or_default() + .push(path_to_cstring(path)?); + } + let cpaths_by_cf = paths_by_cf + .iter() + .map(|(cf, paths)| (*cf, paths.iter().map(|p| p.as_ptr()).collect::>())) + .collect::>(); + let args = cpaths_by_cf + .iter() + .map(|(cf, p)| rocksdb_ingestexternalfilearg_t { + column_family: cf.0, + external_files: p.as_ptr(), + external_files_len: p.len(), + options: db.ingest_external_file_options, + }) + .collect::>(); + unsafe { + ffi_result!(rocksdb_transactiondb_ingest_external_files_with_status( + db.db, + args.as_ptr(), + args.len() + ))?; + } + Ok(()) + } else { + Err(StorageError::Other( + "SST ingestion is only possible on read-write instances".into(), + )) + } + } + + pub fn backup(&self, target_directory: &Path) -> Result<(), StorageError> { + let path = path_to_cstring(target_directory)?; + match &self.inner { + DbKind::ReadOnly(db) => unsafe { + if db.is_secondary { + ffi_result!(rocksdb_try_catch_up_with_primary_with_status(db.db))?; + } + ffi_result!(rocksdb_create_checkpoint_with_status(db.db, path.as_ptr())) + }, + DbKind::ReadWrite(db) => { + if db.in_memory { + return Err(StorageError::Other( + "It is not possible to backup an in-memory database".into(), + )); + } + unsafe { + ffi_result!(rocksdb_transactiondb_create_checkpoint_with_status( + db.db, + path.as_ptr() + )) + } + } + }?; + Ok(()) + } +} + +// It is fine to not keep a lifetime: there is no way to use this type without the database being still in scope. +// So, no use after free possible. +#[derive(Clone, Eq, PartialEq, Hash)] +pub struct ColumnFamily(*mut rocksdb_column_family_handle_t); + +unsafe impl Send for ColumnFamily {} +unsafe impl Sync for ColumnFamily {} + +pub struct Reader { + inner: InnerReader, + options: *mut rocksdb_readoptions_t, +} + +#[derive(Clone)] +enum InnerReader { + TransactionalSnapshot(Rc), + Transaction(Weak<*mut rocksdb_transaction_t>), + PlainDb(Arc), +} + +struct TransactionalSnapshot { + db: Arc, + snapshot: *const rocksdb_snapshot_t, +} + +impl Drop for TransactionalSnapshot { + fn drop(&mut self) { + unsafe { rocksdb_transactiondb_release_snapshot(self.db.db, self.snapshot) } + } +} + +impl Clone for Reader { + fn clone(&self) -> Self { + Self { + inner: self.inner.clone(), + options: unsafe { rocksdb_readoptions_create_copy(self.options) }, + } + } +} + +impl Drop for Reader { + fn drop(&mut self) { + unsafe { rocksdb_readoptions_destroy(self.options) } + } +} + +impl Reader { + pub fn get( + &self, + column_family: &ColumnFamily, + key: &[u8], + ) -> Result, StorageError> { + unsafe { + let slice = match &self.inner { + InnerReader::TransactionalSnapshot(inner) => { + ffi_result!(rocksdb_transactiondb_get_pinned_cf_with_status( + inner.db.db, + self.options, + column_family.0, + key.as_ptr().cast(), + key.len() + )) + } + InnerReader::Transaction(inner) => { + let Some(inner) = inner.upgrade() else { + return Err(StorageError::Other( + "The transaction is already ended".into(), + )); + }; + ffi_result!(rocksdb_transaction_get_pinned_cf_with_status( + *inner, + self.options, + column_family.0, + key.as_ptr().cast(), + key.len() + )) + } + InnerReader::PlainDb(inner) => { + ffi_result!(rocksdb_get_pinned_cf_with_status( + inner.db, + self.options, + column_family.0, + key.as_ptr().cast(), + key.len() + )) + } + }?; + Ok(if slice.is_null() { + None + } else { + Some(PinnableSlice(slice)) + }) + } + } + + pub fn contains_key( + &self, + column_family: &ColumnFamily, + key: &[u8], + ) -> Result { + Ok(self.get(column_family, key)?.is_some()) // TODO: optimize + } + + #[allow(clippy::iter_not_returning_iterator)] + pub fn iter(&self, column_family: &ColumnFamily) -> Result { + self.scan_prefix(column_family, &[]) + } + + pub fn scan_prefix( + &self, + column_family: &ColumnFamily, + prefix: &[u8], + ) -> Result { + // We generate the upper bound + let upper_bound = { + let mut bound = prefix.to_vec(); + let mut found = false; + for c in bound.iter_mut().rev() { + if *c < u8::MAX { + *c += 1; + found = true; + break; + } + } + found.then_some(bound) + }; + + unsafe { + let options = rocksdb_readoptions_create_copy(self.options); + assert!( + !options.is_null(), + "rocksdb_readoptions_create returned null" + ); + if let Some(upper_bound) = &upper_bound { + rocksdb_readoptions_set_iterate_upper_bound( + options, + upper_bound.as_ptr().cast(), + upper_bound.len(), + ); + } + let iter = match &self.inner { + InnerReader::TransactionalSnapshot(inner) => { + rocksdb_transactiondb_create_iterator_cf(inner.db.db, options, column_family.0) + } + InnerReader::Transaction(inner) => { + let Some(inner) = inner.upgrade() else { + return Err(StorageError::Other( + "The transaction is already ended".into(), + )); + }; + rocksdb_transaction_create_iterator_cf(*inner, options, column_family.0) + } + InnerReader::PlainDb(inner) => { + rocksdb_create_iterator_cf(inner.db, options, column_family.0) + } + }; + assert!(!iter.is_null(), "rocksdb_create_iterator returned null"); + if prefix.is_empty() { + rocksdb_iter_seek_to_first(iter); + } else { + rocksdb_iter_seek(iter, prefix.as_ptr().cast(), prefix.len()); + } + let is_currently_valid = rocksdb_iter_valid(iter) != 0; + Ok(Iter { + inner: iter, + options, + _upper_bound: upper_bound, + _reader: self.clone(), + is_currently_valid, + }) + } + } + + pub fn len(&self, column_family: &ColumnFamily) -> Result { + let mut count = 0; + let mut iter = self.iter(column_family)?; + while iter.is_valid() { + count += 1; + iter.next(); + } + iter.status()?; // We makes sure there is no read problem + Ok(count) + } + + pub fn is_empty(&self, column_family: &ColumnFamily) -> Result { + let iter = self.iter(column_family)?; + iter.status()?; // We makes sure there is no read problem + Ok(!iter.is_valid()) + } +} + +pub struct Transaction<'a> { + inner: Rc<*mut rocksdb_transaction_t>, + read_options: *mut rocksdb_readoptions_t, + _lifetime: PhantomData<&'a ()>, +} + +impl Transaction<'_> { + pub fn reader(&self) -> Reader { + Reader { + inner: InnerReader::Transaction(Rc::downgrade(&self.inner)), + options: unsafe { rocksdb_readoptions_create_copy(self.read_options) }, + } + } + + pub fn get_for_update( + &self, + column_family: &ColumnFamily, + key: &[u8], + ) -> Result, StorageError> { + unsafe { + let slice = ffi_result!(rocksdb_transaction_get_for_update_pinned_cf_with_status( + *self.inner, + self.read_options, + column_family.0, + key.as_ptr().cast(), + key.len() + ))?; + Ok(if slice.is_null() { + None + } else { + Some(PinnableSlice(slice)) + }) + } + } + + pub fn contains_key_for_update( + &self, + column_family: &ColumnFamily, + key: &[u8], + ) -> Result { + Ok(self.get_for_update(column_family, key)?.is_some()) // TODO: optimize + } + + pub fn insert( + &mut self, + column_family: &ColumnFamily, + key: &[u8], + value: &[u8], + ) -> Result<(), StorageError> { + unsafe { + ffi_result!(rocksdb_transaction_put_cf_with_status( + *self.inner, + column_family.0, + key.as_ptr().cast(), + key.len(), + value.as_ptr().cast(), + value.len(), + ))?; + } + Ok(()) + } + + pub fn insert_empty( + &mut self, + column_family: &ColumnFamily, + key: &[u8], + ) -> Result<(), StorageError> { + self.insert(column_family, key, &[]) + } + + pub fn remove(&mut self, column_family: &ColumnFamily, key: &[u8]) -> Result<(), StorageError> { + unsafe { + ffi_result!(rocksdb_transaction_delete_cf_with_status( + *self.inner, + column_family.0, + key.as_ptr().cast(), + key.len(), + ))?; + } + Ok(()) + } +} + +pub struct PinnableSlice(*mut rocksdb_pinnableslice_t); + +impl Drop for PinnableSlice { + fn drop(&mut self) { + unsafe { + rocksdb_pinnableslice_destroy(self.0); + } + } +} + +impl Deref for PinnableSlice { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + unsafe { + let mut len = 0; + let val = rocksdb_pinnableslice_value(self.0, &mut len); + slice::from_raw_parts(val.cast(), len) + } + } +} + +impl AsRef<[u8]> for PinnableSlice { + fn as_ref(&self) -> &[u8] { + self + } +} + +impl Borrow<[u8]> for PinnableSlice { + fn borrow(&self) -> &[u8] { + self + } +} + +impl From for Vec { + fn from(value: PinnableSlice) -> Self { + value.to_vec() + } +} + +pub struct Buffer { + base: *mut u8, + len: usize, +} + +impl Drop for Buffer { + fn drop(&mut self) { + unsafe { + rocksdb_free(self.base.cast()); + } + } +} + +impl Deref for Buffer { + type Target = [u8]; + + fn deref(&self) -> &Self::Target { + unsafe { slice::from_raw_parts(self.base, self.len) } + } +} + +impl AsRef<[u8]> for Buffer { + fn as_ref(&self) -> &[u8] { + self + } +} + +impl Borrow<[u8]> for Buffer { + fn borrow(&self) -> &[u8] { + self + } +} + +impl From for Vec { + fn from(value: Buffer) -> Self { + value.to_vec() + } +} + +pub struct Iter { + inner: *mut rocksdb_iterator_t, + is_currently_valid: bool, + _upper_bound: Option>, + _reader: Reader, // needed to ensure that DB still lives while iter is used + options: *mut rocksdb_readoptions_t, /* needed to ensure that options still lives while iter is used */ +} + +impl Drop for Iter { + fn drop(&mut self) { + unsafe { + rocksdb_iter_destroy(self.inner); + rocksdb_readoptions_destroy(self.options); + } + } +} + +#[allow(clippy::non_send_fields_in_send_ty)] +unsafe impl Send for Iter {} + +unsafe impl Sync for Iter {} + +impl Iter { + pub fn is_valid(&self) -> bool { + self.is_currently_valid + } + + pub fn status(&self) -> Result<(), StorageError> { + unsafe { + ffi_result!(rocksdb_iter_get_status(self.inner))?; + } + Ok(()) + } + + pub fn next(&mut self) { + unsafe { + rocksdb_iter_next(self.inner); + self.is_currently_valid = rocksdb_iter_valid(self.inner) != 0; + } + } + + pub fn key(&self) -> Option<&[u8]> { + if self.is_valid() { + unsafe { + let mut len = 0; + let val = rocksdb_iter_key(self.inner, &mut len); + Some(slice::from_raw_parts(val.cast(), len)) + } + } else { + None + } + } +} + +pub struct SstFileWriter { + writer: *mut rocksdb_sstfilewriter_t, + path: PathBuf, +} + +impl Drop for SstFileWriter { + fn drop(&mut self) { + unsafe { + rocksdb_sstfilewriter_destroy(self.writer); + } + } +} + +impl SstFileWriter { + pub fn insert(&mut self, key: &[u8], value: &[u8]) -> Result<(), StorageError> { + unsafe { + ffi_result!(rocksdb_sstfilewriter_put_with_status( + self.writer, + key.as_ptr().cast(), + key.len(), + value.as_ptr().cast(), + value.len(), + ))?; + } + Ok(()) + } + + pub fn insert_empty(&mut self, key: &[u8]) -> Result<(), StorageError> { + self.insert(key, &[]) + } + + pub fn finish(self) -> Result { + unsafe { + ffi_result!(rocksdb_sstfilewriter_finish_with_status(self.writer))?; + } + Ok(self.path.clone()) + } +} + +#[derive(thiserror::Error)] +#[error("{}", self.message())] +struct ErrorStatus(rocksdb_status_t); + +unsafe impl Send for ErrorStatus {} +unsafe impl Sync for ErrorStatus {} + +impl Drop for ErrorStatus { + fn drop(&mut self) { + if !self.0.string.is_null() { + unsafe { + rocksdb_free(self.0.string as *mut c_void); + } + } + } +} + +impl ErrorStatus { + fn message(&self) -> &str { + if self.0.string.is_null() { + "Unknown error" + } else { + unsafe { CStr::from_ptr(self.0.string) } + .to_str() + .unwrap_or("Invalid error message") + } + } +} + +impl fmt::Debug for ErrorStatus { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("ErrorStatus") + .field("code", &self.0.code) + .field("subcode", &self.0.subcode) + .field("severity", &self.0.severity) + .field("message", &self.message()) + .finish() + } +} + +impl From for StorageError { + fn from(status: ErrorStatus) -> Self { + if status.0.code == rocksdb_status_code_t_rocksdb_status_code_io_error { + let kind = + if status.0.subcode == rocksdb_status_subcode_t_rocksdb_status_subcode_no_space { + io::ErrorKind::Other // TODO ErrorKind::StorageFull + } else if status.0.subcode + == rocksdb_status_subcode_t_rocksdb_status_subcode_path_not_found + { + io::ErrorKind::NotFound + } else { + io::ErrorKind::Other + }; + Self::Io(io::Error::new(kind, status)) + } else if status.0.code == rocksdb_status_code_t_rocksdb_status_code_corruption { + Self::Corruption(CorruptionError::new(status)) + } else { + Self::Other(Box::new(status)) + } + } +} + +struct UnsafeEnv(*mut rocksdb_env_t); + +// Hack for OnceCell. OK because only written in OnceCell and used in a thread-safe way by RocksDB +unsafe impl Send for UnsafeEnv {} +unsafe impl Sync for UnsafeEnv {} + +fn path_to_cstring(path: &Path) -> Result { + Ok(CString::new(path.to_str().ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidInput, + "The DB path is not valid UTF-8", + ) + })?) + .map_err(|e| { + io::Error::new( + io::ErrorKind::InvalidInput, + format!("The DB path contains null bytes: {e}"), + ) + })?) +} + +#[cfg(unix)] +fn available_file_descriptors() -> io::Result> { + let mut rlimit = libc::rlimit { + rlim_cur: 0, + rlim_max: 0, + }; + if unsafe { libc::getrlimit(libc::RLIMIT_NOFILE, &mut rlimit) } == 0 { + Ok(Some(min(rlimit.rlim_cur, rlimit.rlim_max))) + } else { + Err(io::Error::last_os_error()) + } +} + +#[cfg(windows)] +fn available_file_descriptors() -> io::Result> { + Ok(Some(512)) // https://docs.microsoft.com/en-us/cpp/c-runtime-library/file-handling +} + +#[cfg(not(any(unix, windows)))] +fn available_file_descriptors() -> io::Result> { + Ok(None) +} + +fn tmp_path() -> PathBuf { + if cfg!(target_os = "linux") { + "/dev/shm/".into() + } else { + temp_dir() + } + .join(format!("oxigraph-rocksdb-{}", random::())) +} diff --git a/src/storage/binary_encoder.rs b/src/storage/binary_encoder.rs new file mode 100644 index 0000000..1e789b7 --- /dev/null +++ b/src/storage/binary_encoder.rs @@ -0,0 +1,742 @@ +use crate::storage::error::{CorruptionError, StorageError}; +use crate::storage::numeric_encoder::{EncodedQuad, EncodedTerm, EncodedTriple, StrHash}; +use crate::storage::small_string::SmallString; +use oxsdatatypes::*; +use std::io::Read; +use std::mem::size_of; + +#[cfg(all(not(target_family = "wasm")))] +pub const LATEST_STORAGE_VERSION: u64 = 1; +pub const WRITTEN_TERM_MAX_SIZE: usize = size_of::() + 2 * size_of::(); + +// Encoded term type blocks +// 1-7: usual named nodes (except prefixes c.f. later) +// 8-15: blank nodes +// 16-47: literals +// 48-55: triples +// 56-64: future use +// 64-127: default named node prefixes +// 128-255: custom named node prefixes +const TYPE_NAMED_NODE_ID: u8 = 1; +const TYPE_NUMERICAL_BLANK_NODE_ID: u8 = 8; +const TYPE_SMALL_BLANK_NODE_ID: u8 = 9; +const TYPE_BIG_BLANK_NODE_ID: u8 = 10; +const TYPE_SMALL_STRING_LITERAL: u8 = 16; +const TYPE_BIG_STRING_LITERAL: u8 = 17; +const TYPE_SMALL_SMALL_LANG_STRING_LITERAL: u8 = 20; +const TYPE_SMALL_BIG_LANG_STRING_LITERAL: u8 = 21; +const TYPE_BIG_SMALL_LANG_STRING_LITERAL: u8 = 22; +const TYPE_BIG_BIG_LANG_STRING_LITERAL: u8 = 23; +const TYPE_SMALL_TYPED_LITERAL: u8 = 24; +const TYPE_BIG_TYPED_LITERAL: u8 = 25; +const TYPE_BOOLEAN_LITERAL_TRUE: u8 = 28; +const TYPE_BOOLEAN_LITERAL_FALSE: u8 = 29; +const TYPE_FLOAT_LITERAL: u8 = 30; +const TYPE_DOUBLE_LITERAL: u8 = 31; +const TYPE_INTEGER_LITERAL: u8 = 32; +const TYPE_DECIMAL_LITERAL: u8 = 33; +const TYPE_DATE_TIME_LITERAL: u8 = 34; +const TYPE_TIME_LITERAL: u8 = 35; +const TYPE_DATE_LITERAL: u8 = 36; +const TYPE_G_YEAR_MONTH_LITERAL: u8 = 37; +const TYPE_G_YEAR_LITERAL: u8 = 38; +const TYPE_G_MONTH_DAY_LITERAL: u8 = 39; +const TYPE_G_DAY_LITERAL: u8 = 40; +const TYPE_G_MONTH_LITERAL: u8 = 41; +const TYPE_DURATION_LITERAL: u8 = 42; +const TYPE_YEAR_MONTH_DURATION_LITERAL: u8 = 43; +const TYPE_DAY_TIME_DURATION_LITERAL: u8 = 44; +const TYPE_TRIPLE: u8 = 48; + +#[derive(Clone, Copy)] +pub enum QuadEncoding { + Spog, + Posg, + Ospg, + Gspo, + Gpos, + Gosp, + Dspo, + Dpos, + Dosp, +} + +impl QuadEncoding { + pub fn decode(self, mut buffer: &[u8]) -> Result { + match self { + Self::Spog => buffer.read_spog_quad(), + Self::Posg => buffer.read_posg_quad(), + Self::Ospg => buffer.read_ospg_quad(), + Self::Gspo => buffer.read_gspo_quad(), + Self::Gpos => buffer.read_gpos_quad(), + Self::Gosp => buffer.read_gosp_quad(), + Self::Dspo => buffer.read_dspo_quad(), + Self::Dpos => buffer.read_dpos_quad(), + Self::Dosp => buffer.read_dosp_quad(), + } + } +} + +pub fn decode_term(mut buffer: &[u8]) -> Result { + buffer.read_term() +} + +pub trait TermReader { + fn read_term(&mut self) -> Result; + + fn read_spog_quad(&mut self) -> Result { + let subject = self.read_term()?; + let predicate = self.read_term()?; + let object = self.read_term()?; + let graph_name = self.read_term()?; + Ok(EncodedQuad { + subject, + predicate, + object, + graph_name, + }) + } + + fn read_posg_quad(&mut self) -> Result { + let predicate = self.read_term()?; + let object = self.read_term()?; + let subject = self.read_term()?; + let graph_name = self.read_term()?; + Ok(EncodedQuad { + subject, + predicate, + object, + graph_name, + }) + } + + fn read_ospg_quad(&mut self) -> Result { + let object = self.read_term()?; + let subject = self.read_term()?; + let predicate = self.read_term()?; + let graph_name = self.read_term()?; + Ok(EncodedQuad { + subject, + predicate, + object, + graph_name, + }) + } + + fn read_gspo_quad(&mut self) -> Result { + let graph_name = self.read_term()?; + let subject = self.read_term()?; + let predicate = self.read_term()?; + let object = self.read_term()?; + Ok(EncodedQuad { + subject, + predicate, + object, + graph_name, + }) + } + + fn read_gpos_quad(&mut self) -> Result { + let graph_name = self.read_term()?; + let predicate = self.read_term()?; + let object = self.read_term()?; + let subject = self.read_term()?; + Ok(EncodedQuad { + subject, + predicate, + object, + graph_name, + }) + } + + fn read_gosp_quad(&mut self) -> Result { + let graph_name = self.read_term()?; + let object = self.read_term()?; + let subject = self.read_term()?; + let predicate = self.read_term()?; + Ok(EncodedQuad { + subject, + predicate, + object, + graph_name, + }) + } + + fn read_dspo_quad(&mut self) -> Result { + let subject = self.read_term()?; + let predicate = self.read_term()?; + let object = self.read_term()?; + Ok(EncodedQuad { + subject, + predicate, + object, + graph_name: EncodedTerm::DefaultGraph, + }) + } + + fn read_dpos_quad(&mut self) -> Result { + let predicate = self.read_term()?; + let object = self.read_term()?; + let subject = self.read_term()?; + Ok(EncodedQuad { + subject, + predicate, + object, + graph_name: EncodedTerm::DefaultGraph, + }) + } + + fn read_dosp_quad(&mut self) -> Result { + let object = self.read_term()?; + let subject = self.read_term()?; + let predicate = self.read_term()?; + Ok(EncodedQuad { + subject, + predicate, + object, + graph_name: EncodedTerm::DefaultGraph, + }) + } +} + +impl TermReader for R { + fn read_term(&mut self) -> Result { + let mut type_buffer = [0]; + self.read_exact(&mut type_buffer)?; + match type_buffer[0] { + TYPE_NAMED_NODE_ID => { + let mut buffer = [0; 16]; + self.read_exact(&mut buffer)?; + Ok(EncodedTerm::NamedNode { + iri_id: StrHash::from_be_bytes(buffer), + }) + } + TYPE_NUMERICAL_BLANK_NODE_ID => { + let mut buffer = [0; 16]; + self.read_exact(&mut buffer)?; + Ok(EncodedTerm::NumericalBlankNode { + id: u128::from_be_bytes(buffer), + }) + } + TYPE_SMALL_BLANK_NODE_ID => { + let mut buffer = [0; 16]; + self.read_exact(&mut buffer)?; + Ok(EncodedTerm::SmallBlankNode( + SmallString::from_be_bytes(buffer).map_err(CorruptionError::new)?, + )) + } + TYPE_BIG_BLANK_NODE_ID => { + let mut buffer = [0; 16]; + self.read_exact(&mut buffer)?; + Ok(EncodedTerm::BigBlankNode { + id_id: StrHash::from_be_bytes(buffer), + }) + } + TYPE_SMALL_SMALL_LANG_STRING_LITERAL => { + let mut language_buffer = [0; 16]; + self.read_exact(&mut language_buffer)?; + let mut value_buffer = [0; 16]; + self.read_exact(&mut value_buffer)?; + Ok(EncodedTerm::SmallSmallLangStringLiteral { + value: SmallString::from_be_bytes(value_buffer) + .map_err(CorruptionError::new)?, + language: SmallString::from_be_bytes(language_buffer) + .map_err(CorruptionError::new)?, + }) + } + TYPE_SMALL_BIG_LANG_STRING_LITERAL => { + let mut language_buffer = [0; 16]; + self.read_exact(&mut language_buffer)?; + let mut value_buffer = [0; 16]; + self.read_exact(&mut value_buffer)?; + Ok(EncodedTerm::SmallBigLangStringLiteral { + value: SmallString::from_be_bytes(value_buffer) + .map_err(CorruptionError::new)?, + language_id: StrHash::from_be_bytes(language_buffer), + }) + } + TYPE_BIG_SMALL_LANG_STRING_LITERAL => { + let mut language_buffer = [0; 16]; + self.read_exact(&mut language_buffer)?; + let mut value_buffer = [0; 16]; + self.read_exact(&mut value_buffer)?; + Ok(EncodedTerm::BigSmallLangStringLiteral { + value_id: StrHash::from_be_bytes(value_buffer), + language: SmallString::from_be_bytes(language_buffer) + .map_err(CorruptionError::new)?, + }) + } + TYPE_BIG_BIG_LANG_STRING_LITERAL => { + let mut language_buffer = [0; 16]; + self.read_exact(&mut language_buffer)?; + let mut value_buffer = [0; 16]; + self.read_exact(&mut value_buffer)?; + Ok(EncodedTerm::BigBigLangStringLiteral { + value_id: StrHash::from_be_bytes(value_buffer), + language_id: StrHash::from_be_bytes(language_buffer), + }) + } + TYPE_SMALL_TYPED_LITERAL => { + let mut datatype_buffer = [0; 16]; + self.read_exact(&mut datatype_buffer)?; + let mut value_buffer = [0; 16]; + self.read_exact(&mut value_buffer)?; + Ok(EncodedTerm::SmallTypedLiteral { + datatype_id: StrHash::from_be_bytes(datatype_buffer), + value: SmallString::from_be_bytes(value_buffer) + .map_err(CorruptionError::new)?, + }) + } + TYPE_BIG_TYPED_LITERAL => { + let mut datatype_buffer = [0; 16]; + self.read_exact(&mut datatype_buffer)?; + let mut value_buffer = [0; 16]; + self.read_exact(&mut value_buffer)?; + Ok(EncodedTerm::BigTypedLiteral { + datatype_id: StrHash::from_be_bytes(datatype_buffer), + value_id: StrHash::from_be_bytes(value_buffer), + }) + } + TYPE_SMALL_STRING_LITERAL => { + let mut buffer = [0; 16]; + self.read_exact(&mut buffer)?; + Ok(EncodedTerm::SmallStringLiteral( + SmallString::from_be_bytes(buffer).map_err(CorruptionError::new)?, + )) + } + TYPE_BIG_STRING_LITERAL => { + let mut buffer = [0; 16]; + self.read_exact(&mut buffer)?; + Ok(EncodedTerm::BigStringLiteral { + value_id: StrHash::from_be_bytes(buffer), + }) + } + TYPE_BOOLEAN_LITERAL_TRUE => Ok(true.into()), + TYPE_BOOLEAN_LITERAL_FALSE => Ok(false.into()), + TYPE_FLOAT_LITERAL => { + let mut buffer = [0; 4]; + self.read_exact(&mut buffer)?; + Ok(Float::from_be_bytes(buffer).into()) + } + TYPE_DOUBLE_LITERAL => { + let mut buffer = [0; 8]; + self.read_exact(&mut buffer)?; + Ok(Double::from_be_bytes(buffer).into()) + } + TYPE_INTEGER_LITERAL => { + let mut buffer = [0; 8]; + self.read_exact(&mut buffer)?; + Ok(Integer::from_be_bytes(buffer).into()) + } + TYPE_DECIMAL_LITERAL => { + let mut buffer = [0; 16]; + self.read_exact(&mut buffer)?; + Ok(Decimal::from_be_bytes(buffer).into()) + } + TYPE_DATE_TIME_LITERAL => { + let mut buffer = [0; 18]; + self.read_exact(&mut buffer)?; + Ok(DateTime::from_be_bytes(buffer).into()) + } + TYPE_TIME_LITERAL => { + let mut buffer = [0; 18]; + self.read_exact(&mut buffer)?; + Ok(Time::from_be_bytes(buffer).into()) + } + TYPE_DATE_LITERAL => { + let mut buffer = [0; 18]; + self.read_exact(&mut buffer)?; + Ok(Date::from_be_bytes(buffer).into()) + } + TYPE_G_YEAR_MONTH_LITERAL => { + let mut buffer = [0; 18]; + self.read_exact(&mut buffer)?; + Ok(GYearMonth::from_be_bytes(buffer).into()) + } + TYPE_G_YEAR_LITERAL => { + let mut buffer = [0; 18]; + self.read_exact(&mut buffer)?; + Ok(GYear::from_be_bytes(buffer).into()) + } + TYPE_G_MONTH_DAY_LITERAL => { + let mut buffer = [0; 18]; + self.read_exact(&mut buffer)?; + Ok(GMonthDay::from_be_bytes(buffer).into()) + } + TYPE_G_DAY_LITERAL => { + let mut buffer = [0; 18]; + self.read_exact(&mut buffer)?; + Ok(GDay::from_be_bytes(buffer).into()) + } + TYPE_G_MONTH_LITERAL => { + let mut buffer = [0; 18]; + self.read_exact(&mut buffer)?; + Ok(GMonth::from_be_bytes(buffer).into()) + } + TYPE_DURATION_LITERAL => { + let mut buffer = [0; 24]; + self.read_exact(&mut buffer)?; + Ok(Duration::from_be_bytes(buffer).into()) + } + TYPE_YEAR_MONTH_DURATION_LITERAL => { + let mut buffer = [0; 8]; + self.read_exact(&mut buffer)?; + Ok(YearMonthDuration::from_be_bytes(buffer).into()) + } + TYPE_DAY_TIME_DURATION_LITERAL => { + let mut buffer = [0; 16]; + self.read_exact(&mut buffer)?; + Ok(DayTimeDuration::from_be_bytes(buffer).into()) + } + TYPE_TRIPLE => Ok(EncodedTriple { + subject: self.read_term()?, + predicate: self.read_term()?, + object: self.read_term()?, + } + .into()), + _ => Err(CorruptionError::msg("the term buffer has an invalid type id").into()), + } + } +} + +pub fn write_spog_quad(sink: &mut Vec, quad: &EncodedQuad) { + write_term(sink, &quad.subject); + write_term(sink, &quad.predicate); + write_term(sink, &quad.object); + write_term(sink, &quad.graph_name); +} + +pub fn write_posg_quad(sink: &mut Vec, quad: &EncodedQuad) { + write_term(sink, &quad.predicate); + write_term(sink, &quad.object); + write_term(sink, &quad.subject); + write_term(sink, &quad.graph_name); +} + +pub fn write_ospg_quad(sink: &mut Vec, quad: &EncodedQuad) { + write_term(sink, &quad.object); + write_term(sink, &quad.subject); + write_term(sink, &quad.predicate); + write_term(sink, &quad.graph_name); +} + +pub fn write_gspo_quad(sink: &mut Vec, quad: &EncodedQuad) { + write_term(sink, &quad.graph_name); + write_term(sink, &quad.subject); + write_term(sink, &quad.predicate); + write_term(sink, &quad.object); +} + +pub fn write_gpos_quad(sink: &mut Vec, quad: &EncodedQuad) { + write_term(sink, &quad.graph_name); + write_term(sink, &quad.predicate); + write_term(sink, &quad.object); + write_term(sink, &quad.subject); +} + +pub fn write_gosp_quad(sink: &mut Vec, quad: &EncodedQuad) { + write_term(sink, &quad.graph_name); + write_term(sink, &quad.object); + write_term(sink, &quad.subject); + write_term(sink, &quad.predicate); +} + +pub fn write_spo_quad(sink: &mut Vec, quad: &EncodedQuad) { + write_term(sink, &quad.subject); + write_term(sink, &quad.predicate); + write_term(sink, &quad.object); +} + +pub fn write_pos_quad(sink: &mut Vec, quad: &EncodedQuad) { + write_term(sink, &quad.predicate); + write_term(sink, &quad.object); + write_term(sink, &quad.subject); +} + +pub fn write_osp_quad(sink: &mut Vec, quad: &EncodedQuad) { + write_term(sink, &quad.object); + write_term(sink, &quad.subject); + write_term(sink, &quad.predicate); +} + +pub fn encode_term(t: &EncodedTerm) -> Vec { + let mut vec = Vec::with_capacity(WRITTEN_TERM_MAX_SIZE); + write_term(&mut vec, t); + vec +} + +pub fn encode_term_pair(t1: &EncodedTerm, t2: &EncodedTerm) -> Vec { + let mut vec = Vec::with_capacity(2 * WRITTEN_TERM_MAX_SIZE); + write_term(&mut vec, t1); + write_term(&mut vec, t2); + vec +} + +pub fn encode_term_triple(t1: &EncodedTerm, t2: &EncodedTerm, t3: &EncodedTerm) -> Vec { + let mut vec = Vec::with_capacity(3 * WRITTEN_TERM_MAX_SIZE); + write_term(&mut vec, t1); + write_term(&mut vec, t2); + write_term(&mut vec, t3); + vec +} + +pub fn encode_term_quad( + t1: &EncodedTerm, + t2: &EncodedTerm, + t3: &EncodedTerm, + t4: &EncodedTerm, +) -> Vec { + let mut vec = Vec::with_capacity(4 * WRITTEN_TERM_MAX_SIZE); + write_term(&mut vec, t1); + write_term(&mut vec, t2); + write_term(&mut vec, t3); + write_term(&mut vec, t4); + vec +} + +pub fn write_term(sink: &mut Vec, term: &EncodedTerm) { + match term { + EncodedTerm::DefaultGraph => (), + EncodedTerm::NamedNode { iri_id } => { + sink.push(TYPE_NAMED_NODE_ID); + sink.extend_from_slice(&iri_id.to_be_bytes()); + } + EncodedTerm::NumericalBlankNode { id } => { + sink.push(TYPE_NUMERICAL_BLANK_NODE_ID); + sink.extend_from_slice(&id.to_be_bytes()) + } + EncodedTerm::SmallBlankNode(id) => { + sink.push(TYPE_SMALL_BLANK_NODE_ID); + sink.extend_from_slice(&id.to_be_bytes()) + } + EncodedTerm::BigBlankNode { id_id } => { + sink.push(TYPE_BIG_BLANK_NODE_ID); + sink.extend_from_slice(&id_id.to_be_bytes()); + } + EncodedTerm::SmallStringLiteral(value) => { + sink.push(TYPE_SMALL_STRING_LITERAL); + sink.extend_from_slice(&value.to_be_bytes()) + } + EncodedTerm::BigStringLiteral { value_id } => { + sink.push(TYPE_BIG_STRING_LITERAL); + sink.extend_from_slice(&value_id.to_be_bytes()); + } + EncodedTerm::SmallSmallLangStringLiteral { value, language } => { + sink.push(TYPE_SMALL_SMALL_LANG_STRING_LITERAL); + sink.extend_from_slice(&language.to_be_bytes()); + sink.extend_from_slice(&value.to_be_bytes()); + } + EncodedTerm::SmallBigLangStringLiteral { value, language_id } => { + sink.push(TYPE_SMALL_BIG_LANG_STRING_LITERAL); + sink.extend_from_slice(&language_id.to_be_bytes()); + sink.extend_from_slice(&value.to_be_bytes()); + } + EncodedTerm::BigSmallLangStringLiteral { value_id, language } => { + sink.push(TYPE_BIG_SMALL_LANG_STRING_LITERAL); + sink.extend_from_slice(&language.to_be_bytes()); + sink.extend_from_slice(&value_id.to_be_bytes()); + } + EncodedTerm::BigBigLangStringLiteral { + value_id, + language_id, + } => { + sink.push(TYPE_BIG_BIG_LANG_STRING_LITERAL); + sink.extend_from_slice(&language_id.to_be_bytes()); + sink.extend_from_slice(&value_id.to_be_bytes()); + } + EncodedTerm::SmallTypedLiteral { value, datatype_id } => { + sink.push(TYPE_SMALL_TYPED_LITERAL); + sink.extend_from_slice(&datatype_id.to_be_bytes()); + sink.extend_from_slice(&value.to_be_bytes()); + } + EncodedTerm::BigTypedLiteral { + value_id, + datatype_id, + } => { + sink.push(TYPE_BIG_TYPED_LITERAL); + sink.extend_from_slice(&datatype_id.to_be_bytes()); + sink.extend_from_slice(&value_id.to_be_bytes()); + } + EncodedTerm::BooleanLiteral(value) => sink.push(if bool::from(*value) { + TYPE_BOOLEAN_LITERAL_TRUE + } else { + TYPE_BOOLEAN_LITERAL_FALSE + }), + EncodedTerm::FloatLiteral(value) => { + sink.push(TYPE_FLOAT_LITERAL); + sink.extend_from_slice(&value.to_be_bytes()) + } + EncodedTerm::DoubleLiteral(value) => { + sink.push(TYPE_DOUBLE_LITERAL); + sink.extend_from_slice(&value.to_be_bytes()) + } + EncodedTerm::IntegerLiteral(value) => { + sink.push(TYPE_INTEGER_LITERAL); + sink.extend_from_slice(&value.to_be_bytes()) + } + EncodedTerm::DecimalLiteral(value) => { + sink.push(TYPE_DECIMAL_LITERAL); + sink.extend_from_slice(&value.to_be_bytes()) + } + EncodedTerm::DateTimeLiteral(value) => { + sink.push(TYPE_DATE_TIME_LITERAL); + sink.extend_from_slice(&value.to_be_bytes()) + } + EncodedTerm::TimeLiteral(value) => { + sink.push(TYPE_TIME_LITERAL); + sink.extend_from_slice(&value.to_be_bytes()) + } + EncodedTerm::DurationLiteral(value) => { + sink.push(TYPE_DURATION_LITERAL); + sink.extend_from_slice(&value.to_be_bytes()) + } + EncodedTerm::DateLiteral(value) => { + sink.push(TYPE_DATE_LITERAL); + sink.extend_from_slice(&value.to_be_bytes()) + } + EncodedTerm::GYearMonthLiteral(value) => { + sink.push(TYPE_G_YEAR_MONTH_LITERAL); + sink.extend_from_slice(&value.to_be_bytes()) + } + EncodedTerm::GYearLiteral(value) => { + sink.push(TYPE_G_YEAR_LITERAL); + sink.extend_from_slice(&value.to_be_bytes()) + } + EncodedTerm::GMonthDayLiteral(value) => { + sink.push(TYPE_G_MONTH_DAY_LITERAL); + sink.extend_from_slice(&value.to_be_bytes()) + } + EncodedTerm::GDayLiteral(value) => { + sink.push(TYPE_G_DAY_LITERAL); + sink.extend_from_slice(&value.to_be_bytes()) + } + EncodedTerm::GMonthLiteral(value) => { + sink.push(TYPE_G_MONTH_LITERAL); + sink.extend_from_slice(&value.to_be_bytes()) + } + EncodedTerm::YearMonthDurationLiteral(value) => { + sink.push(TYPE_YEAR_MONTH_DURATION_LITERAL); + sink.extend_from_slice(&value.to_be_bytes()) + } + EncodedTerm::DayTimeDurationLiteral(value) => { + sink.push(TYPE_DAY_TIME_DURATION_LITERAL); + sink.extend_from_slice(&value.to_be_bytes()) + } + EncodedTerm::Triple(value) => { + sink.push(TYPE_TRIPLE); + write_term(sink, &value.subject); + write_term(sink, &value.predicate); + write_term(sink, &value.object); + } + } +} + +#[cfg(test)] +#[allow(clippy::panic_in_result_fn)] +mod tests { + use super::*; + use crate::model::TermRef; + use crate::storage::numeric_encoder::*; + use std::cell::RefCell; + use std::collections::HashMap; + + #[derive(Default)] + struct MemoryStrStore { + id2str: RefCell>, + } + + impl StrLookup for MemoryStrStore { + fn get_str(&self, key: &StrHash) -> Result, StorageError> { + Ok(self.id2str.borrow().get(key).cloned()) + } + } + + impl MemoryStrStore { + fn insert_term(&self, term: TermRef<'_>, encoded: &EncodedTerm) { + insert_term(term, encoded, &mut |h, v| { + self.insert_str(h, v); + Ok(()) + }) + .unwrap(); + } + + fn insert_str(&self, key: &StrHash, value: &str) { + self.id2str + .borrow_mut() + .entry(*key) + .or_insert_with(|| value.to_owned()); + } + } + + #[test] + fn test_encoding() { + use crate::model::vocab::xsd; + use crate::model::*; + + let store = MemoryStrStore::default(); + let terms: Vec = vec![ + NamedNode::new_unchecked("http://foo.com").into(), + NamedNode::new_unchecked("http://bar.com").into(), + NamedNode::new_unchecked("http://foo.com").into(), + BlankNode::default().into(), + BlankNode::new_unchecked("bnode").into(), + BlankNode::new_unchecked("foo-bnode-thisisaverylargeblanknode").into(), + Literal::new_simple_literal("literal").into(), + BlankNode::new_unchecked("foo-literal-thisisaverylargestringliteral").into(), + Literal::from(true).into(), + Literal::from(1.2).into(), + Literal::from(1).into(), + Literal::from("foo-string").into(), + Literal::new_language_tagged_literal_unchecked("foo-fr", "fr").into(), + Literal::new_language_tagged_literal_unchecked( + "foo-fr-literal-thisisaverylargelanguagetaggedstringliteral", + "fr", + ) + .into(), + Literal::new_language_tagged_literal_unchecked( + "foo-big", + "fr-FR-Latn-x-foo-bar-baz-bat-aaaa-bbbb-cccc", + ) + .into(), + Literal::new_language_tagged_literal_unchecked( + "foo-big-literal-thisisaverylargelanguagetaggedstringliteral", + "fr-FR-Latn-x-foo-bar-baz-bat-aaaa-bbbb-cccc", + ) + .into(), + Literal::new_typed_literal("-1.32", xsd::DECIMAL).into(), + Literal::new_typed_literal("2020-01-01T01:01:01Z", xsd::DATE_TIME).into(), + Literal::new_typed_literal("2020-01-01", xsd::DATE).into(), + Literal::new_typed_literal("01:01:01Z", xsd::TIME).into(), + Literal::new_typed_literal("2020-01", xsd::G_YEAR_MONTH).into(), + Literal::new_typed_literal("2020", xsd::G_YEAR).into(), + Literal::new_typed_literal("--01-01", xsd::G_MONTH_DAY).into(), + Literal::new_typed_literal("--01", xsd::G_MONTH).into(), + Literal::new_typed_literal("---01", xsd::G_DAY).into(), + Literal::new_typed_literal("PT1S", xsd::DURATION).into(), + Literal::new_typed_literal("PT1S", xsd::DAY_TIME_DURATION).into(), + Literal::new_typed_literal("P1Y", xsd::YEAR_MONTH_DURATION).into(), + Literal::new_typed_literal("-foo", NamedNode::new_unchecked("http://foo.com")).into(), + Literal::new_typed_literal( + "-foo-thisisaverybigtypedliteralwiththefoodatatype", + NamedNode::new_unchecked("http://foo.com"), + ) + .into(), + Triple::new( + NamedNode::new_unchecked("http://foo.com"), + NamedNode::new_unchecked("http://bar.com"), + Literal::from(true), + ) + .into(), + ]; + for term in terms { + let encoded = term.as_ref().into(); + store.insert_term(term.as_ref(), &encoded); + assert_eq!(encoded, term.as_ref().into()); + assert_eq!(term, store.decode_term(&encoded).unwrap()); + + let mut buffer = Vec::new(); + write_term(&mut buffer, &encoded); + assert_eq!(encoded, buffer.as_slice().read_term().unwrap()); + } + } +} diff --git a/src/storage/error.rs b/src/storage/error.rs new file mode 100644 index 0000000..d58d031 --- /dev/null +++ b/src/storage/error.rs @@ -0,0 +1,139 @@ +use crate::io::{RdfFormat, RdfParseError}; +use crate::storage::numeric_encoder::EncodedTerm; +use oxiri::IriParseError; +use oxrdf::TermRef; +use std::error::Error; +use std::io; + +/// An error related to storage operations (reads, writes...). +#[derive(Debug, thiserror::Error)] +#[non_exhaustive] +pub enum StorageError { + /// Error from the OS I/O layer. + #[error(transparent)] + Io(#[from] io::Error), + /// Error related to data corruption. + #[error(transparent)] + Corruption(#[from] CorruptionError), + #[doc(hidden)] + #[error("{0}")] + Other(#[source] Box), +} + +impl From for io::Error { + #[inline] + fn from(error: StorageError) -> Self { + match error { + StorageError::Io(error) => error, + StorageError::Corruption(error) => error.into(), + StorageError::Other(error) => Self::new(io::ErrorKind::Other, error), + } + } +} + +/// An error return if some content in the database is corrupted. +#[derive(Debug, thiserror::Error)] +#[error(transparent)] +pub struct CorruptionError(#[from] CorruptionErrorKind); + +/// An error return if some content in the database is corrupted. +#[derive(Debug, thiserror::Error)] +enum CorruptionErrorKind { + #[error("{0}")] + Msg(String), + #[error("{0}")] + Other(#[source] Box), +} + +impl CorruptionError { + /// Builds an error from a printable error message. + #[inline] + pub(crate) fn new(error: impl Into>) -> Self { + Self(CorruptionErrorKind::Other(error.into())) + } + + #[inline] + pub(crate) fn from_encoded_term(encoded: &EncodedTerm, term: &TermRef<'_>) -> Self { + // TODO: eventually use a dedicated error enum value + Self::msg(format!("Invalid term encoding {encoded:?} for {term}")) + } + + #[inline] + pub(crate) fn from_missing_column_family_name(name: &'static str) -> Self { + // TODO: eventually use a dedicated error enum value + Self::msg(format!("Column family {name} does not exist")) + } + + /// Builds an error from a printable error message. + #[inline] + pub(crate) fn msg(msg: impl Into) -> Self { + Self(CorruptionErrorKind::Msg(msg.into())) + } +} + +impl From for io::Error { + #[inline] + fn from(error: CorruptionError) -> Self { + Self::new(io::ErrorKind::InvalidData, error) + } +} + +/// An error raised while loading a file into a [`Store`](crate::store::Store). +#[derive(Debug, thiserror::Error)] +pub enum LoaderError { + /// An error raised while reading the file. + #[error(transparent)] + Parsing(#[from] RdfParseError), + /// An error raised during the insertion in the store. + #[error(transparent)] + Storage(#[from] StorageError), + /// The base IRI is invalid. + #[error("Invalid base IRI '{iri}': {error}")] + InvalidBaseIri { + /// The IRI itself. + iri: String, + /// The parsing error. + #[source] + error: IriParseError, + }, +} + +impl From for io::Error { + #[inline] + fn from(error: LoaderError) -> Self { + match error { + LoaderError::Storage(error) => error.into(), + LoaderError::Parsing(error) => error.into(), + LoaderError::InvalidBaseIri { .. } => { + Self::new(io::ErrorKind::InvalidInput, error.to_string()) + } + } + } +} + +/// An error raised while writing a file from a [`Store`](crate::store::Store). +#[derive(Debug, thiserror::Error)] +pub enum SerializerError { + /// An error raised while writing the content. + #[error(transparent)] + Io(#[from] io::Error), + /// An error raised during the lookup in the store. + #[error(transparent)] + Storage(#[from] StorageError), + /// A format compatible with [RDF dataset](https://www.w3.org/TR/rdf11-concepts/#dfn-rdf-dataset) is required. + #[error("A RDF format supporting datasets was expected, {0} found")] + DatasetFormatExpected(RdfFormat), +} + +impl From for io::Error { + #[inline] + fn from(error: SerializerError) -> Self { + match error { + SerializerError::Storage(error) => error.into(), + SerializerError::Io(error) => error, + SerializerError::DatasetFormatExpected(_) => { + Self::new(io::ErrorKind::InvalidInput, error.to_string()) + } + } + } +} diff --git a/src/storage/mod.rs b/src/storage/mod.rs new file mode 100644 index 0000000..8dc332e --- /dev/null +++ b/src/storage/mod.rs @@ -0,0 +1,1552 @@ +#![allow(clippy::same_name_method)] +#[cfg(all(not(target_family = "wasm")))] +use crate::model::Quad; +use crate::model::{GraphNameRef, NamedOrBlankNodeRef, QuadRef, TermRef}; +use crate::storage::backend::{Reader, Transaction}; +#[cfg(all(not(target_family = "wasm")))] +use crate::storage::binary_encoder::LATEST_STORAGE_VERSION; +use crate::storage::binary_encoder::{ + decode_term, encode_term, encode_term_pair, encode_term_quad, encode_term_triple, + write_gosp_quad, write_gpos_quad, write_gspo_quad, write_osp_quad, write_ospg_quad, + write_pos_quad, write_posg_quad, write_spo_quad, write_spog_quad, write_term, QuadEncoding, + WRITTEN_TERM_MAX_SIZE, +}; +pub use crate::storage::error::{CorruptionError, LoaderError, SerializerError, StorageError}; +#[cfg(all(not(target_family = "wasm")))] +use crate::storage::numeric_encoder::Decoder; +use crate::storage::numeric_encoder::{insert_term, EncodedQuad, EncodedTerm, StrHash, StrLookup}; +use backend::{ColumnFamily, ColumnFamilyDefinition, Db, Iter}; +#[cfg(all(not(target_family = "wasm")))] +use std::collections::VecDeque; +#[cfg(all(not(target_family = "wasm")))] +use std::collections::{HashMap, HashSet}; +use std::error::Error; +#[cfg(all(not(target_family = "wasm")))] +use std::mem::{swap, take}; +#[cfg(all(not(target_family = "wasm")))] +use std::path::{Path, PathBuf}; +#[cfg(all(not(target_family = "wasm")))] +use std::sync::Mutex; +#[cfg(all(not(target_family = "wasm")))] +use std::{io, thread}; + +mod backend; +mod binary_encoder; +mod error; +pub mod numeric_encoder; +pub mod small_string; + +const ID2STR_CF: &str = "id2str"; +const SPOG_CF: &str = "spog"; +const POSG_CF: &str = "posg"; +const OSPG_CF: &str = "ospg"; +const GSPO_CF: &str = "gspo"; +const GPOS_CF: &str = "gpos"; +const GOSP_CF: &str = "gosp"; +const DSPO_CF: &str = "dspo"; +const DPOS_CF: &str = "dpos"; +const DOSP_CF: &str = "dosp"; +const GRAPHS_CF: &str = "graphs"; +#[cfg(all(not(target_family = "wasm")))] +const DEFAULT_CF: &str = "default"; +#[cfg(all(not(target_family = "wasm")))] +const DEFAULT_BULK_LOAD_BATCH_SIZE: usize = 1_000_000; + +/// Low level storage primitives +#[derive(Clone)] +pub struct Storage { + db: Db, + #[cfg(all(not(target_family = "wasm")))] + default_cf: ColumnFamily, + id2str_cf: ColumnFamily, + spog_cf: ColumnFamily, + posg_cf: ColumnFamily, + ospg_cf: ColumnFamily, + gspo_cf: ColumnFamily, + gpos_cf: ColumnFamily, + gosp_cf: ColumnFamily, + dspo_cf: ColumnFamily, + dpos_cf: ColumnFamily, + dosp_cf: ColumnFamily, + graphs_cf: ColumnFamily, +} + +impl Storage { + pub fn new() -> Result { + Self::setup(Db::new(Self::column_families())?) + } + + #[cfg(all(not(target_family = "wasm")))] + pub fn open(path: &Path, key: Option<[u8; 32]>) -> Result { + Self::setup(Db::open_read_write( + Some(path), + Self::column_families(), + key, + )?) + } + + // #[cfg(all(not(target_family = "wasm")))] + // pub fn open_secondary(primary_path: &Path) -> Result { + // Self::setup(Db::open_secondary( + // primary_path, + // None, + // Self::column_families(), + // )?) + // } + + // #[cfg(all(not(target_family = "wasm")))] + // pub fn open_persistent_secondary( + // primary_path: &Path, + // secondary_path: &Path, + // ) -> Result { + // Self::setup(Db::open_secondary( + // primary_path, + // Some(secondary_path), + // Self::column_families(), + // )?) + // } + + #[cfg(all(not(target_family = "wasm")))] + pub fn open_read_only(path: &Path, key: Option<[u8; 32]>) -> Result { + Self::setup(Db::open_read_only(path, Self::column_families(), key)?) + } + + fn column_families() -> Vec { + vec![ + ColumnFamilyDefinition { + name: ID2STR_CF, + use_iter: false, + min_prefix_size: 0, + unordered_writes: true, + }, + ColumnFamilyDefinition { + name: SPOG_CF, + use_iter: true, + min_prefix_size: 17, // named or blank node start + unordered_writes: false, + }, + ColumnFamilyDefinition { + name: POSG_CF, + use_iter: true, + min_prefix_size: 17, // named node start + unordered_writes: false, + }, + ColumnFamilyDefinition { + name: OSPG_CF, + use_iter: true, + min_prefix_size: 0, // There are small literals... + unordered_writes: false, + }, + ColumnFamilyDefinition { + name: GSPO_CF, + use_iter: true, + min_prefix_size: 17, // named or blank node start + unordered_writes: false, + }, + ColumnFamilyDefinition { + name: GPOS_CF, + use_iter: true, + min_prefix_size: 17, // named or blank node start + unordered_writes: false, + }, + ColumnFamilyDefinition { + name: GOSP_CF, + use_iter: true, + min_prefix_size: 17, // named or blank node start + unordered_writes: false, + }, + ColumnFamilyDefinition { + name: DSPO_CF, + use_iter: true, + min_prefix_size: 17, // named or blank node start + unordered_writes: false, + }, + ColumnFamilyDefinition { + name: DPOS_CF, + use_iter: true, + min_prefix_size: 17, // named or blank node start + unordered_writes: false, + }, + ColumnFamilyDefinition { + name: DOSP_CF, + use_iter: true, + min_prefix_size: 0, // There are small literals... + unordered_writes: false, + }, + ColumnFamilyDefinition { + name: GRAPHS_CF, + use_iter: true, + min_prefix_size: 17, // named or blank node start + unordered_writes: false, + }, + ] + } + + fn setup(db: Db) -> Result { + let this = Self { + #[cfg(all(not(target_family = "wasm")))] + default_cf: db.column_family(DEFAULT_CF)?, + id2str_cf: db.column_family(ID2STR_CF)?, + spog_cf: db.column_family(SPOG_CF)?, + posg_cf: db.column_family(POSG_CF)?, + ospg_cf: db.column_family(OSPG_CF)?, + gspo_cf: db.column_family(GSPO_CF)?, + gpos_cf: db.column_family(GPOS_CF)?, + gosp_cf: db.column_family(GOSP_CF)?, + dspo_cf: db.column_family(DSPO_CF)?, + dpos_cf: db.column_family(DPOS_CF)?, + dosp_cf: db.column_family(DOSP_CF)?, + graphs_cf: db.column_family(GRAPHS_CF)?, + db, + }; + #[cfg(all(not(target_family = "wasm")))] + this.migrate()?; + Ok(this) + } + + #[cfg(all(not(target_family = "wasm")))] + fn migrate(&self) -> Result<(), StorageError> { + let mut version = self.ensure_version()?; + if version == 0 { + // We migrate to v1 + let mut graph_names = HashSet::new(); + for quad in self.snapshot().quads() { + let quad = quad?; + if !quad.graph_name.is_default_graph() { + graph_names.insert(quad.graph_name); + } + } + let mut graph_names = graph_names + .into_iter() + .map(|g| encode_term(&g)) + .collect::>(); + graph_names.sort_unstable(); + let mut stt_file = self.db.new_sst_file()?; + for k in graph_names { + stt_file.insert_empty(&k)?; + } + self.db + .insert_stt_files(&[(&self.graphs_cf, stt_file.finish()?)])?; + version = 1; + self.update_version(version)?; + } + + match version { + _ if version < LATEST_STORAGE_VERSION => Err(CorruptionError::msg(format!( + "The RocksDB database is using the outdated encoding version {version}. Automated migration is not supported, please dump the store dataset using a compatible Oxigraph version and load it again using the current version" + + )).into()), + LATEST_STORAGE_VERSION => Ok(()), + _ => Err(CorruptionError::msg(format!( + "The RocksDB database is using the too recent version {version}. Upgrade to the latest Oxigraph version to load this database" + + )).into()) + } + } + + #[cfg(all(not(target_family = "wasm")))] + fn ensure_version(&self) -> Result { + Ok( + if let Some(version) = self.db.get(&self.default_cf, b"oxversion")? { + u64::from_be_bytes(version.as_ref().try_into().map_err(|e| { + CorruptionError::new(format!("Error while parsing the version key: {e}")) + })?) + } else { + self.update_version(LATEST_STORAGE_VERSION)?; + LATEST_STORAGE_VERSION + }, + ) + } + + #[cfg(all(not(target_family = "wasm")))] + fn update_version(&self, version: u64) -> Result<(), StorageError> { + self.db + .insert(&self.default_cf, b"oxversion", &version.to_be_bytes())?; + self.db.flush() + } + + pub fn snapshot(&self) -> StorageReader { + StorageReader { + reader: self.db.snapshot(), + storage: self.clone(), + } + } + + pub fn transaction<'a, 'b: 'a, T, E: Error + 'static + From>( + &'b self, + f: impl Fn(StorageWriter<'a>) -> Result, + ) -> Result { + self.db.transaction(|transaction| { + f(StorageWriter { + buffer: Vec::new(), + transaction, + storage: self, + }) + }) + } + + #[cfg(all(not(target_family = "wasm")))] + pub fn flush(&self) -> Result<(), StorageError> { + self.db.flush() + } + + #[cfg(all(not(target_family = "wasm")))] + pub fn compact(&self) -> Result<(), StorageError> { + self.db.compact(&self.default_cf)?; + self.db.compact(&self.gspo_cf)?; + self.db.compact(&self.gpos_cf)?; + self.db.compact(&self.gosp_cf)?; + self.db.compact(&self.spog_cf)?; + self.db.compact(&self.posg_cf)?; + self.db.compact(&self.ospg_cf)?; + self.db.compact(&self.dspo_cf)?; + self.db.compact(&self.dpos_cf)?; + self.db.compact(&self.dosp_cf)?; + self.db.compact(&self.id2str_cf) + } + + #[cfg(all(not(target_family = "wasm")))] + pub fn backup(&self, target_directory: &Path) -> Result<(), StorageError> { + self.db.backup(target_directory) + } +} + +pub struct StorageReader { + reader: Reader, + storage: Storage, +} + +impl StorageReader { + pub fn len(&self) -> Result { + Ok(self.reader.len(&self.storage.gspo_cf)? + self.reader.len(&self.storage.dspo_cf)?) + } + + pub fn is_empty(&self) -> Result { + Ok(self.reader.is_empty(&self.storage.gspo_cf)? + && self.reader.is_empty(&self.storage.dspo_cf)?) + } + + pub fn contains(&self, quad: &EncodedQuad) -> Result { + let mut buffer = Vec::with_capacity(4 * WRITTEN_TERM_MAX_SIZE); + if quad.graph_name.is_default_graph() { + write_spo_quad(&mut buffer, quad); + Ok(self.reader.contains_key(&self.storage.dspo_cf, &buffer)?) + } else { + write_gspo_quad(&mut buffer, quad); + Ok(self.reader.contains_key(&self.storage.gspo_cf, &buffer)?) + } + } + + pub fn quads_for_pattern( + &self, + subject: Option<&EncodedTerm>, + predicate: Option<&EncodedTerm>, + object: Option<&EncodedTerm>, + graph_name: Option<&EncodedTerm>, + ) -> ChainedDecodingQuadIterator { + match subject { + Some(subject) => match predicate { + Some(predicate) => match object { + Some(object) => match graph_name { + Some(graph_name) => self.quads_for_subject_predicate_object_graph( + subject, predicate, object, graph_name, + ), + None => self.quads_for_subject_predicate_object(subject, predicate, object), + }, + None => match graph_name { + Some(graph_name) => { + self.quads_for_subject_predicate_graph(subject, predicate, graph_name) + } + None => self.quads_for_subject_predicate(subject, predicate), + }, + }, + None => match object { + Some(object) => match graph_name { + Some(graph_name) => { + self.quads_for_subject_object_graph(subject, object, graph_name) + } + None => self.quads_for_subject_object(subject, object), + }, + None => match graph_name { + Some(graph_name) => self.quads_for_subject_graph(subject, graph_name), + None => self.quads_for_subject(subject), + }, + }, + }, + None => match predicate { + Some(predicate) => match object { + Some(object) => match graph_name { + Some(graph_name) => { + self.quads_for_predicate_object_graph(predicate, object, graph_name) + } + None => self.quads_for_predicate_object(predicate, object), + }, + None => match graph_name { + Some(graph_name) => self.quads_for_predicate_graph(predicate, graph_name), + None => self.quads_for_predicate(predicate), + }, + }, + None => match object { + Some(object) => match graph_name { + Some(graph_name) => self.quads_for_object_graph(object, graph_name), + None => self.quads_for_object(object), + }, + None => match graph_name { + Some(graph_name) => self.quads_for_graph(graph_name), + None => self.quads(), + }, + }, + }, + } + } + + pub fn quads(&self) -> ChainedDecodingQuadIterator { + ChainedDecodingQuadIterator::pair(self.dspo_quads(&[]), self.gspo_quads(&[])) + } + + fn quads_in_named_graph(&self) -> DecodingQuadIterator { + self.gspo_quads(&[]) + } + + fn quads_for_subject(&self, subject: &EncodedTerm) -> ChainedDecodingQuadIterator { + ChainedDecodingQuadIterator::pair( + self.dspo_quads(&encode_term(subject)), + self.spog_quads(&encode_term(subject)), + ) + } + + fn quads_for_subject_predicate( + &self, + subject: &EncodedTerm, + predicate: &EncodedTerm, + ) -> ChainedDecodingQuadIterator { + ChainedDecodingQuadIterator::pair( + self.dspo_quads(&encode_term_pair(subject, predicate)), + self.spog_quads(&encode_term_pair(subject, predicate)), + ) + } + + fn quads_for_subject_predicate_object( + &self, + subject: &EncodedTerm, + predicate: &EncodedTerm, + object: &EncodedTerm, + ) -> ChainedDecodingQuadIterator { + ChainedDecodingQuadIterator::pair( + self.dspo_quads(&encode_term_triple(subject, predicate, object)), + self.spog_quads(&encode_term_triple(subject, predicate, object)), + ) + } + + fn quads_for_subject_object( + &self, + subject: &EncodedTerm, + object: &EncodedTerm, + ) -> ChainedDecodingQuadIterator { + ChainedDecodingQuadIterator::pair( + self.dosp_quads(&encode_term_pair(object, subject)), + self.ospg_quads(&encode_term_pair(object, subject)), + ) + } + + fn quads_for_predicate(&self, predicate: &EncodedTerm) -> ChainedDecodingQuadIterator { + ChainedDecodingQuadIterator::pair( + self.dpos_quads(&encode_term(predicate)), + self.posg_quads(&encode_term(predicate)), + ) + } + + fn quads_for_predicate_object( + &self, + predicate: &EncodedTerm, + object: &EncodedTerm, + ) -> ChainedDecodingQuadIterator { + ChainedDecodingQuadIterator::pair( + self.dpos_quads(&encode_term_pair(predicate, object)), + self.posg_quads(&encode_term_pair(predicate, object)), + ) + } + + fn quads_for_object(&self, object: &EncodedTerm) -> ChainedDecodingQuadIterator { + ChainedDecodingQuadIterator::pair( + self.dosp_quads(&encode_term(object)), + self.ospg_quads(&encode_term(object)), + ) + } + + fn quads_for_graph(&self, graph_name: &EncodedTerm) -> ChainedDecodingQuadIterator { + ChainedDecodingQuadIterator::new(if graph_name.is_default_graph() { + self.dspo_quads(&Vec::default()) + } else { + self.gspo_quads(&encode_term(graph_name)) + }) + } + + fn quads_for_subject_graph( + &self, + subject: &EncodedTerm, + graph_name: &EncodedTerm, + ) -> ChainedDecodingQuadIterator { + ChainedDecodingQuadIterator::new(if graph_name.is_default_graph() { + self.dspo_quads(&encode_term(subject)) + } else { + self.gspo_quads(&encode_term_pair(graph_name, subject)) + }) + } + + fn quads_for_subject_predicate_graph( + &self, + subject: &EncodedTerm, + predicate: &EncodedTerm, + graph_name: &EncodedTerm, + ) -> ChainedDecodingQuadIterator { + ChainedDecodingQuadIterator::new(if graph_name.is_default_graph() { + self.dspo_quads(&encode_term_pair(subject, predicate)) + } else { + self.gspo_quads(&encode_term_triple(graph_name, subject, predicate)) + }) + } + + fn quads_for_subject_predicate_object_graph( + &self, + subject: &EncodedTerm, + predicate: &EncodedTerm, + object: &EncodedTerm, + graph_name: &EncodedTerm, + ) -> ChainedDecodingQuadIterator { + ChainedDecodingQuadIterator::new(if graph_name.is_default_graph() { + self.dspo_quads(&encode_term_triple(subject, predicate, object)) + } else { + self.gspo_quads(&encode_term_quad(graph_name, subject, predicate, object)) + }) + } + + fn quads_for_subject_object_graph( + &self, + subject: &EncodedTerm, + object: &EncodedTerm, + graph_name: &EncodedTerm, + ) -> ChainedDecodingQuadIterator { + ChainedDecodingQuadIterator::new(if graph_name.is_default_graph() { + self.dosp_quads(&encode_term_pair(object, subject)) + } else { + self.gosp_quads(&encode_term_triple(graph_name, object, subject)) + }) + } + + fn quads_for_predicate_graph( + &self, + predicate: &EncodedTerm, + graph_name: &EncodedTerm, + ) -> ChainedDecodingQuadIterator { + ChainedDecodingQuadIterator::new(if graph_name.is_default_graph() { + self.dpos_quads(&encode_term(predicate)) + } else { + self.gpos_quads(&encode_term_pair(graph_name, predicate)) + }) + } + + fn quads_for_predicate_object_graph( + &self, + predicate: &EncodedTerm, + object: &EncodedTerm, + graph_name: &EncodedTerm, + ) -> ChainedDecodingQuadIterator { + ChainedDecodingQuadIterator::new(if graph_name.is_default_graph() { + self.dpos_quads(&encode_term_pair(predicate, object)) + } else { + self.gpos_quads(&encode_term_triple(graph_name, predicate, object)) + }) + } + + fn quads_for_object_graph( + &self, + object: &EncodedTerm, + graph_name: &EncodedTerm, + ) -> ChainedDecodingQuadIterator { + ChainedDecodingQuadIterator::new(if graph_name.is_default_graph() { + self.dosp_quads(&encode_term(object)) + } else { + self.gosp_quads(&encode_term_pair(graph_name, object)) + }) + } + + pub fn named_graphs(&self) -> DecodingGraphIterator { + DecodingGraphIterator { + iter: self.reader.iter(&self.storage.graphs_cf).unwrap(), // TODO: propagate error? + } + } + + pub fn contains_named_graph(&self, graph_name: &EncodedTerm) -> Result { + self.reader + .contains_key(&self.storage.graphs_cf, &encode_term(graph_name)) + } + + fn spog_quads(&self, prefix: &[u8]) -> DecodingQuadIterator { + self.inner_quads(&self.storage.spog_cf, prefix, QuadEncoding::Spog) + } + + fn posg_quads(&self, prefix: &[u8]) -> DecodingQuadIterator { + self.inner_quads(&self.storage.posg_cf, prefix, QuadEncoding::Posg) + } + + fn ospg_quads(&self, prefix: &[u8]) -> DecodingQuadIterator { + self.inner_quads(&self.storage.ospg_cf, prefix, QuadEncoding::Ospg) + } + + fn gspo_quads(&self, prefix: &[u8]) -> DecodingQuadIterator { + self.inner_quads(&self.storage.gspo_cf, prefix, QuadEncoding::Gspo) + } + + fn gpos_quads(&self, prefix: &[u8]) -> DecodingQuadIterator { + self.inner_quads(&self.storage.gpos_cf, prefix, QuadEncoding::Gpos) + } + + fn gosp_quads(&self, prefix: &[u8]) -> DecodingQuadIterator { + self.inner_quads(&self.storage.gosp_cf, prefix, QuadEncoding::Gosp) + } + + fn dspo_quads(&self, prefix: &[u8]) -> DecodingQuadIterator { + self.inner_quads(&self.storage.dspo_cf, prefix, QuadEncoding::Dspo) + } + + fn dpos_quads(&self, prefix: &[u8]) -> DecodingQuadIterator { + self.inner_quads(&self.storage.dpos_cf, prefix, QuadEncoding::Dpos) + } + + fn dosp_quads(&self, prefix: &[u8]) -> DecodingQuadIterator { + self.inner_quads(&self.storage.dosp_cf, prefix, QuadEncoding::Dosp) + } + + fn inner_quads( + &self, + column_family: &ColumnFamily, + prefix: &[u8], + encoding: QuadEncoding, + ) -> DecodingQuadIterator { + DecodingQuadIterator { + iter: self.reader.scan_prefix(column_family, prefix).unwrap(), // TODO: propagate error? + encoding, + } + } + + #[cfg(all(not(target_family = "wasm")))] + pub fn get_str(&self, key: &StrHash) -> Result, StorageError> { + Ok(self + .storage + .db + .get(&self.storage.id2str_cf, &key.to_be_bytes())? + .map(|v| String::from_utf8(v.into())) + .transpose() + .map_err(CorruptionError::new)?) + } + + #[cfg(any(target_family = "wasm"))] + pub fn get_str(&self, key: &StrHash) -> Result, StorageError> { + Ok(self + .reader + .get(&self.storage.id2str_cf, &key.to_be_bytes())? + .map(String::from_utf8) + .transpose() + .map_err(CorruptionError::new)?) + } + + #[cfg(all(not(target_family = "wasm")))] + pub fn contains_str(&self, key: &StrHash) -> Result { + self.storage + .db + .contains_key(&self.storage.id2str_cf, &key.to_be_bytes()) + } + + #[cfg(any(target_family = "wasm"))] + pub fn contains_str(&self, key: &StrHash) -> Result { + self.reader + .contains_key(&self.storage.id2str_cf, &key.to_be_bytes()) + } + + /// Validates that all the storage invariants held in the data + #[cfg(all(not(target_family = "wasm")))] + pub fn validate(&self) -> Result<(), StorageError> { + // triples + let dspo_size = self.dspo_quads(&[]).count(); + if dspo_size != self.dpos_quads(&[]).count() || dspo_size != self.dosp_quads(&[]).count() { + return Err(CorruptionError::new( + "Not the same number of triples in dspo, dpos and dosp", + ) + .into()); + } + for spo in self.dspo_quads(&[]) { + let spo = spo?; + self.decode_quad(&spo)?; // We ensure that the quad is readable + if !self.storage.db.contains_key( + &self.storage.dpos_cf, + &encode_term_triple(&spo.predicate, &spo.object, &spo.subject), + )? { + return Err(CorruptionError::new("Quad in dspo and not in dpos").into()); + } + if !self.storage.db.contains_key( + &self.storage.dosp_cf, + &encode_term_triple(&spo.object, &spo.subject, &spo.predicate), + )? { + return Err(CorruptionError::new("Quad in dspo and not in dpos").into()); + } + } + + // quads + let gspo_size = self.gspo_quads(&[]).count(); + if gspo_size != self.gpos_quads(&[]).count() + || gspo_size != self.gosp_quads(&[]).count() + || gspo_size != self.spog_quads(&[]).count() + || gspo_size != self.posg_quads(&[]).count() + || gspo_size != self.ospg_quads(&[]).count() + { + return Err(CorruptionError::new( + "Not the same number of triples in dspo, dpos and dosp", + ) + .into()); + } + for gspo in self.gspo_quads(&[]) { + let gspo = gspo?; + self.decode_quad(&gspo)?; // We ensure that the quad is readable + if !self.storage.db.contains_key( + &self.storage.gpos_cf, + &encode_term_quad( + &gspo.graph_name, + &gspo.predicate, + &gspo.object, + &gspo.subject, + ), + )? { + return Err(CorruptionError::new("Quad in gspo and not in gpos").into()); + } + if !self.storage.db.contains_key( + &self.storage.gosp_cf, + &encode_term_quad( + &gspo.graph_name, + &gspo.object, + &gspo.subject, + &gspo.predicate, + ), + )? { + return Err(CorruptionError::new("Quad in gspo and not in gosp").into()); + } + if !self.storage.db.contains_key( + &self.storage.spog_cf, + &encode_term_quad( + &gspo.subject, + &gspo.predicate, + &gspo.object, + &gspo.graph_name, + ), + )? { + return Err(CorruptionError::new("Quad in gspo and not in spog").into()); + } + if !self.storage.db.contains_key( + &self.storage.posg_cf, + &encode_term_quad( + &gspo.predicate, + &gspo.object, + &gspo.subject, + &gspo.graph_name, + ), + )? { + return Err(CorruptionError::new("Quad in gspo and not in posg").into()); + } + if !self.storage.db.contains_key( + &self.storage.ospg_cf, + &encode_term_quad( + &gspo.object, + &gspo.subject, + &gspo.predicate, + &gspo.graph_name, + ), + )? { + return Err(CorruptionError::new("Quad in gspo and not in ospg").into()); + } + if !self + .storage + .db + .contains_key(&self.storage.graphs_cf, &encode_term(&gspo.graph_name))? + { + return Err( + CorruptionError::new("Quad graph name in gspo and not in graphs").into(), + ); + } + } + Ok(()) + } + + /// Validates that all the storage invariants held in the data + #[cfg(any(target_family = "wasm"))] + #[allow(clippy::unused_self, clippy::unnecessary_wraps)] + pub fn validate(&self) -> Result<(), StorageError> { + Ok(()) // TODO + } +} + +pub struct ChainedDecodingQuadIterator { + first: DecodingQuadIterator, + second: Option, +} + +impl ChainedDecodingQuadIterator { + fn new(first: DecodingQuadIterator) -> Self { + Self { + first, + second: None, + } + } + + fn pair(first: DecodingQuadIterator, second: DecodingQuadIterator) -> Self { + Self { + first, + second: Some(second), + } + } +} + +impl Iterator for ChainedDecodingQuadIterator { + type Item = Result; + + fn next(&mut self) -> Option { + if let Some(result) = self.first.next() { + Some(result) + } else if let Some(second) = self.second.as_mut() { + second.next() + } else { + None + } + } +} + +pub struct DecodingQuadIterator { + iter: Iter, + encoding: QuadEncoding, +} + +impl Iterator for DecodingQuadIterator { + type Item = Result; + + fn next(&mut self) -> Option { + if let Err(e) = self.iter.status() { + return Some(Err(e)); + } + let term = self.encoding.decode(self.iter.key()?); + self.iter.next(); + Some(term) + } +} + +pub struct DecodingGraphIterator { + iter: Iter, +} + +impl Iterator for DecodingGraphIterator { + type Item = Result; + + fn next(&mut self) -> Option { + if let Err(e) = self.iter.status() { + return Some(Err(e)); + } + let term = decode_term(self.iter.key()?); + self.iter.next(); + Some(term) + } +} + +impl StrLookup for StorageReader { + fn get_str(&self, key: &StrHash) -> Result, StorageError> { + self.get_str(key) + } +} + +pub struct StorageWriter<'a> { + buffer: Vec, + transaction: Transaction<'a>, + storage: &'a Storage, +} + +impl<'a> StorageWriter<'a> { + pub fn reader(&self) -> StorageReader { + StorageReader { + reader: self.transaction.reader(), + storage: self.storage.clone(), + } + } + + pub fn insert(&mut self, quad: QuadRef<'_>) -> Result { + let encoded = quad.into(); + self.buffer.clear(); + let result = if quad.graph_name.is_default_graph() { + write_spo_quad(&mut self.buffer, &encoded); + if self + .transaction + .contains_key_for_update(&self.storage.dspo_cf, &self.buffer)? + { + false + } else { + self.transaction + .insert_empty(&self.storage.dspo_cf, &self.buffer)?; + + self.buffer.clear(); + write_pos_quad(&mut self.buffer, &encoded); + self.transaction + .insert_empty(&self.storage.dpos_cf, &self.buffer)?; + + self.buffer.clear(); + write_osp_quad(&mut self.buffer, &encoded); + self.transaction + .insert_empty(&self.storage.dosp_cf, &self.buffer)?; + + self.insert_term(quad.subject.into(), &encoded.subject)?; + self.insert_term(quad.predicate.into(), &encoded.predicate)?; + self.insert_term(quad.object, &encoded.object)?; + true + } + } else { + write_spog_quad(&mut self.buffer, &encoded); + if self + .transaction + .contains_key_for_update(&self.storage.spog_cf, &self.buffer)? + { + false + } else { + self.transaction + .insert_empty(&self.storage.spog_cf, &self.buffer)?; + + self.buffer.clear(); + write_posg_quad(&mut self.buffer, &encoded); + self.transaction + .insert_empty(&self.storage.posg_cf, &self.buffer)?; + + self.buffer.clear(); + write_ospg_quad(&mut self.buffer, &encoded); + self.transaction + .insert_empty(&self.storage.ospg_cf, &self.buffer)?; + + self.buffer.clear(); + write_gspo_quad(&mut self.buffer, &encoded); + self.transaction + .insert_empty(&self.storage.gspo_cf, &self.buffer)?; + + self.buffer.clear(); + write_gpos_quad(&mut self.buffer, &encoded); + self.transaction + .insert_empty(&self.storage.gpos_cf, &self.buffer)?; + + self.buffer.clear(); + write_gosp_quad(&mut self.buffer, &encoded); + self.transaction + .insert_empty(&self.storage.gosp_cf, &self.buffer)?; + + self.insert_term(quad.subject.into(), &encoded.subject)?; + self.insert_term(quad.predicate.into(), &encoded.predicate)?; + self.insert_term(quad.object, &encoded.object)?; + + self.buffer.clear(); + write_term(&mut self.buffer, &encoded.graph_name); + if !self + .transaction + .contains_key_for_update(&self.storage.graphs_cf, &self.buffer)? + { + self.transaction + .insert_empty(&self.storage.graphs_cf, &self.buffer)?; + self.insert_graph_name(quad.graph_name, &encoded.graph_name)?; + } + true + } + }; + Ok(result) + } + + pub fn insert_named_graph( + &mut self, + graph_name: NamedOrBlankNodeRef<'_>, + ) -> Result { + let encoded_graph_name = graph_name.into(); + + self.buffer.clear(); + write_term(&mut self.buffer, &encoded_graph_name); + let result = if self + .transaction + .contains_key_for_update(&self.storage.graphs_cf, &self.buffer)? + { + false + } else { + self.transaction + .insert_empty(&self.storage.graphs_cf, &self.buffer)?; + self.insert_term(graph_name.into(), &encoded_graph_name)?; + true + }; + Ok(result) + } + + fn insert_term( + &mut self, + term: TermRef<'_>, + encoded: &EncodedTerm, + ) -> Result<(), StorageError> { + insert_term(term, encoded, &mut |key, value| self.insert_str(key, value)) + } + + fn insert_graph_name( + &mut self, + graph_name: GraphNameRef<'_>, + encoded: &EncodedTerm, + ) -> Result<(), StorageError> { + match graph_name { + GraphNameRef::NamedNode(graph_name) => self.insert_term(graph_name.into(), encoded), + GraphNameRef::BlankNode(graph_name) => self.insert_term(graph_name.into(), encoded), + GraphNameRef::DefaultGraph => Ok(()), + } + } + + #[cfg(all(not(target_family = "wasm")))] + fn insert_str(&mut self, key: &StrHash, value: &str) -> Result<(), StorageError> { + if self + .storage + .db + .contains_key(&self.storage.id2str_cf, &key.to_be_bytes())? + { + return Ok(()); + } + self.storage.db.insert( + &self.storage.id2str_cf, + &key.to_be_bytes(), + value.as_bytes(), + ) + } + + #[cfg(any(target_family = "wasm"))] + fn insert_str(&mut self, key: &StrHash, value: &str) -> Result<(), StorageError> { + self.transaction.insert( + &self.storage.id2str_cf, + &key.to_be_bytes(), + value.as_bytes(), + ) + } + + pub fn remove(&mut self, quad: QuadRef<'_>) -> Result { + self.remove_encoded(&quad.into()) + } + + fn remove_encoded(&mut self, quad: &EncodedQuad) -> Result { + self.buffer.clear(); + let result = if quad.graph_name.is_default_graph() { + write_spo_quad(&mut self.buffer, quad); + + if self + .transaction + .contains_key_for_update(&self.storage.dspo_cf, &self.buffer)? + { + self.transaction + .remove(&self.storage.dspo_cf, &self.buffer)?; + + self.buffer.clear(); + write_pos_quad(&mut self.buffer, quad); + self.transaction + .remove(&self.storage.dpos_cf, &self.buffer)?; + + self.buffer.clear(); + write_osp_quad(&mut self.buffer, quad); + self.transaction + .remove(&self.storage.dosp_cf, &self.buffer)?; + true + } else { + false + } + } else { + write_spog_quad(&mut self.buffer, quad); + + if self + .transaction + .contains_key_for_update(&self.storage.spog_cf, &self.buffer)? + { + self.transaction + .remove(&self.storage.spog_cf, &self.buffer)?; + + self.buffer.clear(); + write_posg_quad(&mut self.buffer, quad); + self.transaction + .remove(&self.storage.posg_cf, &self.buffer)?; + + self.buffer.clear(); + write_ospg_quad(&mut self.buffer, quad); + self.transaction + .remove(&self.storage.ospg_cf, &self.buffer)?; + + self.buffer.clear(); + write_gspo_quad(&mut self.buffer, quad); + self.transaction + .remove(&self.storage.gspo_cf, &self.buffer)?; + + self.buffer.clear(); + write_gpos_quad(&mut self.buffer, quad); + self.transaction + .remove(&self.storage.gpos_cf, &self.buffer)?; + + self.buffer.clear(); + write_gosp_quad(&mut self.buffer, quad); + self.transaction + .remove(&self.storage.gosp_cf, &self.buffer)?; + true + } else { + false + } + }; + Ok(result) + } + + pub fn clear_graph(&mut self, graph_name: GraphNameRef<'_>) -> Result<(), StorageError> { + if graph_name.is_default_graph() { + for quad in self.reader().quads_for_graph(&EncodedTerm::DefaultGraph) { + self.remove_encoded(&quad?)?; + } + } else { + self.buffer.clear(); + write_term(&mut self.buffer, &graph_name.into()); + if self + .transaction + .contains_key_for_update(&self.storage.graphs_cf, &self.buffer)? + { + // The condition is useful to lock the graph itself and ensure no quad is inserted at the same time + for quad in self.reader().quads_for_graph(&graph_name.into()) { + self.remove_encoded(&quad?)?; + } + } + } + Ok(()) + } + + pub fn clear_all_named_graphs(&mut self) -> Result<(), StorageError> { + for quad in self.reader().quads_in_named_graph() { + self.remove_encoded(&quad?)?; + } + Ok(()) + } + + pub fn clear_all_graphs(&mut self) -> Result<(), StorageError> { + for quad in self.reader().quads() { + self.remove_encoded(&quad?)?; + } + Ok(()) + } + + pub fn remove_named_graph( + &mut self, + graph_name: NamedOrBlankNodeRef<'_>, + ) -> Result { + self.remove_encoded_named_graph(&graph_name.into()) + } + + fn remove_encoded_named_graph( + &mut self, + graph_name: &EncodedTerm, + ) -> Result { + self.buffer.clear(); + write_term(&mut self.buffer, graph_name); + let result = if self + .transaction + .contains_key_for_update(&self.storage.graphs_cf, &self.buffer)? + { + // The condition is done ASAP to lock the graph itself + for quad in self.reader().quads_for_graph(graph_name) { + self.remove_encoded(&quad?)?; + } + self.buffer.clear(); + write_term(&mut self.buffer, graph_name); + self.transaction + .remove(&self.storage.graphs_cf, &self.buffer)?; + true + } else { + false + }; + Ok(result) + } + + pub fn remove_all_named_graphs(&mut self) -> Result<(), StorageError> { + for graph_name in self.reader().named_graphs() { + self.remove_encoded_named_graph(&graph_name?)?; + } + Ok(()) + } + + pub fn clear(&mut self) -> Result<(), StorageError> { + for graph_name in self.reader().named_graphs() { + self.remove_encoded_named_graph(&graph_name?)?; + } + for quad in self.reader().quads() { + self.remove_encoded(&quad?)?; + } + Ok(()) + } +} + +#[cfg(all(not(target_family = "wasm")))] +#[must_use] +pub struct StorageBulkLoader { + storage: Storage, + hooks: Vec>, + num_threads: Option, + max_memory_size: Option, +} + +#[cfg(all(not(target_family = "wasm")))] +impl StorageBulkLoader { + pub fn new(storage: Storage) -> Self { + Self { + storage, + hooks: Vec::new(), + num_threads: None, + max_memory_size: None, + } + } + + pub fn with_num_threads(mut self, num_threads: usize) -> Self { + self.num_threads = Some(num_threads); + self + } + + pub fn with_max_memory_size_in_megabytes(mut self, max_memory_size: usize) -> Self { + self.max_memory_size = Some(max_memory_size); + self + } + + pub fn on_progress(mut self, callback: impl Fn(u64) + 'static) -> Self { + self.hooks.push(Box::new(callback)); + self + } + + #[allow(clippy::trait_duplication_in_bounds)] + pub fn load + From>( + &self, + quads: impl IntoIterator>, + ) -> Result<(), EO> { + let num_threads = self.num_threads.unwrap_or(2); + if num_threads < 2 { + return Err( + StorageError::Other("The bulk loader needs at least 2 threads".into()).into(), + ); + } + let batch_size = if let Some(max_memory_size) = self.max_memory_size { + max_memory_size * 1000 / num_threads + } else { + DEFAULT_BULK_LOAD_BATCH_SIZE + }; + if batch_size < 10_000 { + return Err(StorageError::Other( + "The bulk loader memory bound is too low. It needs at least 100MB".into(), + ) + .into()); + } + let done_counter = Mutex::new(0); + let mut done_and_displayed_counter = 0; + thread::scope(|thread_scope| { + let mut threads = VecDeque::with_capacity(num_threads - 1); + let mut buffer = Vec::with_capacity(batch_size); + for quad in quads { + let quad = quad?; + buffer.push(quad); + if buffer.len() >= batch_size { + self.spawn_load_thread( + &mut buffer, + &mut threads, + thread_scope, + &done_counter, + &mut done_and_displayed_counter, + num_threads, + batch_size, + )?; + } + } + self.spawn_load_thread( + &mut buffer, + &mut threads, + thread_scope, + &done_counter, + &mut done_and_displayed_counter, + num_threads, + batch_size, + )?; + for thread in threads { + map_thread_result(thread.join()).map_err(StorageError::Io)??; + self.on_possible_progress(&done_counter, &mut done_and_displayed_counter)?; + } + Ok(()) + }) + } + + fn spawn_load_thread<'scope>( + &'scope self, + buffer: &mut Vec, + threads: &mut VecDeque>>, + thread_scope: &'scope thread::Scope<'scope, '_>, + done_counter: &'scope Mutex, + done_and_displayed_counter: &mut u64, + num_threads: usize, + batch_size: usize, + ) -> Result<(), StorageError> { + self.on_possible_progress(done_counter, done_and_displayed_counter)?; + // We avoid to have too many threads + if threads.len() >= num_threads { + if let Some(thread) = threads.pop_front() { + map_thread_result(thread.join()).map_err(StorageError::Io)??; + self.on_possible_progress(done_counter, done_and_displayed_counter)?; + } + } + let mut buffer_to_load = Vec::with_capacity(batch_size); + swap(buffer, &mut buffer_to_load); + let storage = &self.storage; + threads.push_back(thread_scope.spawn(move || { + FileBulkLoader::new(storage, batch_size).load(buffer_to_load, done_counter) + })); + Ok(()) + } + + fn on_possible_progress( + &self, + done: &Mutex, + done_and_displayed: &mut u64, + ) -> Result<(), StorageError> { + let new_counter = *done + .lock() + .map_err(|_| io::Error::new(io::ErrorKind::Other, "Mutex poisoned"))?; + let display_step = DEFAULT_BULK_LOAD_BATCH_SIZE as u64; + if new_counter / display_step > *done_and_displayed / display_step { + for hook in &self.hooks { + hook(new_counter); + } + } + *done_and_displayed = new_counter; + Ok(()) + } +} + +#[cfg(all(not(target_family = "wasm")))] +struct FileBulkLoader<'a> { + storage: &'a Storage, + id2str: HashMap>, + quads: HashSet, + triples: HashSet, + graphs: HashSet, +} + +#[cfg(all(not(target_family = "wasm")))] +impl<'a> FileBulkLoader<'a> { + fn new(storage: &'a Storage, batch_size: usize) -> Self { + Self { + storage, + id2str: HashMap::with_capacity(3 * batch_size), + quads: HashSet::with_capacity(batch_size), + triples: HashSet::with_capacity(batch_size), + graphs: HashSet::default(), + } + } + + fn load(&mut self, quads: Vec, counter: &Mutex) -> Result<(), StorageError> { + self.encode(quads)?; + let size = self.triples.len() + self.quads.len(); + self.save()?; + *counter + .lock() + .map_err(|_| io::Error::new(io::ErrorKind::Other, "Mutex poisoned"))? += + size.try_into().unwrap_or(u64::MAX); + Ok(()) + } + + fn encode(&mut self, quads: Vec) -> Result<(), StorageError> { + for quad in quads { + let encoded = EncodedQuad::from(quad.as_ref()); + if quad.graph_name.is_default_graph() { + if self.triples.insert(encoded.clone()) { + self.insert_term(quad.subject.as_ref().into(), &encoded.subject)?; + self.insert_term(quad.predicate.as_ref().into(), &encoded.predicate)?; + self.insert_term(quad.object.as_ref(), &encoded.object)?; + } + } else if self.quads.insert(encoded.clone()) { + self.insert_term(quad.subject.as_ref().into(), &encoded.subject)?; + self.insert_term(quad.predicate.as_ref().into(), &encoded.predicate)?; + self.insert_term(quad.object.as_ref(), &encoded.object)?; + + if self.graphs.insert(encoded.graph_name.clone()) { + self.insert_term( + match quad.graph_name.as_ref() { + GraphNameRef::NamedNode(n) => n.into(), + GraphNameRef::BlankNode(n) => n.into(), + GraphNameRef::DefaultGraph => { + return Err(CorruptionError::new( + "Default graph this not the default graph", + ) + .into()) + } + }, + &encoded.graph_name, + )?; + } + } + } + Ok(()) + } + + fn save(&mut self) -> Result<(), StorageError> { + let mut to_load = Vec::new(); + + // id2str + if !self.id2str.is_empty() { + let mut id2str = take(&mut self.id2str) + .into_iter() + .map(|(k, v)| (k.to_be_bytes(), v)) + .collect::>(); + id2str.sort_unstable(); + let mut id2str_sst = self.storage.db.new_sst_file()?; + for (k, v) in id2str { + id2str_sst.insert(&k, v.as_bytes())?; + } + to_load.push((&self.storage.id2str_cf, id2str_sst.finish()?)); + } + + if !self.triples.is_empty() { + to_load.push(( + &self.storage.dspo_cf, + self.build_sst_for_keys( + self.triples.iter().map(|quad| { + encode_term_triple(&quad.subject, &quad.predicate, &quad.object) + }), + )?, + )); + to_load.push(( + &self.storage.dpos_cf, + self.build_sst_for_keys( + self.triples.iter().map(|quad| { + encode_term_triple(&quad.predicate, &quad.object, &quad.subject) + }), + )?, + )); + to_load.push(( + &self.storage.dosp_cf, + self.build_sst_for_keys( + self.triples.iter().map(|quad| { + encode_term_triple(&quad.object, &quad.subject, &quad.predicate) + }), + )?, + )); + self.triples.clear(); + } + + if !self.quads.is_empty() { + to_load.push(( + &self.storage.graphs_cf, + self.build_sst_for_keys(self.graphs.iter().map(encode_term))?, + )); + self.graphs.clear(); + + to_load.push(( + &self.storage.gspo_cf, + self.build_sst_for_keys(self.quads.iter().map(|quad| { + encode_term_quad( + &quad.graph_name, + &quad.subject, + &quad.predicate, + &quad.object, + ) + }))?, + )); + to_load.push(( + &self.storage.gpos_cf, + self.build_sst_for_keys(self.quads.iter().map(|quad| { + encode_term_quad( + &quad.graph_name, + &quad.predicate, + &quad.object, + &quad.subject, + ) + }))?, + )); + to_load.push(( + &self.storage.gosp_cf, + self.build_sst_for_keys(self.quads.iter().map(|quad| { + encode_term_quad( + &quad.graph_name, + &quad.object, + &quad.subject, + &quad.predicate, + ) + }))?, + )); + to_load.push(( + &self.storage.spog_cf, + self.build_sst_for_keys(self.quads.iter().map(|quad| { + encode_term_quad( + &quad.subject, + &quad.predicate, + &quad.object, + &quad.graph_name, + ) + }))?, + )); + to_load.push(( + &self.storage.posg_cf, + self.build_sst_for_keys(self.quads.iter().map(|quad| { + encode_term_quad( + &quad.predicate, + &quad.object, + &quad.subject, + &quad.graph_name, + ) + }))?, + )); + to_load.push(( + &self.storage.ospg_cf, + self.build_sst_for_keys(self.quads.iter().map(|quad| { + encode_term_quad( + &quad.object, + &quad.subject, + &quad.predicate, + &quad.graph_name, + ) + }))?, + )); + self.quads.clear(); + } + + self.storage.db.insert_stt_files(&to_load) + } + + fn insert_term( + &mut self, + term: TermRef<'_>, + encoded: &EncodedTerm, + ) -> Result<(), StorageError> { + insert_term(term, encoded, &mut |key, value| { + self.id2str.entry(*key).or_insert_with(|| value.into()); + Ok(()) + }) + } + + fn build_sst_for_keys( + &self, + values: impl Iterator>, + ) -> Result { + let mut values = values.collect::>(); + values.sort_unstable(); + let mut sst = self.storage.db.new_sst_file()?; + for value in values { + sst.insert_empty(&value)?; + } + sst.finish() + } +} + +#[cfg(all(not(target_family = "wasm")))] +fn map_thread_result(result: thread::Result) -> io::Result { + result.map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + if let Ok(e) = e.downcast::<&dyn std::fmt::Display>() { + format!("A loader processed crashed with {e}") + } else { + "A loader processed crashed with and unknown error".into() + }, + ) + }) +} diff --git a/src/storage/numeric_encoder.rs b/src/storage/numeric_encoder.rs new file mode 100644 index 0000000..bf4b070 --- /dev/null +++ b/src/storage/numeric_encoder.rs @@ -0,0 +1,1031 @@ +#![allow(clippy::unreadable_literal)] + +use crate::model::*; +use crate::storage::error::{CorruptionError, StorageError}; +use crate::storage::small_string::SmallString; +use oxsdatatypes::*; +use siphasher::sip128::{Hasher128, SipHasher24}; +use std::fmt::Debug; +use std::hash::{Hash, Hasher}; +use std::str; +use std::sync::Arc; + +#[derive(Eq, PartialEq, Debug, Clone, Copy, Hash)] +#[repr(transparent)] +pub struct StrHash { + hash: u128, +} + +impl StrHash { + pub fn new(value: &str) -> Self { + let mut hasher = SipHasher24::new(); + hasher.write(value.as_bytes()); + Self { + hash: hasher.finish128().into(), + } + } + + #[inline] + pub fn from_be_bytes(bytes: [u8; 16]) -> Self { + Self { + hash: u128::from_be_bytes(bytes), + } + } + + #[inline] + pub fn to_be_bytes(self) -> [u8; 16] { + self.hash.to_be_bytes() + } +} + +#[derive(Debug, Clone)] +pub enum EncodedTerm { + DefaultGraph, + NamedNode { + iri_id: StrHash, + }, + NumericalBlankNode { + id: u128, + }, + SmallBlankNode(SmallString), + BigBlankNode { + id_id: StrHash, + }, + SmallStringLiteral(SmallString), + BigStringLiteral { + value_id: StrHash, + }, + SmallSmallLangStringLiteral { + value: SmallString, + language: SmallString, + }, + SmallBigLangStringLiteral { + value: SmallString, + language_id: StrHash, + }, + BigSmallLangStringLiteral { + value_id: StrHash, + language: SmallString, + }, + BigBigLangStringLiteral { + value_id: StrHash, + language_id: StrHash, + }, + SmallTypedLiteral { + value: SmallString, + datatype_id: StrHash, + }, + BigTypedLiteral { + value_id: StrHash, + datatype_id: StrHash, + }, + BooleanLiteral(Boolean), + FloatLiteral(Float), + DoubleLiteral(Double), + IntegerLiteral(Integer), + DecimalLiteral(Decimal), + DateTimeLiteral(DateTime), + TimeLiteral(Time), + DateLiteral(Date), + GYearMonthLiteral(GYearMonth), + GYearLiteral(GYear), + GMonthDayLiteral(GMonthDay), + GDayLiteral(GDay), + GMonthLiteral(GMonth), + DurationLiteral(Duration), + YearMonthDurationLiteral(YearMonthDuration), + DayTimeDurationLiteral(DayTimeDuration), + Triple(Arc), +} + +impl PartialEq for EncodedTerm { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (Self::DefaultGraph, Self::DefaultGraph) => true, + (Self::NamedNode { iri_id: iri_id_a }, Self::NamedNode { iri_id: iri_id_b }) => { + iri_id_a == iri_id_b + } + (Self::NumericalBlankNode { id: id_a }, Self::NumericalBlankNode { id: id_b }) => { + id_a == id_b + } + (Self::SmallBlankNode(id_a), Self::SmallBlankNode(id_b)) => id_a == id_b, + (Self::BigBlankNode { id_id: id_a }, Self::BigBlankNode { id_id: id_b }) => { + id_a == id_b + } + (Self::SmallStringLiteral(a), Self::SmallStringLiteral(b)) => a == b, + ( + Self::BigStringLiteral { + value_id: value_id_a, + }, + Self::BigStringLiteral { + value_id: value_id_b, + }, + ) => value_id_a == value_id_b, + ( + Self::SmallSmallLangStringLiteral { + value: value_a, + language: language_a, + }, + Self::SmallSmallLangStringLiteral { + value: value_b, + language: language_b, + }, + ) => value_a == value_b && language_a == language_b, + ( + Self::SmallBigLangStringLiteral { + value: value_a, + language_id: language_id_a, + }, + Self::SmallBigLangStringLiteral { + value: value_b, + language_id: language_id_b, + }, + ) => value_a == value_b && language_id_a == language_id_b, + ( + Self::BigSmallLangStringLiteral { + value_id: value_id_a, + language: language_a, + }, + Self::BigSmallLangStringLiteral { + value_id: value_id_b, + language: language_b, + }, + ) => value_id_a == value_id_b && language_a == language_b, + ( + Self::BigBigLangStringLiteral { + value_id: value_id_a, + language_id: language_id_a, + }, + Self::BigBigLangStringLiteral { + value_id: value_id_b, + language_id: language_id_b, + }, + ) => value_id_a == value_id_b && language_id_a == language_id_b, + ( + Self::SmallTypedLiteral { + value: value_a, + datatype_id: datatype_id_a, + }, + Self::SmallTypedLiteral { + value: value_b, + datatype_id: datatype_id_b, + }, + ) => value_a == value_b && datatype_id_a == datatype_id_b, + ( + Self::BigTypedLiteral { + value_id: value_id_a, + datatype_id: datatype_id_a, + }, + Self::BigTypedLiteral { + value_id: value_id_b, + datatype_id: datatype_id_b, + }, + ) => value_id_a == value_id_b && datatype_id_a == datatype_id_b, + (Self::BooleanLiteral(a), Self::BooleanLiteral(b)) => a == b, + (Self::FloatLiteral(a), Self::FloatLiteral(b)) => a.is_identical_with(*b), + (Self::DoubleLiteral(a), Self::DoubleLiteral(b)) => a.is_identical_with(*b), + (Self::IntegerLiteral(a), Self::IntegerLiteral(b)) => a.is_identical_with(*b), + (Self::DecimalLiteral(a), Self::DecimalLiteral(b)) => a.is_identical_with(*b), + (Self::DateTimeLiteral(a), Self::DateTimeLiteral(b)) => a.is_identical_with(*b), + (Self::TimeLiteral(a), Self::TimeLiteral(b)) => a.is_identical_with(*b), + (Self::DateLiteral(a), Self::DateLiteral(b)) => a.is_identical_with(*b), + (Self::GYearMonthLiteral(a), Self::GYearMonthLiteral(b)) => a.is_identical_with(*b), + (Self::GYearLiteral(a), Self::GYearLiteral(b)) => a.is_identical_with(*b), + (Self::GMonthDayLiteral(a), Self::GMonthDayLiteral(b)) => a.is_identical_with(*b), + (Self::GMonthLiteral(a), Self::GMonthLiteral(b)) => a.is_identical_with(*b), + (Self::GDayLiteral(a), Self::GDayLiteral(b)) => a.is_identical_with(*b), + (Self::DurationLiteral(a), Self::DurationLiteral(b)) => a.is_identical_with(*b), + (Self::YearMonthDurationLiteral(a), Self::YearMonthDurationLiteral(b)) => { + a.is_identical_with(*b) + } + (Self::DayTimeDurationLiteral(a), Self::DayTimeDurationLiteral(b)) => { + a.is_identical_with(*b) + } + (Self::Triple(a), Self::Triple(b)) => a == b, + (_, _) => false, + } + } +} + +impl Eq for EncodedTerm {} + +impl Hash for EncodedTerm { + fn hash(&self, state: &mut H) { + match self { + Self::NamedNode { iri_id } => iri_id.hash(state), + Self::NumericalBlankNode { id } => id.hash(state), + Self::SmallBlankNode(id) => id.hash(state), + Self::BigBlankNode { id_id } => id_id.hash(state), + Self::DefaultGraph => (), + Self::SmallStringLiteral(value) => value.hash(state), + Self::BigStringLiteral { value_id } => value_id.hash(state), + Self::SmallSmallLangStringLiteral { value, language } => { + value.hash(state); + language.hash(state); + } + Self::SmallBigLangStringLiteral { value, language_id } => { + value.hash(state); + language_id.hash(state); + } + Self::BigSmallLangStringLiteral { value_id, language } => { + value_id.hash(state); + language.hash(state); + } + Self::BigBigLangStringLiteral { + value_id, + language_id, + } => { + value_id.hash(state); + language_id.hash(state); + } + Self::SmallTypedLiteral { value, datatype_id } => { + value.hash(state); + datatype_id.hash(state); + } + Self::BigTypedLiteral { + value_id, + datatype_id, + } => { + value_id.hash(state); + datatype_id.hash(state); + } + Self::BooleanLiteral(value) => value.hash(state), + Self::FloatLiteral(value) => value.to_be_bytes().hash(state), + Self::DoubleLiteral(value) => value.to_be_bytes().hash(state), + Self::IntegerLiteral(value) => value.hash(state), + Self::DecimalLiteral(value) => value.hash(state), + Self::DateTimeLiteral(value) => value.hash(state), + Self::TimeLiteral(value) => value.hash(state), + Self::DateLiteral(value) => value.hash(state), + Self::GYearMonthLiteral(value) => value.hash(state), + Self::GYearLiteral(value) => value.hash(state), + Self::GMonthDayLiteral(value) => value.hash(state), + Self::GDayLiteral(value) => value.hash(state), + Self::GMonthLiteral(value) => value.hash(state), + Self::DurationLiteral(value) => value.hash(state), + Self::YearMonthDurationLiteral(value) => value.hash(state), + Self::DayTimeDurationLiteral(value) => value.hash(state), + Self::Triple(value) => value.hash(state), + } + } +} + +impl EncodedTerm { + pub fn is_named_node(&self) -> bool { + matches!(self, Self::NamedNode { .. }) + } + + pub fn is_blank_node(&self) -> bool { + matches!( + self, + Self::NumericalBlankNode { .. } + | Self::SmallBlankNode { .. } + | Self::BigBlankNode { .. } + ) + } + + pub fn is_literal(&self) -> bool { + matches!( + self, + Self::SmallStringLiteral { .. } + | Self::BigStringLiteral { .. } + | Self::SmallSmallLangStringLiteral { .. } + | Self::SmallBigLangStringLiteral { .. } + | Self::BigSmallLangStringLiteral { .. } + | Self::BigBigLangStringLiteral { .. } + | Self::SmallTypedLiteral { .. } + | Self::BigTypedLiteral { .. } + | Self::BooleanLiteral(_) + | Self::FloatLiteral(_) + | Self::DoubleLiteral(_) + | Self::IntegerLiteral(_) + | Self::DecimalLiteral(_) + | Self::DateTimeLiteral(_) + | Self::TimeLiteral(_) + | Self::DateLiteral(_) + | Self::GYearMonthLiteral(_) + | Self::GYearLiteral(_) + | Self::GMonthDayLiteral(_) + | Self::GDayLiteral(_) + | Self::GMonthLiteral(_) + | Self::DurationLiteral(_) + | Self::YearMonthDurationLiteral(_) + | Self::DayTimeDurationLiteral(_) + ) + } + + pub fn is_unknown_typed_literal(&self) -> bool { + matches!( + self, + Self::SmallTypedLiteral { .. } | Self::BigTypedLiteral { .. } + ) + } + + pub fn is_default_graph(&self) -> bool { + matches!(self, Self::DefaultGraph) + } + + pub fn is_triple(&self) -> bool { + matches!(self, Self::Triple { .. }) + } +} + +impl From for EncodedTerm { + fn from(value: bool) -> Self { + Self::BooleanLiteral(value.into()) + } +} + +impl From for EncodedTerm { + fn from(value: i64) -> Self { + Self::IntegerLiteral(value.into()) + } +} + +impl From for EncodedTerm { + fn from(value: i32) -> Self { + Self::IntegerLiteral(value.into()) + } +} + +impl From for EncodedTerm { + fn from(value: u32) -> Self { + Self::IntegerLiteral(value.into()) + } +} + +impl From for EncodedTerm { + fn from(value: u8) -> Self { + Self::IntegerLiteral(value.into()) + } +} + +impl From for EncodedTerm { + fn from(value: f32) -> Self { + Self::FloatLiteral(value.into()) + } +} + +impl From for EncodedTerm { + fn from(value: Float) -> Self { + Self::FloatLiteral(value) + } +} + +impl From for EncodedTerm { + fn from(value: f64) -> Self { + Self::DoubleLiteral(value.into()) + } +} + +impl From for EncodedTerm { + fn from(value: Boolean) -> Self { + Self::BooleanLiteral(value) + } +} + +impl From for EncodedTerm { + fn from(value: Double) -> Self { + Self::DoubleLiteral(value) + } +} + +impl From for EncodedTerm { + fn from(value: Integer) -> Self { + Self::IntegerLiteral(value) + } +} + +impl From for EncodedTerm { + fn from(value: Decimal) -> Self { + Self::DecimalLiteral(value) + } +} + +impl From for EncodedTerm { + fn from(value: DateTime) -> Self { + Self::DateTimeLiteral(value) + } +} + +impl From