From f7023a381ead141dc97e41c03c62c8ac949fdec6 Mon Sep 17 00:00:00 2001 From: Tpt Date: Tue, 19 Mar 2024 21:19:18 +0100 Subject: [PATCH] Python: exposes the Dataset class --- lib/oxrdf/src/dataset.rs | 3 +- lib/oxrdf/src/graph.rs | 3 +- python/docs/model.rst | 9 ++ python/src/dataset.rs | 322 +++++++++++++++++++++++++++++++++++++++ python/src/lib.rs | 4 + 5 files changed, 337 insertions(+), 4 deletions(-) create mode 100644 python/src/dataset.rs diff --git a/lib/oxrdf/src/dataset.rs b/lib/oxrdf/src/dataset.rs index 3e6592a3..791136c9 100644 --- a/lib/oxrdf/src/dataset.rs +++ b/lib/oxrdf/src/dataset.rs @@ -504,8 +504,7 @@ impl Dataset { } } - /// Applies on the dataset the canonicalization process described in - /// [Canonical Forms for Isomorphic and Equivalent RDF Graphs: Algorithms for Leaning and Labelling Blank Nodes, Aidan Hogan, 2017](http://aidanhogan.com/docs/rdf-canonicalisation.pdf). + /// Canonicalizes the dataset by renaming blank nodes. /// /// Usage example ([Dataset isomorphism](https://www.w3.org/TR/rdf11-concepts/#dfn-dataset-isomorphism)): /// ``` diff --git a/lib/oxrdf/src/graph.rs b/lib/oxrdf/src/graph.rs index b6002ea2..270a19c9 100644 --- a/lib/oxrdf/src/graph.rs +++ b/lib/oxrdf/src/graph.rs @@ -184,8 +184,7 @@ impl Graph { self.dataset.clear() } - /// Applies on the graph the canonicalization process described in - /// [Canonical Forms for Isomorphic and Equivalent RDF Graphs: Algorithms for Leaning and Labelling Blank Nodes, Aidan Hogan, 2017](http://aidanhogan.com/docs/rdf-canonicalisation.pdf). + /// Canonicalizes the dataset by renaming blank nodes. /// /// Usage example ([Graph isomorphism](https://www.w3.org/TR/rdf11-concepts/#dfn-graph-isomorphism)): /// ``` diff --git a/python/docs/model.rst b/python/docs/model.rst index 28a7279c..99893190 100644 --- a/python/docs/model.rst +++ b/python/docs/model.rst @@ -36,3 +36,12 @@ Quads (`triples `_ in a `R .. autoclass:: DefaultGraph :members: + + +`Datasets `_ +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +.. autoclass:: Dataset + :members: + +.. autoclass:: CanonicalizationAlgorithm + :members: diff --git a/python/src/dataset.rs b/python/src/dataset.rs new file mode 100644 index 00000000..bfccca79 --- /dev/null +++ b/python/src/dataset.rs @@ -0,0 +1,322 @@ +use crate::model::{hash, PyGraphNameRef, PyNamedNodeRef, PyQuad, PySubjectRef, PyTermRef}; +use oxigraph::model::dataset::{CanonicalizationAlgorithm, Dataset}; +use oxigraph::model::{Quad, QuadRef}; +use pyo3::exceptions::PyKeyError; +use pyo3::prelude::*; + +/// An in-memory `RDF dataset `_. +/// +/// It can accommodate a fairly large number of quads (in the few millions). +/// +/// Use :py:class:`Store` if you need on-disk persistence or SPARQL. +/// +/// Warning: It interns the strings and does not do any garbage collection yet: +/// if you insert and remove a lot of different terms, memory will grow without any reduction. +/// +/// :param quads: some quads to initialize the dataset with. +/// :type quads: collections.abc.Iterable[Quad] +/// +/// The :py:class:`str` function provides an N-Quads serialization: +/// +/// >>> str(Dataset([Quad(NamedNode('http://example.com/s'), NamedNode('http://example.com/p'), NamedNode('http://example.com/o'), NamedNode('http://example.com/g'))])) +/// ' .\n' +#[pyclass(name = "Dataset", module = "pyoxigraph")] +#[derive(Eq, PartialEq, Debug, Clone)] +pub struct PyDataset { + inner: Dataset, +} + +#[pymethods] +impl PyDataset { + #[new] + #[pyo3(signature = (quads = None))] + fn new(quads: Option<&PyAny>) -> PyResult { + let mut inner = Dataset::new(); + if let Some(quads) = quads { + for quad in quads.iter()? { + inner.insert(&*quad?.extract::>()?); + } + } + Ok(Self { inner }) + } + + /// Looks for the quads with the given subject. + /// + /// :param subject: the quad subject. + /// :type subject: NamedNode or BlankNode or Triple + /// :return: an iterator of the quads. + /// :rtype: collections.abc.Iterator[Quad] + /// + /// >>> store = Dataset([Quad(NamedNode('http://example.com'), NamedNode('http://example.com/p'), Literal('1'), NamedNode('http://example.com/g'))]) + /// >>> list(store.quads_for_subject(NamedNode('http://example.com'))) + /// [ predicate= object=> graph_name=>] + pub fn quads_for_subject(&self, subject: &PyAny) -> PyResult { + Ok(QuadIter { + inner: self + .inner + .quads_for_subject(&PySubjectRef::try_from(subject)?) + .map(QuadRef::into_owned) + .collect::>() + .into_iter(), + }) + } + + /// Looks for the quads with the given predicate. + /// + /// :param predicate: the quad predicate. + /// :type subject: NamedNode + /// :return: an iterator of the quads. + /// :rtype: collections.abc.Iterator[Quad] + /// + /// >>> store = Dataset([Quad(NamedNode('http://example.com'), NamedNode('http://example.com/p'), Literal('1'), NamedNode('http://example.com/g'))]) + /// >>> list(store.quads_for_predicate(NamedNode('http://example.com/p'))) + /// [ predicate= object=> graph_name=>] + pub fn quads_for_predicate(&self, predicate: &PyAny) -> PyResult { + Ok(QuadIter { + inner: self + .inner + .quads_for_predicate(&PyNamedNodeRef::try_from(predicate)?) + .map(QuadRef::into_owned) + .collect::>() + .into_iter(), + }) + } + + /// Looks for the quads with the given object. + /// + /// :param object: the quad object. + /// :type object: NamedNode or BlankNode or Literal or Triple + /// :return: an iterator of the quads. + /// :rtype: collections.abc.Iterator[Quad] + /// + /// >>> store = Dataset([Quad(NamedNode('http://example.com'), NamedNode('http://example.com/p'), Literal('1'), NamedNode('http://example.com/g'))]) + /// >>> list(store.quads_for_object(Literal('1'))) + /// [ predicate= object=> graph_name=>] + pub fn quads_for_object(&self, object: &PyAny) -> PyResult { + Ok(QuadIter { + inner: self + .inner + .quads_for_object(&PyTermRef::try_from(object)?) + .map(QuadRef::into_owned) + .collect::>() + .into_iter(), + }) + } + + /// Looks for the quads with the given graph name. + /// + /// :param graph_name: the quad graph name. + /// :type graph_name: NamedNode or BlankNode or Literal or Triple + /// :return: an iterator of the quads. + /// :rtype: collections.abc.Iterator[Quad] + /// + /// >>> store = Dataset([Quad(NamedNode('http://example.com'), NamedNode('http://example.com/p'), Literal('1'), NamedNode('http://example.com/g'))]) + /// >>> list(store.quads_for_graph_name(NamedNode('http://example.com/g'))) + /// [ predicate= object=> graph_name=>] + pub fn quads_for_graph_name(&self, graph_name: &PyAny) -> PyResult { + Ok(QuadIter { + inner: self + .inner + .quads_for_graph_name(&PyGraphNameRef::try_from(graph_name)?) + .map(QuadRef::into_owned) + .collect::>() + .into_iter(), + }) + } + + /// Adds a quad to the dataset. + /// + /// :param quad: the quad to add. + /// :type quad: Quad + /// :rtype: None + /// + /// >>> quad = Quad(NamedNode('http://example.com/s'), NamedNode('http://example.com/p'), NamedNode('http://example.com/o'), NamedNode('http://example.com/g')) + /// >>> dataset = Dataset() + /// >>> dataset.add(quad) + /// >>> quad in dataset + /// True + fn add(&mut self, quad: &PyQuad) { + self.inner.insert(quad); + } + + /// Removes a quad from the dataset and raises an exception if it is not in the set. + /// + /// :param quad: the quad to remove. + /// :type quad: Quad + /// :rtype: None + /// :raises KeyError: if the element was not in the set. + /// + /// >>> quad = Quad(NamedNode('http://example.com/s'), NamedNode('http://example.com/p'), NamedNode('http://example.com/o'), NamedNode('http://example.com/g')) + /// >>> dataset = Dataset([quad]) + /// >>> dataset.remove(quad) + /// >>> quad in dataset + /// False + fn remove(&mut self, quad: &PyQuad) -> PyResult<()> { + if self.inner.remove(quad) { + Ok(()) + } else { + Err(PyKeyError::new_err(format!( + "{} is not in the Dataset", + QuadRef::from(quad) + ))) + } + } + + /// Removes a quad from the dataset if it is present. + /// + /// :param quad: the quad to remove. + /// :type quad: Quad + /// :rtype: None + /// + /// >>> quad = Quad(NamedNode('http://example.com/s'), NamedNode('http://example.com/p'), NamedNode('http://example.com/o'), NamedNode('http://example.com/g')) + /// >>> dataset = Dataset([quad]) + /// >>> dataset.discard(quad) + /// >>> quad in dataset + /// False + fn discard(&mut self, quad: &PyQuad) { + self.inner.remove(quad); + } + + /// Removes all quads from the dataset. + /// + /// :rtype: None + /// + /// >>> quad = Quad(NamedNode('http://example.com/s'), NamedNode('http://example.com/p'), NamedNode('http://example.com/o'), NamedNode('http://example.com/g')) + /// >>> dataset = Dataset([quad]) + /// >>> dataset.clear() + /// >>> len(dataset) + /// 0 + fn clear(&mut self) { + self.inner.clear() + } + + /// Canonicalizes the dataset by renaming blank nodes. + /// + /// Warning: Blank node ids depends on the current shape of the graph. Adding a new quad might change the ids of a lot of blank nodes. + /// Hence, this canonization might not be suitable for diffs. + /// + /// Warning: This implementation worst-case complexity is in *O(b!)* with *b* the number of blank nodes in the input dataset. + /// + /// :param algorithm: the canonicalization algorithm to use. + /// :type algorithm: CanonicalizationAlgorithm + /// :rtype: None + /// + /// >>> d1 = Dataset([Quad(BlankNode(), NamedNode('http://example.com/p'), BlankNode())]) + /// >>> d2 = Dataset([Quad(BlankNode(), NamedNode('http://example.com/p'), BlankNode())]) + /// >>> d1 == d2 + /// False + /// >>> d1.canonicalize(CanonicalizationAlgorithm.UNSTABLE) + /// >>> d2.canonicalize(CanonicalizationAlgorithm.UNSTABLE) + /// >>> d1 == d2 + /// True + fn canonicalize(&mut self, algorithm: &PyCanonicalizationAlgorithm) { + self.inner.canonicalize(algorithm.inner) + } + + fn __str__(&self) -> String { + self.inner.to_string() + } + + fn __bool__(&self) -> bool { + self.inner.is_empty() + } + + fn __eq__(&self, other: &Self) -> bool { + self.inner == other.inner + } + + fn __ne__(&self, other: &Self) -> bool { + self.inner != other.inner + } + + fn __len__(&self) -> usize { + self.inner.len() + } + + fn __contains__(&self, quad: &PyQuad) -> bool { + self.inner.contains(quad) + } + + fn __iter__(&self) -> QuadIter { + // TODO: very inefficient + QuadIter { + inner: self + .inner + .iter() + .map(QuadRef::into_owned) + .collect::>() + .into_iter(), + } + } +} + +#[pyclass(unsendable, module = "pyoxigraph")] +pub struct QuadIter { + inner: std::vec::IntoIter, +} + +#[pymethods] +impl QuadIter { + fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> { + slf + } + + fn __next__(&mut self) -> Option { + Some(self.inner.next()?.into()) + } +} + +/// RDF canonicalization algorithms. +/// +/// The following algorithms are supported: +/// * :py:attr:`CanonicalizationAlgorithm.UNSTABLE`: an unstable algorithm preferred by PyOxigraph. +#[pyclass(name = "CanonicalizationAlgorithm", module = "pyoxigraph")] +#[derive(Clone)] +pub struct PyCanonicalizationAlgorithm { + inner: CanonicalizationAlgorithm, +} + +#[pymethods] +impl PyCanonicalizationAlgorithm { + /// The algorithm preferred by PyOxigraph. + /// + /// Warning: Might change between Oxigraph versions. No stability guaranties. + #[classattr] + const UNSTABLE: Self = Self { + inner: CanonicalizationAlgorithm::Unstable, + }; + + fn __repr__(&self) -> String { + format!( + "", + match self.inner { + CanonicalizationAlgorithm::Unstable => "unstable", + _ => "unknown", + } + ) + } + + fn __hash__(&self) -> u64 { + hash(&self.inner) + } + + fn __eq__(&self, other: &Self) -> bool { + self.inner == other.inner + } + + fn __ne__(&self, other: &Self) -> bool { + self.inner != other.inner + } + + /// :rtype: CanonicalizationAlgorithm + fn __copy__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> { + slf + } + + /// :type memo: typing.Any + /// :rtype: CanonicalizationAlgorithm + #[allow(unused_variables)] + fn __deepcopy__<'a>(slf: PyRef<'a, Self>, memo: &'_ PyAny) -> PyRef<'a, Self> { + slf + } +} diff --git a/python/src/lib.rs b/python/src/lib.rs index cabb8ee2..9cbf780c 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -4,11 +4,13 @@ unused_qualifications )] +mod dataset; mod io; mod model; mod sparql; mod store; +use crate::dataset::*; use crate::io::*; use crate::model::*; use crate::sparql::*; @@ -28,6 +30,7 @@ fn pyoxigraph(_py: Python<'_>, module: &PyModule) -> PyResult<()> { module.add_class::()?; module.add_class::()?; module.add_class::()?; + module.add_class::()?; module.add_class::()?; module.add_class::()?; module.add_class::()?; @@ -36,6 +39,7 @@ fn pyoxigraph(_py: Python<'_>, module: &PyModule) -> PyResult<()> { module.add_class::()?; module.add_class::()?; module.add_class::()?; + module.add_class::()?; module.add_wrapped(wrap_pyfunction!(parse))?; module.add_wrapped(wrap_pyfunction!(parse_query_results))?; module.add_wrapped(wrap_pyfunction!(serialize))?;