From f7023a381ead141dc97e41c03c62c8ac949fdec6 Mon Sep 17 00:00:00 2001
From: Tpt <thomaspt@hotmail.fr>
Date: Tue, 19 Mar 2024 21:19:18 +0100
Subject: [PATCH] Python: exposes the Dataset class

---
 lib/oxrdf/src/dataset.rs |   3 +-
 lib/oxrdf/src/graph.rs   |   3 +-
 python/docs/model.rst    |   9 ++
 python/src/dataset.rs    | 322 +++++++++++++++++++++++++++++++++++++++
 python/src/lib.rs        |   4 +
 5 files changed, 337 insertions(+), 4 deletions(-)
 create mode 100644 python/src/dataset.rs
diff --git a/lib/oxrdf/src/dataset.rs b/lib/oxrdf/src/dataset.rs
index 3e6592a3..791136c9 100644
--- a/lib/oxrdf/src/dataset.rs
+++ b/lib/oxrdf/src/dataset.rs
@@ -504,8 +504,7 @@ impl Dataset {
         }
     }
 
-    /// Applies on the dataset the canonicalization process described in
-    /// [Canonical Forms for Isomorphic and Equivalent RDF Graphs: Algorithms for Leaning and Labelling Blank Nodes, Aidan Hogan, 2017](http://aidanhogan.com/docs/rdf-canonicalisation.pdf).
+    /// Canonicalizes the dataset by renaming blank nodes.
     ///
     /// Usage example ([Dataset isomorphism](https://www.w3.org/TR/rdf11-concepts/#dfn-dataset-isomorphism)):
     /// ```
diff --git a/lib/oxrdf/src/graph.rs b/lib/oxrdf/src/graph.rs
index b6002ea2..270a19c9 100644
--- a/lib/oxrdf/src/graph.rs
+++ b/lib/oxrdf/src/graph.rs
@@ -184,8 +184,7 @@ impl Graph {
         self.dataset.clear()
     }
 
-    /// Applies on the graph the canonicalization process described in
-    /// [Canonical Forms for Isomorphic and Equivalent RDF Graphs: Algorithms for Leaning and Labelling Blank Nodes, Aidan Hogan, 2017](http://aidanhogan.com/docs/rdf-canonicalisation.pdf).
+    /// Canonicalizes the dataset by renaming blank nodes.
     ///
     /// Usage example ([Graph isomorphism](https://www.w3.org/TR/rdf11-concepts/#dfn-graph-isomorphism)):
     /// ```
diff --git a/python/docs/model.rst b/python/docs/model.rst
index 28a7279c..99893190 100644
--- a/python/docs/model.rst
+++ b/python/docs/model.rst
@@ -36,3 +36,12 @@ Quads (`triples <https://www.w3.org/TR/rdf11-concepts/#dfn-rdf-triple>`_ in a `R
 
 .. autoclass:: DefaultGraph
     :members:
+
+
+`Datasets <https://www.w3.org/TR/rdf11-concepts/#dfn-rdf-dataset>`_
+"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+.. autoclass:: Dataset
+    :members:
+
+.. autoclass:: CanonicalizationAlgorithm
+    :members:
diff --git a/python/src/dataset.rs b/python/src/dataset.rs
new file mode 100644
index 00000000..bfccca79
--- /dev/null
+++ b/python/src/dataset.rs
@@ -0,0 +1,322 @@
+use crate::model::{hash, PyGraphNameRef, PyNamedNodeRef, PyQuad, PySubjectRef, PyTermRef};
+use oxigraph::model::dataset::{CanonicalizationAlgorithm, Dataset};
+use oxigraph::model::{Quad, QuadRef};
+use pyo3::exceptions::PyKeyError;
+use pyo3::prelude::*;
+
+/// An in-memory `RDF dataset <https://www.w3.org/TR/rdf11-concepts/#dfn-rdf-dataset>`_.
+///
+/// It can accommodate a fairly large number of quads (in the few millions).
+///
+/// Use :py:class:`Store` if you need on-disk persistence or SPARQL.
+///
+/// Warning: It interns the strings and does not do any garbage collection yet:
+/// if you insert and remove a lot of different terms, memory will grow without any reduction.
+///
+/// :param quads: some quads to initialize the dataset with.
+/// :type quads: collections.abc.Iterable[Quad]
+///
+/// The :py:class:`str` function provides an N-Quads serialization:
+///
+/// >>> str(Dataset([Quad(NamedNode('http://example.com/s'), NamedNode('http://example.com/p'), NamedNode('http://example.com/o'), NamedNode('http://example.com/g'))]))
+/// '<http://example.com/s> <http://example.com/p> <http://example.com/o> <http://example.com/g> .\n'
+#[pyclass(name = "Dataset", module = "pyoxigraph")]
+#[derive(Eq, PartialEq, Debug, Clone)]
+pub struct PyDataset {
+    inner: Dataset,
+}
+
+#[pymethods]
+impl PyDataset {
+    #[new]
+    #[pyo3(signature = (quads = None))]
+    fn new(quads: Option<&PyAny>) -> PyResult<Self> {
+        let mut inner = Dataset::new();
+        if let Some(quads) = quads {
+            for quad in quads.iter()? {
+                inner.insert(&*quad?.extract::<PyRef<'_, PyQuad>>()?);
+            }
+        }
+        Ok(Self { inner })
+    }
+
+    /// Looks for the quads with the given subject.
+    ///
+    /// :param subject: the quad subject.
+    /// :type subject: NamedNode or BlankNode or Triple
+    /// :return: an iterator of the quads.
+    /// :rtype: collections.abc.Iterator[Quad]
+    ///
+    /// >>> store = Dataset([Quad(NamedNode('http://example.com'), NamedNode('http://example.com/p'), Literal('1'), NamedNode('http://example.com/g'))])
+    /// >>> list(store.quads_for_subject(NamedNode('http://example.com')))
+    /// [<Quad subject=<NamedNode value=http://example.com> predicate=<NamedNode value=http://example.com/p> object=<Literal value=1 datatype=<NamedNode value=http://www.w3.org/2001/XMLSchema#string>> graph_name=<NamedNode value=http://example.com/g>>]
+    pub fn quads_for_subject(&self, subject: &PyAny) -> PyResult<QuadIter> {
+        Ok(QuadIter {
+            inner: self
+                .inner
+                .quads_for_subject(&PySubjectRef::try_from(subject)?)
+                .map(QuadRef::into_owned)
+                .collect::<Vec<_>>()
+                .into_iter(),
+        })
+    }
+
+    /// Looks for the quads with the given predicate.
+    ///
+    /// :param predicate: the quad predicate.
+    /// :type subject: NamedNode
+    /// :return: an iterator of the quads.
+    /// :rtype: collections.abc.Iterator[Quad]
+    ///
+    /// >>> store = Dataset([Quad(NamedNode('http://example.com'), NamedNode('http://example.com/p'), Literal('1'), NamedNode('http://example.com/g'))])
+    /// >>> list(store.quads_for_predicate(NamedNode('http://example.com/p')))
+    /// [<Quad subject=<NamedNode value=http://example.com> predicate=<NamedNode value=http://example.com/p> object=<Literal value=1 datatype=<NamedNode value=http://www.w3.org/2001/XMLSchema#string>> graph_name=<NamedNode value=http://example.com/g>>]
+    pub fn quads_for_predicate(&self, predicate: &PyAny) -> PyResult<QuadIter> {
+        Ok(QuadIter {
+            inner: self
+                .inner
+                .quads_for_predicate(&PyNamedNodeRef::try_from(predicate)?)
+                .map(QuadRef::into_owned)
+                .collect::<Vec<_>>()
+                .into_iter(),
+        })
+    }
+
+    /// Looks for the quads with the given object.
+    ///
+    /// :param object: the quad object.
+    /// :type object: NamedNode or BlankNode or Literal or Triple
+    /// :return: an iterator of the quads.
+    /// :rtype: collections.abc.Iterator[Quad]
+    ///
+    /// >>> store = Dataset([Quad(NamedNode('http://example.com'), NamedNode('http://example.com/p'), Literal('1'), NamedNode('http://example.com/g'))])
+    /// >>> list(store.quads_for_object(Literal('1')))
+    /// [<Quad subject=<NamedNode value=http://example.com> predicate=<NamedNode value=http://example.com/p> object=<Literal value=1 datatype=<NamedNode value=http://www.w3.org/2001/XMLSchema#string>> graph_name=<NamedNode value=http://example.com/g>>]
+    pub fn quads_for_object(&self, object: &PyAny) -> PyResult<QuadIter> {
+        Ok(QuadIter {
+            inner: self
+                .inner
+                .quads_for_object(&PyTermRef::try_from(object)?)
+                .map(QuadRef::into_owned)
+                .collect::<Vec<_>>()
+                .into_iter(),
+        })
+    }
+
+    /// Looks for the quads with the given graph name.
+    ///
+    /// :param graph_name: the quad graph name.
+    /// :type graph_name: NamedNode or BlankNode or Literal or Triple
+    /// :return: an iterator of the quads.
+    /// :rtype: collections.abc.Iterator[Quad]
+    ///
+    /// >>> store = Dataset([Quad(NamedNode('http://example.com'), NamedNode('http://example.com/p'), Literal('1'), NamedNode('http://example.com/g'))])
+    /// >>> list(store.quads_for_graph_name(NamedNode('http://example.com/g')))
+    /// [<Quad subject=<NamedNode value=http://example.com> predicate=<NamedNode value=http://example.com/p> object=<Literal value=1 datatype=<NamedNode value=http://www.w3.org/2001/XMLSchema#string>> graph_name=<NamedNode value=http://example.com/g>>]
+    pub fn quads_for_graph_name(&self, graph_name: &PyAny) -> PyResult<QuadIter> {
+        Ok(QuadIter {
+            inner: self
+                .inner
+                .quads_for_graph_name(&PyGraphNameRef::try_from(graph_name)?)
+                .map(QuadRef::into_owned)
+                .collect::<Vec<_>>()
+                .into_iter(),
+        })
+    }
+
+    /// Adds a quad to the dataset.
+    ///
+    /// :param quad: the quad to add.
+    /// :type quad: Quad
+    /// :rtype: None
+    ///
+    /// >>> quad = Quad(NamedNode('http://example.com/s'), NamedNode('http://example.com/p'), NamedNode('http://example.com/o'), NamedNode('http://example.com/g'))
+    /// >>> dataset = Dataset()
+    /// >>> dataset.add(quad)
+    /// >>> quad in dataset
+    /// True
+    fn add(&mut self, quad: &PyQuad) {
+        self.inner.insert(quad);
+    }
+
+    /// Removes a quad from the dataset and raises an exception if it is not in the set.
+    ///
+    /// :param quad: the quad to remove.
+    /// :type quad: Quad
+    /// :rtype: None
+    /// :raises KeyError: if the element was not in the set.
+    ///
+    /// >>> quad = Quad(NamedNode('http://example.com/s'), NamedNode('http://example.com/p'), NamedNode('http://example.com/o'), NamedNode('http://example.com/g'))
+    /// >>> dataset = Dataset([quad])
+    /// >>> dataset.remove(quad)
+    /// >>> quad in dataset
+    /// False
+    fn remove(&mut self, quad: &PyQuad) -> PyResult<()> {
+        if self.inner.remove(quad) {
+            Ok(())
+        } else {
+            Err(PyKeyError::new_err(format!(
+                "{} is not in the Dataset",
+                QuadRef::from(quad)
+            )))
+        }
+    }
+
+    /// Removes a quad from the dataset if it is present.
+    ///
+    /// :param quad: the quad to remove.
+    /// :type quad: Quad
+    /// :rtype: None
+    ///
+    /// >>> quad = Quad(NamedNode('http://example.com/s'), NamedNode('http://example.com/p'), NamedNode('http://example.com/o'), NamedNode('http://example.com/g'))
+    /// >>> dataset = Dataset([quad])
+    /// >>> dataset.discard(quad)
+    /// >>> quad in dataset
+    /// False
+    fn discard(&mut self, quad: &PyQuad) {
+        self.inner.remove(quad);
+    }
+
+    /// Removes all quads from the dataset.
+    ///
+    /// :rtype: None
+    ///
+    /// >>> quad = Quad(NamedNode('http://example.com/s'), NamedNode('http://example.com/p'), NamedNode('http://example.com/o'), NamedNode('http://example.com/g'))
+    /// >>> dataset = Dataset([quad])
+    /// >>> dataset.clear()
+    /// >>> len(dataset)
+    /// 0
+    fn clear(&mut self) {
+        self.inner.clear()
+    }
+
+    /// Canonicalizes the dataset by renaming blank nodes.
+    ///
+    /// Warning: Blank node ids depends on the current shape of the graph. Adding a new quad might change the ids of a lot of blank nodes.
+    /// Hence, this canonization might not be suitable for diffs.
+    ///
+    /// Warning: This implementation worst-case complexity is in *O(b!)* with *b* the number of blank nodes in the input dataset.
+    ///
+    /// :param algorithm: the canonicalization algorithm to use.
+    /// :type algorithm: CanonicalizationAlgorithm
+    /// :rtype: None
+    ///
+    /// >>> d1 = Dataset([Quad(BlankNode(), NamedNode('http://example.com/p'), BlankNode())])
+    /// >>> d2 = Dataset([Quad(BlankNode(), NamedNode('http://example.com/p'), BlankNode())])
+    /// >>> d1 == d2
+    /// False
+    /// >>> d1.canonicalize(CanonicalizationAlgorithm.UNSTABLE)
+    /// >>> d2.canonicalize(CanonicalizationAlgorithm.UNSTABLE)
+    /// >>> d1 == d2
+    /// True
+    fn canonicalize(&mut self, algorithm: &PyCanonicalizationAlgorithm) {
+        self.inner.canonicalize(algorithm.inner)
+    }
+
+    fn __str__(&self) -> String {
+        self.inner.to_string()
+    }
+
+    fn __bool__(&self) -> bool {
+        self.inner.is_empty()
+    }
+
+    fn __eq__(&self, other: &Self) -> bool {
+        self.inner == other.inner
+    }
+
+    fn __ne__(&self, other: &Self) -> bool {
+        self.inner != other.inner
+    }
+
+    fn __len__(&self) -> usize {
+        self.inner.len()
+    }
+
+    fn __contains__(&self, quad: &PyQuad) -> bool {
+        self.inner.contains(quad)
+    }
+
+    fn __iter__(&self) -> QuadIter {
+        // TODO: very inefficient
+        QuadIter {
+            inner: self
+                .inner
+                .iter()
+                .map(QuadRef::into_owned)
+                .collect::<Vec<_>>()
+                .into_iter(),
+        }
+    }
+}
+
+#[pyclass(unsendable, module = "pyoxigraph")]
+pub struct QuadIter {
+    inner: std::vec::IntoIter<Quad>,
+}
+
+#[pymethods]
+impl QuadIter {
+    fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> {
+        slf
+    }
+
+    fn __next__(&mut self) -> Option<PyQuad> {
+        Some(self.inner.next()?.into())
+    }
+}
+
+/// RDF canonicalization algorithms.
+///
+/// The following algorithms are supported:
+/// * :py:attr:`CanonicalizationAlgorithm.UNSTABLE`: an unstable algorithm preferred by PyOxigraph.
+#[pyclass(name = "CanonicalizationAlgorithm", module = "pyoxigraph")]
+#[derive(Clone)]
+pub struct PyCanonicalizationAlgorithm {
+    inner: CanonicalizationAlgorithm,
+}
+
+#[pymethods]
+impl PyCanonicalizationAlgorithm {
+    /// The algorithm preferred by PyOxigraph.
+    ///
+    /// Warning: Might change between Oxigraph versions. No stability guaranties.
+    #[classattr]
+    const UNSTABLE: Self = Self {
+        inner: CanonicalizationAlgorithm::Unstable,
+    };
+
+    fn __repr__(&self) -> String {
+        format!(
+            "<CanonicalizationAlgorithm {}>",
+            match self.inner {
+                CanonicalizationAlgorithm::Unstable => "unstable",
+                _ => "unknown",
+            }
+        )
+    }
+
+    fn __hash__(&self) -> u64 {
+        hash(&self.inner)
+    }
+
+    fn __eq__(&self, other: &Self) -> bool {
+        self.inner == other.inner
+    }
+
+    fn __ne__(&self, other: &Self) -> bool {
+        self.inner != other.inner
+    }
+
+    /// :rtype: CanonicalizationAlgorithm
+    fn __copy__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> {
+        slf
+    }
+
+    /// :type memo: typing.Any
+    /// :rtype: CanonicalizationAlgorithm
+    #[allow(unused_variables)]
+    fn __deepcopy__<'a>(slf: PyRef<'a, Self>, memo: &'_ PyAny) -> PyRef<'a, Self> {
+        slf
+    }
+}
diff --git a/python/src/lib.rs b/python/src/lib.rs
index cabb8ee2..9cbf780c 100644
--- a/python/src/lib.rs
+++ b/python/src/lib.rs
@@ -4,11 +4,13 @@
     unused_qualifications
 )]
 
+mod dataset;
 mod io;
 mod model;
 mod sparql;
 mod store;
 
+use crate::dataset::*;
 use crate::io::*;
 use crate::model::*;
 use crate::sparql::*;
@@ -28,6 +30,7 @@ fn pyoxigraph(_py: Python<'_>, module: &PyModule) -> PyResult<()> {
     module.add_class::<PyDefaultGraph>()?;
     module.add_class::<PyTriple>()?;
     module.add_class::<PyQuad>()?;
+    module.add_class::<PyDataset>()?;
     module.add_class::<PyStore>()?;
     module.add_class::<PyVariable>()?;
     module.add_class::<PyQuerySolutions>()?;
@@ -36,6 +39,7 @@ fn pyoxigraph(_py: Python<'_>, module: &PyModule) -> PyResult<()> {
     module.add_class::<PyQueryTriples>()?;
     module.add_class::<PyRdfFormat>()?;
     module.add_class::<PyQueryResultsFormat>()?;
+    module.add_class::<PyCanonicalizationAlgorithm>()?;
     module.add_wrapped(wrap_pyfunction!(parse))?;
     module.add_wrapped(wrap_pyfunction!(parse_query_results))?;
     module.add_wrapped(wrap_pyfunction!(serialize))?;