#![allow(clippy::needless_option_as_deref)] use crate::model::{hash, PyQuad, PyTriple}; use oxigraph::io::{FromReadQuadReader, RdfFormat, RdfParseError, RdfParser, RdfSerializer}; use oxigraph::model::QuadRef; use pyo3::exceptions::{PyDeprecationWarning, PySyntaxError, PyValueError}; use pyo3::intern; use pyo3::prelude::*; use pyo3::types::PyBytes; use std::cmp::max; use std::ffi::OsStr; use std::fs::File; use std::io::{self, BufWriter, Cursor, Read, Write}; use std::path::{Path, PathBuf}; use std::sync::OnceLock; /// Parses RDF graph and dataset serialization formats. /// /// It currently supports the following formats: /// /// * `N-Triples `_ (:py:attr:`RdfFormat.N_TRIPLES`) /// * `N-Quads `_ (:py:attr:`RdfFormat.N_QUADS`) /// * `Turtle `_ (:py:attr:`RdfFormat.TURTLE`) /// * `TriG `_ (:py:attr:`RdfFormat.TRIG`) /// * `N3 `_ (:py:attr:`RdfFormat.N3`) /// * `RDF/XML `_ (:py:attr:`RdfFormat.RDF_XML`) /// /// It supports also some media type and extension aliases. /// For example, ``application/turtle`` could also be used for `Turtle `_ /// and ``application/xml`` or ``xml`` for `RDF/XML `_. /// /// :param input: The :py:class:`str`, :py:class:`bytes` or I/O object to read from. For example, it could be the file content as a string or a file reader opened in binary mode with ``open('my_file.ttl', 'rb')``. /// :type input: bytes or str or typing.IO[bytes] or typing.IO[str] or None, optional /// :param format: the format of the RDF serialization. If :py:const:`None`, the format is guessed from the file name extension. /// :type format: RdfFormat or None, optional /// :param path: The file path to read from. Replaces the ``input`` parameter. /// :type path: str or os.PathLike[str] or None, optional /// :param base_iri: the base IRI used to resolve the relative IRIs in the file or :py:const:`None` if relative IRI resolution should not be done. /// :type base_iri: str or None, optional /// :param without_named_graphs: Sets that the parser must fail when parsing a named graph. /// :type without_named_graphs: bool, optional /// :param rename_blank_nodes: Renames the blank nodes identifiers from the ones set in the serialization to random ids. This allows to avoid identifier conflicts when merging graphs together. /// :type rename_blank_nodes: bool, optional /// :return: an iterator of RDF triples or quads depending on the format. /// :rtype: collections.abc.Iterator[Quad] /// :raises ValueError: if the format is not supported. /// :raises SyntaxError: if the provided data is invalid. /// :raises OSError: if a system error happens while reading the file. /// /// >>> list(parse(input=b'

"1" .', format=RdfFormat.TURTLE, base_iri="http://example.com/")) /// [ predicate= object=> graph_name=>] #[pyfunction] #[pyo3(signature = (input = None, format = None, *, path = None, base_iri = None, without_named_graphs = false, rename_blank_nodes = false))] pub fn parse( input: Option, format: Option, path: Option, base_iri: Option<&str>, without_named_graphs: bool, rename_blank_nodes: bool, py: Python<'_>, ) -> PyResult { let input = PyReadable::from_args(&path, input, py)?; let format = lookup_rdf_format(format, path.as_deref())?; let mut parser = RdfParser::from_format(format); if let Some(base_iri) = base_iri { parser = parser .with_base_iri(base_iri) .map_err(|e| PyValueError::new_err(e.to_string()))?; } if without_named_graphs { parser = parser.without_named_graphs(); } if rename_blank_nodes { parser = parser.rename_blank_nodes(); } Ok(PyQuadReader { inner: parser.parse_read(input), file_path: path, } .into_py(py)) } /// Serializes an RDF graph or dataset. /// /// It currently supports the following formats: /// /// * `canonical `_ `N-Triples `_ (:py:attr:`RdfFormat.N_TRIPLES`) /// * `N-Quads `_ (:py:attr:`RdfFormat.N_QUADS`) /// * `Turtle `_ (:py:attr:`RdfFormat.TURTLE`) /// * `TriG `_ (:py:attr:`RdfFormat.TRIG`) /// * `N3 `_ (:py:attr:`RdfFormat.N3`) /// * `RDF/XML `_ (:py:attr:`RdfFormat.RDF_XML`) /// /// It supports also some media type and extension aliases. /// For example, ``application/turtle`` could also be used for `Turtle `_ /// and ``application/xml`` or ``xml`` for `RDF/XML `_. /// /// :param input: the RDF triples and quads to serialize. /// :type input: collections.abc.Iterable[Triple] or collections.abc.Iterable[Quad] /// :param output: The binary I/O object or file path to write to. For example, it could be a file path as a string or a file writer opened in binary mode with ``open('my_file.ttl', 'wb')``. If :py:const:`None`, a :py:class:`bytes` buffer is returned with the serialized content. /// :type output: typing.IO[bytes] or str or os.PathLike[str] or None, optional /// :param format: the format of the RDF serialization. If :py:const:`None`, the format is guessed from the file name extension. /// :type format: RdfFormat or None, optional /// :return: :py:class:`bytes` with the serialization if the ``output`` parameter is :py:const:`None`, :py:const:`None` if ``output`` is set. /// :rtype: bytes or None /// :raises ValueError: if the format is not supported. /// :raises TypeError: if a triple is given during a quad format serialization or reverse. /// :raises OSError: if a system error happens while writing the file. /// /// >>> serialize([Triple(NamedNode('http://example.com'), NamedNode('http://example.com/p'), Literal('1'))], format=RdfFormat.TURTLE) /// b' "1" .\n' /// /// >>> import io /// >>> output = io.BytesIO() /// >>> serialize([Triple(NamedNode('http://example.com'), NamedNode('http://example.com/p'), Literal('1'))], output, RdfFormat.TURTLE) /// >>> output.getvalue() /// b' "1" .\n' #[pyfunction] #[pyo3(signature = (input, output = None, format = None))] pub fn serialize<'a>( input: &PyAny, output: Option, format: Option, py: Python<'a>, ) -> PyResult> { PyWritable::do_write( |output, file_path| { let format = lookup_rdf_format(format, file_path.as_deref())?; let mut writer = RdfSerializer::from_format(format).serialize_to_write(output); for i in input.iter()? { let i = i?; if let Ok(triple) = i.extract::>() { writer.write_triple(&*triple) } else { let quad = i.extract::>()?; let quad = QuadRef::from(&*quad); if !quad.graph_name.is_default_graph() && !format.supports_datasets() { return Err(PyValueError::new_err( "The {format} format does not support named graphs", )); } writer.write_quad(quad) }?; } Ok(writer.finish()?) }, output, py, ) } #[pyclass(name = "QuadReader", module = "pyoxigraph")] pub struct PyQuadReader { inner: FromReadQuadReader, file_path: Option, } #[pymethods] impl PyQuadReader { fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> { slf } fn __next__(&mut self, py: Python<'_>) -> PyResult> { py.allow_threads(|| { Ok(self .inner .next() .transpose() .map_err(|e| map_parse_error(e, self.file_path.clone()))? .map(PyQuad::from)) }) } } /// RDF serialization formats. /// /// The following formats are supported: /// * `N-Triples `_ (:py:attr:`RdfFormat.N_TRIPLES`) /// * `N-Quads `_ (:py:attr:`RdfFormat.N_QUADS`) /// * `Turtle `_ (:py:attr:`RdfFormat.TURTLE`) /// * `TriG `_ (:py:attr:`RdfFormat.TRIG`) /// * `N3 `_ (:py:attr:`RdfFormat.N3`) /// * `RDF/XML `_ (:py:attr:`RdfFormat.RDF_XML`) /// /// >>> RdfFormat.N3.media_type /// 'text/n3' #[pyclass(name = "RdfFormat", module = "pyoxigraph")] #[derive(Clone)] pub struct PyRdfFormat { inner: RdfFormat, } #[pymethods] impl PyRdfFormat { /// `N3 `_ #[classattr] const N3: Self = Self { inner: RdfFormat::N3, }; /// `N-Quads `_ #[classattr] const N_QUADS: Self = Self { inner: RdfFormat::NQuads, }; /// `N-Triples `_ #[classattr] const N_TRIPLES: Self = Self { inner: RdfFormat::NTriples, }; /// `RDF/XML `_ #[classattr] const RDF_XML: Self = Self { inner: RdfFormat::RdfXml, }; /// `TriG `_ #[classattr] const TRIG: Self = Self { inner: RdfFormat::TriG, }; /// `Turtle `_ #[classattr] const TURTLE: Self = Self { inner: RdfFormat::Turtle, }; /// :return: the format canonical IRI according to the `Unique URIs for file formats registry `_. /// :rtype: str /// /// >>> RdfFormat.N_TRIPLES.iri /// 'http://www.w3.org/ns/formats/N-Triples' #[getter] fn iri(&self) -> &'static str { self.inner.iri() } /// :return: the format `IANA media type `_. /// :rtype: str /// /// >>> RdfFormat.N_TRIPLES.media_type /// 'application/n-triples' #[getter] fn media_type(&self) -> &'static str { self.inner.media_type() } /// :return: the format `IANA-registered `_ file extension. /// :rtype: str /// /// >>> RdfFormat.N_TRIPLES.file_extension /// 'nt' #[getter] pub fn file_extension(&self) -> &'static str { self.inner.file_extension() } /// :return: the format name. /// :rtype: str /// /// >>> RdfFormat.N_TRIPLES.name /// 'N-Triples' #[getter] pub const fn name(&self) -> &'static str { self.inner.name() } /// :return: if the formats supports `RDF datasets `_ and not only `RDF graphs `_. /// :rtype: bool /// /// >>> RdfFormat.N_TRIPLES.supports_datasets /// False /// >>> RdfFormat.N_QUADS.supports_datasets /// True #[getter] pub fn supports_datasets(&self) -> bool { self.inner.supports_datasets() } /// :return: if the formats supports `RDF-star quoted triples `_. /// :rtype: bool /// /// >>> RdfFormat.N_TRIPLES.supports_rdf_star /// True /// >>> RdfFormat.RDF_XML.supports_rdf_star /// False #[getter] pub const fn supports_rdf_star(&self) -> bool { self.inner.supports_rdf_star() } /// Looks for a known format from a media type. /// /// It supports some media type aliases. /// For example, "application/xml" is going to return RDF/XML even if it is not its canonical media type. /// /// :param media_type: the media type. /// :type media_type: str /// :return: :py:class:`RdfFormat` if the media type is known or :py:const:`None` if not. /// :rtype: RdfFormat or None /// /// >>> RdfFormat.from_media_type("text/turtle; charset=utf-8") /// #[staticmethod] pub fn from_media_type(media_type: &str) -> Option { Some(Self { inner: RdfFormat::from_media_type(media_type)?, }) } /// Looks for a known format from an extension. /// /// It supports some aliases. /// /// :param extension: the extension. /// :type extension: str /// :return: :py:class:`RdfFormat` if the extension is known or :py:const:`None` if not. /// :rtype: RdfFormat or None /// /// >>> RdfFormat.from_extension("nt") /// #[staticmethod] pub fn from_extension(extension: &str) -> Option { Some(Self { inner: RdfFormat::from_extension(extension)?, }) } fn __str__(&self) -> &'static str { self.inner.name() } fn __repr__(&self) -> String { format!("", self.inner.name()) } fn __hash__(&self) -> u64 { hash(&self.inner) } fn __eq__(&self, other: &Self) -> bool { self.inner == other.inner } fn __ne__(&self, other: &Self) -> bool { self.inner != other.inner } /// :rtype: RdfFormat fn __copy__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> { slf } /// :type memo: typing.Any /// :rtype: RdfFormat #[allow(unused_variables)] fn __deepcopy__<'a>(slf: PyRef<'a, Self>, memo: &'_ PyAny) -> PyRef<'a, Self> { slf } } pub enum PyReadable { Bytes(Cursor>), Io(PyIo), File(File), } impl PyReadable { pub fn from_args( path: &Option, input: Option, py: Python<'_>, ) -> PyResult { match (path, input) { (Some(_), Some(_)) => Err(PyValueError::new_err( "input and file_path can't be both set at the same time", )), (Some(path), None) => Ok(Self::File(py.allow_threads(|| File::open(path))?)), (None, Some(input)) => Ok(input.into()), (None, None) => Err(PyValueError::new_err( "Either input or file_path must be set", )), } } } impl Read for PyReadable { fn read(&mut self, buf: &mut [u8]) -> io::Result { match self { Self::Bytes(bytes) => bytes.read(buf), Self::Io(io) => io.read(buf), Self::File(file) => file.read(buf), } } } #[derive(FromPyObject)] pub enum PyReadableInput { String(String), Bytes(Vec), Io(PyObject), } impl From for PyReadable { fn from(input: PyReadableInput) -> Self { match input { PyReadableInput::String(string) => Self::Bytes(Cursor::new(string.into_bytes())), PyReadableInput::Bytes(bytes) => Self::Bytes(Cursor::new(bytes)), PyReadableInput::Io(io) => Self::Io(PyIo(io)), } } } pub enum PyWritable { Bytes(Vec), Io(PyIo), File(File), } impl PyWritable { pub fn do_write( write: impl FnOnce(BufWriter, Option) -> PyResult>, output: Option, py: Python<'_>, ) -> PyResult> { let (output, file_path) = match output { Some(PyWritableOutput::Path(file_path)) => ( Self::File(py.allow_threads(|| File::create(&file_path))?), Some(file_path), ), Some(PyWritableOutput::Io(object)) => (Self::Io(PyIo(object)), None), None => (Self::Bytes(Vec::new()), None), }; let writer = write(BufWriter::new(output), file_path)?; py.allow_threads(|| writer.into_inner())?.close(py) } fn close(self, py: Python<'_>) -> PyResult> { match self { Self::Bytes(bytes) => Ok(Some(PyBytes::new(py, &bytes))), Self::File(mut file) => { py.allow_threads(|| { file.flush()?; file.sync_all() })?; Ok(None) } Self::Io(mut io) => { py.allow_threads(|| io.flush())?; Ok(None) } } } } impl Write for PyWritable { fn write(&mut self, buf: &[u8]) -> io::Result { match self { Self::Bytes(bytes) => bytes.write(buf), Self::Io(io) => io.write(buf), Self::File(file) => file.write(buf), } } fn flush(&mut self) -> io::Result<()> { match self { Self::Bytes(_) => Ok(()), Self::Io(io) => io.flush(), Self::File(file) => file.flush(), } } } #[derive(FromPyObject)] pub enum PyWritableOutput { Path(PathBuf), Io(PyObject), } pub struct PyIo(PyObject); impl Read for PyIo { fn read(&mut self, buf: &mut [u8]) -> io::Result { Python::with_gil(|py| { if buf.is_empty() { return Ok(0); } let to_read = max(1, buf.len() / 4); // We divide by 4 because TextIO works with number of characters and not with number of bytes let read = self .0 .as_ref(py) .call_method1(intern!(py, "read"), (to_read,))?; let bytes = read .extract::<&[u8]>() .or_else(|_| read.extract::<&str>().map(str::as_bytes))?; buf[..bytes.len()].copy_from_slice(bytes); Ok(bytes.len()) }) } } impl Write for PyIo { fn write(&mut self, buf: &[u8]) -> io::Result { Python::with_gil(|py| { Ok(self .0 .as_ref(py) .call_method1(intern!(py, "write"), (PyBytes::new(py, buf),))? .extract::()?) }) } fn flush(&mut self) -> io::Result<()> { Python::with_gil(|py| { self.0.as_ref(py).call_method0(intern!(py, "flush"))?; Ok(()) }) } } pub fn lookup_rdf_format( format: Option, path: Option<&Path>, ) -> PyResult { if let Some(format) = format { return match format { PyRdfFormatInput::Object(format) => Ok(format.inner), PyRdfFormatInput::MediaType(media_type) => { deprecation_warning("Using string to specify a RDF format is deprecated, please use a RdfFormat object instead.")?; RdfFormat::from_media_type(&media_type).ok_or_else(|| { PyValueError::new_err(format!( "The media type {media_type} is not supported by pyoxigraph" )) }) } }; } let Some(path) = path else { return Err(PyValueError::new_err( "The format parameter is required when a file path is not given", )); }; let Some(ext) = path.extension().and_then(OsStr::to_str) else { return Err(PyValueError::new_err(format!( "The file name {} has no extension to guess a file format from", path.display() ))); }; RdfFormat::from_extension(ext) .ok_or_else(|| PyValueError::new_err(format!("Not supported RDF format extension: {ext}"))) } #[derive(FromPyObject)] pub enum PyRdfFormatInput { Object(PyRdfFormat), MediaType(String), } pub fn map_parse_error(error: RdfParseError, file_path: Option) -> PyErr { match error { RdfParseError::Syntax(error) => { // Python 3.9 does not support end line and end column if python_version() >= (3, 10) { let params = if let Some(location) = error.location() { ( file_path, Some(location.start.line + 1), Some(location.start.column + 1), None::>, Some(location.end.line + 1), Some(location.end.column + 1), ) } else { (None, None, None, None, None, None) }; PySyntaxError::new_err((error.to_string(), params)) } else { let params = if let Some(location) = error.location() { ( file_path, Some(location.start.line + 1), Some(location.start.column + 1), None::>, ) } else { (None, None, None, None) }; PySyntaxError::new_err((error.to_string(), params)) } } RdfParseError::Io(error) => error.into(), } } /// Release the GIL /// There should not be ANY use of pyo3 code inside of this method!!! /// /// Code from pyo3: https://github.com/PyO3/pyo3/blob/a67180c8a42a0bc0fdc45b651b62c0644130cf47/src/python.rs#L366 #[allow(unsafe_code)] pub fn allow_threads_unsafe(_py: Python<'_>, f: impl FnOnce() -> T) -> T { struct RestoreGuard { tstate: *mut pyo3::ffi::PyThreadState, } impl Drop for RestoreGuard { fn drop(&mut self) { // SAFETY: not cloned so called once unsafe { pyo3::ffi::PyEval_RestoreThread(self.tstate); } } } // SAFETY: we have the restore part in Drop to make sure it's properly executed let tstate = unsafe { pyo3::ffi::PyEval_SaveThread() }; let _guard = RestoreGuard { tstate }; f() } pub fn python_version() -> (u8, u8) { static VERSION: OnceLock<(u8, u8)> = OnceLock::new(); *VERSION.get_or_init(|| { Python::with_gil(|py| { let v = py.version_info(); (v.major, v.minor) }) }) } pub fn deprecation_warning(message: &str) -> PyResult<()> { Python::with_gil(|py| PyErr::warn(py, py.get_type::(), message, 0)) }