Python: guess file type from file extension

pull/631/head
Tpt 1 year ago committed by Thomas Tanon
parent 87d2006b6e
commit 1b511ed018
  1. 67
      python/src/io.rs
  2. 53
      python/src/store.rs
  3. 8
      python/tests/test_io.py
  4. 25
      python/tests/test_store.py

@ -9,6 +9,7 @@ use pyo3::types::PyBytes;
use pyo3::{intern, wrap_pyfunction};
use std::cmp::max;
use std::error::Error;
use std::ffi::OsStr;
use std::fs::File;
use std::io::{self, BufWriter, Cursor, Read, Write};
use std::path::{Path, PathBuf};
@ -33,10 +34,10 @@ pub fn add_to_module(module: &PyModule) -> PyResult<()> {
/// For example, ``application/turtle`` could also be used for `Turtle <https://www.w3.org/TR/turtle/>`_
/// and ``application/xml`` or ``xml`` for `RDF/XML <https://www.w3.org/TR/rdf-syntax-grammar/>`_.
///
/// :param input: The binary I/O object or file path to read from. For example, it could be a file path as a string or a file reader opened in binary mode with ``open('my_file.ttl', 'rb')``.
/// :param input: The I/O object or file path to read from. For example, it could be a file path as a string or a file reader opened in binary mode with ``open('my_file.ttl', 'rb')``.
/// :type input: io(bytes) or io(str) or str or pathlib.Path
/// :param format: the format of the RDF serialization using a media type like ``text/turtle`` or an extension like `ttl`.
/// :type format: str
/// :param format: the format of the RDF serialization using a media type like ``text/turtle`` or an extension like `ttl`. If :py:const:`None`, the format is guessed from the file name extension.
/// :type format: str or None, optional
/// :param base_iri: the base IRI used to resolve the relative IRIs in the file or :py:const:`None` if relative IRI resolution should not be done.
/// :type base_iri: str or None, optional
/// :param without_named_graphs: Sets that the parser must fail if parsing a named graph.
@ -52,21 +53,21 @@ pub fn add_to_module(module: &PyModule) -> PyResult<()> {
/// >>> list(parse(input, "text/turtle", base_iri="http://example.com/"))
/// [<Quad subject=<NamedNode value=http://example.com/foo> predicate=<NamedNode value=http://example.com/p> object=<Literal value=1 datatype=<NamedNode value=http://www.w3.org/2001/XMLSchema#string>> graph_name=<DefaultGraph>>]
#[pyfunction]
#[pyo3(signature = (input, format, *, base_iri = None, without_named_graphs = false, rename_blank_nodes = false))]
#[pyo3(signature = (input, /, format = None, *, base_iri = None, without_named_graphs = false, rename_blank_nodes = false))]
pub fn parse(
input: PyObject,
format: &str,
input: &PyAny,
format: Option<&str>,
base_iri: Option<&str>,
without_named_graphs: bool,
rename_blank_nodes: bool,
py: Python<'_>,
) -> PyResult<PyObject> {
let format = rdf_format(format)?;
let file_path = input.extract::<PathBuf>(py).ok();
let file_path = input.extract::<PathBuf>().ok();
let format = rdf_format(format, file_path.as_deref())?;
let input = if let Some(file_path) = &file_path {
PyReadable::from_file(file_path, py).map_err(map_io_err)?
} else {
PyReadable::from_data(input, py)
PyReadable::from_data(input)
};
let mut parser = RdfParser::from_format(format);
if let Some(base_iri) = base_iri {
@ -106,8 +107,8 @@ pub fn parse(
/// :type input: iterable(Triple) or iterable(Quad)
/// :param output: The binary I/O object or file path to write to. For example, it could be a file path as a string or a file writer opened in binary mode with ``open('my_file.ttl', 'wb')``.
/// :type output: io(bytes) or str or pathlib.Path
/// :param format: the format of the RDF serialization using a media type like ``text/turtle`` or an extension like `ttl`.
/// :type format: str
/// :param format: the format of the RDF serialization using a media type like ``text/turtle`` or an extension like `ttl`. If :py:const:`None`, the format is guessed from the file name extension.
/// :type format: str or None, optional
/// :rtype: None
/// :raises ValueError: if the format is not supported.
/// :raises TypeError: if a triple is given during a quad format serialization or reverse.
@ -117,10 +118,16 @@ pub fn parse(
/// >>> output.getvalue()
/// b'<http://example.com> <http://example.com/p> "1" .\n'
#[pyfunction]
pub fn serialize(input: &PyAny, output: PyObject, format: &str, py: Python<'_>) -> PyResult<()> {
let format = rdf_format(format)?;
let output = if let Ok(path) = output.extract::<PathBuf>(py) {
PyWritable::from_file(&path, py).map_err(map_io_err)?
pub fn serialize(
input: &PyAny,
output: &PyAny,
format: Option<&str>,
py: Python<'_>,
) -> PyResult<()> {
let file_path = output.extract::<PathBuf>().ok();
let format = rdf_format(format, file_path.as_deref())?;
let output = if let Some(file_path) = &file_path {
PyWritable::from_file(file_path, py).map_err(map_io_err)?
} else {
PyWritable::from_data(output)
};
@ -186,13 +193,13 @@ impl PyReadable {
Ok(Self::File(py.allow_threads(|| File::open(file))?))
}
pub fn from_data(data: PyObject, py: Python<'_>) -> Self {
if let Ok(bytes) = data.extract::<Vec<u8>>(py) {
pub fn from_data(data: &PyAny) -> Self {
if let Ok(bytes) = data.extract::<Vec<u8>>() {
Self::Bytes(Cursor::new(bytes))
} else if let Ok(string) = data.extract::<String>(py) {
} else if let Ok(string) = data.extract::<String>() {
Self::Bytes(Cursor::new(string.into_bytes()))
} else {
Self::Io(PyIo(data))
Self::Io(PyIo(data.into()))
}
}
}
@ -217,8 +224,8 @@ impl PyWritable {
Ok(Self::File(py.allow_threads(|| File::create(file))?))
}
pub fn from_data(data: PyObject) -> Self {
Self::Io(PyIo(data))
pub fn from_data(data: &PyAny) -> Self {
Self::Io(PyIo(data.into()))
}
pub fn close(mut self) -> io::Result<()> {
@ -293,7 +300,23 @@ impl Write for PyIo {
}
}
pub fn rdf_format(format: &str) -> PyResult<RdfFormat> {
pub fn rdf_format(format: Option<&str>, path: Option<&Path>) -> PyResult<RdfFormat> {
let format = if let Some(format) = format {
format
} else if let Some(path) = path {
if let Some(ext) = path.extension().and_then(OsStr::to_str) {
ext
} else {
return Err(PyValueError::new_err(format!(
"The file name {} has no extension to guess a file format from",
path.display()
)));
}
} else {
return Err(PyValueError::new_err(
"The format parameter is required when a file path is not given",
));
};
if format.contains('/') {
RdfFormat::from_media_type(format).ok_or_else(|| {
PyValueError::new_err(format!("Not supported RDF format media type: {format}"))

@ -360,10 +360,10 @@ impl PyStore {
/// For example, ``application/turtle`` could also be used for `Turtle <https://www.w3.org/TR/turtle/>`_
/// and ``application/xml`` or ``xml`` for `RDF/XML <https://www.w3.org/TR/rdf-syntax-grammar/>`_.
///
/// :param input: The binary I/O object or file path to read from. For example, it could be a file path as a string or a file reader opened in binary mode with ``open('my_file.ttl', 'rb')``.
/// :param input: The I/O object or file path to read from. For example, it could be a file path as a string or a file reader opened in binary mode with ``open('my_file.ttl', 'rb')``.
/// :type input: io(bytes) or io(str) or str or pathlib.Path
/// :param format: the format of the RDF serialization using a media type like ``text/turtle`` or an extension like `ttl`.
/// :type format: str
/// :param format: the format of the RDF serialization using a media type like ``text/turtle`` or an extension like `ttl`. If :py:const:`None`, the format is guessed from the file name extension.
/// :type format: str or None, optional
/// :param base_iri: the base IRI used to resolve the relative IRIs in the file or :py:const:`None` if relative IRI resolution should not be done.
/// :type base_iri: str or None, optional
/// :param to_graph: if it is a file composed of triples, the graph in which the triples should be stored. By default, the default graph is used.
@ -377,26 +377,26 @@ impl PyStore {
/// >>> store.load(io.BytesIO(b'<foo> <p> "1" .'), "text/turtle", base_iri="http://example.com/", to_graph=NamedNode("http://example.com/g"))
/// >>> list(store)
/// [<Quad subject=<NamedNode value=http://example.com/foo> predicate=<NamedNode value=http://example.com/p> object=<Literal value=1 datatype=<NamedNode value=http://www.w3.org/2001/XMLSchema#string>> graph_name=<NamedNode value=http://example.com/g>>]
#[pyo3(signature = (input, format, *, base_iri = None, to_graph = None))]
#[pyo3(signature = (input, /, format = None, *, base_iri = None, to_graph = None))]
fn load(
&self,
input: PyObject,
format: &str,
input: &PyAny,
format: Option<&str>,
base_iri: Option<&str>,
to_graph: Option<&PyAny>,
py: Python<'_>,
) -> PyResult<()> {
let format = rdf_format(format)?;
let to_graph_name = if let Some(graph_name) = to_graph {
Some(GraphName::from(&PyGraphNameRef::try_from(graph_name)?))
} else {
None
};
let file_path = input.extract::<PathBuf>(py).ok();
let file_path = input.extract::<PathBuf>().ok();
let format = rdf_format(format, file_path.as_deref())?;
let input = if let Some(file_path) = &file_path {
PyReadable::from_file(file_path, py).map_err(map_io_err)?
} else {
PyReadable::from_data(input, py)
PyReadable::from_data(input)
};
py.allow_threads(|| {
if let Some(to_graph_name) = to_graph_name {
@ -429,10 +429,10 @@ impl PyStore {
/// For example, ``application/turtle`` could also be used for `Turtle <https://www.w3.org/TR/turtle/>`_
/// and ``application/xml`` or ``xml`` for `RDF/XML <https://www.w3.org/TR/rdf-syntax-grammar/>`_.
///
/// :param input: The binary I/O object or file path to read from. For example, it could be a file path as a string or a file reader opened in binary mode with ``open('my_file.ttl', 'rb')``.
/// :param input: The I/O object or file path to read from. For example, it could be a file path as a string or a file reader opened in binary mode with ``open('my_file.ttl', 'rb')``.
/// :type input: io(bytes) or io(str) or str or pathlib.Path
/// :param format: the format of the RDF serialization using a media type like ``text/turtle`` or an extension like `ttl`.
/// :type format: str
/// :param format: the format of the RDF serialization using a media type like ``text/turtle`` or an extension like `ttl`. If :py:const:`None`, the format is guessed from the file name extension.
/// :type format: str or None, optional
/// :param base_iri: the base IRI used to resolve the relative IRIs in the file or :py:const:`None` if relative IRI resolution should not be done.
/// :type base_iri: str or None, optional
/// :param to_graph: if it is a file composed of triples, the graph in which the triples should be stored. By default, the default graph is used.
@ -446,26 +446,26 @@ impl PyStore {
/// >>> store.bulk_load(io.BytesIO(b'<foo> <p> "1" .'), "text/turtle", base_iri="http://example.com/", to_graph=NamedNode("http://example.com/g"))
/// >>> list(store)
/// [<Quad subject=<NamedNode value=http://example.com/foo> predicate=<NamedNode value=http://example.com/p> object=<Literal value=1 datatype=<NamedNode value=http://www.w3.org/2001/XMLSchema#string>> graph_name=<NamedNode value=http://example.com/g>>]
#[pyo3(signature = (input, format, *, base_iri = None, to_graph = None))]
#[pyo3(signature = (input, /, format = None, *, base_iri = None, to_graph = None))]
fn bulk_load(
&self,
input: PyObject,
format: &str,
input: &PyAny,
format: Option<&str>,
base_iri: Option<&str>,
to_graph: Option<&PyAny>,
py: Python<'_>,
) -> PyResult<()> {
let format = rdf_format(format)?;
let to_graph_name = if let Some(graph_name) = to_graph {
Some(GraphName::from(&PyGraphNameRef::try_from(graph_name)?))
} else {
None
};
let file_path = input.extract::<PathBuf>(py).ok();
let file_path = input.extract::<PathBuf>().ok();
let format = rdf_format(format, file_path.as_deref())?;
let input = if let Some(file_path) = &file_path {
PyReadable::from_file(file_path, py).map_err(map_io_err)?
} else {
PyReadable::from_data(input, py)
PyReadable::from_data(input)
};
py.allow_threads(|| {
if let Some(to_graph_name) = to_graph_name {
@ -498,8 +498,8 @@ impl PyStore {
///
/// :param output: The binary I/O object or file path to write to. For example, it could be a file path as a string or a file writer opened in binary mode with ``open('my_file.ttl', 'wb')``.
/// :type output: io(bytes) or str or pathlib.Path
/// :param format: the format of the RDF serialization using a media type like ``text/turtle`` or an extension like `ttl`.
/// :type format: str
/// :param format: the format of the RDF serialization using a media type like ``text/turtle`` or an extension like `ttl`. If :py:const:`None`, the format is guessed from the file name extension.
/// :type format: str or None, optional
/// :param from_graph: the store graph from which dump the triples. Required if the serialization format does not support named graphs. If it does supports named graphs the full dataset is written.
/// :type from_graph: NamedNode or BlankNode or DefaultGraph or None, optional
/// :rtype: None
@ -512,22 +512,23 @@ impl PyStore {
/// >>> store.dump(output, "text/turtle", from_graph=NamedNode("http://example.com/g"))
/// >>> output.getvalue()
/// b'<http://example.com> <http://example.com/p> "1" .\n'
#[pyo3(signature = (output, format, *, from_graph = None))]
#[pyo3(signature = (output, /, format = None, *, from_graph = None))]
fn dump(
&self,
output: PyObject,
format: &str,
output: &PyAny,
format: Option<&str>,
from_graph: Option<&PyAny>,
py: Python<'_>,
) -> PyResult<()> {
let format = rdf_format(format)?;
let from_graph_name = if let Some(graph_name) = from_graph {
Some(GraphName::from(&PyGraphNameRef::try_from(graph_name)?))
} else {
None
};
let output = if let Ok(path) = output.extract::<PathBuf>(py) {
PyWritable::from_file(&path, py).map_err(map_io_err)?
let file_path = output.extract::<PathBuf>().ok();
let format = rdf_format(format, file_path.as_deref())?;
let output = if let Some(file_path) = &file_path {
PyWritable::from_file(file_path, py).map_err(map_io_err)?
} else {
PyWritable::from_data(output)
};

@ -20,11 +20,11 @@ EXAMPLE_QUAD = Quad(
class TestParse(unittest.TestCase):
def test_parse_file(self) -> None:
with NamedTemporaryFile() as fp:
with NamedTemporaryFile(suffix=".ttl") as fp:
fp.write('<foo> <p> "éù" .'.encode())
fp.flush()
self.assertEqual(
list(parse(fp.name, "text/turtle", base_iri="http://example.com/")),
list(parse(fp.name, base_iri="http://example.com/")),
[EXAMPLE_TRIPLE],
)
@ -138,8 +138,8 @@ class TestSerialize(unittest.TestCase):
)
def test_serialize_to_file(self) -> None:
with NamedTemporaryFile() as fp:
serialize([EXAMPLE_TRIPLE], fp.name, "text/turtle")
with NamedTemporaryFile(suffix=".ttl") as fp:
serialize([EXAMPLE_TRIPLE], fp.name)
self.assertEqual(
fp.read().decode(),
'<http://example.com/foo> <http://example.com/p> "éù" .\n',

@ -265,13 +265,12 @@ class TestStore(unittest.TestCase):
self.assertEqual(set(store), {Quad(foo, bar, baz, graph)})
def test_load_file(self) -> None:
with NamedTemporaryFile(delete=False) as fp:
file_name = Path(fp.name)
with NamedTemporaryFile(suffix=".nq") as fp:
fp.write(b"<http://foo> <http://bar> <http://baz> <http://graph>.")
store = Store()
store.load(file_name, "nq")
file_name.unlink()
self.assertEqual(set(store), {Quad(foo, bar, baz, graph)})
fp.flush()
store = Store()
store.load(fp.name)
self.assertEqual(set(store), {Quad(foo, bar, baz, graph)})
def test_load_with_io_error(self) -> None:
with self.assertRaises(UnsupportedOperation) as _, TemporaryFile("wb") as fp:
@ -311,14 +310,14 @@ class TestStore(unittest.TestCase):
def test_dump_file(self) -> None:
with NamedTemporaryFile(delete=False) as fp:
store = Store()
store.add(Quad(foo, bar, baz, graph))
file_name = Path(fp.name)
store = Store()
store.add(Quad(foo, bar, baz, graph))
store.dump(file_name, "nq")
self.assertEqual(
file_name.read_text(),
"<http://foo> <http://bar> <http://baz> <http://graph> .\n",
)
store.dump(file_name, "nq")
self.assertEqual(
file_name.read_text(),
"<http://foo> <http://bar> <http://baz> <http://graph> .\n",
)
def test_dump_with_io_error(self) -> None:
store = Store()

Loading…
Cancel
Save