Python: fixes parsing from text I/O with not-ASCII char

Python text IO is counting by number of character
and not by number of bytes.
However, we were asking to read a number of bytes
and not a number of character,
leading to strange I/O exceptions being raised.
pull/516/head
Thomas 2 years ago committed by Thomas Tanon
parent d500614fcc
commit d2804d8a8d
  1. 25
      python/src/io.rs
  2. 29
      python/tests/test_io.py

@ -8,7 +8,8 @@ use oxigraph::io::{
use pyo3::exceptions::{PyIOError, PySyntaxError, PyValueError}; use pyo3::exceptions::{PyIOError, PySyntaxError, PyValueError};
use pyo3::prelude::*; use pyo3::prelude::*;
use pyo3::types::PyBytes; use pyo3::types::PyBytes;
use pyo3::wrap_pyfunction; use pyo3::{intern, wrap_pyfunction};
use std::cmp::max;
use std::error::Error; use std::error::Error;
use std::fs::File; use std::fs::File;
use std::io::{self, BufRead, BufReader, BufWriter, Cursor, Read, Write}; use std::io::{self, BufRead, BufReader, BufWriter, Cursor, Read, Write};
@ -282,17 +283,22 @@ impl Write for PyWritable {
pub struct PyIo(PyObject); pub struct PyIo(PyObject);
impl Read for PyIo { impl Read for PyIo {
fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> { fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
Python::with_gil(|py| { Python::with_gil(|py| {
if buf.is_empty() {
return Ok(0);
}
let to_read = max(1, buf.len() / 4); // We divide by 4 because TextIO works with number of characters and not with number of bytes
let read = self let read = self
.0 .0
.call_method(py, "read", (buf.len(),), None) .as_ref(py)
.call_method1(intern!(py, "read"), (to_read,))
.map_err(to_io_err)?; .map_err(to_io_err)?;
let bytes = read let bytes = read
.extract::<&[u8]>(py) .extract::<&[u8]>()
.or_else(|e| read.extract::<&str>(py).map(str::as_bytes).map_err(|_| e)) .or_else(|e| read.extract::<&str>().map(str::as_bytes).map_err(|_| e))
.map_err(to_io_err)?; .map_err(to_io_err)?;
buf.write_all(bytes)?; buf[..bytes.len()].copy_from_slice(bytes);
Ok(bytes.len()) Ok(bytes.len())
}) })
} }
@ -302,16 +308,17 @@ impl Write for PyIo {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> { fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
Python::with_gil(|py| { Python::with_gil(|py| {
self.0 self.0
.call_method(py, "write", (PyBytes::new(py, buf),), None) .as_ref(py)
.call_method1(intern!(py, "write"), (PyBytes::new(py, buf),))
.map_err(to_io_err)? .map_err(to_io_err)?
.extract::<usize>(py) .extract::<usize>()
.map_err(to_io_err) .map_err(to_io_err)
}) })
} }
fn flush(&mut self) -> io::Result<()> { fn flush(&mut self) -> io::Result<()> {
Python::with_gil(|py| { Python::with_gil(|py| {
self.0.call_method(py, "flush", (), None)?; self.0.as_ref(py).call_method0(intern!(py, "flush"))?;
Ok(()) Ok(())
}) })
} }

@ -5,7 +5,9 @@ from tempfile import NamedTemporaryFile, TemporaryFile
from pyoxigraph import Literal, NamedNode, Quad, Triple, parse, serialize from pyoxigraph import Literal, NamedNode, Quad, Triple, parse, serialize
EXAMPLE_TRIPLE = Triple( EXAMPLE_TRIPLE = Triple(
NamedNode("http://example.com/foo"), NamedNode("http://example.com/p"), Literal("1") NamedNode("http://example.com/foo"),
NamedNode("http://example.com/p"),
Literal("éù"),
) )
EXAMPLE_QUAD = Quad( EXAMPLE_QUAD = Quad(
NamedNode("http://example.com/foo"), NamedNode("http://example.com/foo"),
@ -18,7 +20,7 @@ EXAMPLE_QUAD = Quad(
class TestParse(unittest.TestCase): class TestParse(unittest.TestCase):
def test_parse_file(self) -> None: def test_parse_file(self) -> None:
with NamedTemporaryFile() as fp: with NamedTemporaryFile() as fp:
fp.write(b'<foo> <p> "1" .') fp.write('<foo> <p> "éù" .'.encode())
fp.flush() fp.flush()
self.assertEqual( self.assertEqual(
list(parse(fp.name, "text/turtle", base_iri="http://example.com/")), list(parse(fp.name, "text/turtle", base_iri="http://example.com/")),
@ -33,7 +35,7 @@ class TestParse(unittest.TestCase):
self.assertEqual( self.assertEqual(
list( list(
parse( parse(
StringIO('<foo> <p> "1" .'), StringIO('<foo> <p> "éù" .'),
"text/turtle", "text/turtle",
base_iri="http://example.com/", base_iri="http://example.com/",
) )
@ -41,11 +43,23 @@ class TestParse(unittest.TestCase):
[EXAMPLE_TRIPLE], [EXAMPLE_TRIPLE],
) )
def test_parse_long_str_io(self) -> None:
self.assertEqual(
list(
parse(
StringIO('<foo> <p> "éù" .\n' * 1024),
"text/turtle",
base_iri="http://example.com/",
)
),
[EXAMPLE_TRIPLE] * 1024,
)
def test_parse_bytes_io(self) -> None: def test_parse_bytes_io(self) -> None:
self.assertEqual( self.assertEqual(
list( list(
parse( parse(
BytesIO(b'<foo> <p> "1" .'), BytesIO('<foo> <p> "éù" .'.encode()),
"text/turtle", "text/turtle",
base_iri="http://example.com/", base_iri="http://example.com/",
) )
@ -75,15 +89,16 @@ class TestSerialize(unittest.TestCase):
output = BytesIO() output = BytesIO()
serialize([EXAMPLE_TRIPLE], output, "text/turtle") serialize([EXAMPLE_TRIPLE], output, "text/turtle")
self.assertEqual( self.assertEqual(
output.getvalue(), output.getvalue().decode(),
b'<http://example.com/foo> <http://example.com/p> "1" .\n', '<http://example.com/foo> <http://example.com/p> "éù" .\n',
) )
def test_serialize_to_file(self) -> None: def test_serialize_to_file(self) -> None:
with NamedTemporaryFile() as fp: with NamedTemporaryFile() as fp:
serialize([EXAMPLE_TRIPLE], fp.name, "text/turtle") serialize([EXAMPLE_TRIPLE], fp.name, "text/turtle")
self.assertEqual( self.assertEqual(
fp.read(), b'<http://example.com/foo> <http://example.com/p> "1" .\n' fp.read().decode(),
'<http://example.com/foo> <http://example.com/p> "éù" .\n',
) )
def test_serialize_io_error(self) -> None: def test_serialize_io_error(self) -> None:

Loading…
Cancel
Save