Fork of https://github.com/oxigraph/oxigraph.git for the purpose of NextGraph project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 
oxigraph/lib/src/io/read.rs

557 lines
18 KiB

//! Utilities to read RDF graphs and datasets.
use crate::io::{DatasetFormat, GraphFormat};
use crate::model::*;
use oxiri::{Iri, IriParseError};
use rio_api::model as rio;
use rio_api::parser::{QuadsParser, TriplesParser};
use rio_turtle::{NQuadsParser, NTriplesParser, TriGParser, TurtleError, TurtleParser};
use rio_xml::{RdfXmlError, RdfXmlParser};
use std::collections::HashMap;
use std::error::Error;
use std::io::BufRead;
use std::{fmt, io};
/// Parsers for RDF graph serialization formats.
///
/// It currently supports the following formats:
/// * [N-Triples](https://www.w3.org/TR/n-triples/) ([`GraphFormat::NTriples`](super::GraphFormat::NTriples))
/// * [Turtle](https://www.w3.org/TR/turtle/) ([`GraphFormat::Turtle`](super::GraphFormat::Turtle))
/// * [RDF/XML](https://www.w3.org/TR/rdf-syntax-grammar/) ([`GraphFormat::RdfXml`](super::GraphFormat::RdfXml))
///
/// ```
/// use oxigraph::io::{GraphFormat, GraphParser};
/// use std::io::Cursor;
///
/// let file = "<http://example.com/s> <http://example.com/p> <http://example.com/o> .";
///
/// let parser = GraphParser::from_format(GraphFormat::NTriples);
/// let triples = parser.read_triples(Cursor::new(file))?.collect::<Result<Vec<_>,_>>()?;
///
///assert_eq!(triples.len(), 1);
///assert_eq!(triples[0].subject.to_string(), "<http://example.com/s>");
/// # std::io::Result::Ok(())
/// ```
pub struct GraphParser {
format: GraphFormat,
base_iri: Option<Iri<String>>,
}
impl GraphParser {
/// Builds a parser for the given format.
pub fn from_format(format: GraphFormat) -> Self {
Self {
format,
base_iri: None,
}
}
/// Provides an IRI that could be used to resolve the file relative IRIs.
///
/// ```
/// use oxigraph::io::{GraphFormat, GraphParser};
/// use std::io::Cursor;
///
/// let file = "</s> </p> </o> .";
///
/// let parser = GraphParser::from_format(GraphFormat::Turtle).with_base_iri("http://example.com")?;
/// let triples = parser.read_triples(Cursor::new(file))?.collect::<Result<Vec<_>,_>>()?;
///
///assert_eq!(triples.len(), 1);
///assert_eq!(triples[0].subject.to_string(), "<http://example.com/s>");
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
/// ```
pub fn with_base_iri(mut self, base_iri: impl Into<String>) -> Result<Self, IriParseError> {
self.base_iri = Some(Iri::parse(base_iri.into())?);
Ok(self)
}
/// Executes the parsing itself on a [`BufRead`](std::io::BufRead) implementation and returns an iterator of triples.
#[allow(clippy::unnecessary_wraps)]
pub fn read_triples<R: BufRead>(&self, reader: R) -> Result<TripleReader<R>, ParserError> {
Ok(TripleReader {
mapper: RioMapper::default(),
parser: match self.format {
GraphFormat::NTriples => TripleReaderKind::NTriples(NTriplesParser::new(reader)),
GraphFormat::Turtle => {
TripleReaderKind::Turtle(TurtleParser::new(reader, self.base_iri.clone()))
}
GraphFormat::RdfXml => {
TripleReaderKind::RdfXml(RdfXmlParser::new(reader, self.base_iri.clone()))
}
},
buffer: Vec::new(),
})
}
}
/// An iterator yielding read triples.
/// Could be built using a [`GraphParser`].
///
/// ```
/// use oxigraph::io::{GraphFormat, GraphParser};
/// use std::io::Cursor;
///
/// let file = "<http://example.com/s> <http://example.com/p> <http://example.com/o> .";
///
/// let parser = GraphParser::from_format(GraphFormat::NTriples);
/// let triples = parser.read_triples(Cursor::new(file))?.collect::<Result<Vec<_>,_>>()?;
///
///assert_eq!(triples.len(), 1);
///assert_eq!(triples[0].subject.to_string(), "<http://example.com/s>");
/// # std::io::Result::Ok(())
/// ```
#[must_use]
pub struct TripleReader<R: BufRead> {
mapper: RioMapper,
parser: TripleReaderKind<R>,
buffer: Vec<Triple>,
}
enum TripleReaderKind<R: BufRead> {
NTriples(NTriplesParser<R>),
Turtle(TurtleParser<R>),
RdfXml(RdfXmlParser<R>),
}
impl<R: BufRead> Iterator for TripleReader<R> {
type Item = Result<Triple, ParserError>;
fn next(&mut self) -> Option<Result<Triple, ParserError>> {
loop {
if let Some(r) = self.buffer.pop() {
return Some(Ok(r));
}
if let Err(error) = match &mut self.parser {
TripleReaderKind::NTriples(parser) => {
Self::read(parser, &mut self.buffer, &mut self.mapper)
}
TripleReaderKind::Turtle(parser) => {
Self::read(parser, &mut self.buffer, &mut self.mapper)
}
TripleReaderKind::RdfXml(parser) => {
Self::read(parser, &mut self.buffer, &mut self.mapper)
}
}? {
return Some(Err(error));
}
}
}
}
impl<R: BufRead> TripleReader<R> {
fn read<P: TriplesParser>(
parser: &mut P,
buffer: &mut Vec<Triple>,
mapper: &mut RioMapper,
) -> Option<Result<(), ParserError>>
where
ParserError: From<P::Error>,
{
if parser.is_end() {
None
} else if let Err(e) = parser.parse_step(&mut |t| {
buffer.push(mapper.triple(&t));
Ok(())
}) {
Some(Err(e))
} else {
Some(Ok(()))
}
}
}
/// A parser for RDF dataset serialization formats.
///
/// It currently supports the following formats:
/// * [N-Quads](https://www.w3.org/TR/n-quads/) ([`DatasetFormat::NQuads`](super::DatasetFormat::NQuads))
/// * [TriG](https://www.w3.org/TR/trig/) ([`DatasetFormat::TriG`](super::DatasetFormat::TriG))
///
/// ```
/// use oxigraph::io::{DatasetFormat, DatasetParser};
/// use std::io::Cursor;
///
/// let file = "<http://example.com/s> <http://example.com/p> <http://example.com/o> <http://example.com/g> .";
///
/// let parser = DatasetParser::from_format(DatasetFormat::NQuads);
/// let quads = parser.read_quads(Cursor::new(file))?.collect::<Result<Vec<_>,_>>()?;
///
///assert_eq!(quads.len(), 1);
///assert_eq!(quads[0].subject.to_string(), "<http://example.com/s>");
/// # std::io::Result::Ok(())
/// ```
pub struct DatasetParser {
format: DatasetFormat,
base_iri: Option<Iri<String>>,
}
impl DatasetParser {
/// Builds a parser for the given format.
pub fn from_format(format: DatasetFormat) -> Self {
Self {
format,
base_iri: None,
}
}
/// Provides an IRI that could be used to resolve the file relative IRIs.
///
/// ```
/// use oxigraph::io::{DatasetFormat, DatasetParser};
/// use std::io::Cursor;
///
/// let file = "<g> { </s> </p> </o> }";
///
/// let parser = DatasetParser::from_format(DatasetFormat::TriG).with_base_iri("http://example.com")?;
/// let triples = parser.read_quads(Cursor::new(file))?.collect::<Result<Vec<_>,_>>()?;
///
///assert_eq!(triples.len(), 1);
///assert_eq!(triples[0].subject.to_string(), "<http://example.com/s>");
/// # Result::<_,Box<dyn std::error::Error>>::Ok(())
/// ```
pub fn with_base_iri(mut self, base_iri: impl Into<String>) -> Result<Self, IriParseError> {
self.base_iri = Some(Iri::parse(base_iri.into())?);
Ok(self)
}
/// Executes the parsing itself on a [`BufRead`](std::io::BufRead) implementation and returns an iterator of quads.
#[allow(clippy::unnecessary_wraps)]
pub fn read_quads<R: BufRead>(&self, reader: R) -> Result<QuadReader<R>, ParserError> {
Ok(QuadReader {
mapper: RioMapper::default(),
parser: match self.format {
DatasetFormat::NQuads => QuadReaderKind::NQuads(NQuadsParser::new(reader)),
DatasetFormat::TriG => {
QuadReaderKind::TriG(TriGParser::new(reader, self.base_iri.clone()))
}
},
buffer: Vec::new(),
})
}
}
/// An iterator yielding read quads.
/// Could be built using a [`DatasetParser`].
///
/// ```
/// use oxigraph::io::{DatasetFormat, DatasetParser};
/// use std::io::Cursor;
///
/// let file = "<http://example.com/s> <http://example.com/p> <http://example.com/o> <http://example.com/g> .";
///
/// let parser = DatasetParser::from_format(DatasetFormat::NQuads);
/// let quads = parser.read_quads(Cursor::new(file))?.collect::<Result<Vec<_>,_>>()?;
///
///assert_eq!(quads.len(), 1);
///assert_eq!(quads[0].subject.to_string(), "<http://example.com/s>");
/// # std::io::Result::Ok(())
/// ```
#[must_use]
pub struct QuadReader<R: BufRead> {
mapper: RioMapper,
parser: QuadReaderKind<R>,
buffer: Vec<Quad>,
}
enum QuadReaderKind<R: BufRead> {
NQuads(NQuadsParser<R>),
TriG(TriGParser<R>),
}
impl<R: BufRead> Iterator for QuadReader<R> {
type Item = Result<Quad, ParserError>;
fn next(&mut self) -> Option<Result<Quad, ParserError>> {
loop {
if let Some(r) = self.buffer.pop() {
return Some(Ok(r));
}
if let Err(error) = match &mut self.parser {
QuadReaderKind::NQuads(parser) => {
Self::read(parser, &mut self.buffer, &mut self.mapper)
}
QuadReaderKind::TriG(parser) => {
Self::read(parser, &mut self.buffer, &mut self.mapper)
}
}? {
return Some(Err(error));
}
}
}
}
impl<R: BufRead> QuadReader<R> {
fn read<P: QuadsParser>(
parser: &mut P,
buffer: &mut Vec<Quad>,
mapper: &mut RioMapper,
) -> Option<Result<(), ParserError>>
where
ParserError: From<P::Error>,
{
if parser.is_end() {
None
} else if let Err(e) = parser.parse_step(&mut |t| {
buffer.push(mapper.quad(&t));
Ok(())
}) {
Some(Err(e))
} else {
Some(Ok(()))
}
}
}
#[derive(Default)]
struct RioMapper {
bnode_map: HashMap<String, BlankNode>,
}
impl<'a> RioMapper {
fn named_node(node: rio::NamedNode<'a>) -> NamedNode {
NamedNode::new_unchecked(node.iri)
}
fn blank_node(&mut self, node: rio::BlankNode<'a>) -> BlankNode {
self.bnode_map
.entry(node.id.to_owned())
.or_insert_with(BlankNode::default)
.clone()
}
fn literal(literal: rio::Literal<'a>) -> Literal {
match literal {
rio::Literal::Simple { value } => Literal::new_simple_literal(value),
rio::Literal::LanguageTaggedString { value, language } => {
Literal::new_language_tagged_literal_unchecked(value, language)
}
rio::Literal::Typed { value, datatype } => {
Literal::new_typed_literal(value, Self::named_node(datatype))
}
}
}
fn subject(&mut self, node: rio::Subject<'a>) -> Subject {
match node {
rio::Subject::NamedNode(node) => Self::named_node(node).into(),
rio::Subject::BlankNode(node) => self.blank_node(node).into(),
rio::Subject::Triple(triple) => self.triple(triple).into(),
}
}
fn term(&mut self, node: rio::Term<'a>) -> Term {
match node {
rio::Term::NamedNode(node) => Self::named_node(node).into(),
rio::Term::BlankNode(node) => self.blank_node(node).into(),
rio::Term::Literal(literal) => Self::literal(literal).into(),
rio::Term::Triple(triple) => self.triple(triple).into(),
}
}
fn triple(&mut self, triple: &rio::Triple<'a>) -> Triple {
Triple {
subject: self.subject(triple.subject),
predicate: Self::named_node(triple.predicate),
object: self.term(triple.object),
}
}
fn graph_name(&mut self, graph_name: Option<rio::GraphName<'a>>) -> GraphName {
match graph_name {
Some(rio::GraphName::NamedNode(node)) => Self::named_node(node).into(),
Some(rio::GraphName::BlankNode(node)) => self.blank_node(node).into(),
None => GraphName::DefaultGraph,
}
}
fn quad(&mut self, quad: &rio::Quad<'a>) -> Quad {
Quad {
subject: self.subject(quad.subject),
predicate: Self::named_node(quad.predicate),
object: self.term(quad.object),
graph_name: self.graph_name(quad.graph_name),
}
}
}
/// Error returned during RDF format parsing.
#[derive(Debug)]
pub enum ParserError {
/// I/O error during parsing (file not found...).
Io(io::Error),
/// An error in the file syntax.
Syntax(SyntaxError),
}
impl ParserError {
pub(crate) fn invalid_base_iri(iri: &str, error: IriParseError) -> Self {
Self::Syntax(SyntaxError {
inner: SyntaxErrorKind::BaseIri {
iri: iri.to_owned(),
error,
},
})
}
}
impl fmt::Display for ParserError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Io(e) => e.fmt(f),
Self::Syntax(e) => e.fmt(f),
}
}
}
impl Error for ParserError {
fn source(&self) -> Option<&(dyn Error + 'static)> {
match self {
Self::Io(e) => Some(e),
Self::Syntax(e) => Some(e),
}
}
}
#[allow(clippy::fallible_impl_from)]
impl From<TurtleError> for ParserError {
fn from(error: TurtleError) -> Self {
let error = io::Error::from(error);
if error.get_ref().map_or(false, |e| e.is::<TurtleError>()) {
Self::Syntax(SyntaxError {
inner: SyntaxErrorKind::Turtle(*error.into_inner().unwrap().downcast().unwrap()),
})
} else {
Self::Io(error)
}
}
}
#[allow(clippy::fallible_impl_from)]
impl From<RdfXmlError> for ParserError {
fn from(error: RdfXmlError) -> Self {
let error = io::Error::from(error);
if error.get_ref().map_or(false, |e| e.is::<RdfXmlError>()) {
Self::Syntax(SyntaxError {
inner: SyntaxErrorKind::RdfXml(*error.into_inner().unwrap().downcast().unwrap()),
})
} else {
Self::Io(error)
}
}
}
impl From<TermParseError> for ParserError {
fn from(error: TermParseError) -> Self {
Self::Syntax(SyntaxError {
inner: SyntaxErrorKind::Term(error),
})
}
}
impl From<io::Error> for ParserError {
fn from(error: io::Error) -> Self {
Self::Io(error)
}
}
impl From<SyntaxError> for ParserError {
fn from(error: SyntaxError) -> Self {
Self::Syntax(error)
}
}
impl From<ParserError> for io::Error {
fn from(error: ParserError) -> Self {
match error {
ParserError::Io(error) => error,
ParserError::Syntax(error) => error.into(),
}
}
}
impl From<quick_xml::Error> for ParserError {
fn from(error: quick_xml::Error) -> Self {
match error {
quick_xml::Error::Io(error) => Self::Io(error),
error => Self::Syntax(SyntaxError {
inner: SyntaxErrorKind::Xml(error),
}),
}
}
}
/// An error in the syntax of the parsed file
#[derive(Debug)]
pub struct SyntaxError {
pub(crate) inner: SyntaxErrorKind,
}
#[derive(Debug)]
pub(crate) enum SyntaxErrorKind {
Turtle(TurtleError),
RdfXml(RdfXmlError),
BaseIri { iri: String, error: IriParseError },
Xml(quick_xml::Error),
Term(TermParseError),
Msg { msg: String },
}
impl SyntaxError {
/// Builds an error from a printable error message.
pub(crate) fn msg(msg: impl Into<String>) -> Self {
Self {
inner: SyntaxErrorKind::Msg { msg: msg.into() },
}
}
}
impl fmt::Display for SyntaxError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match &self.inner {
SyntaxErrorKind::Turtle(e) => e.fmt(f),
SyntaxErrorKind::RdfXml(e) => e.fmt(f),
SyntaxErrorKind::BaseIri { iri, error } => {
write!(f, "Invalid base IRI '{}': {}", iri, error)
}
SyntaxErrorKind::Xml(e) => e.fmt(f),
SyntaxErrorKind::Term(e) => e.fmt(f),
SyntaxErrorKind::Msg { msg } => f.write_str(msg),
}
}
}
impl Error for SyntaxError {
fn source(&self) -> Option<&(dyn Error + 'static)> {
match &self.inner {
SyntaxErrorKind::Turtle(e) => Some(e),
SyntaxErrorKind::RdfXml(e) => Some(e),
SyntaxErrorKind::Xml(e) => Some(e),
SyntaxErrorKind::Term(e) => Some(e),
SyntaxErrorKind::BaseIri { .. } | SyntaxErrorKind::Msg { .. } => None,
}
}
}
impl From<SyntaxError> for io::Error {
fn from(error: SyntaxError) -> Self {
match error.inner {
SyntaxErrorKind::Turtle(error) => error.into(),
SyntaxErrorKind::RdfXml(error) => error.into(),
SyntaxErrorKind::BaseIri { iri, error } => Self::new(
io::ErrorKind::InvalidInput,
format!("Invalid IRI '{}': {}", iri, error),
),
SyntaxErrorKind::Xml(error) => match error {
quick_xml::Error::Io(error) => error,
quick_xml::Error::UnexpectedEof(error) => {
Self::new(io::ErrorKind::UnexpectedEof, error)
}
error => Self::new(io::ErrorKind::InvalidData, error),
},
SyntaxErrorKind::Term(error) => Self::new(io::ErrorKind::InvalidData, error),
SyntaxErrorKind::Msg { msg } => Self::new(io::ErrorKind::InvalidData, msg),
}
}
}