Beginning of RDF/XML parser

Does not support all RDF/XML constructs yet
pull/10/head
Tpt 6 years ago
parent 8760436b19
commit 9e88e27acf
  1. 1
      Cargo.toml
  2. 19
      src/errors.rs
  3. 1
      src/lib.rs
  4. 8
      src/model/vocab.rs
  5. 1
      src/rio/mod.rs
  6. 601
      src/rio/xml.rs
  7. 56
      tests/rdf_test_cases.rs

@ -22,6 +22,7 @@ url = "1"
uuid = { version = "0.7", features = ["v4"] }
bzip2 = "0.3"
byteorder = "1"
quick-xml = "0.12"
[build-dependencies]
peg = "0.5"

@ -1,5 +1,7 @@
use quick_xml::Error as Xml_Error;
use std::fmt;
use std::sync::PoisonError;
error_chain! {
foreign_links {
Url(::url::ParseError);
@ -10,6 +12,13 @@ error_chain! {
Turtle(::rio::turtle::ParseError);
SparqlParser(::sparql::parser::ParseError);
}
errors {
Xml(error: Xml_Error) {
description("XML parsing error")
display("XML parsing error: {:?}", error)
}
}
}
impl<T> From<PoisonError<T>> for Error {
@ -19,6 +28,16 @@ impl<T> From<PoisonError<T>> for Error {
}
}
impl From<Xml_Error> for Error {
fn from(error: Xml_Error) -> Self {
match error {
Xml_Error::Io(error) => error.into(),
Xml_Error::Utf8(error) => error.into(),
error => ErrorKind::Xml(error).into(),
}
}
}
impl From<Error> for fmt::Error {
fn from(_: Error) -> Self {
fmt::Error

@ -3,6 +3,7 @@ extern crate byteorder;
extern crate error_chain;
#[macro_use]
extern crate lazy_static;
extern crate quick_xml;
extern crate rocksdb;
extern crate url;
extern crate uuid;

@ -13,8 +13,16 @@ pub mod rdf {
NamedNode::from_str("http://www.w3.org/1999/02/22-rdf-syntax-ns#langString").unwrap();
pub static ref NIL: NamedNode =
NamedNode::from_str("http://www.w3.org/1999/02/22-rdf-syntax-ns#nil").unwrap();
pub static ref OBJECT: NamedNode =
NamedNode::from_str("http://www.w3.org/1999/02/22-rdf-syntax-ns#object").unwrap();
pub static ref PREDICATE: NamedNode =
NamedNode::from_str("http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate").unwrap();
pub static ref REST: NamedNode =
NamedNode::from_str("http://www.w3.org/1999/02/22-rdf-syntax-ns#rest").unwrap();
pub static ref STATEMENT: NamedNode =
NamedNode::from_str("http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement").unwrap();
pub static ref SUBJECT: NamedNode =
NamedNode::from_str("http://www.w3.org/1999/02/22-rdf-syntax-ns#subject").unwrap();
pub static ref TYPE: NamedNode =
NamedNode::from_str("http://www.w3.org/1999/02/22-rdf-syntax-ns#type").unwrap();
pub static ref XML_LITERAL: NamedNode =

@ -1,3 +1,4 @@
pub mod ntriples;
pub mod turtle;
pub(crate) mod utils;
pub mod xml;

@ -0,0 +1,601 @@
use errors::*;
use model::vocab::rdf;
use model::Triple;
use model::*;
use quick_xml::events::BytesEnd;
use quick_xml::events::BytesStart;
use quick_xml::events::BytesText;
use quick_xml::events::Event;
use quick_xml::Reader;
use std::collections::BTreeMap;
use std::io::BufRead;
use std::str::FromStr;
use url::Url;
pub fn read_rdf_xml(
source: impl BufRead,
base_uri: impl Into<Option<Url>>,
) -> impl Iterator<Item = Result<Triple>> {
let mut reader = Reader::from_reader(source);
reader.expand_empty_elements(true);
reader.trim_text(true);
RdfXmlIterator {
reader,
namespace_buffer: Vec::default(),
state: vec![RdfXmlState::Doc {
base_uri: base_uri.into(),
}],
object: None,
bnodes_map: BTreeMap::default(),
triples_cache: Vec::default(),
li_counter: Vec::default(),
}
}
lazy_static! {
static ref RDF_ABOUT: Url =
Url::from_str("http://www.w3.org/1999/02/22-rdf-syntax-ns#about").unwrap();
static ref RDF_DATATYPE: Url =
Url::from_str("http://www.w3.org/1999/02/22-rdf-syntax-ns#datatype").unwrap();
static ref RDF_DESCRIPTION: NamedNode =
NamedNode::from_str("http://www.w3.org/1999/02/22-rdf-syntax-ns#Description").unwrap();
static ref RDF_ID: Url =
Url::from_str("http://www.w3.org/1999/02/22-rdf-syntax-ns#ID").unwrap();
static ref RDF_LI: Url =
Url::from_str("http://www.w3.org/1999/02/22-rdf-syntax-ns#li").unwrap();
static ref RDF_NODE_ID: Url =
Url::from_str("http://www.w3.org/1999/02/22-rdf-syntax-ns#nodeID").unwrap();
static ref RDF_PARSE_TYPE: Url =
Url::from_str("http://www.w3.org/1999/02/22-rdf-syntax-ns#parseType").unwrap();
static ref RDF_RDF: Url =
Url::from_str("http://www.w3.org/1999/02/22-rdf-syntax-ns#RDF").unwrap();
static ref RDF_RESOURCE: Url =
Url::from_str("http://www.w3.org/1999/02/22-rdf-syntax-ns#resource").unwrap();
}
struct RdfXmlIterator<R: BufRead> {
reader: Reader<R>,
namespace_buffer: Vec<u8>,
state: Vec<RdfXmlState>,
object: Option<NodeOrText>,
bnodes_map: BTreeMap<Vec<u8>, BlankNode>,
triples_cache: Vec<Triple>,
li_counter: Vec<usize>,
}
#[derive(Clone)]
enum NodeOrText {
Node(NamedOrBlankNode),
Text(String),
}
enum RdfXmlState {
Doc {
base_uri: Option<Url>,
},
RDF {
base_uri: Option<Url>,
language: String,
},
NodeElt {
base_uri: Option<Url>,
language: String,
subject: NamedOrBlankNode,
},
PropertyElt {
//Resource, Literal or Empty property element
uri: Url,
base_uri: Option<Url>,
language: String,
subject: NamedOrBlankNode,
object: Option<NamedOrBlankNode>,
id_attr: Option<NamedNode>,
datatype_attr: Option<NamedNode>,
},
ParseTypeCollectionPropertyElt {
uri: NamedNode,
base_uri: Option<Url>,
language: String,
subject: NamedOrBlankNode,
id_attr: Option<NamedNode>,
},
//TODO: ParseTypeOtherProperty and ParseTypeLiteralProperty
}
impl RdfXmlState {
fn base_uri(&self) -> &Option<Url> {
match self {
RdfXmlState::Doc { base_uri, .. } => base_uri,
RdfXmlState::RDF { base_uri, .. } => base_uri,
RdfXmlState::NodeElt { base_uri, .. } => base_uri,
RdfXmlState::PropertyElt { base_uri, .. } => base_uri,
RdfXmlState::ParseTypeCollectionPropertyElt { base_uri, .. } => base_uri,
}
}
fn language(&self) -> &str {
match self {
RdfXmlState::Doc { .. } => "",
RdfXmlState::RDF { language, .. } => language,
RdfXmlState::NodeElt { language, .. } => language,
RdfXmlState::PropertyElt { language, .. } => language,
RdfXmlState::ParseTypeCollectionPropertyElt { language, .. } => language,
}
}
}
impl<R: BufRead> Iterator for RdfXmlIterator<R> {
type Item = Result<Triple>;
fn next(&mut self) -> Option<Result<Triple>> {
let mut buffer = Vec::default();
loop {
//Finish the stack
if let Some(triple) = self.triples_cache.pop() {
return Some(Ok(triple));
}
//Read more XML
let (namespace, event) = match self
.reader
.read_namespaced_event(&mut buffer, &mut self.namespace_buffer)
{
Ok((Some(namespace), event)) => {
(Some(self.reader.decode(namespace).to_string()), event)
}
Ok((None, event)) => (None, event),
Err(error) => return Some(Err(error.into())),
};
match event {
Event::Start(event) => {
let uri = match namespace {
Some(namespace) => {
Url::parse(&(namespace + &self.reader.decode(&event.local_name())))
}
None => Url::parse(&self.reader.decode(&event.name())),
};
let uri = match uri {
Ok(uri) => uri,
Err(error) => return Some(Err(error.into())),
};
if let Err(error) = self.parse_start_event(&event, uri) {
return Some(Err(error));
}
}
Event::Text(event) => {
if let Err(error) = self.parse_text_event(&event) {
return Some(Err(error));
}
}
Event::End(event) => {
if let Err(error) = self.parse_end_event(&event) {
return Some(Err(error));
}
}
Event::Eof => return None,
_ => (),
}
}
}
}
/*
fn read_next_event<'a, R: BufRead>(reader: &'a mut Reader<R>, namespace_buffer: &mut Vec<u8>) -> Result<(Option<String>, Event<'a>)> {
let mut buffer = Vec::default();
Ok(match reader.read_namespaced_event(&mut buffer, namespace_buffer)? {
(Some(namespace), event) => (Some(reader.decode(namespace).to_string()), event),
(None, event) => (None, event)
})
}
*/
impl<R: BufRead> RdfXmlIterator<R> {
fn parse_start_event(&mut self, event: &BytesStart, uri: Url) -> Result<()> {
#[derive(PartialEq, Eq)]
enum RdfXmlParseType {
Default,
Collection,
Literal,
Resource,
Other,
}
#[derive(PartialEq, Eq)]
enum RdfXmlNextProduction {
RDF,
NodeElt,
PropertyElt { subject: NamedOrBlankNode },
}
//We read attributes
let mut language = String::default();
let mut base_uri = None;
if let Some(current_state) = self.state.last() {
language = current_state.language().to_string();
base_uri = current_state.base_uri().clone();
}
let mut id_attr = None;
let mut node_id_attr = None;
let mut about_attr = None;
let mut property_attrs = Vec::default();
let mut resource_attr = None;
let mut datatype_attr = None;
let mut parse_type = RdfXmlParseType::Default;
let mut type_attr = None;
for attribute in event.attributes() {
let attribute = attribute?;
match attribute.key {
b"xml:lang" => {
language = attribute.unescape_and_decode_value(&self.reader)?;
}
b"xml:base" => {
base_uri = Some(self.resolve_uri(&attribute.unescaped_value()?, &None)?)
}
key if !key.starts_with(b"xml") => {
let attribute_url = self.resolve_q_name(key)?;
if attribute_url == *RDF_ID {
let mut id = Vec::with_capacity(attribute.value.len() + 1);
id.push(b'#');
id.extend_from_slice(attribute.unescaped_value()?.as_ref());
id_attr = Some(id);
} else if attribute_url == *RDF_NODE_ID {
node_id_attr = Some(
self.bnodes_map
.entry(attribute.unescaped_value()?.to_vec())
.or_insert_with(BlankNode::default)
.clone(),
);
} else if attribute_url == *RDF_ABOUT {
about_attr = Some(attribute.unescaped_value()?.to_vec());
} else if attribute_url == *RDF_RESOURCE {
resource_attr = Some(attribute.unescaped_value()?.to_vec());
} else if attribute_url == *RDF_DATATYPE {
datatype_attr = Some(attribute.unescaped_value()?.to_vec());
} else if attribute_url == *RDF_PARSE_TYPE {
parse_type = match attribute.value.as_ref() {
b"Collection" => RdfXmlParseType::Collection,
b"Literal" => RdfXmlParseType::Literal,
b"Resource" => RdfXmlParseType::Resource,
_ => RdfXmlParseType::Other,
};
} else if attribute_url == **rdf::TYPE {
type_attr = Some(attribute.unescaped_value()?.to_vec());
} else {
property_attrs.push((
NamedNode::from(attribute_url),
attribute.unescape_and_decode_value(&self.reader)?,
));
}
}
_ => (), //We do not fail for unknown tags in the XML namespace
}
}
//Parsing with the base URI
let id_attr = match id_attr {
Some(uri) => Some(NamedNode::from(self.resolve_uri(&uri, &base_uri)?)),
None => None,
};
let about_attr = match about_attr {
Some(uri) => Some(NamedNode::from(self.resolve_uri(&uri, &base_uri)?)),
None => None,
};
let resource_attr = match resource_attr {
Some(uri) => Some(NamedNode::from(self.resolve_uri(&uri, &base_uri)?)),
None => None,
};
let datatype_attr = match datatype_attr {
Some(uri) => Some(NamedNode::from(self.resolve_uri(&uri, &base_uri)?)),
None => None,
};
let type_attr = match type_attr {
Some(uri) => Some(NamedNode::from(self.resolve_uri(&uri, &base_uri)?)),
None => None,
};
let next_production = match self.state.last() {
Some(RdfXmlState::Doc { .. }) => RdfXmlNextProduction::RDF,
Some(RdfXmlState::RDF { .. }) => RdfXmlNextProduction::NodeElt,
Some(RdfXmlState::NodeElt { subject, .. }) => RdfXmlNextProduction::PropertyElt {
subject: subject.clone(),
},
Some(RdfXmlState::PropertyElt { .. }) => RdfXmlNextProduction::NodeElt {},
Some(RdfXmlState::ParseTypeCollectionPropertyElt { .. }) => {
RdfXmlNextProduction::NodeElt {}
}
None => return Err("No state in the stack: the XML is not balanced".into()),
};
let new_state = match next_production {
RdfXmlNextProduction::RDF => {
if uri == *RDF_RDF {
RdfXmlState::RDF { base_uri, language }
} else {
self.build_node_elt(
NamedNode::from(uri),
base_uri,
language,
id_attr,
node_id_attr,
about_attr,
type_attr,
property_attrs,
)
}
}
RdfXmlNextProduction::NodeElt => self.build_node_elt(
NamedNode::from(uri),
base_uri,
language,
id_attr,
node_id_attr,
about_attr,
type_attr,
property_attrs,
),
RdfXmlNextProduction::PropertyElt { subject } => match parse_type {
RdfXmlParseType::Default => {
if resource_attr.is_some()
|| node_id_attr.is_some()
|| !property_attrs.is_empty()
{
let object: NamedOrBlankNode = match resource_attr {
Some(resource_attr) => resource_attr.into(),
None => match node_id_attr {
Some(node_id_attr) => node_id_attr.into(),
None => BlankNode::default().into(),
},
};
self.emit_property_attrs(&object, property_attrs, &language);
if let Some(type_attr) = type_attr {
self.triples_cache.push(Triple::new(
object.clone(),
rdf::TYPE.clone(),
type_attr,
));
}
RdfXmlState::PropertyElt {
uri,
base_uri,
language,
subject,
object: Some(object),
id_attr,
datatype_attr,
}
} else {
RdfXmlState::PropertyElt {
uri,
base_uri,
language,
subject,
object: None,
id_attr,
datatype_attr,
}
}
}
RdfXmlParseType::Literal => {
return Err("rdf:parseType=\"Literal\" is not supported yet".into());
}
RdfXmlParseType::Resource => self.build_parse_type_resource_property_elt(
NamedNode::from(uri),
base_uri,
language,
subject,
id_attr,
),
RdfXmlParseType::Collection => {
return Err("rdf:parseType=\"Collection\" is not supported yet".into());
}
RdfXmlParseType::Other => {
return Err("Arbitrary rdf:parseType are not supported yet".into());
}
},
};
self.state.push(new_state);
Ok(())
}
fn parse_end_event(&mut self, _event: &BytesEnd) -> Result<()> {
if let Some(current_state) = self.state.pop() {
self.end_state(current_state)?;
}
Ok(())
}
fn parse_text_event(&mut self, event: &BytesText) -> Result<()> {
if self.object.is_some() {
return Err(format!(
"There is already an object set at byte {}",
self.reader.buffer_position()
).into());
}
self.object = Some(NodeOrText::Text(event.unescape_and_decode(&self.reader)?));
Ok(())
}
fn resolve_q_name(&self, qname: &[u8]) -> Result<Url> {
let (namespace, local_name) = self.reader.resolve_namespace(qname, &self.namespace_buffer);
Ok(Url::parse(
&(match namespace {
Some(namespace) => self.reader.decode(namespace) + self.reader.decode(local_name),
None => self.reader.decode(local_name),
}),
)?)
}
fn resolve_uri(&self, uri: &[u8], base: &Option<Url>) -> Result<Url> {
Ok(Url::options()
.base_url(base.as_ref())
.parse(&self.reader.decode(uri))?)
}
fn build_node_elt(
&mut self,
uri: NamedNode,
base_uri: Option<Url>,
language: String,
id_attr: Option<NamedNode>,
node_id_attr: Option<BlankNode>,
about_attr: Option<NamedNode>,
type_attr: Option<NamedNode>,
property_attrs: Vec<(NamedNode, String)>,
) -> RdfXmlState {
self.object = None; //We reset object return: we are in a list of elements
let subject = match id_attr {
Some(id_attr) => id_attr.into(),
None => match about_attr {
Some(about_attr) => about_attr.into(),
None => node_id_attr.unwrap_or_else(BlankNode::default).into(),
},
};
self.emit_property_attrs(&subject, property_attrs, &language);
if let Some(type_attr) = type_attr {
self.triples_cache
.push(Triple::new(subject.clone(), rdf::TYPE.clone(), type_attr));
}
if uri != *RDF_DESCRIPTION {
self.triples_cache
.push(Triple::new(subject.clone(), rdf::TYPE.clone(), uri));
}
self.li_counter.push(0);
RdfXmlState::NodeElt {
base_uri,
language,
subject: subject.clone(),
}
}
fn build_parse_type_resource_property_elt(
&mut self,
uri: NamedNode,
base_uri: Option<Url>,
language: String,
subject: NamedOrBlankNode,
id_attr: Option<NamedNode>,
) -> RdfXmlState {
let object = BlankNode::default();
let triple = Triple::new(subject, uri, object.clone());
if let Some(id_attr) = id_attr {
self.reify(&triple, id_attr.into());
}
self.triples_cache.push(triple);
self.li_counter.push(0);
RdfXmlState::NodeElt {
base_uri,
language,
subject: object.into(),
}
}
fn end_state(&mut self, state: RdfXmlState) -> Result<()> {
match state {
RdfXmlState::PropertyElt {
uri,
language,
subject,
id_attr,
datatype_attr,
object,
..
} => {
let predicate = if uri == *RDF_LI {
if let Some(li_counter) = self.li_counter.last_mut() {
*li_counter += 1;
NamedNode::from_str(&format!(
"http://www.w3.org/1999/02/22-rdf-syntax-ns#_{}",
li_counter
))?
} else {
NamedNode::from(uri)
}
} else {
NamedNode::from(uri)
};
let object: Term = match object {
Some(object) => object.into(),
None => match self.object.clone() {
Some(NodeOrText::Node(node)) => node.into(),
Some(NodeOrText::Text(text)) => {
self.new_literal(text, language, datatype_attr).into()
}
None => self
.new_literal(String::default(), language, datatype_attr)
.into(),
},
};
self.object = None; //We have used self.object
let triple = Triple::new(subject, predicate, object);
if let Some(id_attr) = id_attr {
self.reify(&triple, id_attr.into());
}
self.triples_cache.push(triple);
}
RdfXmlState::NodeElt { subject, .. } => {
self.object = Some(NodeOrText::Node(subject));
self.li_counter.pop();
}
_ => (),
}
Ok(())
}
fn new_literal(&self, text: String, language: String, datatype: Option<NamedNode>) -> Literal {
if let Some(datatype) = datatype {
Literal::new_typed_literal(text, datatype)
} else if language.is_empty() {
Literal::new_simple_literal(text)
} else {
Literal::new_language_tagged_literal(text, language)
}
}
fn reify(&mut self, triple: &Triple, statement_id: NamedOrBlankNode) {
self.triples_cache.push(Triple::new(
statement_id.clone(),
rdf::OBJECT.clone(),
triple.object().clone(),
));
self.triples_cache.push(Triple::new(
statement_id.clone(),
rdf::PREDICATE.clone(),
triple.predicate().clone(),
));
self.triples_cache.push(Triple::new(
statement_id.clone(),
rdf::SUBJECT.clone(),
triple.subject().clone(),
));
self.triples_cache.push(Triple::new(
statement_id,
rdf::TYPE.clone(),
rdf::STATEMENT.clone(),
));
}
fn emit_property_attrs(
&mut self,
subject: &NamedOrBlankNode,
literal_attributes: Vec<(NamedNode, String)>,
language: &str,
) {
for (literal_predicate, literal_value) in literal_attributes {
self.triples_cache.push(Triple::new(
subject.clone(),
literal_predicate,
if language.is_empty() {
Literal::new_simple_literal(literal_value)
} else {
Literal::new_language_tagged_literal(literal_value, language)
},
));
}
}
}

@ -14,12 +14,14 @@ use rudf::model::vocab::rdfs;
use rudf::model::*;
use rudf::rio::ntriples::read_ntriples;
use rudf::rio::turtle::read_turtle;
use rudf::rio::xml::read_rdf_xml;
use rudf::sparql::algebra::Query;
use rudf::sparql::parser::read_sparql_query;
use rudf::store::isomorphism::GraphIsomorphism;
use rudf::store::MemoryGraph;
use std::error::Error;
use std::fmt;
use std::io::BufReader;
use url::Url;
#[test]
@ -130,6 +132,56 @@ fn ntriples_w3c_testsuite() {
}
}
#[test]
fn rdf_xml_w3c_testsuite() -> Result<()> {
let manifest_url = Url::parse("http://www.w3.org/2013/RDFXMLTests/manifest.ttl")?;
let client = RDFClient::default();
//TODO: make blacklist pass
let test_blacklist = vec![
NamedNode::new(manifest_url.join("#xml-canon-test001")?),
NamedNode::new(manifest_url.join("#rdfms-seq-representation-test001")?),
NamedNode::new(manifest_url.join("#rdf-containers-syntax-vs-schema-test004")?),
];
for test_result in TestManifest::new(&client, manifest_url) {
let test = test_result?;
if test_blacklist.contains(&test.id) {
continue;
}
if test.kind == "TestXMLNegativeSyntax" {
/*TODO assert!(
client.load_rdf_xml(test.action.clone()).is_err(),
"Failure on {}",
test
);*/
} else if test.kind == "TestXMLEval" {
match client.load_rdf_xml(test.action.clone()) {
Ok(action_graph) => match client.load_ntriples(test.result.clone().unwrap()) {
Ok(result_graph) => assert!(
action_graph.is_isomorphic(&result_graph)?,
"Failure on {}. Expected file:\n{}\nParsed file:\n{}\n",
test,
result_graph,
action_graph
),
Err(error) => assert!(
false,
"Failure to parse the RDF XML result file {} of {} with error: {}",
test.result.clone().unwrap(),
test,
error
),
},
Err(error) => assert!(false, "Failure to parse {} with error: {}", test, error),
}
} else {
assert!(false, "Not supported test: {}", test);
}
}
Ok(())
}
#[test]
fn sparql_w3c_syntax_testsuite() {
let manifest_url = Url::parse(
@ -186,6 +238,10 @@ impl RDFClient {
read_ntriples(self.get(&url)?).collect()
}
pub fn load_rdf_xml(&self, url: Url) -> Result<MemoryGraph> {
read_rdf_xml(BufReader::new(self.get(&url)?), Some(url)).collect()
}
pub fn load_sparql_query(&self, url: Url) -> Result<Query> {
read_sparql_query(self.get(&url)?, Some(url))
}

Loading…
Cancel
Save