Uses Rio RDF XML parser

pull/10/head
Tpt 5 years ago
parent 74dadf5f21
commit 4dee8a9aa2
  1. 5
      lib/Cargo.toml
  2. 608
      lib/src/rio/xml.rs
  3. 15
      lib/tests/rdf_test_cases.rs
  4. 2
      lib/tests/sparql_test_cases.rs

@ -29,8 +29,9 @@ rust_decimal = "1"
chrono = "0.4" chrono = "0.4"
failure = "0.1" failure = "0.1"
regex = "1" regex = "1"
rio_api = "0.1" rio_api = "0.2"
rio_turtle = "0.1" rio_turtle = "0.2"
rio_xml = "0.2"
[build-dependencies] [build-dependencies]
peg = "0.5" peg = "0.5"

@ -1,604 +1,22 @@
//! Implementation of [RDF XML](https://www.w3.org/TR/rdf-syntax-grammar/) syntax //! Implementation of [RDF XML](https://www.w3.org/TR/rdf-syntax-grammar/) syntax
use crate::model::vocab::rdf; use crate::model::Triple;
use crate::model::*; use crate::rio::rio::convert_triple;
use crate::Result; use crate::Result;
use failure::format_err; use rio_api::parser::TripleParser;
use lazy_static::lazy_static; use rio_xml::RdfXmlParser;
use quick_xml::events::BytesEnd;
use quick_xml::events::BytesStart;
use quick_xml::events::BytesText;
use quick_xml::events::Event;
use quick_xml::Reader;
use std::collections::BTreeMap; use std::collections::BTreeMap;
use std::io::BufRead; use std::io::BufRead;
use std::str::FromStr;
use url::Url; use url::Url;
/// Reads a [RDF XML](https://www.w3.org/TR/rdf-syntax-grammar/) file from a Rust `Read` and returns an iterator on the read `Triple`s /// Reads a [RDF XML](https://www.w3.org/TR/rdf-syntax-grammar/) file from a Rust `BufRead` and returns an iterator of the read `Triple`s
/// pub fn read_rdf_xml<R: BufRead>(
/// Warning: The `rdf:parseType="Literal"` and `rdf:parseType="Collection"` options are not supported yet reader: R,
pub fn read_rdf_xml( base_url: Option<Url>,
source: impl BufRead, ) -> Result<impl Iterator<Item = Result<Triple>>> {
base_uri: impl Into<Option<Url>>, let mut bnode_map = BTreeMap::default();
) -> impl Iterator<Item = Result<Triple>> { Ok(
let mut reader = Reader::from_reader(source); RdfXmlParser::new(reader, base_url.as_ref().map_or("", |url| url.as_str()))?
reader.expand_empty_elements(true); .into_iter(move |t| convert_triple(t, &mut bnode_map)),
reader.trim_text(true);
RdfXmlIterator {
reader,
namespace_buffer: Vec::default(),
state: vec![RdfXmlState::Doc {
base_uri: base_uri.into(),
}],
object: None,
bnodes_map: BTreeMap::default(),
triples_cache: Vec::default(),
li_counter: Vec::default(),
}
}
lazy_static! {
static ref RDF_ABOUT: Url =
Url::from_str("http://www.w3.org/1999/02/22-rdf-syntax-ns#about").unwrap();
static ref RDF_DATATYPE: Url =
Url::from_str("http://www.w3.org/1999/02/22-rdf-syntax-ns#datatype").unwrap();
static ref RDF_DESCRIPTION: NamedNode =
NamedNode::from_str("http://www.w3.org/1999/02/22-rdf-syntax-ns#Description").unwrap();
static ref RDF_ID: Url =
Url::from_str("http://www.w3.org/1999/02/22-rdf-syntax-ns#ID").unwrap();
static ref RDF_LI: Url =
Url::from_str("http://www.w3.org/1999/02/22-rdf-syntax-ns#li").unwrap();
static ref RDF_NODE_ID: Url =
Url::from_str("http://www.w3.org/1999/02/22-rdf-syntax-ns#nodeID").unwrap();
static ref RDF_PARSE_TYPE: Url =
Url::from_str("http://www.w3.org/1999/02/22-rdf-syntax-ns#parseType").unwrap();
static ref RDF_RDF: Url =
Url::from_str("http://www.w3.org/1999/02/22-rdf-syntax-ns#RDF").unwrap();
static ref RDF_RESOURCE: Url =
Url::from_str("http://www.w3.org/1999/02/22-rdf-syntax-ns#resource").unwrap();
}
struct RdfXmlIterator<R: BufRead> {
reader: Reader<R>,
namespace_buffer: Vec<u8>,
state: Vec<RdfXmlState>,
object: Option<NodeOrText>,
bnodes_map: BTreeMap<Vec<u8>, BlankNode>,
triples_cache: Vec<Triple>,
li_counter: Vec<usize>,
}
#[derive(Clone, Debug)]
enum NodeOrText {
Node(NamedOrBlankNode),
Text(String),
}
enum RdfXmlState {
Doc {
base_uri: Option<Url>,
},
RDF {
base_uri: Option<Url>,
language: Option<LanguageTag>,
},
NodeElt {
base_uri: Option<Url>,
language: Option<LanguageTag>,
subject: NamedOrBlankNode,
},
PropertyElt {
//Resource, Literal or Empty property element
uri: Url,
base_uri: Option<Url>,
language: Option<LanguageTag>,
subject: NamedOrBlankNode,
object: Option<NamedOrBlankNode>,
id_attr: Option<NamedNode>,
datatype_attr: Option<NamedNode>,
},
ParseTypeCollectionPropertyElt {
uri: NamedNode,
base_uri: Option<Url>,
language: Option<LanguageTag>,
subject: NamedOrBlankNode,
id_attr: Option<NamedNode>,
},
//TODO: ParseTypeOtherProperty and ParseTypeLiteralProperty
}
impl RdfXmlState {
fn base_uri(&self) -> &Option<Url> {
match self {
RdfXmlState::Doc { base_uri, .. } => base_uri,
RdfXmlState::RDF { base_uri, .. } => base_uri,
RdfXmlState::NodeElt { base_uri, .. } => base_uri,
RdfXmlState::PropertyElt { base_uri, .. } => base_uri,
RdfXmlState::ParseTypeCollectionPropertyElt { base_uri, .. } => base_uri,
}
}
fn language(&self) -> Option<&LanguageTag> {
match self {
RdfXmlState::Doc { .. } => None,
RdfXmlState::RDF { language, .. } => language.as_ref(),
RdfXmlState::NodeElt { language, .. } => language.as_ref(),
RdfXmlState::PropertyElt { language, .. } => language.as_ref(),
RdfXmlState::ParseTypeCollectionPropertyElt { language, .. } => language.as_ref(),
}
}
}
impl<R: BufRead> Iterator for RdfXmlIterator<R> {
type Item = Result<Triple>;
fn next(&mut self) -> Option<Result<Triple>> {
let mut buffer = Vec::default();
loop {
//Finish the stack
if let Some(triple) = self.triples_cache.pop() {
return Some(Ok(triple));
}
//Read more XML
match self
.reader
.read_namespaced_event(&mut buffer, &mut self.namespace_buffer)
{
Ok((_, event)) => match event {
Event::Start(event) => {
if let Err(error) = self.parse_start_event(&event) {
return Some(Err(error));
}
}
Event::Text(event) => {
if let Err(error) = self.parse_text_event(&event) {
return Some(Err(error));
}
}
Event::End(event) => {
if let Err(error) = self.parse_end_event(&event) {
return Some(Err(error));
}
}
Event::Eof => return None,
_ => (),
},
Err(error) => return Some(Err(error.into())),
}
}
}
}
impl<R: BufRead> RdfXmlIterator<R> {
fn parse_start_event(&mut self, event: &BytesStart<'_>) -> Result<()> {
#[derive(PartialEq, Eq)]
enum RdfXmlParseType {
Default,
Collection,
Literal,
Resource,
Other,
}
#[derive(PartialEq, Eq)]
enum RdfXmlNextProduction {
RDF,
NodeElt,
PropertyElt { subject: NamedOrBlankNode },
}
let uri = self.resolve_tag_name(event.name())?;
//We read attributes
let mut language = None;
let mut base_uri = None;
if let Some(current_state) = self.state.last() {
language = current_state.language().cloned();
base_uri = current_state.base_uri().clone();
}
let mut id_attr = None;
let mut node_id_attr = None;
let mut about_attr = None;
let mut property_attrs = Vec::default();
let mut resource_attr = None;
let mut datatype_attr = None;
let mut parse_type = RdfXmlParseType::Default;
let mut type_attr = None;
for attribute in event.attributes() {
let attribute = attribute?;
match attribute.key {
b"xml:lang" => {
language = Some(LanguageTag::parse(
&attribute.unescape_and_decode_value(&self.reader)?,
)?);
}
b"xml:base" => {
base_uri = Some(self.resolve_uri(&attribute.unescaped_value()?, &None)?)
}
key if !key.starts_with(b"xml") => {
let attribute_url = self.resolve_attribute_name(key)?;
if attribute_url == *RDF_ID {
let mut id = Vec::with_capacity(attribute.value.len() + 1);
id.push(b'#');
id.extend_from_slice(attribute.unescaped_value()?.as_ref());
id_attr = Some(id);
} else if attribute_url == *RDF_NODE_ID {
node_id_attr = Some(
self.bnodes_map
.entry(attribute.unescaped_value()?.to_vec())
.or_insert_with(BlankNode::default)
.clone(),
);
} else if attribute_url == *RDF_ABOUT {
about_attr = Some(attribute.unescaped_value()?.to_vec());
} else if attribute_url == *RDF_RESOURCE {
resource_attr = Some(attribute.unescaped_value()?.to_vec());
} else if attribute_url == *RDF_DATATYPE {
datatype_attr = Some(attribute.unescaped_value()?.to_vec());
} else if attribute_url == *RDF_PARSE_TYPE {
parse_type = match attribute.value.as_ref() {
b"Collection" => RdfXmlParseType::Collection,
b"Literal" => RdfXmlParseType::Literal,
b"Resource" => RdfXmlParseType::Resource,
_ => RdfXmlParseType::Other,
};
} else if attribute_url == *rdf::TYPE.as_url() {
type_attr = Some(attribute.unescaped_value()?.to_vec());
} else {
property_attrs.push((
NamedNode::from(attribute_url),
attribute.unescape_and_decode_value(&self.reader)?,
));
}
}
_ => (), //We do not fail for unknown tags in the XML namespace
}
}
//Parsing with the base URI
let id_attr = match id_attr {
Some(uri) => Some(NamedNode::from(self.resolve_uri(&uri, &base_uri)?)),
None => None,
};
let about_attr = match about_attr {
Some(uri) => Some(NamedNode::from(self.resolve_uri(&uri, &base_uri)?)),
None => None,
};
let resource_attr = match resource_attr {
Some(uri) => Some(NamedNode::from(self.resolve_uri(&uri, &base_uri)?)),
None => None,
};
let datatype_attr = match datatype_attr {
Some(uri) => Some(NamedNode::from(self.resolve_uri(&uri, &base_uri)?)),
None => None,
};
let type_attr = match type_attr {
Some(uri) => Some(NamedNode::from(self.resolve_uri(&uri, &base_uri)?)),
None => None,
};
let next_production = match self.state.last() {
Some(RdfXmlState::Doc { .. }) => RdfXmlNextProduction::RDF,
Some(RdfXmlState::RDF { .. }) => RdfXmlNextProduction::NodeElt,
Some(RdfXmlState::NodeElt { subject, .. }) => RdfXmlNextProduction::PropertyElt {
subject: subject.clone(),
},
Some(RdfXmlState::PropertyElt { .. }) => RdfXmlNextProduction::NodeElt {},
Some(RdfXmlState::ParseTypeCollectionPropertyElt { .. }) => {
RdfXmlNextProduction::NodeElt {}
}
None => {
return Err(format_err!(
"No state in the stack: the XML is not balanced"
));
}
};
let new_state = match next_production {
RdfXmlNextProduction::RDF => {
if uri == *RDF_RDF {
RdfXmlState::RDF { base_uri, language }
} else {
self.build_node_elt(
NamedNode::from(uri),
base_uri,
language,
id_attr,
node_id_attr,
about_attr,
type_attr,
property_attrs,
) )
} }
}
RdfXmlNextProduction::NodeElt => self.build_node_elt(
NamedNode::from(uri),
base_uri,
language,
id_attr,
node_id_attr,
about_attr,
type_attr,
property_attrs,
),
RdfXmlNextProduction::PropertyElt { subject } => match parse_type {
RdfXmlParseType::Default => {
if resource_attr.is_some()
|| node_id_attr.is_some()
|| !property_attrs.is_empty()
{
let object: NamedOrBlankNode = match resource_attr {
Some(resource_attr) => resource_attr.into(),
None => match node_id_attr {
Some(node_id_attr) => node_id_attr.into(),
None => BlankNode::default().into(),
},
};
self.emit_property_attrs(&object, property_attrs, &language);
if let Some(type_attr) = type_attr {
self.triples_cache.push(Triple::new(
object.clone(),
rdf::TYPE.clone(),
type_attr,
));
}
RdfXmlState::PropertyElt {
uri,
base_uri,
language,
subject,
object: Some(object),
id_attr,
datatype_attr,
}
} else {
RdfXmlState::PropertyElt {
uri,
base_uri,
language,
subject,
object: None,
id_attr,
datatype_attr,
}
}
}
RdfXmlParseType::Literal => {
return Err(format_err!(
"rdf:parseType=\"Literal\" is not supported yet"
));
}
RdfXmlParseType::Resource => self.build_parse_type_resource_property_elt(
NamedNode::from(uri),
base_uri,
language,
subject,
id_attr,
),
RdfXmlParseType::Collection => {
return Err(format_err!(
"rdf:parseType=\"Collection\" is not supported yet"
));
}
RdfXmlParseType::Other => {
return Err(format_err!("Arbitrary rdf:parseType are not supported yet"));
}
},
};
self.state.push(new_state);
Ok(())
}
fn parse_end_event(&mut self, _event: &BytesEnd<'_>) -> Result<()> {
if let Some(current_state) = self.state.pop() {
self.end_state(current_state)?;
}
Ok(())
}
fn parse_text_event(&mut self, event: &BytesText<'_>) -> Result<()> {
self.object = Some(NodeOrText::Text(event.unescape_and_decode(&self.reader)?));
Ok(())
}
fn resolve_tag_name(&self, qname: &[u8]) -> Result<Url> {
let (namespace, local_name) = self.reader.event_namespace(qname, &self.namespace_buffer);
self.resolve_ns_name(namespace, local_name)
}
fn resolve_attribute_name(&self, qname: &[u8]) -> Result<Url> {
let (namespace, local_name) = self
.reader
.attribute_namespace(qname, &self.namespace_buffer);
self.resolve_ns_name(namespace, local_name)
}
fn resolve_ns_name(&self, namespace: Option<&[u8]>, local_name: &[u8]) -> Result<Url> {
Ok(Url::parse(
&(match namespace {
Some(namespace) => self.reader.decode(namespace) + self.reader.decode(local_name),
None => self.reader.decode(local_name),
}),
)?)
}
fn resolve_uri(&self, uri: &[u8], base: &Option<Url>) -> Result<Url> {
Ok(Url::options()
.base_url(base.as_ref())
.parse(&self.reader.decode(uri))?)
}
fn build_node_elt(
&mut self,
uri: NamedNode,
base_uri: Option<Url>,
language: Option<LanguageTag>,
id_attr: Option<NamedNode>,
node_id_attr: Option<BlankNode>,
about_attr: Option<NamedNode>,
type_attr: Option<NamedNode>,
property_attrs: Vec<(NamedNode, String)>,
) -> RdfXmlState {
self.object = None; //We reset object return: we are in a list of elements
let subject = match id_attr {
Some(id_attr) => id_attr.into(),
None => match about_attr {
Some(about_attr) => about_attr.into(),
None => node_id_attr.unwrap_or_else(BlankNode::default).into(),
},
};
self.emit_property_attrs(&subject, property_attrs, &language);
if let Some(type_attr) = type_attr {
self.triples_cache
.push(Triple::new(subject.clone(), rdf::TYPE.clone(), type_attr));
}
if uri != *RDF_DESCRIPTION {
self.triples_cache
.push(Triple::new(subject.clone(), rdf::TYPE.clone(), uri));
}
self.li_counter.push(0);
RdfXmlState::NodeElt {
base_uri,
language,
subject: subject.clone(),
}
}
fn build_parse_type_resource_property_elt(
&mut self,
uri: NamedNode,
base_uri: Option<Url>,
language: Option<LanguageTag>,
subject: NamedOrBlankNode,
id_attr: Option<NamedNode>,
) -> RdfXmlState {
let object = BlankNode::default();
let triple = Triple::new(subject, uri, object.clone());
if let Some(id_attr) = id_attr {
self.reify(&triple, id_attr.into());
}
self.triples_cache.push(triple);
self.li_counter.push(0);
RdfXmlState::NodeElt {
base_uri,
language,
subject: object.into(),
}
}
fn end_state(&mut self, state: RdfXmlState) -> Result<()> {
match state {
RdfXmlState::PropertyElt {
uri,
language,
subject,
id_attr,
datatype_attr,
object,
..
} => {
let predicate = if uri == *RDF_LI {
if let Some(li_counter) = self.li_counter.last_mut() {
*li_counter += 1;
NamedNode::from_str(&format!(
"http://www.w3.org/1999/02/22-rdf-syntax-ns#_{}",
li_counter
))?
} else {
NamedNode::from(uri)
}
} else {
NamedNode::from(uri)
};
let object: Term = match object {
Some(object) => object.into(),
None => match self.object.clone() {
Some(NodeOrText::Node(node)) => node.into(),
Some(NodeOrText::Text(text)) => {
self.new_literal(text, language, datatype_attr).into()
}
None => self
.new_literal(String::default(), language, datatype_attr)
.into(),
},
};
self.object = None; //We have used self.object
let triple = Triple::new(subject, predicate, object);
if let Some(id_attr) = id_attr {
self.reify(&triple, id_attr.into());
}
self.triples_cache.push(triple);
}
RdfXmlState::NodeElt { subject, .. } => {
self.object = Some(NodeOrText::Node(subject));
self.li_counter.pop();
}
_ => (),
}
Ok(())
}
fn new_literal(
&self,
text: String,
language: Option<LanguageTag>,
datatype: Option<NamedNode>,
) -> Literal {
if let Some(datatype) = datatype {
Literal::new_typed_literal(text, datatype)
} else if let Some(language) = language {
Literal::new_language_tagged_literal(text, language)
} else {
Literal::new_simple_literal(text)
}
}
fn reify(&mut self, triple: &Triple, statement_id: NamedOrBlankNode) {
self.triples_cache.push(Triple::new(
statement_id.clone(),
rdf::OBJECT.clone(),
triple.object().clone(),
));
self.triples_cache.push(Triple::new(
statement_id.clone(),
rdf::PREDICATE.clone(),
triple.predicate().clone(),
));
self.triples_cache.push(Triple::new(
statement_id.clone(),
rdf::SUBJECT.clone(),
triple.subject().clone(),
));
self.triples_cache.push(Triple::new(
statement_id,
rdf::TYPE.clone(),
rdf::STATEMENT.clone(),
));
}
fn emit_property_attrs(
&mut self,
subject: &NamedOrBlankNode,
literal_attributes: Vec<(NamedNode, String)>,
language: &Option<LanguageTag>,
) {
for (literal_predicate, literal_value) in literal_attributes {
self.triples_cache.push(Triple::new(
subject.clone(),
literal_predicate,
if let Some(language) = language {
Literal::new_language_tagged_literal(literal_value, language.clone())
} else {
Literal::new_simple_literal(literal_value)
},
));
}
}
}

@ -95,25 +95,16 @@ fn ntriples_w3c_testsuite() {
#[test] #[test]
fn rdf_xml_w3c_testsuite() -> Result<()> { fn rdf_xml_w3c_testsuite() -> Result<()> {
let manifest_url = Url::parse("http://www.w3.org/2013/RDFXMLTests/manifest.ttl")?; let manifest_url = Url::parse("http://www.w3.org/2013/RDFXMLTests/manifest.ttl")?;
//TODO: make blacklist pass
let test_blacklist = vec![
NamedNode::new(manifest_url.join("#xml-canon-test001")?),
NamedNode::new(manifest_url.join("#rdfms-seq-representation-test001")?),
NamedNode::new(manifest_url.join("#rdf-containers-syntax-vs-schema-test004")?),
];
for test_result in TestManifest::new(manifest_url) { for test_result in TestManifest::new(manifest_url) {
let test = test_result?; let test = test_result?;
if test_blacklist.contains(&test.id) {
continue;
}
if test.kind == "TestXMLNegativeSyntax" { if test.kind == "TestXMLNegativeSyntax" {
/*TODO assert!( assert!(
load_rdf_xml(test.action.clone()).is_err(), load_rdf_xml(test.action.clone()).is_err(),
"Failure on {}", "Failure on {}",
test test
);*/ );
} else if test.kind == "TestXMLEval" { } else if test.kind == "TestXMLEval" {
match load_rdf_xml(test.action.clone()) { match load_rdf_xml(test.action.clone()) {
Ok(action_graph) => match load_ntriples(test.result.clone().unwrap()) { Ok(action_graph) => match load_ntriples(test.result.clone().unwrap()) {
@ -150,7 +141,7 @@ fn load_ntriples(url: Url) -> Result<MemoryGraph> {
} }
fn load_rdf_xml(url: Url) -> Result<MemoryGraph> { fn load_rdf_xml(url: Url) -> Result<MemoryGraph> {
read_rdf_xml(read_file(&url)?, Some(url)).collect() read_rdf_xml(read_file(&url)?, Some(url))?.collect()
} }
fn to_relative_path(url: &Url) -> Result<String> { fn to_relative_path(url: &Url) -> Result<String> {

@ -243,7 +243,7 @@ fn load_graph(url: Url) -> Result<MemoryGraph> {
if url.as_str().ends_with(".ttl") { if url.as_str().ends_with(".ttl") {
read_turtle(read_file(&url)?, Some(url))?.collect() read_turtle(read_file(&url)?, Some(url))?.collect()
} else if url.as_str().ends_with(".rdf") { } else if url.as_str().ends_with(".rdf") {
read_rdf_xml(read_file(&url)?, Some(url)).collect() read_rdf_xml(read_file(&url)?, Some(url))?.collect()
} else { } else {
Err(format_err!("Serialization type not found for {}", url)) Err(format_err!("Serialization type not found for {}", url))
} }

Loading…
Cancel
Save