From 4ef13df3172691b978e56ef38b3d70b983cde4b5 Mon Sep 17 00:00:00 2001 From: Tpt Date: Fri, 3 Mar 2023 18:25:47 +0100 Subject: [PATCH] TSV SPARQL results: Properly quote \t and use short Turtle serialization everywhere possible --- lib/sparesults/src/csv.rs | 151 +++++++++++++++++++++++++++++++++----- 1 file changed, 132 insertions(+), 19 deletions(-) diff --git a/lib/sparesults/src/csv.rs b/lib/sparesults/src/csv.rs index 78df0511..aa2cb280 100644 --- a/lib/sparesults/src/csv.rs +++ b/lib/sparesults/src/csv.rs @@ -153,25 +153,28 @@ impl TsvSolutionsWriter { } fn write_tsv_term<'a>(term: impl Into>, sink: &mut impl Write) -> io::Result<()> { - //TODO: full Turtle serialization match term.into() { TermRef::NamedNode(node) => write!(sink, "<{}>", node.as_str()), TermRef::BlankNode(node) => write!(sink, "_:{}", node.as_str()), - TermRef::Literal(literal) => match literal.datatype() { - xsd::BOOLEAN => match literal.value() { - "true" | "1" => sink.write_all(b"true"), - "false" | "0" => sink.write_all(b"false"), - _ => sink.write_all(literal.to_string().as_bytes()), - }, - xsd::INTEGER => { - if literal.value().bytes().all(|c| c.is_ascii_digit()) { - sink.write_all(literal.value().as_bytes()) - } else { - sink.write_all(literal.to_string().as_bytes()) + TermRef::Literal(literal) => { + let value = literal.value(); + if let Some(language) = literal.language() { + write_tsv_quoted_str(value, sink)?; + write!(sink, "@{}", language) + } else { + match literal.datatype() { + xsd::BOOLEAN if is_turtle_boolean(value) => sink.write_all(value.as_bytes()), + xsd::INTEGER if is_turtle_integer(value) => sink.write_all(value.as_bytes()), + xsd::DECIMAL if is_turtle_decimal(value) => sink.write_all(value.as_bytes()), + xsd::DOUBLE if is_turtle_double(value) => sink.write_all(value.as_bytes()), + xsd::STRING => write_tsv_quoted_str(value, sink), + datatype => { + write_tsv_quoted_str(value, sink)?; + write!(sink, "^^<{}>", datatype.as_str()) + } } } - _ => sink.write_all(literal.to_string().as_bytes()), - }, + } #[cfg(feature = "rdf-star")] TermRef::Triple(triple) => { sink.write_all(b"<<")?; @@ -186,6 +189,92 @@ fn write_tsv_term<'a>(term: impl Into>, sink: &mut impl Write) -> io } } +fn write_tsv_quoted_str(string: &str, f: &mut impl Write) -> io::Result<()> { + f.write_all(b"\"")?; + for c in string.bytes() { + match c { + b'\t' => f.write_all(b"\\t"), + b'\n' => f.write_all(b"\\n"), + b'\r' => f.write_all(b"\\r"), + b'"' => f.write_all(b"\\\""), + b'\\' => f.write_all(b"\\\\"), + c => f.write_all(&[c]), + }?; + } + f.write_all(b"\"") +} + +fn is_turtle_boolean(value: &str) -> bool { + matches!(value, "true" | "false") +} + +fn is_turtle_integer(value: &str) -> bool { + // [19] INTEGER ::= [+-]? [0-9]+ + let mut value = value.as_bytes(); + if let Some(v) = value.strip_prefix(b"+") { + value = v; + } else if let Some(v) = value.strip_prefix(b"-") { + value = v; + } + !value.is_empty() && value.iter().all(|c| c.is_ascii_digit()) +} + +fn is_turtle_decimal(value: &str) -> bool { + // [20] DECIMAL ::= [+-]? [0-9]* '.' [0-9]+ + let mut value = value.as_bytes(); + if let Some(v) = value.strip_prefix(b"+") { + value = v; + } else if let Some(v) = value.strip_prefix(b"-") { + value = v; + } + while value.first().map_or(false, |c| c.is_ascii_digit()) { + value = &value[1..]; + } + if let Some(v) = value.strip_prefix(b".") { + value = v; + } else { + return false; + } + !value.is_empty() && value.iter().all(|c| c.is_ascii_digit()) +} + +fn is_turtle_double(value: &str) -> bool { + // [21] DOUBLE ::= [+-]? ([0-9]+ '.' [0-9]* EXPONENT | '.' [0-9]+ EXPONENT | [0-9]+ EXPONENT) + // [154s] EXPONENT ::= [eE] [+-]? [0-9]+ + let mut value = value.as_bytes(); + if let Some(v) = value.strip_prefix(b"+") { + value = v; + } else if let Some(v) = value.strip_prefix(b"-") { + value = v; + } + let mut with_before = false; + while value.first().map_or(false, |c| c.is_ascii_digit()) { + value = &value[1..]; + with_before = true; + } + let mut with_after = false; + if let Some(v) = value.strip_prefix(b".") { + value = v; + while value.first().map_or(false, |c| c.is_ascii_digit()) { + value = &value[1..]; + with_after = true; + } + } + if let Some(v) = value.strip_prefix(b"e") { + value = v; + } else if let Some(v) = value.strip_prefix(b"E") { + value = v; + } else { + return false; + } + if let Some(v) = value.strip_prefix(b"+") { + value = v; + } else if let Some(v) = value.strip_prefix(b"-") { + value = v; + } + (with_before || with_after) && !value.is_empty() && value.iter().all(|c| c.is_ascii_digit()) +} + pub enum TsvQueryResultsReader { Solutions { variables: Vec, @@ -259,6 +348,7 @@ impl TsvSolutionsReader { #[cfg(test)] mod tests { use super::*; + use std::error::Error; use std::io::Cursor; use std::rc::Rc; use std::str; @@ -302,6 +392,10 @@ mod tests { Some(BlankNode::new_unchecked("b1").into()), Some(Literal::new_typed_literal("123", xsd::INTEGER).into()), ], + vec![ + None, + Some(Literal::new_simple_literal("escape,\t\r\n").into()), + ], ], ) } @@ -320,25 +414,44 @@ mod tests { )?; } let result = writer.finish()?; - assert_eq!(str::from_utf8(&result).unwrap(), "x,literal\r\nhttp://example/x,String\r\nhttp://example/x,\"String-with-dquote\"\"\"\r\n_:b0,Blank node\r\n,Missing 'x'\r\n,\r\nhttp://example/x,\r\n_:b1,String-with-lang\r\n_:b1,123"); + assert_eq!(str::from_utf8(&result).unwrap(), "x,literal\r\nhttp://example/x,String\r\nhttp://example/x,\"String-with-dquote\"\"\"\r\n_:b0,Blank node\r\n,Missing 'x'\r\n,\r\nhttp://example/x,\r\n_:b1,String-with-lang\r\n_:b1,123\r\n,\"escape,\t\r\n\""); Ok(()) } #[test] - fn test_tsv_serialization() -> io::Result<()> { + fn test_tsv_roundtrip() -> Result<(), Box> { let (variables, solutions) = build_example(); + + // Write let mut writer = TsvSolutionsWriter::start(Vec::new(), variables.clone())?; let variables = Rc::new(variables); - for solution in solutions { + for solution in &solutions { writer.write( variables .iter() - .zip(&solution) + .zip(solution) .filter_map(|(v, s)| s.as_ref().map(|s| (v.as_ref(), s.as_ref()))), )?; } let result = writer.finish()?; - assert_eq!(str::from_utf8(&result).unwrap(), "?x\t?literal\n\t\"String\"\n\t\"String-with-dquote\\\"\"\n_:b0\t\"Blank node\"\n\t\"Missing 'x'\"\n\t\n\t\n_:b1\t\"String-with-lang\"@en\n_:b1\t123"); + assert_eq!(str::from_utf8(&result).unwrap(), "?x\t?literal\n\t\"String\"\n\t\"String-with-dquote\\\"\"\n_:b0\t\"Blank node\"\n\t\"Missing 'x'\"\n\t\n\t\n_:b1\t\"String-with-lang\"@en\n_:b1\t123\n\t\"escape,\\t\\r\\n\""); + + // Read + if let TsvQueryResultsReader::Solutions { + solutions: mut solutions_iter, + variables: actual_variables, + } = TsvQueryResultsReader::read(Cursor::new(result))? + { + assert_eq!(actual_variables.as_slice(), variables.as_slice()); + let mut rows = Vec::new(); + while let Some(row) = solutions_iter.read_next()? { + rows.push(row); + } + assert_eq!(rows, solutions); + } else { + unreachable!() + } + Ok(()) }