From a7bc31b44651e90b01c5a1a332500380280fc6cb Mon Sep 17 00:00:00 2001 From: Tpt Date: Sat, 7 Jan 2023 15:04:54 +0100 Subject: [PATCH] SPARQL regex: compile ahead of time if possible --- lib/src/sparql/eval.rs | 71 ++++++++++-------- lib/src/sparql/plan.rs | 17 +++-- lib/src/sparql/plan_builder.rs | 132 +++++++++++++++++++++++++++------ 3 files changed, 164 insertions(+), 56 deletions(-) diff --git a/lib/src/sparql/eval.rs b/lib/src/sparql/eval.rs index a343b45a..c4213ae3 100644 --- a/lib/src/sparql/eval.rs +++ b/lib/src/sparql/eval.rs @@ -1325,22 +1325,35 @@ impl SimpleEvaluator { Some((to_string(&dataset, &arg(tuple)?)?.chars().count() as i64).into()) }) } - PlanExpression::Replace(arg, pattern, replacement, flags) => { + PlanExpression::StaticReplace(arg, regex, replacement) => { + let arg = self.expression_evaluator(arg); + let regex = regex.clone(); + let replacement = self.expression_evaluator(replacement); + let dataset = self.dataset.clone(); + Rc::new(move |tuple| { + let (text, language) = to_string_and_language(&dataset, &arg(tuple)?)?; + let replacement = to_simple_string(&dataset, &replacement(tuple)?)?; + Some(build_plain_literal( + &dataset, + ®ex.replace_all(&text, replacement.as_str()), + language, + )) + }) + } + PlanExpression::DynamicReplace(arg, pattern, replacement, flags) => { let arg = self.expression_evaluator(arg); let pattern = self.expression_evaluator(pattern); let replacement = self.expression_evaluator(replacement); let flags = flags.as_ref().map(|flags| self.expression_evaluator(flags)); let dataset = self.dataset.clone(); Rc::new(move |tuple| { - let regex = compile_pattern( - &dataset, - &pattern(tuple)?, - if let Some(flags) = &flags { - Some(flags(tuple)?) - } else { - None - }, - )?; + let pattern = to_simple_string(&dataset, &pattern(tuple)?)?; + let options = if let Some(flags) = &flags { + Some(to_simple_string(&dataset, &flags(tuple)?)?) + } else { + None + }; + let regex = compile_pattern(&pattern, options.as_deref())?; let (text, language) = to_string_and_language(&dataset, &arg(tuple)?)?; let replacement = to_simple_string(&dataset, &replacement(tuple)?)?; Some(build_plain_literal( @@ -1704,21 +1717,28 @@ impl SimpleEvaluator { ) }) } - PlanExpression::Regex(text, pattern, flags) => { + PlanExpression::StaticRegex(text, regex) => { + let text = self.expression_evaluator(text); + let dataset = self.dataset.clone(); + let regex = regex.clone(); + Rc::new(move |tuple| { + let text = to_string(&dataset, &text(tuple)?)?; + Some(regex.is_match(&text).into()) + }) + } + PlanExpression::DynamicRegex(text, pattern, flags) => { let text = self.expression_evaluator(text); let pattern = self.expression_evaluator(pattern); let flags = flags.as_ref().map(|flags| self.expression_evaluator(flags)); let dataset = self.dataset.clone(); Rc::new(move |tuple| { - let regex = compile_pattern( - &dataset, - &pattern(tuple)?, - if let Some(flags) = &flags { - Some(flags(tuple)?) - } else { - None - }, - )?; + let pattern = to_simple_string(&dataset, &pattern(tuple)?)?; + let options = if let Some(flags) = &flags { + Some(to_simple_string(&dataset, &flags(tuple)?)?) + } else { + None + }; + let regex = compile_pattern(&pattern, options.as_deref())?; let text = to_string(&dataset, &text(tuple)?)?; Some(regex.is_match(&text).into()) }) @@ -2193,17 +2213,10 @@ fn to_argument_compatible_strings( } } -fn compile_pattern( - dataset: &DatasetView, - pattern: &EncodedTerm, - flags: Option, -) -> Option { - // TODO Avoid to compile the regex each time - let pattern = to_simple_string(dataset, pattern)?; - let mut regex_builder = RegexBuilder::new(&pattern); +pub(super) fn compile_pattern(pattern: &str, flags: Option<&str>) -> Option { + let mut regex_builder = RegexBuilder::new(pattern); regex_builder.size_limit(REGEX_SIZE_LIMIT); if let Some(flags) = flags { - let flags = to_simple_string(dataset, &flags)?; for flag in flags.chars() { match flag { 's' => { diff --git a/lib/src/sparql/plan.rs b/lib/src/sparql/plan.rs index 2fa1f9f7..fdb0922e 100644 --- a/lib/src/sparql/plan.rs +++ b/lib/src/sparql/plan.rs @@ -1,6 +1,7 @@ use crate::model::NamedNode; use crate::storage::numeric_encoder::EncodedTerm; use oxrdf::Variable; +use regex::Regex; use spargebra::algebra::GraphPattern; use std::cmp::max; use std::collections::btree_map::Entry; @@ -420,7 +421,8 @@ pub enum PlanExpression { Concat(Vec), SubStr(Box, Box, Option>), StrLen(Box), - Replace(Box, Box, Box, Option>), + StaticReplace(Box, Regex, Box), + DynamicReplace(Box, Box, Box, Option>), UCase(Box), LCase(Box), EncodeForUri(Box), @@ -454,7 +456,8 @@ pub enum PlanExpression { IsBlank(Box), IsLiteral(Box), IsNumeric(Box), - Regex(Box, Box, Option>), + StaticRegex(Box, Regex), + DynamicRegex(Box, Box, Option>), Triple(Box, Box, Box), Subject(Box), Predicate(Box), @@ -504,6 +507,7 @@ impl PlanExpression { | Self::LCase(e) | Self::StrLen(e) | Self::EncodeForUri(e) + | Self::StaticRegex(e, _) | Self::Year(e) | Self::Month(e) | Self::Day(e) @@ -550,6 +554,7 @@ impl PlanExpression { | Self::Divide(a, b) | Self::LangMatches(a, b) | Self::Contains(a, b) + | Self::StaticReplace(a, _, b) | Self::StrStarts(a, b) | Self::StrEnds(a, b) | Self::StrBefore(a, b) @@ -558,21 +563,21 @@ impl PlanExpression { | Self::StrDt(a, b) | Self::SameTerm(a, b) | Self::SubStr(a, b, None) - | Self::Regex(a, b, None) + | Self::DynamicRegex(a, b, None) | Self::Adjust(a, b) => { a.lookup_used_variables(callback); b.lookup_used_variables(callback); } Self::If(a, b, c) | Self::SubStr(a, b, Some(c)) - | Self::Regex(a, b, Some(c)) - | Self::Replace(a, b, c, None) + | Self::DynamicRegex(a, b, Some(c)) + | Self::DynamicReplace(a, b, c, None) | Self::Triple(a, b, c) => { a.lookup_used_variables(callback); b.lookup_used_variables(callback); c.lookup_used_variables(callback); } - Self::Replace(a, b, c, Some(d)) => { + Self::DynamicReplace(a, b, c, Some(d)) => { a.lookup_used_variables(callback); b.lookup_used_variables(callback); c.lookup_used_variables(callback); diff --git a/lib/src/sparql/plan_builder.rs b/lib/src/sparql/plan_builder.rs index 57ec123b..61cbba58 100644 --- a/lib/src/sparql/plan_builder.rs +++ b/lib/src/sparql/plan_builder.rs @@ -1,11 +1,13 @@ use crate::model::Term as OxTerm; use crate::sparql::dataset::DatasetView; use crate::sparql::error::EvaluationError; +use crate::sparql::eval::compile_pattern; use crate::sparql::plan::*; use crate::storage::numeric_encoder::{EncodedTerm, EncodedTriple}; use oxrdf::vocab::xsd; use oxrdf::TermRef; use rand::random; +use regex::Regex; use spargebra::algebra::*; use spargebra::term::*; use std::collections::{BTreeSet, HashMap, HashSet}; @@ -486,17 +488,49 @@ impl<'a> PlanBuilder<'a> { variables, graph_name, )?)), - Function::Replace => PlanExpression::Replace( - Box::new(self.build_for_expression(¶meters[0], variables, graph_name)?), - Box::new(self.build_for_expression(¶meters[1], variables, graph_name)?), - Box::new(self.build_for_expression(¶meters[2], variables, graph_name)?), - match parameters.get(3) { - Some(flags) => Some(Box::new( - self.build_for_expression(flags, variables, graph_name)?, - )), - None => None, - }, - ), + Function::Replace => { + if let Some(static_regex) = + compile_static_pattern_if_exists(¶meters[1], parameters.get(3)) + { + PlanExpression::StaticReplace( + Box::new(self.build_for_expression( + ¶meters[0], + variables, + graph_name, + )?), + static_regex, + Box::new(self.build_for_expression( + ¶meters[2], + variables, + graph_name, + )?), + ) + } else { + PlanExpression::DynamicReplace( + Box::new(self.build_for_expression( + ¶meters[0], + variables, + graph_name, + )?), + Box::new(self.build_for_expression( + ¶meters[1], + variables, + graph_name, + )?), + Box::new(self.build_for_expression( + ¶meters[2], + variables, + graph_name, + )?), + match parameters.get(3) { + Some(flags) => Some(Box::new( + self.build_for_expression(flags, variables, graph_name)?, + )), + None => None, + }, + ) + } + } Function::UCase => PlanExpression::UCase(Box::new(self.build_for_expression( ¶meters[0], variables, @@ -620,16 +654,39 @@ impl<'a> PlanBuilder<'a> { Function::IsNumeric => PlanExpression::IsNumeric(Box::new( self.build_for_expression(¶meters[0], variables, graph_name)?, )), - Function::Regex => PlanExpression::Regex( - Box::new(self.build_for_expression(¶meters[0], variables, graph_name)?), - Box::new(self.build_for_expression(¶meters[1], variables, graph_name)?), - match parameters.get(2) { - Some(flags) => Some(Box::new( - self.build_for_expression(flags, variables, graph_name)?, - )), - None => None, - }, - ), + Function::Regex => { + if let Some(static_regex) = + compile_static_pattern_if_exists(¶meters[1], parameters.get(2)) + { + PlanExpression::StaticRegex( + Box::new(self.build_for_expression( + ¶meters[0], + variables, + graph_name, + )?), + static_regex, + ) + } else { + PlanExpression::DynamicRegex( + Box::new(self.build_for_expression( + ¶meters[0], + variables, + graph_name, + )?), + Box::new(self.build_for_expression( + ¶meters[1], + variables, + graph_name, + )?), + match parameters.get(2) { + Some(flags) => Some(Box::new( + self.build_for_expression(flags, variables, graph_name)?, + )), + None => None, + }, + ) + } + } Function::Triple => PlanExpression::Triple( Box::new(self.build_for_expression(¶meters[0], variables, graph_name)?), Box::new(self.build_for_expression(¶meters[1], variables, graph_name)?), @@ -1505,3 +1562,36 @@ fn add_pattern_variables<'a>( TermPattern::Triple(t) => add_pattern_variables(t, variables, blank_nodes), } } + +fn compile_static_pattern_if_exists( + pattern: &Expression, + options: Option<&Expression>, +) -> Option { + let static_pattern = if let Expression::Literal(pattern) = pattern { + if pattern.datatype() == xsd::STRING { + Some(pattern.value()) + } else { + None + } + } else { + None + }; + let static_options = if let Some(options) = options { + if let Expression::Literal(options) = options { + if options.datatype() == xsd::STRING { + Some(Some(options.value())) + } else { + None + } + } else { + None + } + } else { + Some(None) + }; + if let (Some(static_pattern), Some(static_options)) = (static_pattern, static_options) { + compile_pattern(static_pattern, static_options) + } else { + None + } +}