SPARQL regex: compile ahead of time if possible

pull/354/head
Tpt 2 years ago committed by Thomas Tanon
parent 5c055e0d12
commit a7bc31b446
  1. 63
      lib/src/sparql/eval.rs
  2. 17
      lib/src/sparql/plan.rs
  3. 108
      lib/src/sparql/plan_builder.rs

@ -1325,22 +1325,35 @@ impl SimpleEvaluator {
Some((to_string(&dataset, &arg(tuple)?)?.chars().count() as i64).into()) Some((to_string(&dataset, &arg(tuple)?)?.chars().count() as i64).into())
}) })
} }
PlanExpression::Replace(arg, pattern, replacement, flags) => { PlanExpression::StaticReplace(arg, regex, replacement) => {
let arg = self.expression_evaluator(arg);
let regex = regex.clone();
let replacement = self.expression_evaluator(replacement);
let dataset = self.dataset.clone();
Rc::new(move |tuple| {
let (text, language) = to_string_and_language(&dataset, &arg(tuple)?)?;
let replacement = to_simple_string(&dataset, &replacement(tuple)?)?;
Some(build_plain_literal(
&dataset,
&regex.replace_all(&text, replacement.as_str()),
language,
))
})
}
PlanExpression::DynamicReplace(arg, pattern, replacement, flags) => {
let arg = self.expression_evaluator(arg); let arg = self.expression_evaluator(arg);
let pattern = self.expression_evaluator(pattern); let pattern = self.expression_evaluator(pattern);
let replacement = self.expression_evaluator(replacement); let replacement = self.expression_evaluator(replacement);
let flags = flags.as_ref().map(|flags| self.expression_evaluator(flags)); let flags = flags.as_ref().map(|flags| self.expression_evaluator(flags));
let dataset = self.dataset.clone(); let dataset = self.dataset.clone();
Rc::new(move |tuple| { Rc::new(move |tuple| {
let regex = compile_pattern( let pattern = to_simple_string(&dataset, &pattern(tuple)?)?;
&dataset, let options = if let Some(flags) = &flags {
&pattern(tuple)?, Some(to_simple_string(&dataset, &flags(tuple)?)?)
if let Some(flags) = &flags {
Some(flags(tuple)?)
} else { } else {
None None
}, };
)?; let regex = compile_pattern(&pattern, options.as_deref())?;
let (text, language) = to_string_and_language(&dataset, &arg(tuple)?)?; let (text, language) = to_string_and_language(&dataset, &arg(tuple)?)?;
let replacement = to_simple_string(&dataset, &replacement(tuple)?)?; let replacement = to_simple_string(&dataset, &replacement(tuple)?)?;
Some(build_plain_literal( Some(build_plain_literal(
@ -1704,21 +1717,28 @@ impl SimpleEvaluator {
) )
}) })
} }
PlanExpression::Regex(text, pattern, flags) => { PlanExpression::StaticRegex(text, regex) => {
let text = self.expression_evaluator(text);
let dataset = self.dataset.clone();
let regex = regex.clone();
Rc::new(move |tuple| {
let text = to_string(&dataset, &text(tuple)?)?;
Some(regex.is_match(&text).into())
})
}
PlanExpression::DynamicRegex(text, pattern, flags) => {
let text = self.expression_evaluator(text); let text = self.expression_evaluator(text);
let pattern = self.expression_evaluator(pattern); let pattern = self.expression_evaluator(pattern);
let flags = flags.as_ref().map(|flags| self.expression_evaluator(flags)); let flags = flags.as_ref().map(|flags| self.expression_evaluator(flags));
let dataset = self.dataset.clone(); let dataset = self.dataset.clone();
Rc::new(move |tuple| { Rc::new(move |tuple| {
let regex = compile_pattern( let pattern = to_simple_string(&dataset, &pattern(tuple)?)?;
&dataset, let options = if let Some(flags) = &flags {
&pattern(tuple)?, Some(to_simple_string(&dataset, &flags(tuple)?)?)
if let Some(flags) = &flags {
Some(flags(tuple)?)
} else { } else {
None None
}, };
)?; let regex = compile_pattern(&pattern, options.as_deref())?;
let text = to_string(&dataset, &text(tuple)?)?; let text = to_string(&dataset, &text(tuple)?)?;
Some(regex.is_match(&text).into()) Some(regex.is_match(&text).into())
}) })
@ -2193,17 +2213,10 @@ fn to_argument_compatible_strings(
} }
} }
fn compile_pattern( pub(super) fn compile_pattern(pattern: &str, flags: Option<&str>) -> Option<Regex> {
dataset: &DatasetView, let mut regex_builder = RegexBuilder::new(pattern);
pattern: &EncodedTerm,
flags: Option<EncodedTerm>,
) -> Option<Regex> {
// TODO Avoid to compile the regex each time
let pattern = to_simple_string(dataset, pattern)?;
let mut regex_builder = RegexBuilder::new(&pattern);
regex_builder.size_limit(REGEX_SIZE_LIMIT); regex_builder.size_limit(REGEX_SIZE_LIMIT);
if let Some(flags) = flags { if let Some(flags) = flags {
let flags = to_simple_string(dataset, &flags)?;
for flag in flags.chars() { for flag in flags.chars() {
match flag { match flag {
's' => { 's' => {

@ -1,6 +1,7 @@
use crate::model::NamedNode; use crate::model::NamedNode;
use crate::storage::numeric_encoder::EncodedTerm; use crate::storage::numeric_encoder::EncodedTerm;
use oxrdf::Variable; use oxrdf::Variable;
use regex::Regex;
use spargebra::algebra::GraphPattern; use spargebra::algebra::GraphPattern;
use std::cmp::max; use std::cmp::max;
use std::collections::btree_map::Entry; use std::collections::btree_map::Entry;
@ -420,7 +421,8 @@ pub enum PlanExpression {
Concat(Vec<Self>), Concat(Vec<Self>),
SubStr(Box<Self>, Box<Self>, Option<Box<Self>>), SubStr(Box<Self>, Box<Self>, Option<Box<Self>>),
StrLen(Box<Self>), StrLen(Box<Self>),
Replace(Box<Self>, Box<Self>, Box<Self>, Option<Box<Self>>), StaticReplace(Box<Self>, Regex, Box<Self>),
DynamicReplace(Box<Self>, Box<Self>, Box<Self>, Option<Box<Self>>),
UCase(Box<Self>), UCase(Box<Self>),
LCase(Box<Self>), LCase(Box<Self>),
EncodeForUri(Box<Self>), EncodeForUri(Box<Self>),
@ -454,7 +456,8 @@ pub enum PlanExpression {
IsBlank(Box<Self>), IsBlank(Box<Self>),
IsLiteral(Box<Self>), IsLiteral(Box<Self>),
IsNumeric(Box<Self>), IsNumeric(Box<Self>),
Regex(Box<Self>, Box<Self>, Option<Box<Self>>), StaticRegex(Box<Self>, Regex),
DynamicRegex(Box<Self>, Box<Self>, Option<Box<Self>>),
Triple(Box<Self>, Box<Self>, Box<Self>), Triple(Box<Self>, Box<Self>, Box<Self>),
Subject(Box<Self>), Subject(Box<Self>),
Predicate(Box<Self>), Predicate(Box<Self>),
@ -504,6 +507,7 @@ impl PlanExpression {
| Self::LCase(e) | Self::LCase(e)
| Self::StrLen(e) | Self::StrLen(e)
| Self::EncodeForUri(e) | Self::EncodeForUri(e)
| Self::StaticRegex(e, _)
| Self::Year(e) | Self::Year(e)
| Self::Month(e) | Self::Month(e)
| Self::Day(e) | Self::Day(e)
@ -550,6 +554,7 @@ impl PlanExpression {
| Self::Divide(a, b) | Self::Divide(a, b)
| Self::LangMatches(a, b) | Self::LangMatches(a, b)
| Self::Contains(a, b) | Self::Contains(a, b)
| Self::StaticReplace(a, _, b)
| Self::StrStarts(a, b) | Self::StrStarts(a, b)
| Self::StrEnds(a, b) | Self::StrEnds(a, b)
| Self::StrBefore(a, b) | Self::StrBefore(a, b)
@ -558,21 +563,21 @@ impl PlanExpression {
| Self::StrDt(a, b) | Self::StrDt(a, b)
| Self::SameTerm(a, b) | Self::SameTerm(a, b)
| Self::SubStr(a, b, None) | Self::SubStr(a, b, None)
| Self::Regex(a, b, None) | Self::DynamicRegex(a, b, None)
| Self::Adjust(a, b) => { | Self::Adjust(a, b) => {
a.lookup_used_variables(callback); a.lookup_used_variables(callback);
b.lookup_used_variables(callback); b.lookup_used_variables(callback);
} }
Self::If(a, b, c) Self::If(a, b, c)
| Self::SubStr(a, b, Some(c)) | Self::SubStr(a, b, Some(c))
| Self::Regex(a, b, Some(c)) | Self::DynamicRegex(a, b, Some(c))
| Self::Replace(a, b, c, None) | Self::DynamicReplace(a, b, c, None)
| Self::Triple(a, b, c) => { | Self::Triple(a, b, c) => {
a.lookup_used_variables(callback); a.lookup_used_variables(callback);
b.lookup_used_variables(callback); b.lookup_used_variables(callback);
c.lookup_used_variables(callback); c.lookup_used_variables(callback);
} }
Self::Replace(a, b, c, Some(d)) => { Self::DynamicReplace(a, b, c, Some(d)) => {
a.lookup_used_variables(callback); a.lookup_used_variables(callback);
b.lookup_used_variables(callback); b.lookup_used_variables(callback);
c.lookup_used_variables(callback); c.lookup_used_variables(callback);

@ -1,11 +1,13 @@
use crate::model::Term as OxTerm; use crate::model::Term as OxTerm;
use crate::sparql::dataset::DatasetView; use crate::sparql::dataset::DatasetView;
use crate::sparql::error::EvaluationError; use crate::sparql::error::EvaluationError;
use crate::sparql::eval::compile_pattern;
use crate::sparql::plan::*; use crate::sparql::plan::*;
use crate::storage::numeric_encoder::{EncodedTerm, EncodedTriple}; use crate::storage::numeric_encoder::{EncodedTerm, EncodedTriple};
use oxrdf::vocab::xsd; use oxrdf::vocab::xsd;
use oxrdf::TermRef; use oxrdf::TermRef;
use rand::random; use rand::random;
use regex::Regex;
use spargebra::algebra::*; use spargebra::algebra::*;
use spargebra::term::*; use spargebra::term::*;
use std::collections::{BTreeSet, HashMap, HashSet}; use std::collections::{BTreeSet, HashMap, HashSet};
@ -486,17 +488,49 @@ impl<'a> PlanBuilder<'a> {
variables, variables,
graph_name, graph_name,
)?)), )?)),
Function::Replace => PlanExpression::Replace( Function::Replace => {
Box::new(self.build_for_expression(&parameters[0], variables, graph_name)?), if let Some(static_regex) =
Box::new(self.build_for_expression(&parameters[1], variables, graph_name)?), compile_static_pattern_if_exists(&parameters[1], parameters.get(3))
Box::new(self.build_for_expression(&parameters[2], variables, graph_name)?), {
PlanExpression::StaticReplace(
Box::new(self.build_for_expression(
&parameters[0],
variables,
graph_name,
)?),
static_regex,
Box::new(self.build_for_expression(
&parameters[2],
variables,
graph_name,
)?),
)
} else {
PlanExpression::DynamicReplace(
Box::new(self.build_for_expression(
&parameters[0],
variables,
graph_name,
)?),
Box::new(self.build_for_expression(
&parameters[1],
variables,
graph_name,
)?),
Box::new(self.build_for_expression(
&parameters[2],
variables,
graph_name,
)?),
match parameters.get(3) { match parameters.get(3) {
Some(flags) => Some(Box::new( Some(flags) => Some(Box::new(
self.build_for_expression(flags, variables, graph_name)?, self.build_for_expression(flags, variables, graph_name)?,
)), )),
None => None, None => None,
}, },
), )
}
}
Function::UCase => PlanExpression::UCase(Box::new(self.build_for_expression( Function::UCase => PlanExpression::UCase(Box::new(self.build_for_expression(
&parameters[0], &parameters[0],
variables, variables,
@ -620,16 +654,39 @@ impl<'a> PlanBuilder<'a> {
Function::IsNumeric => PlanExpression::IsNumeric(Box::new( Function::IsNumeric => PlanExpression::IsNumeric(Box::new(
self.build_for_expression(&parameters[0], variables, graph_name)?, self.build_for_expression(&parameters[0], variables, graph_name)?,
)), )),
Function::Regex => PlanExpression::Regex( Function::Regex => {
Box::new(self.build_for_expression(&parameters[0], variables, graph_name)?), if let Some(static_regex) =
Box::new(self.build_for_expression(&parameters[1], variables, graph_name)?), compile_static_pattern_if_exists(&parameters[1], parameters.get(2))
{
PlanExpression::StaticRegex(
Box::new(self.build_for_expression(
&parameters[0],
variables,
graph_name,
)?),
static_regex,
)
} else {
PlanExpression::DynamicRegex(
Box::new(self.build_for_expression(
&parameters[0],
variables,
graph_name,
)?),
Box::new(self.build_for_expression(
&parameters[1],
variables,
graph_name,
)?),
match parameters.get(2) { match parameters.get(2) {
Some(flags) => Some(Box::new( Some(flags) => Some(Box::new(
self.build_for_expression(flags, variables, graph_name)?, self.build_for_expression(flags, variables, graph_name)?,
)), )),
None => None, None => None,
}, },
), )
}
}
Function::Triple => PlanExpression::Triple( Function::Triple => PlanExpression::Triple(
Box::new(self.build_for_expression(&parameters[0], variables, graph_name)?), Box::new(self.build_for_expression(&parameters[0], variables, graph_name)?),
Box::new(self.build_for_expression(&parameters[1], variables, graph_name)?), Box::new(self.build_for_expression(&parameters[1], variables, graph_name)?),
@ -1505,3 +1562,36 @@ fn add_pattern_variables<'a>(
TermPattern::Triple(t) => add_pattern_variables(t, variables, blank_nodes), TermPattern::Triple(t) => add_pattern_variables(t, variables, blank_nodes),
} }
} }
fn compile_static_pattern_if_exists(
pattern: &Expression,
options: Option<&Expression>,
) -> Option<Regex> {
let static_pattern = if let Expression::Literal(pattern) = pattern {
if pattern.datatype() == xsd::STRING {
Some(pattern.value())
} else {
None
}
} else {
None
};
let static_options = if let Some(options) = options {
if let Expression::Literal(options) = options {
if options.datatype() == xsd::STRING {
Some(Some(options.value()))
} else {
None
}
} else {
None
}
} else {
Some(None)
};
if let (Some(static_pattern), Some(static_options)) = (static_pattern, static_options) {
compile_pattern(static_pattern, static_options)
} else {
None
}
}

Loading…
Cancel
Save