SPARQL regex: compile ahead of time if possible

pull/354/head
Tpt 2 years ago committed by Thomas Tanon
parent 5c055e0d12
commit a7bc31b446
  1. 71
      lib/src/sparql/eval.rs
  2. 17
      lib/src/sparql/plan.rs
  3. 132
      lib/src/sparql/plan_builder.rs

@ -1325,22 +1325,35 @@ impl SimpleEvaluator {
Some((to_string(&dataset, &arg(tuple)?)?.chars().count() as i64).into())
})
}
PlanExpression::Replace(arg, pattern, replacement, flags) => {
PlanExpression::StaticReplace(arg, regex, replacement) => {
let arg = self.expression_evaluator(arg);
let regex = regex.clone();
let replacement = self.expression_evaluator(replacement);
let dataset = self.dataset.clone();
Rc::new(move |tuple| {
let (text, language) = to_string_and_language(&dataset, &arg(tuple)?)?;
let replacement = to_simple_string(&dataset, &replacement(tuple)?)?;
Some(build_plain_literal(
&dataset,
&regex.replace_all(&text, replacement.as_str()),
language,
))
})
}
PlanExpression::DynamicReplace(arg, pattern, replacement, flags) => {
let arg = self.expression_evaluator(arg);
let pattern = self.expression_evaluator(pattern);
let replacement = self.expression_evaluator(replacement);
let flags = flags.as_ref().map(|flags| self.expression_evaluator(flags));
let dataset = self.dataset.clone();
Rc::new(move |tuple| {
let regex = compile_pattern(
&dataset,
&pattern(tuple)?,
if let Some(flags) = &flags {
Some(flags(tuple)?)
} else {
None
},
)?;
let pattern = to_simple_string(&dataset, &pattern(tuple)?)?;
let options = if let Some(flags) = &flags {
Some(to_simple_string(&dataset, &flags(tuple)?)?)
} else {
None
};
let regex = compile_pattern(&pattern, options.as_deref())?;
let (text, language) = to_string_and_language(&dataset, &arg(tuple)?)?;
let replacement = to_simple_string(&dataset, &replacement(tuple)?)?;
Some(build_plain_literal(
@ -1704,21 +1717,28 @@ impl SimpleEvaluator {
)
})
}
PlanExpression::Regex(text, pattern, flags) => {
PlanExpression::StaticRegex(text, regex) => {
let text = self.expression_evaluator(text);
let dataset = self.dataset.clone();
let regex = regex.clone();
Rc::new(move |tuple| {
let text = to_string(&dataset, &text(tuple)?)?;
Some(regex.is_match(&text).into())
})
}
PlanExpression::DynamicRegex(text, pattern, flags) => {
let text = self.expression_evaluator(text);
let pattern = self.expression_evaluator(pattern);
let flags = flags.as_ref().map(|flags| self.expression_evaluator(flags));
let dataset = self.dataset.clone();
Rc::new(move |tuple| {
let regex = compile_pattern(
&dataset,
&pattern(tuple)?,
if let Some(flags) = &flags {
Some(flags(tuple)?)
} else {
None
},
)?;
let pattern = to_simple_string(&dataset, &pattern(tuple)?)?;
let options = if let Some(flags) = &flags {
Some(to_simple_string(&dataset, &flags(tuple)?)?)
} else {
None
};
let regex = compile_pattern(&pattern, options.as_deref())?;
let text = to_string(&dataset, &text(tuple)?)?;
Some(regex.is_match(&text).into())
})
@ -2193,17 +2213,10 @@ fn to_argument_compatible_strings(
}
}
fn compile_pattern(
dataset: &DatasetView,
pattern: &EncodedTerm,
flags: Option<EncodedTerm>,
) -> Option<Regex> {
// TODO Avoid to compile the regex each time
let pattern = to_simple_string(dataset, pattern)?;
let mut regex_builder = RegexBuilder::new(&pattern);
pub(super) fn compile_pattern(pattern: &str, flags: Option<&str>) -> Option<Regex> {
let mut regex_builder = RegexBuilder::new(pattern);
regex_builder.size_limit(REGEX_SIZE_LIMIT);
if let Some(flags) = flags {
let flags = to_simple_string(dataset, &flags)?;
for flag in flags.chars() {
match flag {
's' => {

@ -1,6 +1,7 @@
use crate::model::NamedNode;
use crate::storage::numeric_encoder::EncodedTerm;
use oxrdf::Variable;
use regex::Regex;
use spargebra::algebra::GraphPattern;
use std::cmp::max;
use std::collections::btree_map::Entry;
@ -420,7 +421,8 @@ pub enum PlanExpression {
Concat(Vec<Self>),
SubStr(Box<Self>, Box<Self>, Option<Box<Self>>),
StrLen(Box<Self>),
Replace(Box<Self>, Box<Self>, Box<Self>, Option<Box<Self>>),
StaticReplace(Box<Self>, Regex, Box<Self>),
DynamicReplace(Box<Self>, Box<Self>, Box<Self>, Option<Box<Self>>),
UCase(Box<Self>),
LCase(Box<Self>),
EncodeForUri(Box<Self>),
@ -454,7 +456,8 @@ pub enum PlanExpression {
IsBlank(Box<Self>),
IsLiteral(Box<Self>),
IsNumeric(Box<Self>),
Regex(Box<Self>, Box<Self>, Option<Box<Self>>),
StaticRegex(Box<Self>, Regex),
DynamicRegex(Box<Self>, Box<Self>, Option<Box<Self>>),
Triple(Box<Self>, Box<Self>, Box<Self>),
Subject(Box<Self>),
Predicate(Box<Self>),
@ -504,6 +507,7 @@ impl PlanExpression {
| Self::LCase(e)
| Self::StrLen(e)
| Self::EncodeForUri(e)
| Self::StaticRegex(e, _)
| Self::Year(e)
| Self::Month(e)
| Self::Day(e)
@ -550,6 +554,7 @@ impl PlanExpression {
| Self::Divide(a, b)
| Self::LangMatches(a, b)
| Self::Contains(a, b)
| Self::StaticReplace(a, _, b)
| Self::StrStarts(a, b)
| Self::StrEnds(a, b)
| Self::StrBefore(a, b)
@ -558,21 +563,21 @@ impl PlanExpression {
| Self::StrDt(a, b)
| Self::SameTerm(a, b)
| Self::SubStr(a, b, None)
| Self::Regex(a, b, None)
| Self::DynamicRegex(a, b, None)
| Self::Adjust(a, b) => {
a.lookup_used_variables(callback);
b.lookup_used_variables(callback);
}
Self::If(a, b, c)
| Self::SubStr(a, b, Some(c))
| Self::Regex(a, b, Some(c))
| Self::Replace(a, b, c, None)
| Self::DynamicRegex(a, b, Some(c))
| Self::DynamicReplace(a, b, c, None)
| Self::Triple(a, b, c) => {
a.lookup_used_variables(callback);
b.lookup_used_variables(callback);
c.lookup_used_variables(callback);
}
Self::Replace(a, b, c, Some(d)) => {
Self::DynamicReplace(a, b, c, Some(d)) => {
a.lookup_used_variables(callback);
b.lookup_used_variables(callback);
c.lookup_used_variables(callback);

@ -1,11 +1,13 @@
use crate::model::Term as OxTerm;
use crate::sparql::dataset::DatasetView;
use crate::sparql::error::EvaluationError;
use crate::sparql::eval::compile_pattern;
use crate::sparql::plan::*;
use crate::storage::numeric_encoder::{EncodedTerm, EncodedTriple};
use oxrdf::vocab::xsd;
use oxrdf::TermRef;
use rand::random;
use regex::Regex;
use spargebra::algebra::*;
use spargebra::term::*;
use std::collections::{BTreeSet, HashMap, HashSet};
@ -486,17 +488,49 @@ impl<'a> PlanBuilder<'a> {
variables,
graph_name,
)?)),
Function::Replace => PlanExpression::Replace(
Box::new(self.build_for_expression(&parameters[0], variables, graph_name)?),
Box::new(self.build_for_expression(&parameters[1], variables, graph_name)?),
Box::new(self.build_for_expression(&parameters[2], variables, graph_name)?),
match parameters.get(3) {
Some(flags) => Some(Box::new(
self.build_for_expression(flags, variables, graph_name)?,
)),
None => None,
},
),
Function::Replace => {
if let Some(static_regex) =
compile_static_pattern_if_exists(&parameters[1], parameters.get(3))
{
PlanExpression::StaticReplace(
Box::new(self.build_for_expression(
&parameters[0],
variables,
graph_name,
)?),
static_regex,
Box::new(self.build_for_expression(
&parameters[2],
variables,
graph_name,
)?),
)
} else {
PlanExpression::DynamicReplace(
Box::new(self.build_for_expression(
&parameters[0],
variables,
graph_name,
)?),
Box::new(self.build_for_expression(
&parameters[1],
variables,
graph_name,
)?),
Box::new(self.build_for_expression(
&parameters[2],
variables,
graph_name,
)?),
match parameters.get(3) {
Some(flags) => Some(Box::new(
self.build_for_expression(flags, variables, graph_name)?,
)),
None => None,
},
)
}
}
Function::UCase => PlanExpression::UCase(Box::new(self.build_for_expression(
&parameters[0],
variables,
@ -620,16 +654,39 @@ impl<'a> PlanBuilder<'a> {
Function::IsNumeric => PlanExpression::IsNumeric(Box::new(
self.build_for_expression(&parameters[0], variables, graph_name)?,
)),
Function::Regex => PlanExpression::Regex(
Box::new(self.build_for_expression(&parameters[0], variables, graph_name)?),
Box::new(self.build_for_expression(&parameters[1], variables, graph_name)?),
match parameters.get(2) {
Some(flags) => Some(Box::new(
self.build_for_expression(flags, variables, graph_name)?,
)),
None => None,
},
),
Function::Regex => {
if let Some(static_regex) =
compile_static_pattern_if_exists(&parameters[1], parameters.get(2))
{
PlanExpression::StaticRegex(
Box::new(self.build_for_expression(
&parameters[0],
variables,
graph_name,
)?),
static_regex,
)
} else {
PlanExpression::DynamicRegex(
Box::new(self.build_for_expression(
&parameters[0],
variables,
graph_name,
)?),
Box::new(self.build_for_expression(
&parameters[1],
variables,
graph_name,
)?),
match parameters.get(2) {
Some(flags) => Some(Box::new(
self.build_for_expression(flags, variables, graph_name)?,
)),
None => None,
},
)
}
}
Function::Triple => PlanExpression::Triple(
Box::new(self.build_for_expression(&parameters[0], variables, graph_name)?),
Box::new(self.build_for_expression(&parameters[1], variables, graph_name)?),
@ -1505,3 +1562,36 @@ fn add_pattern_variables<'a>(
TermPattern::Triple(t) => add_pattern_variables(t, variables, blank_nodes),
}
}
fn compile_static_pattern_if_exists(
pattern: &Expression,
options: Option<&Expression>,
) -> Option<Regex> {
let static_pattern = if let Expression::Literal(pattern) = pattern {
if pattern.datatype() == xsd::STRING {
Some(pattern.value())
} else {
None
}
} else {
None
};
let static_options = if let Some(options) = options {
if let Expression::Literal(options) = options {
if options.datatype() == xsd::STRING {
Some(Some(options.value()))
} else {
None
}
} else {
None
}
} else {
Some(None)
};
if let (Some(static_pattern), Some(static_options)) = (static_pattern, static_options) {
compile_pattern(static_pattern, static_options)
} else {
None
}
}

Loading…
Cancel
Save