From ee1ab7b82f2ba738a03f6bc9684ff48953a570ab Mon Sep 17 00:00:00 2001
From: Tpt <thomaspt@hotmail.fr>
Date: Sat, 7 Jan 2023 15:04:54 +0100
Subject: [PATCH] SPARQL regex: compile ahead of time if possible

---
 lib/src/sparql/eval.rs         |  71 ++++++++++--------
 lib/src/sparql/plan.rs         |  17 +++--
 lib/src/sparql/plan_builder.rs | 132 +++++++++++++++++++++++++++------
 3 files changed, 164 insertions(+), 56 deletions(-)

diff --git a/lib/src/sparql/eval.rs b/lib/src/sparql/eval.rs
index a343b45a..c4213ae3 100644
--- a/lib/src/sparql/eval.rs
+++ b/lib/src/sparql/eval.rs
@@ -1325,22 +1325,35 @@ impl SimpleEvaluator {
                     Some((to_string(&dataset, &arg(tuple)?)?.chars().count() as i64).into())
                 })
             }
-            PlanExpression::Replace(arg, pattern, replacement, flags) => {
+            PlanExpression::StaticReplace(arg, regex, replacement) => {
+                let arg = self.expression_evaluator(arg);
+                let regex = regex.clone();
+                let replacement = self.expression_evaluator(replacement);
+                let dataset = self.dataset.clone();
+                Rc::new(move |tuple| {
+                    let (text, language) = to_string_and_language(&dataset, &arg(tuple)?)?;
+                    let replacement = to_simple_string(&dataset, &replacement(tuple)?)?;
+                    Some(build_plain_literal(
+                        &dataset,
+                        &regex.replace_all(&text, replacement.as_str()),
+                        language,
+                    ))
+                })
+            }
+            PlanExpression::DynamicReplace(arg, pattern, replacement, flags) => {
                 let arg = self.expression_evaluator(arg);
                 let pattern = self.expression_evaluator(pattern);
                 let replacement = self.expression_evaluator(replacement);
                 let flags = flags.as_ref().map(|flags| self.expression_evaluator(flags));
                 let dataset = self.dataset.clone();
                 Rc::new(move |tuple| {
-                    let regex = compile_pattern(
-                        &dataset,
-                        &pattern(tuple)?,
-                        if let Some(flags) = &flags {
-                            Some(flags(tuple)?)
-                        } else {
-                            None
-                        },
-                    )?;
+                    let pattern = to_simple_string(&dataset, &pattern(tuple)?)?;
+                    let options = if let Some(flags) = &flags {
+                        Some(to_simple_string(&dataset, &flags(tuple)?)?)
+                    } else {
+                        None
+                    };
+                    let regex = compile_pattern(&pattern, options.as_deref())?;
                     let (text, language) = to_string_and_language(&dataset, &arg(tuple)?)?;
                     let replacement = to_simple_string(&dataset, &replacement(tuple)?)?;
                     Some(build_plain_literal(
@@ -1704,21 +1717,28 @@ impl SimpleEvaluator {
                     )
                 })
             }
-            PlanExpression::Regex(text, pattern, flags) => {
+            PlanExpression::StaticRegex(text, regex) => {
+                let text = self.expression_evaluator(text);
+                let dataset = self.dataset.clone();
+                let regex = regex.clone();
+                Rc::new(move |tuple| {
+                    let text = to_string(&dataset, &text(tuple)?)?;
+                    Some(regex.is_match(&text).into())
+                })
+            }
+            PlanExpression::DynamicRegex(text, pattern, flags) => {
                 let text = self.expression_evaluator(text);
                 let pattern = self.expression_evaluator(pattern);
                 let flags = flags.as_ref().map(|flags| self.expression_evaluator(flags));
                 let dataset = self.dataset.clone();
                 Rc::new(move |tuple| {
-                    let regex = compile_pattern(
-                        &dataset,
-                        &pattern(tuple)?,
-                        if let Some(flags) = &flags {
-                            Some(flags(tuple)?)
-                        } else {
-                            None
-                        },
-                    )?;
+                    let pattern = to_simple_string(&dataset, &pattern(tuple)?)?;
+                    let options = if let Some(flags) = &flags {
+                        Some(to_simple_string(&dataset, &flags(tuple)?)?)
+                    } else {
+                        None
+                    };
+                    let regex = compile_pattern(&pattern, options.as_deref())?;
                     let text = to_string(&dataset, &text(tuple)?)?;
                     Some(regex.is_match(&text).into())
                 })
@@ -2193,17 +2213,10 @@ fn to_argument_compatible_strings(
     }
 }
 
-fn compile_pattern(
-    dataset: &DatasetView,
-    pattern: &EncodedTerm,
-    flags: Option<EncodedTerm>,
-) -> Option<Regex> {
-    // TODO Avoid to compile the regex each time
-    let pattern = to_simple_string(dataset, pattern)?;
-    let mut regex_builder = RegexBuilder::new(&pattern);
+pub(super) fn compile_pattern(pattern: &str, flags: Option<&str>) -> Option<Regex> {
+    let mut regex_builder = RegexBuilder::new(pattern);
     regex_builder.size_limit(REGEX_SIZE_LIMIT);
     if let Some(flags) = flags {
-        let flags = to_simple_string(dataset, &flags)?;
         for flag in flags.chars() {
             match flag {
                 's' => {
diff --git a/lib/src/sparql/plan.rs b/lib/src/sparql/plan.rs
index 2fa1f9f7..fdb0922e 100644
--- a/lib/src/sparql/plan.rs
+++ b/lib/src/sparql/plan.rs
@@ -1,6 +1,7 @@
 use crate::model::NamedNode;
 use crate::storage::numeric_encoder::EncodedTerm;
 use oxrdf::Variable;
+use regex::Regex;
 use spargebra::algebra::GraphPattern;
 use std::cmp::max;
 use std::collections::btree_map::Entry;
@@ -420,7 +421,8 @@ pub enum PlanExpression {
     Concat(Vec<Self>),
     SubStr(Box<Self>, Box<Self>, Option<Box<Self>>),
     StrLen(Box<Self>),
-    Replace(Box<Self>, Box<Self>, Box<Self>, Option<Box<Self>>),
+    StaticReplace(Box<Self>, Regex, Box<Self>),
+    DynamicReplace(Box<Self>, Box<Self>, Box<Self>, Option<Box<Self>>),
     UCase(Box<Self>),
     LCase(Box<Self>),
     EncodeForUri(Box<Self>),
@@ -454,7 +456,8 @@ pub enum PlanExpression {
     IsBlank(Box<Self>),
     IsLiteral(Box<Self>),
     IsNumeric(Box<Self>),
-    Regex(Box<Self>, Box<Self>, Option<Box<Self>>),
+    StaticRegex(Box<Self>, Regex),
+    DynamicRegex(Box<Self>, Box<Self>, Option<Box<Self>>),
     Triple(Box<Self>, Box<Self>, Box<Self>),
     Subject(Box<Self>),
     Predicate(Box<Self>),
@@ -504,6 +507,7 @@ impl PlanExpression {
             | Self::LCase(e)
             | Self::StrLen(e)
             | Self::EncodeForUri(e)
+            | Self::StaticRegex(e, _)
             | Self::Year(e)
             | Self::Month(e)
             | Self::Day(e)
@@ -550,6 +554,7 @@ impl PlanExpression {
             | Self::Divide(a, b)
             | Self::LangMatches(a, b)
             | Self::Contains(a, b)
+            | Self::StaticReplace(a, _, b)
             | Self::StrStarts(a, b)
             | Self::StrEnds(a, b)
             | Self::StrBefore(a, b)
@@ -558,21 +563,21 @@ impl PlanExpression {
             | Self::StrDt(a, b)
             | Self::SameTerm(a, b)
             | Self::SubStr(a, b, None)
-            | Self::Regex(a, b, None)
+            | Self::DynamicRegex(a, b, None)
             | Self::Adjust(a, b) => {
                 a.lookup_used_variables(callback);
                 b.lookup_used_variables(callback);
             }
             Self::If(a, b, c)
             | Self::SubStr(a, b, Some(c))
-            | Self::Regex(a, b, Some(c))
-            | Self::Replace(a, b, c, None)
+            | Self::DynamicRegex(a, b, Some(c))
+            | Self::DynamicReplace(a, b, c, None)
             | Self::Triple(a, b, c) => {
                 a.lookup_used_variables(callback);
                 b.lookup_used_variables(callback);
                 c.lookup_used_variables(callback);
             }
-            Self::Replace(a, b, c, Some(d)) => {
+            Self::DynamicReplace(a, b, c, Some(d)) => {
                 a.lookup_used_variables(callback);
                 b.lookup_used_variables(callback);
                 c.lookup_used_variables(callback);
diff --git a/lib/src/sparql/plan_builder.rs b/lib/src/sparql/plan_builder.rs
index 57ec123b..61cbba58 100644
--- a/lib/src/sparql/plan_builder.rs
+++ b/lib/src/sparql/plan_builder.rs
@@ -1,11 +1,13 @@
 use crate::model::Term as OxTerm;
 use crate::sparql::dataset::DatasetView;
 use crate::sparql::error::EvaluationError;
+use crate::sparql::eval::compile_pattern;
 use crate::sparql::plan::*;
 use crate::storage::numeric_encoder::{EncodedTerm, EncodedTriple};
 use oxrdf::vocab::xsd;
 use oxrdf::TermRef;
 use rand::random;
+use regex::Regex;
 use spargebra::algebra::*;
 use spargebra::term::*;
 use std::collections::{BTreeSet, HashMap, HashSet};
@@ -486,17 +488,49 @@ impl<'a> PlanBuilder<'a> {
                     variables,
                     graph_name,
                 )?)),
-                Function::Replace => PlanExpression::Replace(
-                    Box::new(self.build_for_expression(&parameters[0], variables, graph_name)?),
-                    Box::new(self.build_for_expression(&parameters[1], variables, graph_name)?),
-                    Box::new(self.build_for_expression(&parameters[2], variables, graph_name)?),
-                    match parameters.get(3) {
-                        Some(flags) => Some(Box::new(
-                            self.build_for_expression(flags, variables, graph_name)?,
-                        )),
-                        None => None,
-                    },
-                ),
+                Function::Replace => {
+                    if let Some(static_regex) =
+                        compile_static_pattern_if_exists(&parameters[1], parameters.get(3))
+                    {
+                        PlanExpression::StaticReplace(
+                            Box::new(self.build_for_expression(
+                                &parameters[0],
+                                variables,
+                                graph_name,
+                            )?),
+                            static_regex,
+                            Box::new(self.build_for_expression(
+                                &parameters[2],
+                                variables,
+                                graph_name,
+                            )?),
+                        )
+                    } else {
+                        PlanExpression::DynamicReplace(
+                            Box::new(self.build_for_expression(
+                                &parameters[0],
+                                variables,
+                                graph_name,
+                            )?),
+                            Box::new(self.build_for_expression(
+                                &parameters[1],
+                                variables,
+                                graph_name,
+                            )?),
+                            Box::new(self.build_for_expression(
+                                &parameters[2],
+                                variables,
+                                graph_name,
+                            )?),
+                            match parameters.get(3) {
+                                Some(flags) => Some(Box::new(
+                                    self.build_for_expression(flags, variables, graph_name)?,
+                                )),
+                                None => None,
+                            },
+                        )
+                    }
+                }
                 Function::UCase => PlanExpression::UCase(Box::new(self.build_for_expression(
                     &parameters[0],
                     variables,
@@ -620,16 +654,39 @@ impl<'a> PlanBuilder<'a> {
                 Function::IsNumeric => PlanExpression::IsNumeric(Box::new(
                     self.build_for_expression(&parameters[0], variables, graph_name)?,
                 )),
-                Function::Regex => PlanExpression::Regex(
-                    Box::new(self.build_for_expression(&parameters[0], variables, graph_name)?),
-                    Box::new(self.build_for_expression(&parameters[1], variables, graph_name)?),
-                    match parameters.get(2) {
-                        Some(flags) => Some(Box::new(
-                            self.build_for_expression(flags, variables, graph_name)?,
-                        )),
-                        None => None,
-                    },
-                ),
+                Function::Regex => {
+                    if let Some(static_regex) =
+                        compile_static_pattern_if_exists(&parameters[1], parameters.get(2))
+                    {
+                        PlanExpression::StaticRegex(
+                            Box::new(self.build_for_expression(
+                                &parameters[0],
+                                variables,
+                                graph_name,
+                            )?),
+                            static_regex,
+                        )
+                    } else {
+                        PlanExpression::DynamicRegex(
+                            Box::new(self.build_for_expression(
+                                &parameters[0],
+                                variables,
+                                graph_name,
+                            )?),
+                            Box::new(self.build_for_expression(
+                                &parameters[1],
+                                variables,
+                                graph_name,
+                            )?),
+                            match parameters.get(2) {
+                                Some(flags) => Some(Box::new(
+                                    self.build_for_expression(flags, variables, graph_name)?,
+                                )),
+                                None => None,
+                            },
+                        )
+                    }
+                }
                 Function::Triple => PlanExpression::Triple(
                     Box::new(self.build_for_expression(&parameters[0], variables, graph_name)?),
                     Box::new(self.build_for_expression(&parameters[1], variables, graph_name)?),
@@ -1505,3 +1562,36 @@ fn add_pattern_variables<'a>(
         TermPattern::Triple(t) => add_pattern_variables(t, variables, blank_nodes),
     }
 }
+
+fn compile_static_pattern_if_exists(
+    pattern: &Expression,
+    options: Option<&Expression>,
+) -> Option<Regex> {
+    let static_pattern = if let Expression::Literal(pattern) = pattern {
+        if pattern.datatype() == xsd::STRING {
+            Some(pattern.value())
+        } else {
+            None
+        }
+    } else {
+        None
+    };
+    let static_options = if let Some(options) = options {
+        if let Expression::Literal(options) = options {
+            if options.datatype() == xsd::STRING {
+                Some(Some(options.value()))
+            } else {
+                None
+            }
+        } else {
+            None
+        }
+    } else {
+        Some(None)
+    };
+    if let (Some(static_pattern), Some(static_options)) = (static_pattern, static_options) {
+        compile_pattern(static_pattern, static_options)
+    } else {
+        None
+    }
+}