Makes sparopt figure out good join keys

pull/572/head
Thomas 1 year ago committed by Thomas Tanon
parent cdabe52847
commit c31ba0e823
  1. 62
      lib/sparopt/src/algebra.rs
  2. 171
      lib/sparopt/src/optimizer.rs
  3. 11
      lib/sparopt/src/type_inference.rs
  4. 22
      lib/src/sparql/eval.rs
  5. 139
      lib/src/sparql/plan.rs
  6. 28
      lib/src/sparql/plan_builder.rs

@ -660,6 +660,7 @@ pub enum GraphPattern {
left: Box<Self>,
right: Box<Self>,
expression: Expression,
algorithm: LeftJoinAlgorithm,
},
/// Lateral join i.e. evaluate right for all result row of left
#[cfg(feature = "sep-0006")]
@ -678,7 +679,11 @@ pub enum GraphPattern {
expression: Expression,
},
/// [Minus](https://www.w3.org/TR/sparql11-query/#defn_algMinus).
Minus { left: Box<Self>, right: Box<Self> },
Minus {
left: Box<Self>,
right: Box<Self>,
algorithm: MinusAlgorithm,
},
/// A table used to provide inline values
Values {
variables: Vec<Variable>,
@ -784,7 +789,12 @@ impl GraphPattern {
}
}
pub fn left_join(left: Self, right: Self, expression: Expression) -> Self {
pub fn left_join(
left: Self,
right: Self,
expression: Expression,
algorithm: LeftJoinAlgorithm,
) -> Self {
let expression_ebv = expression.effective_boolean_value();
if left.is_empty()
|| right.is_empty()
@ -801,10 +811,11 @@ impl GraphPattern {
} else {
expression
},
algorithm,
}
}
pub fn minus(left: Self, right: Self) -> Self {
pub fn minus(left: Self, right: Self, algorithm: MinusAlgorithm) -> Self {
if left.is_empty() {
return Self::empty();
}
@ -814,6 +825,7 @@ impl GraphPattern {
Self::Minus {
left: Box::new(left),
right: Box::new(right),
algorithm,
}
}
@ -1046,7 +1058,7 @@ impl GraphPattern {
child.lookup_used_variables(callback);
}
}
Self::Join { left, right, .. } | Self::Minus { left, right } => {
Self::Join { left, right, .. } | Self::Minus { left, right, .. } => {
left.lookup_used_variables(callback);
right.lookup_used_variables(callback);
}
@ -1059,6 +1071,7 @@ impl GraphPattern {
left,
right,
expression,
..
} => {
expression.lookup_used_variables(callback);
left.lookup_used_variables(callback);
@ -1148,6 +1161,7 @@ impl GraphPattern {
|| true.into(),
|e| Expression::from_sparql_algebra(e, graph_name),
),
algorithm: LeftJoinAlgorithm::default(),
},
#[cfg(feature = "sep-0006")]
AlGraphPattern::Lateral { left, right } => Self::Lateral {
@ -1179,6 +1193,7 @@ impl GraphPattern {
AlGraphPattern::Minus { left, right } => Self::Minus {
left: Box::new(Self::from_sparql_algebra(left, graph_name, blank_nodes)),
right: Box::new(Self::from_sparql_algebra(right, graph_name, blank_nodes)),
algorithm: MinusAlgorithm::default(),
},
AlGraphPattern::Values {
variables,
@ -1365,6 +1380,7 @@ impl From<&GraphPattern> for AlGraphPattern {
left,
right,
expression,
..
} => {
let empty_expr = if let Expression::Literal(l) = expression {
l.datatype() == xsd::BOOLEAN && l.value() == "true"
@ -1418,7 +1434,7 @@ impl From<&GraphPattern> for AlGraphPattern {
expression: expression.into(),
variable: variable.clone(),
},
GraphPattern::Minus { left, right } => Self::Minus {
GraphPattern::Minus { left, right, .. } => Self::Minus {
left: Box::new(left.as_ref().into()),
right: Box::new(right.as_ref().into()),
},
@ -1478,14 +1494,44 @@ impl From<&GraphPattern> for AlGraphPattern {
}
/// The join algorithm used (c.f. [`GraphPattern::Join`]).
#[derive(Eq, PartialEq, Debug, Clone, Copy, Hash)]
#[derive(Eq, PartialEq, Debug, Clone, Hash)]
pub enum JoinAlgorithm {
HashBuildLeftProbeRight,
HashBuildLeftProbeRight { keys: Vec<Variable> },
}
impl Default for JoinAlgorithm {
fn default() -> Self {
Self::HashBuildLeftProbeRight
Self::HashBuildLeftProbeRight {
keys: Vec::default(),
}
}
}
/// The left join algorithm used (c.f. [`GraphPattern::LeftJoin`]).
#[derive(Eq, PartialEq, Debug, Clone, Hash)]
pub enum LeftJoinAlgorithm {
HashBuildRightProbeLeft { keys: Vec<Variable> },
}
impl Default for LeftJoinAlgorithm {
fn default() -> Self {
Self::HashBuildRightProbeLeft {
keys: Vec::default(),
}
}
}
/// The left join algorithm used (c.f. [`GraphPattern::Minus`]).
#[derive(Eq, PartialEq, Debug, Clone, Hash)]
pub enum MinusAlgorithm {
HashBuildRightProbeLeft { keys: Vec<Variable> },
}
impl Default for MinusAlgorithm {
fn default() -> Self {
Self::HashBuildRightProbeLeft {
keys: Vec::default(),
}
}
}

@ -1,7 +1,10 @@
use crate::algebra::{Expression, GraphPattern, JoinAlgorithm, OrderExpression};
use crate::algebra::{
Expression, GraphPattern, JoinAlgorithm, LeftJoinAlgorithm, MinusAlgorithm, OrderExpression,
};
use crate::type_inference::{
infer_expression_type, infer_graph_pattern_types, VariableType, VariableTypes,
};
use oxrdf::Variable;
use spargebra::algebra::PropertyPathExpression;
use spargebra::term::{GroundTermPattern, NamedNodePattern};
use std::cmp::{max, min};
@ -53,6 +56,7 @@ impl Optimizer {
left,
right,
expression,
algorithm,
} => {
let left = Self::normalize_pattern(*left, input_types);
let right = Self::normalize_pattern(*right, input_types);
@ -62,6 +66,7 @@ impl Optimizer {
left,
right,
Self::normalize_expression(expression, &inner_types),
algorithm,
)
}
#[cfg(feature = "sep-0006")]
@ -103,9 +108,14 @@ impl Optimizer {
GraphPattern::extend(inner, variable, expression)
}
}
GraphPattern::Minus { left, right } => GraphPattern::minus(
GraphPattern::Minus {
left,
right,
algorithm,
} => GraphPattern::minus(
Self::normalize_pattern(*left, input_types),
Self::normalize_pattern(*right, input_types),
algorithm,
),
GraphPattern::Values {
variables,
@ -336,6 +346,7 @@ impl Optimizer {
left,
right,
expression,
algorithm,
} => {
let left_types = infer_graph_pattern_types(&left, input_types.clone());
let right_types = infer_graph_pattern_types(&right, input_types.clone());
@ -364,13 +375,19 @@ impl Optimizer {
Self::push_filters(*left, left_filters, input_types),
Self::push_filters(*right, right_filters, input_types),
expression,
algorithm,
),
Expression::and_all(final_filters),
)
}
GraphPattern::Minus { left, right } => GraphPattern::minus(
GraphPattern::Minus {
left,
right,
algorithm,
} => GraphPattern::minus(
Self::push_filters(*left, filters, input_types),
Self::push_filters(*right, Vec::new(), input_types),
algorithm,
),
GraphPattern::Extend {
inner,
@ -503,11 +520,7 @@ impl Optimizer {
.enumerate()
.filter_map(|(i, v)| v.then(|| i))
.filter(|i| {
count_common_variables(
&output_types,
&to_reorder_types[*i],
input_types,
) > 0
has_common_variables(&output_types, &to_reorder_types[*i], input_types)
})
.min_by_key(|i| {
// Estimation of the join cost
@ -527,10 +540,14 @@ impl Optimizer {
} else {
estimate_join_cost(
&output,
&output_types,
&to_reorder[*i],
&JoinAlgorithm::HashBuildLeftProbeRight {
keys: join_key_variables(
&output_types,
&to_reorder_types[*i],
JoinAlgorithm::HashBuildLeftProbeRight,
input_types,
),
},
input_types,
)
}
@ -547,7 +564,13 @@ impl Optimizer {
GraphPattern::join(
output,
next,
JoinAlgorithm::HashBuildLeftProbeRight,
JoinAlgorithm::HashBuildLeftProbeRight {
keys: join_key_variables(
&output_types,
&to_reorder_types[next_id],
input_types,
),
},
)
};
}
@ -556,7 +579,13 @@ impl Optimizer {
output = GraphPattern::join(
output,
next,
JoinAlgorithm::HashBuildLeftProbeRight,
JoinAlgorithm::HashBuildLeftProbeRight {
keys: join_key_variables(
&output_types,
&to_reorder_types[next_id],
input_types,
),
},
);
}
output_types.intersect_with(to_reorder_types[next_id].clone());
@ -566,12 +595,25 @@ impl Optimizer {
output_cartesian_product_joins
.into_iter()
.reduce(|left, right| {
let keys = join_key_variables(
&infer_graph_pattern_types(&left, input_types.clone()),
&infer_graph_pattern_types(&right, input_types.clone()),
input_types,
);
if estimate_graph_pattern_size(&left, input_types)
<= estimate_graph_pattern_size(&right, input_types)
{
GraphPattern::join(left, right, JoinAlgorithm::HashBuildLeftProbeRight)
GraphPattern::join(
left,
right,
JoinAlgorithm::HashBuildLeftProbeRight { keys },
)
} else {
GraphPattern::join(right, left, JoinAlgorithm::HashBuildLeftProbeRight)
GraphPattern::join(
right,
left,
JoinAlgorithm::HashBuildLeftProbeRight { keys },
)
}
})
.unwrap()
@ -588,15 +630,16 @@ impl Optimizer {
left,
right,
expression,
..
} => {
let left = Self::reorder_joins(*left, input_types);
let left_types = infer_graph_pattern_types(&left, input_types.clone());
let right = Self::reorder_joins(*right, input_types);
let right_types = infer_graph_pattern_types(&right, input_types.clone());
#[cfg(feature = "sep-0006")]
{
let left_types = infer_graph_pattern_types(&left, input_types.clone());
let right_types = infer_graph_pattern_types(&right, input_types.clone());
if is_fit_for_for_loop_join(&right, input_types, &left_types)
&& count_common_variables(&left_types, &right_types, input_types) > 0
&& has_common_variables(&left_types, &right_types, input_types)
{
return GraphPattern::lateral(
left,
@ -604,16 +647,33 @@ impl Optimizer {
GraphPattern::empty_singleton(),
right,
expression,
LeftJoinAlgorithm::HashBuildRightProbeLeft { keys: Vec::new() },
),
);
}
}
GraphPattern::left_join(left, right, expression)
GraphPattern::left_join(
left,
right,
expression,
LeftJoinAlgorithm::HashBuildRightProbeLeft {
keys: join_key_variables(&left_types, &right_types, input_types),
},
)
}
GraphPattern::Minus { left, right, .. } => {
let left = Self::reorder_joins(*left, input_types);
let left_types = infer_graph_pattern_types(&left, input_types.clone());
let right = Self::reorder_joins(*right, input_types);
let right_types = infer_graph_pattern_types(&right, input_types.clone());
GraphPattern::minus(
left,
right,
MinusAlgorithm::HashBuildRightProbeLeft {
keys: join_key_variables(&left_types, &right_types, input_types),
},
)
}
GraphPattern::Minus { left, right } => GraphPattern::minus(
Self::reorder_joins(*left, input_types),
Self::reorder_joins(*right, input_types),
),
GraphPattern::Extend {
inner,
expression,
@ -685,6 +745,7 @@ fn is_fit_for_for_loop_join(
left,
right,
expression,
..
} => {
if !is_fit_for_for_loop_join(left, global_input_types, entry_types) {
return false;
@ -802,17 +863,28 @@ fn is_expression_fit_for_for_loop_join(
}
}
fn count_common_variables(
fn has_common_variables(
left: &VariableTypes,
right: &VariableTypes,
input_types: &VariableTypes,
) -> usize {
) -> bool {
// TODO: we should be smart and count as shared variables FILTER(?a = ?b)
left.iter().any(|(variable, left_type)| {
!left_type.undef && !right.get(variable).undef && input_types.get(variable).undef
})
}
fn join_key_variables(
left: &VariableTypes,
right: &VariableTypes,
input_types: &VariableTypes,
) -> Vec<Variable> {
left.iter()
.filter(|(variable, left_type)| {
!left_type.undef && !right.get(variable).undef && input_types.get(variable).undef
})
.count()
.map(|(variable, _)| variable.clone())
.collect()
}
fn estimate_graph_pattern_size(pattern: &GraphPattern, input_types: &VariableTypes) -> usize {
@ -842,35 +914,26 @@ fn estimate_graph_pattern_size(pattern: &GraphPattern, input_types: &VariableTyp
left,
right,
algorithm,
} => {
let left_types = infer_graph_pattern_types(left, input_types.clone());
let right_types = infer_graph_pattern_types(right, input_types.clone());
estimate_join_cost(
} => estimate_join_cost(left, right, algorithm, input_types),
GraphPattern::LeftJoin {
left,
&left_types,
right,
&right_types,
*algorithm,
input_types,
)
}
GraphPattern::LeftJoin { left, right, .. } => {
algorithm,
..
} => match algorithm {
LeftJoinAlgorithm::HashBuildRightProbeLeft { keys } => {
let left_size = estimate_graph_pattern_size(left, input_types);
let left_types = infer_graph_pattern_types(left, input_types.clone());
let right_types = infer_graph_pattern_types(right, input_types.clone());
max(
left_size,
left_size
.saturating_mul(estimate_graph_pattern_size(right, &right_types))
.saturating_div(
1_000_usize.saturating_pow(
count_common_variables(&left_types, &right_types, input_types)
.try_into()
.unwrap(),
),
),
.saturating_mul(estimate_graph_pattern_size(
right,
&infer_graph_pattern_types(right, input_types.clone()),
))
.saturating_div(1_000_usize.saturating_pow(keys.len().try_into().unwrap())),
)
}
},
#[cfg(feature = "sep-0006")]
GraphPattern::Lateral { left, right } => estimate_lateral_cost(
left,
@ -908,22 +971,16 @@ fn estimate_graph_pattern_size(pattern: &GraphPattern, input_types: &VariableTyp
fn estimate_join_cost(
left: &GraphPattern,
left_types: &VariableTypes,
right: &GraphPattern,
right_types: &VariableTypes,
algorithm: JoinAlgorithm,
algorithm: &JoinAlgorithm,
input_types: &VariableTypes,
) -> usize {
match algorithm {
JoinAlgorithm::HashBuildLeftProbeRight => estimate_graph_pattern_size(left, input_types)
JoinAlgorithm::HashBuildLeftProbeRight { keys } => {
estimate_graph_pattern_size(left, input_types)
.saturating_mul(estimate_graph_pattern_size(right, input_types))
.saturating_div(
1_000_usize.saturating_pow(
count_common_variables(left_types, right_types, input_types)
.try_into()
.unwrap(),
),
),
.saturating_div(1_000_usize.saturating_pow(keys.len().try_into().unwrap()))
}
}
}
fn estimate_lateral_cost(

@ -124,11 +124,20 @@ pub fn infer_graph_pattern_types(
}
types
}
GraphPattern::Service { name, inner, .. } => {
GraphPattern::Service {
name,
inner,
silent,
} => {
let parent_types = types.clone();
let mut types = infer_graph_pattern_types(inner, types);
if let NamedNodePattern::Variable(v) = name {
types.intersect_variable_with(v.clone(), VariableType::NAMED_NODE)
}
if *silent {
// On failure, single empty solution
types.union_with(parent_types);
}
types
}
}

@ -380,12 +380,9 @@ impl SimpleEvaluator {
PlanNode::HashJoin {
probe_child,
build_child,
keys,
} => {
let join_keys: Vec<_> = probe_child
.always_bound_variables()
.intersection(&build_child.always_bound_variables())
.copied()
.collect();
let join_keys = keys.iter().map(|v| v.encoded).collect::<Vec<_>>();
let (probe, probe_stats) = self.plan_evaluator(probe_child);
stat_children.push(probe_stats);
let (build, build_stats) = self.plan_evaluator(build_child);
@ -444,12 +441,8 @@ impl SimpleEvaluator {
}))
})
}
PlanNode::AntiJoin { left, right } => {
let join_keys: Vec<_> = left
.always_bound_variables()
.intersection(&right.always_bound_variables())
.copied()
.collect();
PlanNode::AntiJoin { left, right, keys } => {
let join_keys = keys.iter().map(|v| v.encoded).collect::<Vec<_>>();
let (left, left_stats) = self.plan_evaluator(left);
stat_children.push(left_stats);
let (right, right_stats) = self.plan_evaluator(right);
@ -487,12 +480,9 @@ impl SimpleEvaluator {
left,
right,
expression,
keys,
} => {
let join_keys: Vec<_> = left
.always_bound_variables()
.intersection(&right.always_bound_variables())
.copied()
.collect();
let join_keys = keys.iter().map(|v| v.encoded).collect::<Vec<_>>();
let (left, left_stats) = self.plan_evaluator(left);
stat_children.push(left_stats);
let (right, right_stats) = self.plan_evaluator(right);

@ -4,9 +4,7 @@ use crate::storage::numeric_encoder::EncodedTerm;
use regex::Regex;
use spargebra::algebra::GraphPattern;
use spargebra::term::GroundTerm;
use std::cmp::max;
use std::collections::btree_map::Entry;
use std::collections::{BTreeMap, BTreeSet};
use std::collections::BTreeSet;
use std::fmt;
use std::rc::Rc;
@ -40,6 +38,7 @@ pub enum PlanNode {
HashJoin {
probe_child: Rc<Self>,
build_child: Rc<Self>,
keys: Vec<PlanVariable>,
},
/// Right nested in left loop
ForLoopJoin {
@ -50,6 +49,7 @@ pub enum PlanNode {
AntiJoin {
left: Rc<Self>,
right: Rc<Self>,
keys: Vec<PlanVariable>,
},
Filter {
child: Rc<Self>,
@ -63,6 +63,7 @@ pub enum PlanNode {
left: Rc<Self>,
right: Rc<Self>,
expression: Box<PlanExpression>,
keys: Vec<PlanVariable>,
},
/// right nested in left loop
ForLoopLeftJoin {
@ -159,9 +160,10 @@ impl PlanNode {
Self::HashJoin {
probe_child: left,
build_child: right,
..
}
| Self::ForLoopJoin { left, right, .. }
| Self::AntiJoin { left, right }
| Self::AntiJoin { left, right, .. }
| Self::ForLoopLeftJoin { left, right, .. } => {
left.lookup_used_variables(callback);
right.lookup_used_variables(callback);
@ -170,6 +172,7 @@ impl PlanNode {
left,
right,
expression,
..
} => {
left.lookup_used_variables(callback);
right.lookup_used_variables(callback);
@ -219,134 +222,6 @@ impl PlanNode {
}
}
}
/// Returns subset of the set of variables that are always bound in the result set
///
/// (subset because this function is not perfect yet)
pub fn always_bound_variables(&self) -> BTreeSet<usize> {
let mut set = BTreeSet::default();
self.lookup_always_bound_variables(&mut |v| {
set.insert(v);
});
set
}
pub fn lookup_always_bound_variables(&self, callback: &mut impl FnMut(usize)) {
match self {
Self::StaticBindings { encoded_tuples, .. } => {
let mut variables = BTreeMap::default(); // value true iff always bound
let max_tuple_length = encoded_tuples
.iter()
.map(EncodedTuple::capacity)
.fold(0, max);
for tuple in encoded_tuples {
for key in 0..max_tuple_length {
match variables.entry(key) {
Entry::Vacant(e) => {
e.insert(tuple.contains(key));
}
Entry::Occupied(mut e) => {
if !tuple.contains(key) {
e.insert(false);
}
}
}
}
}
for (k, v) in variables {
if v {
callback(k);
}
}
}
Self::QuadPattern {
subject,
predicate,
object,
graph_name,
} => {
subject.lookup_variables(callback);
predicate.lookup_variables(callback);
object.lookup_variables(callback);
graph_name.lookup_variables(callback);
}
Self::PathPattern {
subject,
object,
graph_name,
..
} => {
subject.lookup_variables(callback);
object.lookup_variables(callback);
graph_name.lookup_variables(callback);
}
Self::Filter { child, .. } => {
//TODO: have a look at the expression to know if it filters out unbound variables
child.lookup_always_bound_variables(callback);
}
Self::Union { children } => {
if let Some(vars) = children
.iter()
.map(|c| c.always_bound_variables())
.reduce(|a, b| a.intersection(&b).copied().collect())
{
for v in vars {
callback(v);
}
}
}
Self::HashJoin {
probe_child: left,
build_child: right,
}
| Self::ForLoopJoin { left, right, .. } => {
left.lookup_always_bound_variables(callback);
right.lookup_always_bound_variables(callback);
}
Self::AntiJoin { left, .. }
| Self::HashLeftJoin { left, .. }
| Self::ForLoopLeftJoin { left, .. } => {
left.lookup_always_bound_variables(callback);
}
Self::Extend {
child,
variable,
expression,
} => {
if matches!(
expression.as_ref(),
PlanExpression::NamedNode(_) | PlanExpression::Literal(_)
) {
// TODO: more cases?
callback(variable.encoded);
}
child.lookup_always_bound_variables(callback);
}
Self::Sort { child, .. }
| Self::HashDeduplicate { child }
| Self::Reduced { child }
| Self::Skip { child, .. }
| Self::Limit { child, .. } => child.lookup_always_bound_variables(callback),
Self::Service { child, silent, .. } => {
if *silent {
// none, might return a null tuple
} else {
child.lookup_always_bound_variables(callback)
}
}
Self::Project { mapping, child } => {
let child_bound = child.always_bound_variables();
for (child_i, output_i) in mapping.iter() {
if child_bound.contains(&child_i.encoded) {
callback(output_i.encoded);
}
}
}
Self::Aggregate { .. } => {
//TODO
}
}
}
}
#[derive(Debug, Clone)]

@ -106,25 +106,37 @@ impl<'a> PlanBuilder<'a> {
right,
algorithm,
} => match algorithm {
JoinAlgorithm::HashBuildLeftProbeRight => PlanNode::HashJoin {
JoinAlgorithm::HashBuildLeftProbeRight { keys } => PlanNode::HashJoin {
build_child: Rc::new(self.build_for_graph_pattern(left, variables)?),
probe_child: Rc::new(self.build_for_graph_pattern(right, variables)?),
keys: keys
.iter()
.map(|v| build_plan_variable(variables, v))
.collect(),
},
},
GraphPattern::LeftJoin {
left,
right,
expression,
} => PlanNode::HashLeftJoin {
algorithm,
} => match algorithm {
LeftJoinAlgorithm::HashBuildRightProbeLeft { keys } => PlanNode::HashLeftJoin {
left: Rc::new(self.build_for_graph_pattern(left, variables)?),
right: Rc::new(self.build_for_graph_pattern(right, variables)?),
expression: Box::new(self.build_for_expression(expression, variables)?),
keys: keys
.iter()
.map(|v| build_plan_variable(variables, v))
.collect(),
},
},
GraphPattern::Lateral { left, right } => {
if let GraphPattern::LeftJoin {
left: nested_left,
right: nested_right,
expression,
..
} = right.as_ref()
{
if nested_left.is_empty_singleton() {
@ -167,9 +179,19 @@ impl<'a> PlanBuilder<'a> {
variable: build_plan_variable(variables, variable),
expression: Box::new(self.build_for_expression(expression, variables)?),
},
GraphPattern::Minus { left, right } => PlanNode::AntiJoin {
GraphPattern::Minus {
left,
right,
algorithm,
} => match algorithm {
MinusAlgorithm::HashBuildRightProbeLeft { keys } => PlanNode::AntiJoin {
left: Rc::new(self.build_for_graph_pattern(left, variables)?),
right: Rc::new(self.build_for_graph_pattern(right, variables)?),
keys: keys
.iter()
.map(|v| build_plan_variable(variables, v))
.collect(),
},
},
GraphPattern::Service {
name,

Loading…
Cancel
Save