apache · xiedeyantu · May 19, 2026 · Dandandan · May 19, 2026 · xiedeyantu
diff --git a/datafusion/core/src/optimizer_rule_reference.md b/datafusion/core/src/optimizer_rule_reference.md
@@ -47,21 +47,22 @@ Rule order matters. The default pipeline may change between releases.
 | 8     | `scalar_subquery_to_join`                 | Rewrites eligible scalar subqueries into joins and adds schema-preserving projections.                                      |
 | 9     | `decorrelate_lateral_join`                | Rewrites eligible lateral joins into regular joins.                                                                         |
 | 10    | `extract_equijoin_predicate`              | Splits join filters into equijoin keys and residual predicates.                                                             |
-| 11    | `eliminate_duplicated_expr`               | Removes duplicate expressions from projections, aggregates, and similar operators.                                          |
-| 12    | `eliminate_filter`                        | Drops always-true filters and replaces always-false or NULL filters with empty relations.                                   |
-| 13    | `eliminate_cross_join`                    | Uses filter predicates to replace cross joins with inner joins when join keys can be found.                                 |
-| 14    | `eliminate_limit`                         | Removes no-op limits and simplifies trivial limit shapes.                                                                   |
-| 15    | `propagate_empty_relation`                | Pushes empty-relation knowledge upward so operators fed by no rows collapse early.                                          |
-| 16    | `filter_null_join_keys`                   | Adds `IS NOT NULL` filters to nullable equijoin keys that can never match.                                                  |
-| 17    | `eliminate_outer_join`                    | Rewrites outer joins to inner joins when later filters reject the NULL-extended rows.                                       |
-| 18    | `push_down_limit`                         | Moves literal limits closer to scans and unions and merges adjacent limits.                                                 |
-| 19    | `push_down_filter`                        | Moves filters as early as possible through filter-commutative operators.                                                    |
-| 20    | `single_distinct_aggregation_to_group_by` | Rewrites single-column `DISTINCT` aggregations into two-stage `GROUP BY` plans.                                             |
-| 21    | `eliminate_group_by_constant`             | Removes constant or functionally redundant expressions from `GROUP BY`.                                                     |
-| 22    | `common_sub_expression_eliminate`         | Computes repeated subexpressions once and reuses the result.                                                                |
-| 23    | `extract_leaf_expressions`                | Pulls cheap leaf expressions closer to data sources so later pruning and filter rules can act earlier.                      |
-| 24    | `push_down_leaf_projections`              | Pushes the helper projections created by leaf extraction toward leaf inputs.                                                |
-| 25    | `optimize_projections`                    | Prunes unused columns and removes unnecessary logical projections.                                                          |
+| 11    | `expand_join_or_predicate`               | Rewrites inner joins whose OR branches are all hashjoin-capable equijoin predicates into a `UNION ALL` of inner joins.     |
+| 12    | `eliminate_duplicated_expr`               | Removes duplicate expressions from projections, aggregates, and similar operators.                                          |
+| 13    | `eliminate_filter`                        | Drops always-true filters and replaces always-false or NULL filters with empty relations.                                   |
+| 14    | `eliminate_cross_join`                    | Uses filter predicates to replace cross joins with inner joins when join keys can be found.                                 |
+| 15    | `eliminate_limit`                         | Removes no-op limits and simplifies trivial limit shapes.                                                                   |
+| 16    | `propagate_empty_relation`                | Pushes empty-relation knowledge upward so operators fed by no rows collapse early.                                          |
+| 17    | `filter_null_join_keys`                   | Adds `IS NOT NULL` filters to nullable equijoin keys that can never match.                                                  |
+| 18    | `eliminate_outer_join`                    | Rewrites outer joins to inner joins when later filters reject the NULL-extended rows.                                       |
+| 19    | `push_down_limit`                         | Moves literal limits closer to scans and unions and merges adjacent limits.                                                 |
+| 20    | `push_down_filter`                        | Moves filters as early as possible through filter-commutative operators.                                                    |
+| 21    | `single_distinct_aggregation_to_group_by` | Rewrites single-column `DISTINCT` aggregations into two-stage `GROUP BY` plans.                                             |
+| 22    | `eliminate_group_by_constant`             | Removes constant or functionally redundant expressions from `GROUP BY`.                                                     |
+| 23    | `common_sub_expression_eliminate`         | Computes repeated subexpressions once and reuses the result.                                                                |
+| 24    | `extract_leaf_expressions`                | Pulls cheap leaf expressions closer to data sources so later pruning and filter rules can act earlier.                      |
+| 25    | `push_down_leaf_projections`              | Pushes the helper projections created by leaf extraction toward leaf inputs.                                                |
+| 26    | `optimize_projections`                    | Prunes unused columns and removes unnecessary logical projections.                                                          |
 
 ### Physical Optimizer Rules
 

diff --git a/datafusion/optimizer/src/eliminate_cross_join.rs b/datafusion/optimizer/src/eliminate_cross_join.rs
@@ -310,6 +310,11 @@ fn find_inner_join(
 ) -> Result<LogicalPlan> {
     for (i, right_input) in rights.iter().enumerate() {
         let mut join_keys = vec![];
+        let candidate_join_schema = Arc::new(build_join_schema(
+            left_input.schema(),
+            right_input.schema(),
+            &JoinType::Inner,
+        )?);
 
         for (l, r) in possible_join_keys.iter() {
             let key_pair = find_valid_equijoin_key_pair(
@@ -321,7 +326,7 @@ fn find_inner_join(
 
             // Save join keys
             if let Some((valid_l, valid_r)) = key_pair
-                && can_hash(&valid_l.get_type(left_input.schema())?)
+                && can_hash(&valid_l.get_type(candidate_join_schema.as_ref())?)
             {
                 join_keys.push((valid_l, valid_r));
             }
@@ -331,11 +336,7 @@ fn find_inner_join(
         if !join_keys.is_empty() {
             all_join_keys.insert_all(join_keys.iter());
             let right_input = rights.remove(i);
-            let join_schema = Arc::new(build_join_schema(
-                left_input.schema(),
-                right_input.schema(),
-                &JoinType::Inner,
-            )?);
+            let join_schema = candidate_join_schema;
 
             return Ok(LogicalPlan::Join(Join {
                 left: Arc::new(left_input),

diff --git a/datafusion/optimizer/src/expand_join_or_predicate.rs b/datafusion/optimizer/src/expand_join_or_predicate.rs
@@ -0,0 +1,174 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`ExpandJoinOrPredicate`] rewrites inner joins with OR filters into a UNION ALL
+//! of mutually exclusive hashjoin-capable inner joins.
+
+use crate::optimizer::ApplyOrder;
+use crate::{OptimizerConfig, OptimizerRule};
+use std::sync::Arc;
+
+use datafusion_common::tree_node::Transformed;
+use datafusion_common::Result;
+use datafusion_expr::logical_plan::{Join, LogicalPlan, Projection, Union};
+use datafusion_expr::utils::{can_hash, find_valid_equijoin_key_pair, split_binary_owned, split_conjunction_owned};
+use datafusion_expr::{Expr, ExprSchemable, JoinType, Operator};
+
+#[derive(Default, Debug)]
+pub struct ExpandJoinOrPredicate;
+
+impl ExpandJoinOrPredicate {
+    #[expect(missing_docs)]
+    pub fn new() -> Self {
+        Self {}
+    }
+}
+
+impl OptimizerRule for ExpandJoinOrPredicate {
+    fn name(&self) -> &str {
+        "expand_join_or_predicate"
+    }
+
+    fn supports_rewrite(&self) -> bool {
+        true
+    }
+
+    fn apply_order(&self) -> Option<ApplyOrder> {
+        Some(ApplyOrder::BottomUp)
+    }
+
+    fn rewrite(
+        &self,
+        plan: LogicalPlan,
+        _config: &dyn OptimizerConfig,
+    ) -> Result<Transformed<LogicalPlan>> {
+        match plan {
+            LogicalPlan::Join(join) => rewrite_join(join),
+            _ => Ok(Transformed::no(plan)),
+        }
+    }
+}
+
+fn rewrite_join(join: Join) -> Result<Transformed<LogicalPlan>> {
+    let original_schema = Arc::clone(&join.schema);
+
+    if join.join_type != JoinType::Inner || join.null_aware {
+        return Ok(Transformed::no(LogicalPlan::Join(join)));
+    }
+
+    let Some(filter) = join.filter.clone() else {
+        return Ok(Transformed::no(LogicalPlan::Join(join)));
+    };
+
+    if filter.is_volatile() {
+        return Ok(Transformed::no(LogicalPlan::Join(join)));
+    }
+
+    let disjuncts = split_binary_owned(filter, Operator::Or);
+    if disjuncts.len() < 2 {
+        return Ok(Transformed::no(LogicalPlan::Join(join)));
+    }
+
+    let left_schema = join.left.schema();
+    let right_schema = join.right.schema();
+
+    let Some(branch_keys) = disjuncts
+        .iter()
+        .map(|expr| extract_hashjoin_keys(expr, left_schema, right_schema))
+        .collect::<Result<Option<Vec<Vec<(Expr, Expr)>>>>>()?
+    else {
+        return Ok(Transformed::no(LogicalPlan::Join(join)));
+    };
+
+    let mut guards = Vec::with_capacity(disjuncts.len().saturating_sub(1));
+    let mut branches = Vec::with_capacity(disjuncts.len());
+
+    for (disjunct, keys) in disjuncts.into_iter().zip(branch_keys.into_iter()) {
+        let branch_filter = guards
+            .iter()
+            .cloned()
+            .reduce(Expr::and);
+
+        let mut on = join.on.clone();
+        on.extend(keys);
+
+        let branch = LogicalPlan::Join(Join::try_new(
+            Arc::clone(&join.left),
+            Arc::clone(&join.right),
+            on,
+            branch_filter,
+            join.join_type,
+            join.join_constraint,
+            join.null_equality,
+            join.null_aware,
+        )?);
+
+        let branch = LogicalPlan::Projection(Projection::new_from_schema(
+            Arc::new(branch),
+            Arc::clone(&original_schema),
+        ));
+
+        branches.push(Arc::new(branch));
+
+        guards.push(disjunct.is_not_true());
+    }
+
+    let rewritten = LogicalPlan::Union(Union {
+        inputs: branches,
+        schema: original_schema,
+    });
+
+    Ok(Transformed::yes(rewritten))
+}
+
+fn extract_hashjoin_keys(
+    expr: &Expr,
+    left_schema: &datafusion_common::DFSchema,
+    right_schema: &datafusion_common::DFSchema,
+) -> Result<Option<Vec<(Expr, Expr)>>> {
+    let conjuncts = split_conjunction_owned(expr.clone());
+    let mut keys = Vec::with_capacity(conjuncts.len());
+
+    for conjunct in conjuncts {
+        let Expr::BinaryExpr(binary) = conjunct else {
+            return Ok(None);
+        };
+
+        if binary.op != Operator::Eq {
+            return Ok(None);
+        }
+
+        let Some((left, right)) = find_valid_equijoin_key_pair(
+            &binary.left,
+            &binary.right,
+            left_schema,
+            right_schema,
+        )? else {
+            return Ok(None);
+        };
+
+        let left_type = left.get_type(left_schema)?;
+        let right_type = right.get_type(right_schema)?;
+        if !can_hash(&left_type) || !can_hash(&right_type) {
+            return Ok(None);
+        }
+
+        keys.push((left, right));
+    }
+
+    Ok(Some(keys))
+}
diff --git a/datafusion/optimizer/src/lib.rs b/datafusion/optimizer/src/lib.rs
@@ -56,6 +56,7 @@ pub mod eliminate_nested_union {
     pub type EliminateNestedUnion = OptimizeUnions;
 }
 pub mod eliminate_outer_join;
+pub mod expand_join_or_predicate;
 pub mod extract_equijoin_predicate;
 pub mod extract_leaf_expressions;
 pub mod filter_null_join_keys;

diff --git a/datafusion/optimizer/src/optimizer.rs b/datafusion/optimizer/src/optimizer.rs
@@ -42,6 +42,7 @@ use crate::eliminate_group_by_constant::EliminateGroupByConstant;
 use crate::eliminate_join::EliminateJoin;
 use crate::eliminate_limit::EliminateLimit;
 use crate::eliminate_outer_join::EliminateOuterJoin;
+use crate::expand_join_or_predicate::ExpandJoinOrPredicate;
 use crate::extract_equijoin_predicate::ExtractEquijoinPredicate;
 use crate::extract_leaf_expressions::{ExtractLeafExpressions, PushDownLeafProjections};
 use crate::filter_null_join_keys::FilterNullJoinKeys;
@@ -289,6 +290,7 @@ impl Optimizer {
             Arc::new(ScalarSubqueryToJoin::new()),
             Arc::new(DecorrelateLateralJoin::new()),
             Arc::new(ExtractEquijoinPredicate::new()),
+            Arc::new(ExpandJoinOrPredicate::new()),
             Arc::new(EliminateDuplicatedExpr::new()),
             Arc::new(EliminateFilter::new()),
             Arc::new(EliminateCrossJoin::new()),

diff --git a/datafusion/sqllogictest/test_files/explain.slt b/datafusion/sqllogictest/test_files/explain.slt
@@ -186,6 +186,7 @@ logical_plan after decorrelate_predicate_subquery SAME TEXT AS ABOVE
 logical_plan after scalar_subquery_to_join SAME TEXT AS ABOVE
 logical_plan after decorrelate_lateral_join SAME TEXT AS ABOVE
 logical_plan after extract_equijoin_predicate SAME TEXT AS ABOVE
+logical_plan after expand_join_or_predicate SAME TEXT AS ABOVE
 logical_plan after eliminate_duplicated_expr SAME TEXT AS ABOVE
 logical_plan after eliminate_filter SAME TEXT AS ABOVE
 logical_plan after eliminate_cross_join SAME TEXT AS ABOVE
@@ -211,6 +212,7 @@ logical_plan after decorrelate_predicate_subquery SAME TEXT AS ABOVE
 logical_plan after scalar_subquery_to_join SAME TEXT AS ABOVE
 logical_plan after decorrelate_lateral_join SAME TEXT AS ABOVE
 logical_plan after extract_equijoin_predicate SAME TEXT AS ABOVE
+logical_plan after expand_join_or_predicate SAME TEXT AS ABOVE
 logical_plan after eliminate_duplicated_expr SAME TEXT AS ABOVE
 logical_plan after eliminate_filter SAME TEXT AS ABOVE
 logical_plan after eliminate_cross_join SAME TEXT AS ABOVE
@@ -563,6 +565,7 @@ logical_plan after decorrelate_predicate_subquery SAME TEXT AS ABOVE
 logical_plan after scalar_subquery_to_join SAME TEXT AS ABOVE
 logical_plan after decorrelate_lateral_join SAME TEXT AS ABOVE
 logical_plan after extract_equijoin_predicate SAME TEXT AS ABOVE
+logical_plan after expand_join_or_predicate SAME TEXT AS ABOVE
 logical_plan after eliminate_duplicated_expr SAME TEXT AS ABOVE
 logical_plan after eliminate_filter SAME TEXT AS ABOVE
 logical_plan after eliminate_cross_join SAME TEXT AS ABOVE
@@ -588,6 +591,7 @@ logical_plan after decorrelate_predicate_subquery SAME TEXT AS ABOVE
 logical_plan after scalar_subquery_to_join SAME TEXT AS ABOVE
 logical_plan after decorrelate_lateral_join SAME TEXT AS ABOVE
 logical_plan after extract_equijoin_predicate SAME TEXT AS ABOVE
+logical_plan after expand_join_or_predicate SAME TEXT AS ABOVE
 logical_plan after eliminate_duplicated_expr SAME TEXT AS ABOVE
 logical_plan after eliminate_filter SAME TEXT AS ABOVE
 logical_plan after eliminate_cross_join SAME TEXT AS ABOVE

diff --git a/datafusion/sqllogictest/test_files/join_only.slt b/datafusion/sqllogictest/test_files/join_only.slt
@@ -20,3 +20,56 @@ include ./join.slt.part
 # Config reset
 statement ok
 RESET datafusion.optimizer.repartition_joins;
+
+# expand_join_or_predicate: rows that satisfy multiple OR branches should appear once
+statement ok
+CREATE TABLE IF NOT EXISTS join_or_left(lid INT, lalt INT) AS VALUES
+(1, 10),
+(2, 20),
+(3, NULL);
+
+statement ok
+CREATE TABLE IF NOT EXISTS join_or_right(rid INT, ralt INT) AS VALUES
+(1, 10),
+(4, 20),
+(3, 30);
+
+query IIII rowsort
+SELECT *
+FROM join_or_left
+INNER JOIN join_or_right
+  ON lid = rid OR lalt = ralt;
+----
+1 10 1 10
+2 20 4 20
+3 NULL 3 30
+
+# expand_join_or_predicate: if the previous equality is NULL and the later branch is true,
+# the row must still match
+statement ok
+CREATE TABLE IF NOT EXISTS join_or_null_left(la INT, lb INT) AS VALUES
+(NULL, 1);
+
+statement ok
+CREATE TABLE IF NOT EXISTS join_or_null_right(ra INT, rb INT) AS VALUES
+(NULL, 1);
+
+query IIII rowsort
+SELECT *
+FROM join_or_null_left
+INNER JOIN join_or_null_right
+  ON la = ra OR lb = rb;
+----
+NULL 1 NULL 1
+
+statement ok
+DROP TABLE IF EXISTS join_or_left;
+
+statement ok
+DROP TABLE IF EXISTS join_or_right;
+
+statement ok
+DROP TABLE IF EXISTS join_or_null_left;
+
+statement ok
+DROP TABLE IF EXISTS join_or_null_right;