Optimize DISTINCT, ORDER BY clause when Aggregation without Group By.

For query which has Aggregation but without Group by clause, the DISTINCT/DISTINCT ON/ORDER BY clause could be removed as there would be one row returned at most. And there is no necessary to do unique or sort. This can simply the plan, and process less Aggref nodes during planner. select distinct on(count(b), count(c)) count(a), sum(b) from t_distinct_sort order by count(c); QUERY PLAN -------------------------------------------------------------------- Unique Output: (count(a)), (sum(b)), (count(c)), (count(b)) Group Key: (count(c)), (count(b)) -> Sort Output: (count(a)), (sum(b)), (count(c)), (count(b)) Sort Key: (count(t_distinct_sort.c)), (count(t_distinct_sort.b)) -> Finalize Aggregate Output: count(a), sum(b), count(c), count(b) -> Gather Motion 3:1 (slice1; segments: 3) Output: (PARTIAL count(a)), (PARTIAL sum(b)), (PARTIAL count(c)), (PARTIAL count(b)) -> Partial Aggregate Output: PARTIAL count(a), PARTIAL sum(b), PARTIAL count(c), PARTIAL count(b) -> Seq Scan on public.t_distinct_sort Output: a, b, c After this commit: select distinct on(count(b), count(c)) count(a), sum(b) from t_distinct_sort order by count(c); QUERY PLAN -------------------------------------------------------- Finalize Aggregate Output: count(a), sum(b) -> Gather Motion 3:1 (slice1; segments: 3) Output: (PARTIAL count(a)), (PARTIAL sum(b)) -> Partial Aggregate Output: PARTIAL count(a), PARTIAL sum(b) -> Seq Scan on public.t_distinct_sort Output: a, b, c Optimizer: Postgres query optimizer Authored-by: Zhang Mingli [email protected]
apache · Nov 4, 2024 · 1d31349 · 1d31349
1 parent 5633fbb
commit 1d31349
Show file tree

Hide file tree

Showing 14 changed files with 521 additions and 150 deletions.
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
@@ -1339,6 +1339,8 @@ subquery_planner(PlannerGlobal *glob, Query *parse,
 	if (hasResultRTEs)
 		remove_useless_result_rtes(root);
 
+	parse = remove_distinct_sort_clause(parse);
+
 	/*
 	 * Do the main planning.
 	 */

diff --git a/src/backend/optimizer/plan/transform.c b/src/backend/optimizer/plan/transform.c
@@ -20,6 +20,7 @@
 #include "nodes/makefuncs.h"
 #include "optimizer/clauses.h"
 #include "optimizer/optimizer.h"
+#include "optimizer/tlist.h"
 #include "optimizer/transform.h"
 #include "utils/lsyscache.h"
 #include "catalog/pg_proc.h"
@@ -520,3 +521,134 @@ replace_sirvf_rte(Query *query, RangeTblEntry *rte)
 
 	return rte;
 }
+
+/*
+ * Does query has SRFs, or WITH ORDINALITY?
+ */
+bool query_has_srf(Query *query)
+{
+	if (query->hasTargetSRFs)
+	{
+		return true;
+	}
+
+	/* Double check for subquery. */
+	if (expression_returns_set( (Node *) query->targetList))
+	{
+		return true;
+	}
+
+	ListCell *lcrte = NULL;
+	foreach (lcrte, query->rtable)
+	{
+		RangeTblEntry *rte = (RangeTblEntry *) lfirst(lcrte);
+
+		switch(rte->rtekind)
+		{
+			case RTE_FUNCTION:
+			{
+				ListCell *lcrtfunc;
+
+				/* WITH ORDINALITY */
+				if (rte->funcordinality)
+					return true;
+
+				foreach(lcrtfunc, rte->functions)
+				{
+					RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lcrtfunc);
+
+					if (!IsA(rtfunc->funcexpr, FuncExpr))
+						return true;
+
+					if (((FuncExpr *) rtfunc->funcexpr)->funcretset)
+					{
+						/* SRF in FROM clause */
+						return true;
+					}
+				}
+				break;
+			}
+			case RTE_SUBQUERY:
+			{
+				Query *sq = (Query *) rte->subquery;
+
+				if (query_has_srf(sq))
+				{
+					return true;
+				}
+				break;
+			}
+			default:
+			{
+				break;
+			}
+		}
+	}
+
+	return false;
+}
+
+/*
+ * DISTINCT/DISTINCT ON/ORDER BY optimization.
+ * Remove DISTINCT clause if possibile, ex:
+ * select DISTINCT count(a) from t; to
+ * select count(a) from t;
+ * There is one row returned at most, DISTINCT and/or ON is pointless then.
+ * The same with ORDER BY clause;
+ */
+Query *remove_distinct_sort_clause(Query *parse)
+{
+	if (parse->hasAggs &&
+		parse->groupClause == NIL &&
+		!contain_mutable_functions((Node *) parse) &&
+		!query_has_srf(parse))
+	{
+		List	   *useless_tlist = NIL;
+		List	   *tles;
+		List	   *sortops;
+		List	   *eqops;
+		ListCell   *lc;
+
+		if (parse->distinctClause != NIL)
+		{
+			get_sortgroupclauses_tles(parse->distinctClause, parse->targetList,
+									  &tles, &sortops, &eqops);
+			foreach(lc, tles)
+			{
+				TargetEntry *tle = lfirst(lc);
+				if (tle->resjunk)
+					useless_tlist = lappend(useless_tlist, tle);
+			}
+			parse->distinctClause = NIL;
+			if (parse->hasDistinctOn)
+				parse->hasDistinctOn = false;
+		}
+
+		if (parse->sortClause != NIL)
+		{
+
+			get_sortgroupclauses_tles(parse->sortClause, parse->targetList,
+									  &tles, &sortops, &eqops);
+			foreach(lc, tles)
+			{
+				TargetEntry *tle = lfirst(lc);
+				/*
+				 * For SELECT DISTINCT, ORDER BY expressions must appear in select list,
+				 * Some tles may be already in the list.
+				 */ 
+				if (tle->resjunk)
+					useless_tlist = list_append_unique(useless_tlist, tle);
+			}
+			parse->sortClause = NIL;
+		}
+
+		/*
+		 * There is no groupClause, sortClause and distinctClause now .
+		 * The junk TargetEntrys with ressortgroupref index are safe to be removed.
+		 */
+		if (useless_tlist != NIL)
+			parse->targetList = list_difference(parse->targetList, useless_tlist);
+	}
+
+	return parse;
+}
diff --git a/src/include/optimizer/transform.h b/src/include/optimizer/transform.h
@@ -21,4 +21,8 @@
 
 extern Query *normalize_query(Query *query);
 
+extern Query *remove_distinct_sort_clause(Query *query);
+
+extern bool query_has_srf(Query *query);
+
 #endif /* TRANSFORM_H */
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
@@ -1066,17 +1066,15 @@ explain (costs off)
   select distinct max(unique2) from tenk1;
                                 QUERY PLAN                                 
 ---------------------------------------------------------------------------
- HashAggregate
-   Group Key: $0
+ Result
    InitPlan 1 (returns $0)  (slice1)
      ->  Limit
            ->  Gather Motion 3:1  (slice2; segments: 3)
                  Merge Key: tenk1.unique2
                  ->  Index Only Scan Backward using tenk1_unique2 on tenk1
                        Index Cond: (unique2 IS NOT NULL)
-   ->  Result
  Optimizer: Postgres query optimizer
-(10 rows)
+(8 rows)
 
 select distinct max(unique2) from tenk1;
  max  
@@ -1088,17 +1086,15 @@ explain (costs off)
   select max(unique2) from tenk1 order by 1;
                                 QUERY PLAN                                 
 ---------------------------------------------------------------------------
- Sort
-   Sort Key: ($0)
+ Result
    InitPlan 1 (returns $0)  (slice1)
      ->  Limit
            ->  Gather Motion 3:1  (slice2; segments: 3)
                  Merge Key: tenk1.unique2
                  ->  Index Only Scan Backward using tenk1_unique2 on tenk1
                        Index Cond: (unique2 IS NOT NULL)
-   ->  Result
  Optimizer: Postgres query optimizer
-(10 rows)
+(8 rows)
 
 select max(unique2) from tenk1 order by 1;
  max  
@@ -1110,17 +1106,15 @@ explain (costs off)
   select max(unique2) from tenk1 order by max(unique2);
                                 QUERY PLAN                                 
 ---------------------------------------------------------------------------
- Sort
-   Sort Key: ($0)
+ Result
    InitPlan 1 (returns $0)  (slice1)
      ->  Limit
            ->  Gather Motion 3:1  (slice2; segments: 3)
                  Merge Key: tenk1.unique2
                  ->  Index Only Scan Backward using tenk1_unique2 on tenk1
                        Index Cond: (unique2 IS NOT NULL)
-   ->  Result
  Optimizer: Postgres query optimizer
-(10 rows)
+(8 rows)
 
 select max(unique2) from tenk1 order by max(unique2);
  max  
@@ -1132,17 +1126,15 @@ explain (costs off)
   select max(unique2) from tenk1 order by max(unique2)+1;
                                 QUERY PLAN                                 
 ---------------------------------------------------------------------------
- Sort
-   Sort Key: (($0 + 1))
+ Result
    InitPlan 1 (returns $0)  (slice1)
      ->  Limit
            ->  Gather Motion 3:1  (slice2; segments: 3)
                  Merge Key: tenk1.unique2
                  ->  Index Only Scan Backward using tenk1_unique2 on tenk1
                        Index Cond: (unique2 IS NOT NULL)
-   ->  Result
  Optimizer: Postgres query optimizer
-(10 rows)
+(8 rows)
 
 select max(unique2) from tenk1 order by max(unique2)+1;
  max  
@@ -1260,20 +1252,16 @@ explain (costs off)
   select distinct min(f1), max(f1) from minmaxtest;
                           QUERY PLAN                          
 --------------------------------------------------------------
- Unique
-   Group Key: (min(minmaxtest.f1)), (max(minmaxtest.f1))
-   ->  Sort
-         Sort Key: (min(minmaxtest.f1)), (max(minmaxtest.f1))
-         ->  Finalize Aggregate
-               ->  Gather Motion 3:1  (slice1; segments: 3)
-                     ->  Partial Aggregate
-                           ->  Append
-                                 ->  Seq Scan on minmaxtest minmaxtest_1
-                                 ->  Seq Scan on minmaxtest1 minmaxtest_2
-                                 ->  Seq Scan on minmaxtest2 minmaxtest_3
-                                 ->  Seq Scan on minmaxtest3 minmaxtest_4
+ Finalize Aggregate
+   ->  Gather Motion 3:1  (slice1; segments: 3)
+         ->  Partial Aggregate
+               ->  Append
+                     ->  Seq Scan on minmaxtest minmaxtest_1
+                     ->  Seq Scan on minmaxtest1 minmaxtest_2
+                     ->  Seq Scan on minmaxtest2 minmaxtest_3
+                     ->  Seq Scan on minmaxtest3 minmaxtest_4
  Optimizer: Postgres query optimizer
-(13 rows)
+(9 rows)
 
 select distinct min(f1), max(f1) from minmaxtest;
  min | max 

diff --git a/src/test/regress/expected/aggregates_optimizer.out b/src/test/regress/expected/aggregates_optimizer.out
@@ -1267,22 +1267,18 @@ explain (costs off)
   select distinct min(f1), max(f1) from minmaxtest;
 INFO:  GPORCA failed to produce a plan, falling back to planner
 DETAIL:  Feature not supported: Inherited tables
-                                QUERY PLAN                                
---------------------------------------------------------------------------
- Unique
-   Group Key: (min(minmaxtest.f1)), (max(minmaxtest.f1))
-   ->  Sort
-         Sort Key: (min(minmaxtest.f1)), (max(minmaxtest.f1))
-         ->  Finalize Aggregate
-               ->  Gather Motion 3:1  (slice1; segments: 3)
-                     ->  Partial Aggregate
-                           ->  Append
-                                 ->  Seq Scan on minmaxtest minmaxtest_1
-                                 ->  Seq Scan on minmaxtest1 minmaxtest_2
-                                 ->  Seq Scan on minmaxtest2 minmaxtest_3
-                                 ->  Seq Scan on minmaxtest3 minmaxtest_4
+                          QUERY PLAN                          
+--------------------------------------------------------------
+ Finalize Aggregate
+   ->  Gather Motion 3:1  (slice1; segments: 3)
+         ->  Partial Aggregate
+               ->  Append
+                     ->  Seq Scan on minmaxtest minmaxtest_1
+                     ->  Seq Scan on minmaxtest1 minmaxtest_2
+                     ->  Seq Scan on minmaxtest2 minmaxtest_3
+                     ->  Seq Scan on minmaxtest3 minmaxtest_4
  Optimizer: Postgres query optimizer
-(13 rows)
+(9 rows)
 
 select distinct min(f1), max(f1) from minmaxtest;
 INFO:  GPORCA failed to produce a plan, falling back to planner

diff --git a/src/test/regress/expected/cbdb_parallel.out b/src/test/regress/expected/cbdb_parallel.out
@@ -2407,22 +2407,16 @@ create table t1(c1 int) distributed by (c1);
 insert into t1 values(11), (12);
 analyze t1;
 explain(costs off, locus) select distinct min(c1), max(c1) from t1;
-                         QUERY PLAN                         
-------------------------------------------------------------
- Unique
+                   QUERY PLAN                   
+------------------------------------------------
+ Aggregate
    Locus: Entry
-   Group Key: (min(c1)), (max(c1))
-   ->  Sort
+   ->  Gather Motion 3:1  (slice1; segments: 3)
          Locus: Entry
-         Sort Key: (min(c1)), (max(c1))
-         ->  Aggregate
-               Locus: Entry
-               ->  Gather Motion 3:1  (slice1; segments: 3)
-                     Locus: Entry
-                     ->  Seq Scan on t1
-                           Locus: Hashed
+         ->  Seq Scan on t1
+               Locus: Hashed
  Optimizer: Postgres query optimizer
-(13 rows)
+(7 rows)
 
 abort;
 begin;

diff --git a/src/test/regress/expected/incremental_sort.out b/src/test/regress/expected/incremental_sort.out
@@ -1705,26 +1705,24 @@ from tenk1 t1
 join tenk1 t2 on t1.unique1 = t2.unique2
 join tenk1 t3 on t2.unique1 = t3.unique1
 order by count(*);
-                                           QUERY PLAN                                            
--------------------------------------------------------------------------------------------------
- Sort
-   Sort Key: (count(*))
-   ->  Finalize Aggregate
-         ->  Gather Motion 3:1  (slice1; segments: 3)
-               ->  Partial Aggregate
-                     ->  Hash Join
-                           Hash Cond: (t2.unique2 = t1.unique1)
-                           ->  Redistribute Motion 3:3  (slice2; segments: 3)
-                                 Hash Key: t2.unique2
-                                 ->  Hash Join
-                                       Hash Cond: (t2.unique1 = t3.unique1)
-                                       ->  Index Scan using tenk1_unique2 on tenk1 t2
-                                       ->  Hash
-                                             ->  Index Only Scan using tenk1_unique1 on tenk1 t3
-                           ->  Hash
-                                 ->  Index Only Scan using tenk1_unique1 on tenk1 t1
+                                        QUERY PLAN                                         
+-------------------------------------------------------------------------------------------
+ Finalize Aggregate
+   ->  Gather Motion 3:1  (slice1; segments: 3)
+         ->  Partial Aggregate
+               ->  Hash Join
+                     Hash Cond: (t2.unique2 = t1.unique1)
+                     ->  Redistribute Motion 3:3  (slice2; segments: 3)
+                           Hash Key: t2.unique2
+                           ->  Hash Join
+                                 Hash Cond: (t2.unique1 = t3.unique1)
+                                 ->  Index Scan using tenk1_unique2 on tenk1 t2
+                                 ->  Hash
+                                       ->  Index Only Scan using tenk1_unique1 on tenk1 t3
+                     ->  Hash
+                           ->  Index Only Scan using tenk1_unique1 on tenk1 t1
  Optimizer: Postgres query optimizer
-(17 rows)
+(15 rows)
 
 -- Parallel sort but with expression (correlated subquery) that
 -- is prohibited in parallel plans.