Skip to content

Commit

Permalink
Optimize DISTINCT, ORDER BY clause when Aggregation without Group By.
Browse files Browse the repository at this point in the history
For query which has Aggregation but without Group by clause, the
DISTINCT/DISTINCT ON/ORDER BY clause could be removed as there would
be one row returned at most.
And there is no necessary to do unique or sort.
This can simply the plan, and process less Aggref nodes during planner.

select distinct on(count(b), count(c)) count(a), sum(b) from
t_distinct_sort order by count(c);
                           QUERY PLAN
--------------------------------------------------------------------
 Unique
   Output: (count(a)), (sum(b)), (count(c)), (count(b))
   Group Key: (count(c)), (count(b))
   ->  Sort
         Output: (count(a)), (sum(b)), (count(c)), (count(b))
         Sort Key: (count(t_distinct_sort.c)),
(count(t_distinct_sort.b))
         ->  Finalize Aggregate
               Output: count(a), sum(b), count(c), count(b)
               ->  Gather Motion 3:1  (slice1; segments: 3)
                     Output: (PARTIAL count(a)), (PARTIAL sum(b)),
(PARTIAL count(c)), (PARTIAL count(b))
                     ->  Partial Aggregate
                           Output: PARTIAL count(a), PARTIAL sum(b),
PARTIAL count(c), PARTIAL count(b)
                           ->  Seq Scan on public.t_distinct_sort
                                 Output: a, b, c

After this commit:

select distinct on(count(b), count(c)) count(a), sum(b) from
t_distinct_sort order by count(c);
                      QUERY PLAN
--------------------------------------------------------
 Finalize Aggregate
   Output: count(a), sum(b)
   ->  Gather Motion 3:1  (slice1; segments: 3)
         Output: (PARTIAL count(a)), (PARTIAL sum(b))
         ->  Partial Aggregate
               Output: PARTIAL count(a), PARTIAL sum(b)
               ->  Seq Scan on public.t_distinct_sort
                     Output: a, b, c
 Optimizer: Postgres query optimizer

Authored-by: Zhang Mingli [email protected]
  • Loading branch information
avamingli committed Nov 4, 2024
1 parent 5633fbb commit 1d31349
Show file tree
Hide file tree
Showing 14 changed files with 521 additions and 150 deletions.
2 changes: 2 additions & 0 deletions src/backend/optimizer/plan/planner.c
Original file line number Diff line number Diff line change
Expand Up @@ -1339,6 +1339,8 @@ subquery_planner(PlannerGlobal *glob, Query *parse,
if (hasResultRTEs)
remove_useless_result_rtes(root);

parse = remove_distinct_sort_clause(parse);

/*
* Do the main planning.
*/
Expand Down
132 changes: 132 additions & 0 deletions src/backend/optimizer/plan/transform.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "nodes/makefuncs.h"
#include "optimizer/clauses.h"
#include "optimizer/optimizer.h"
#include "optimizer/tlist.h"
#include "optimizer/transform.h"
#include "utils/lsyscache.h"
#include "catalog/pg_proc.h"
Expand Down Expand Up @@ -520,3 +521,134 @@ replace_sirvf_rte(Query *query, RangeTblEntry *rte)

return rte;
}

/*
* Does query has SRFs, or WITH ORDINALITY?
*/
bool query_has_srf(Query *query)
{
if (query->hasTargetSRFs)
{
return true;
}

/* Double check for subquery. */
if (expression_returns_set( (Node *) query->targetList))
{
return true;
}

ListCell *lcrte = NULL;
foreach (lcrte, query->rtable)
{
RangeTblEntry *rte = (RangeTblEntry *) lfirst(lcrte);

switch(rte->rtekind)
{
case RTE_FUNCTION:
{
ListCell *lcrtfunc;

/* WITH ORDINALITY */
if (rte->funcordinality)
return true;

foreach(lcrtfunc, rte->functions)
{
RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lcrtfunc);

if (!IsA(rtfunc->funcexpr, FuncExpr))
return true;

if (((FuncExpr *) rtfunc->funcexpr)->funcretset)
{
/* SRF in FROM clause */
return true;
}
}
break;
}
case RTE_SUBQUERY:
{
Query *sq = (Query *) rte->subquery;

if (query_has_srf(sq))
{
return true;
}
break;
}
default:
{
break;
}
}
}

return false;
}

/*
* DISTINCT/DISTINCT ON/ORDER BY optimization.
* Remove DISTINCT clause if possibile, ex:
* select DISTINCT count(a) from t; to
* select count(a) from t;
* There is one row returned at most, DISTINCT and/or ON is pointless then.
* The same with ORDER BY clause;
*/
Query *remove_distinct_sort_clause(Query *parse)
{
if (parse->hasAggs &&
parse->groupClause == NIL &&
!contain_mutable_functions((Node *) parse) &&
!query_has_srf(parse))
{
List *useless_tlist = NIL;
List *tles;
List *sortops;
List *eqops;
ListCell *lc;

if (parse->distinctClause != NIL)
{
get_sortgroupclauses_tles(parse->distinctClause, parse->targetList,
&tles, &sortops, &eqops);
foreach(lc, tles)
{
TargetEntry *tle = lfirst(lc);
if (tle->resjunk)
useless_tlist = lappend(useless_tlist, tle);
}
parse->distinctClause = NIL;
if (parse->hasDistinctOn)
parse->hasDistinctOn = false;
}

if (parse->sortClause != NIL)
{

get_sortgroupclauses_tles(parse->sortClause, parse->targetList,
&tles, &sortops, &eqops);
foreach(lc, tles)
{
TargetEntry *tle = lfirst(lc);
/*
* For SELECT DISTINCT, ORDER BY expressions must appear in select list,
* Some tles may be already in the list.
*/
if (tle->resjunk)
useless_tlist = list_append_unique(useless_tlist, tle);
}
parse->sortClause = NIL;
}

/*
* There is no groupClause, sortClause and distinctClause now .
* The junk TargetEntrys with ressortgroupref index are safe to be removed.
*/
if (useless_tlist != NIL)
parse->targetList = list_difference(parse->targetList, useless_tlist);
}

return parse;
}
4 changes: 4 additions & 0 deletions src/include/optimizer/transform.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,8 @@

extern Query *normalize_query(Query *query);

extern Query *remove_distinct_sort_clause(Query *query);

extern bool query_has_srf(Query *query);

#endif /* TRANSFORM_H */
46 changes: 17 additions & 29 deletions src/test/regress/expected/aggregates.out
Original file line number Diff line number Diff line change
Expand Up @@ -1066,17 +1066,15 @@ explain (costs off)
select distinct max(unique2) from tenk1;
QUERY PLAN
---------------------------------------------------------------------------
HashAggregate
Group Key: $0
Result
InitPlan 1 (returns $0) (slice1)
-> Limit
-> Gather Motion 3:1 (slice2; segments: 3)
Merge Key: tenk1.unique2
-> Index Only Scan Backward using tenk1_unique2 on tenk1
Index Cond: (unique2 IS NOT NULL)
-> Result
Optimizer: Postgres query optimizer
(10 rows)
(8 rows)

select distinct max(unique2) from tenk1;
max
Expand All @@ -1088,17 +1086,15 @@ explain (costs off)
select max(unique2) from tenk1 order by 1;
QUERY PLAN
---------------------------------------------------------------------------
Sort
Sort Key: ($0)
Result
InitPlan 1 (returns $0) (slice1)
-> Limit
-> Gather Motion 3:1 (slice2; segments: 3)
Merge Key: tenk1.unique2
-> Index Only Scan Backward using tenk1_unique2 on tenk1
Index Cond: (unique2 IS NOT NULL)
-> Result
Optimizer: Postgres query optimizer
(10 rows)
(8 rows)

select max(unique2) from tenk1 order by 1;
max
Expand All @@ -1110,17 +1106,15 @@ explain (costs off)
select max(unique2) from tenk1 order by max(unique2);
QUERY PLAN
---------------------------------------------------------------------------
Sort
Sort Key: ($0)
Result
InitPlan 1 (returns $0) (slice1)
-> Limit
-> Gather Motion 3:1 (slice2; segments: 3)
Merge Key: tenk1.unique2
-> Index Only Scan Backward using tenk1_unique2 on tenk1
Index Cond: (unique2 IS NOT NULL)
-> Result
Optimizer: Postgres query optimizer
(10 rows)
(8 rows)

select max(unique2) from tenk1 order by max(unique2);
max
Expand All @@ -1132,17 +1126,15 @@ explain (costs off)
select max(unique2) from tenk1 order by max(unique2)+1;
QUERY PLAN
---------------------------------------------------------------------------
Sort
Sort Key: (($0 + 1))
Result
InitPlan 1 (returns $0) (slice1)
-> Limit
-> Gather Motion 3:1 (slice2; segments: 3)
Merge Key: tenk1.unique2
-> Index Only Scan Backward using tenk1_unique2 on tenk1
Index Cond: (unique2 IS NOT NULL)
-> Result
Optimizer: Postgres query optimizer
(10 rows)
(8 rows)

select max(unique2) from tenk1 order by max(unique2)+1;
max
Expand Down Expand Up @@ -1260,20 +1252,16 @@ explain (costs off)
select distinct min(f1), max(f1) from minmaxtest;
QUERY PLAN
--------------------------------------------------------------
Unique
Group Key: (min(minmaxtest.f1)), (max(minmaxtest.f1))
-> Sort
Sort Key: (min(minmaxtest.f1)), (max(minmaxtest.f1))
-> Finalize Aggregate
-> Gather Motion 3:1 (slice1; segments: 3)
-> Partial Aggregate
-> Append
-> Seq Scan on minmaxtest minmaxtest_1
-> Seq Scan on minmaxtest1 minmaxtest_2
-> Seq Scan on minmaxtest2 minmaxtest_3
-> Seq Scan on minmaxtest3 minmaxtest_4
Finalize Aggregate
-> Gather Motion 3:1 (slice1; segments: 3)
-> Partial Aggregate
-> Append
-> Seq Scan on minmaxtest minmaxtest_1
-> Seq Scan on minmaxtest1 minmaxtest_2
-> Seq Scan on minmaxtest2 minmaxtest_3
-> Seq Scan on minmaxtest3 minmaxtest_4
Optimizer: Postgres query optimizer
(13 rows)
(9 rows)

select distinct min(f1), max(f1) from minmaxtest;
min | max
Expand Down
26 changes: 11 additions & 15 deletions src/test/regress/expected/aggregates_optimizer.out
Original file line number Diff line number Diff line change
Expand Up @@ -1267,22 +1267,18 @@ explain (costs off)
select distinct min(f1), max(f1) from minmaxtest;
INFO: GPORCA failed to produce a plan, falling back to planner
DETAIL: Feature not supported: Inherited tables
QUERY PLAN
--------------------------------------------------------------------------
Unique
Group Key: (min(minmaxtest.f1)), (max(minmaxtest.f1))
-> Sort
Sort Key: (min(minmaxtest.f1)), (max(minmaxtest.f1))
-> Finalize Aggregate
-> Gather Motion 3:1 (slice1; segments: 3)
-> Partial Aggregate
-> Append
-> Seq Scan on minmaxtest minmaxtest_1
-> Seq Scan on minmaxtest1 minmaxtest_2
-> Seq Scan on minmaxtest2 minmaxtest_3
-> Seq Scan on minmaxtest3 minmaxtest_4
QUERY PLAN
--------------------------------------------------------------
Finalize Aggregate
-> Gather Motion 3:1 (slice1; segments: 3)
-> Partial Aggregate
-> Append
-> Seq Scan on minmaxtest minmaxtest_1
-> Seq Scan on minmaxtest1 minmaxtest_2
-> Seq Scan on minmaxtest2 minmaxtest_3
-> Seq Scan on minmaxtest3 minmaxtest_4
Optimizer: Postgres query optimizer
(13 rows)
(9 rows)

select distinct min(f1), max(f1) from minmaxtest;
INFO: GPORCA failed to produce a plan, falling back to planner
Expand Down
20 changes: 7 additions & 13 deletions src/test/regress/expected/cbdb_parallel.out
Original file line number Diff line number Diff line change
Expand Up @@ -2407,22 +2407,16 @@ create table t1(c1 int) distributed by (c1);
insert into t1 values(11), (12);
analyze t1;
explain(costs off, locus) select distinct min(c1), max(c1) from t1;
QUERY PLAN
------------------------------------------------------------
Unique
QUERY PLAN
------------------------------------------------
Aggregate
Locus: Entry
Group Key: (min(c1)), (max(c1))
-> Sort
-> Gather Motion 3:1 (slice1; segments: 3)
Locus: Entry
Sort Key: (min(c1)), (max(c1))
-> Aggregate
Locus: Entry
-> Gather Motion 3:1 (slice1; segments: 3)
Locus: Entry
-> Seq Scan on t1
Locus: Hashed
-> Seq Scan on t1
Locus: Hashed
Optimizer: Postgres query optimizer
(13 rows)
(7 rows)

abort;
begin;
Expand Down
36 changes: 17 additions & 19 deletions src/test/regress/expected/incremental_sort.out
Original file line number Diff line number Diff line change
Expand Up @@ -1705,26 +1705,24 @@ from tenk1 t1
join tenk1 t2 on t1.unique1 = t2.unique2
join tenk1 t3 on t2.unique1 = t3.unique1
order by count(*);
QUERY PLAN
-------------------------------------------------------------------------------------------------
Sort
Sort Key: (count(*))
-> Finalize Aggregate
-> Gather Motion 3:1 (slice1; segments: 3)
-> Partial Aggregate
-> Hash Join
Hash Cond: (t2.unique2 = t1.unique1)
-> Redistribute Motion 3:3 (slice2; segments: 3)
Hash Key: t2.unique2
-> Hash Join
Hash Cond: (t2.unique1 = t3.unique1)
-> Index Scan using tenk1_unique2 on tenk1 t2
-> Hash
-> Index Only Scan using tenk1_unique1 on tenk1 t3
-> Hash
-> Index Only Scan using tenk1_unique1 on tenk1 t1
QUERY PLAN
-------------------------------------------------------------------------------------------
Finalize Aggregate
-> Gather Motion 3:1 (slice1; segments: 3)
-> Partial Aggregate
-> Hash Join
Hash Cond: (t2.unique2 = t1.unique1)
-> Redistribute Motion 3:3 (slice2; segments: 3)
Hash Key: t2.unique2
-> Hash Join
Hash Cond: (t2.unique1 = t3.unique1)
-> Index Scan using tenk1_unique2 on tenk1 t2
-> Hash
-> Index Only Scan using tenk1_unique1 on tenk1 t3
-> Hash
-> Index Only Scan using tenk1_unique1 on tenk1 t1
Optimizer: Postgres query optimizer
(17 rows)
(15 rows)

-- Parallel sort but with expression (correlated subquery) that
-- is prohibited in parallel plans.
Expand Down
Loading

0 comments on commit 1d31349

Please sign in to comment.