diff --git a/src/backend/optimizer/plan/orca.c b/src/backend/optimizer/plan/orca.c index 97f63f7a334..514385cc2e9 100644 --- a/src/backend/optimizer/plan/orca.c +++ b/src/backend/optimizer/plan/orca.c @@ -545,6 +545,10 @@ push_down_expr_mutator(Node *node, List *child_tlist) { ((Const *) child_tle->expr)->consttypmod = ((Var *) node)->vartypmod; } + else if (IsA(child_tle->expr, Var)) + { + ((Var *) child_tle->expr)->vartypmod = ((Var *) node)->vartypmod; + } return (Node *) child_tle->expr; } diff --git a/src/test/regress/expected/union_gp.out b/src/test/regress/expected/union_gp.out index 5bdae3e887c..d134f223502 100644 --- a/src/test/regress/expected/union_gp.out +++ b/src/test/regress/expected/union_gp.out @@ -2342,6 +2342,36 @@ with result as (update r_1240 set a = a +1 where a < 5 returning *) select * fro drop table r_1240; drop table p1_1240; -- +-- Test CTAS with UNION ALL when branches have different typmods (issue #1431). +-- ORCA should resolve the output column type to character varying (no length), +-- same as the Postgres planner, instead of picking the first branch's typmod. +-- +create table union_ctas_t1(id int, name varchar(1)); +create table union_ctas_t2(id int, name varchar(2)); +insert into union_ctas_t1 values (1, 'a'); +insert into union_ctas_t2 values (1, 'ab'); +create table union_ctas_result as + (select id, name from union_ctas_t1) + union all + (select id, name from union_ctas_t2); +-- name column should be "character varying" without length, not varchar(1) +select atttypmod from pg_attribute +where attrelid = 'union_ctas_result'::regclass and attname = 'name'; + atttypmod +----------- + -1 +(1 row) + +-- data should not be truncated +select * from union_ctas_result order by name; + id | name +----+------ + 1 | a + 1 | ab +(2 rows) + +drop table union_ctas_t1, union_ctas_t2, union_ctas_result; +-- -- Clean up -- DROP TABLE IF EXISTS T_a1 CASCADE; diff --git a/src/test/regress/expected/union_gp_optimizer.out b/src/test/regress/expected/union_gp_optimizer.out index 8ff8655591d..8704f0fe7a7 100644 --- a/src/test/regress/expected/union_gp_optimizer.out +++ b/src/test/regress/expected/union_gp_optimizer.out @@ -1,7 +1,7 @@ -- Additional GPDB-added tests for UNION SET optimizer_trace_fallback=on; create temp table t_union1 (a int, b int); -NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table. +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Apache Cloudberry data distribution key for this table. HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. select distinct a, null::integer as c from t_union1 union select a, b from t_union1; a | c @@ -44,8 +44,8 @@ LINE 1: select 1 intersect (select 1, 2 union all select 3, 4); select 1 a, row_number() over (partition by 'a') union all (select 1 a , 2 b); a | row_number ---+------------ - 1 | 2 1 | 1 + 1 | 2 (2 rows) -- This should preserve domain types @@ -104,8 +104,7 @@ DETAIL: Falling back to Postgres-based planner because GPORCA does not support (1 row) CREATE TABLE union_ctas (a, b) AS SELECT 1, 2 UNION SELECT 1, 1 UNION SELECT 1, 1; -NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'a' as the Greenplum Database data distribution key for this table. -HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause. Creating a NULL policy entry. SELECT * FROM union_ctas; a | b ---+--- @@ -116,11 +115,9 @@ SELECT * FROM union_ctas; DROP TABLE union_ctas; -- MPP-21075: push quals below union CREATE TABLE union_quals1 (a, b) AS SELECT i, i%2 from generate_series(1,10) i; -NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'a' as the Greenplum Database data distribution key for this table. -HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause. Creating a NULL policy entry. CREATE TABLE union_quals2 (a, b) AS SELECT i%2, i from generate_series(1,10) i; -NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'a' as the Greenplum Database data distribution key for this table. -HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause. Creating a NULL policy entry. SELECT * FROM (SELECT a, b from union_quals1 UNION SELECT b, a from union_quals2) as foo(a,b) where a > b order by a; a | b ----+--- @@ -225,7 +222,7 @@ select distinct a from (select distinct 'A' from (select 'C' from (select disti -- on a single QE. -- CREATE TABLE test1 (id int); -NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'id' as the Greenplum Database data distribution key for this table. +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'id' as the Apache Cloudberry data distribution key for this table. HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. insert into test1 values (1); CREATE EXTERNAL WEB TABLE test2 (id int) EXECUTE 'echo 2' ON COORDINATOR FORMAT 'csv'; @@ -234,8 +231,8 @@ union (SELECT 'test2' as branch, id FROM test2); branch | id --------+---- - test1 | 1 test2 | 2 + test1 | 1 (2 rows) explain (SELECT 'test1' as branch, id FROM test1 LIMIT 1) @@ -243,10 +240,10 @@ union (SELECT 'test2' as branch, id FROM test2); QUERY PLAN ------------------------------------------------------------------------------------------------------------------------------ - Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..984.78 rows=1125 width=12) - -> HashAggregate (cost=0.00..984.73 rows=375 width=12) + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..985.86 rows=1125 width=12) + -> HashAggregate (cost=0.00..985.81 rows=375 width=12) Group Key: ('test1'::text), test1.id - -> Append (cost=0.00..984.65 rows=334 width=12) + -> Append (cost=0.00..985.73 rows=334 width=12) -> Redistribute Motion 1:3 (slice2) (cost=0.00..431.00 rows=1 width=12) Hash Key: ('test1'::text), test1.id -> GroupAggregate (cost=0.00..431.00 rows=1 width=12) @@ -257,16 +254,16 @@ union -> Result (cost=0.00..431.00 rows=1 width=12) -> Gather Motion 3:1 (slice3; segments: 3) (cost=0.00..431.00 rows=1 width=4) -> Seq Scan on test1 (cost=0.00..431.00 rows=1 width=4) - -> HashAggregate (cost=0.00..553.64 rows=334 width=12) - Group Key: ('test2'::text), test2.id - -> Redistribute Motion 3:3 (slice4; segments: 3) (cost=0.00..553.56 rows=334 width=12) - Hash Key: ('test2'::text), test2.id - -> Streaming HashAggregate (cost=0.00..553.55 rows=334 width=12) - Group Key: 'test2'::text, test2.id - -> Result (cost=0.00..471.53 rows=333334 width=12) - -> Redistribute Motion 1:3 (slice5) (cost=0.00..467.53 rows=333334 width=4) - -> Foreign Scan on test2 (cost=0.00..449.70 rows=1000000 width=4) - Optimizer: Pivotal Optimizer (GPORCA) + -> HashAggregate (cost=0.00..554.73 rows=334 width=12) + Group Key: ('test2'::text), id + -> Redistribute Motion 3:3 (slice4; segments: 3) (cost=0.00..554.64 rows=334 width=12) + Hash Key: ('test2'::text), id + -> Streaming HashAggregate (cost=0.00..554.63 rows=334 width=12) + Group Key: 'test2'::text, id + -> Result (cost=0.00..473.73 rows=333334 width=12) + -> Redistribute Motion 1:3 (slice5) (cost=0.00..469.73 rows=333334 width=4) + -> Foreign Scan on test2 (cost=0.00..451.90 rows=1000000 width=4) + Optimizer: GPORCA (24 rows) -- @@ -320,8 +317,8 @@ INFO: GPORCA failed to produce a plan, falling back to Postgres-based planner DETAIL: Unknown error: Partially Distributed Data QUERY PLAN --------------------------------------------------------------------------------------- - Gather Motion 1:1 (slice1; segments: 1) (cost=1922.00..1922.00 rows=172200 width=8) - -> Append (cost=0.00..1922.00 rows=172200 width=8) + Gather Motion 1:1 (slice1; segments: 1) (cost=2783.00..2783.00 rows=172200 width=8) + -> Append (cost=0.00..2783.00 rows=172200 width=8) -> Seq Scan on rep2 (cost=0.00..961.00 rows=86100 width=8) -> Seq Scan on rep3 (cost=0.00..961.00 rows=86100 width=8) Optimizer: Postgres query optimizer @@ -353,7 +350,7 @@ INSERT INTO T_a1 SELECT i, i%5 from generate_series(1,10) i; CREATE TABLE T_b2 (b1 int, b2 int) DISTRIBUTED BY(b2); INSERT INTO T_b2 SELECT i, i%5 from generate_series(1,20) i; CREATE TABLE T_random (c1 int, c2 int); -NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Greenplum Database data distribution key for this table. +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Apache Cloudberry data distribution key for this table. HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. INSERT INTO T_random SELECT i, i%5 from generate_series(1,30) i; --start_ignore @@ -2079,14 +2076,18 @@ insert into t1_ncols values (1, 11, 'one', '2001-01-01'); insert into t2_ncols values (2, 22, 'two', '2002-02-02'); insert into t2_ncols values (4, 44, 'four','2004-04-04'); select b from t1_ncols union all select a from t2_ncols; +NOTICE: One or more columns in the following table(s) do not have statistics: t2_ncols +HINT: For non-partitioned tables, run analyze (). For partitioned tables, run analyze rootpartition (). See log for columns missing statistics. b ---- - 4 - 2 11 + 2 + 4 (3 rows) select a+100, b, d from t1_ncols union select b, a+200, d from t2_ncols order by 1; +NOTICE: One or more columns in the following table(s) do not have statistics: t2_ncols +HINT: For non-partitioned tables, run analyze (). For partitioned tables, run analyze rootpartition (). See log for columns missing statistics. ?column? | b | d ----------+-----+------------ 22 | 202 | 02-02-2002 @@ -2095,15 +2096,19 @@ select a+100, b, d from t1_ncols union select b, a+200, d from t2_ncols order by (3 rows) select c, a from v1_ncols; +NOTICE: One or more columns in the following table(s) do not have statistics: t2_ncols +HINT: For non-partitioned tables, run analyze (). For partitioned tables, run analyze rootpartition (). See log for columns missing statistics. c | a ------+--- one | 1 - four | 4 two | 2 + four | 4 (3 rows) with cte1(aa, b, c, d) as (select a*100, b, c, d from t1_ncols union select * from t2_ncols) select x.aa/100 aaa, x.c, y.c from cte1 x join cte1 y on x.aa=y.aa; +NOTICE: One or more columns in the following table(s) do not have statistics: t2_ncols +HINT: For non-partitioned tables, run analyze (). For partitioned tables, run analyze rootpartition (). See log for columns missing statistics. aaa | c | c -----+------+------ 0 | two | two @@ -2122,13 +2127,13 @@ NOTICE: schema "union_schema" does not exist, skipping -- end_ignore create schema union_schema; create table union_schema.t1(a int, b int); -NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table. +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Apache Cloudberry data distribution key for this table. HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. create table union_schema.t2(a int, b int); -NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table. +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Apache Cloudberry data distribution key for this table. HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. create table union_schema.t3(a int, b int); -NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table. +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Apache Cloudberry data distribution key for this table. HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. set allow_system_table_mods = on; update gp_distribution_policy set numsegments = 1 @@ -2188,8 +2193,8 @@ INFO: GPORCA failed to produce a plan, falling back to Postgres-based planner DETAIL: Unknown error: Partially Distributed Data QUERY PLAN ----------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=1.23..1472.30 rows=86130 width=8) - -> Append (cost=1.23..323.90 rows=28710 width=8) + Gather Motion 3:1 (slice1; segments: 3) (cost=1.23..1615.85 rows=86130 width=8) + -> Append (cost=1.23..467.45 rows=28710 width=8) -> Hash Join (cost=1.23..2.80 rows=10 width=8) Hash Cond: (t2.b = t1.a) -> Redistribute Motion 2:3 (slice2; segments: 2) (cost=0.00..1.40 rows=20 width=4) @@ -2208,6 +2213,8 @@ INFO: GPORCA failed to produce a plan, falling back to Postgres-based planner DETAIL: Unknown error: Partially Distributed Data a | b | a | b ----+----+----+---- + 1 | 1 | 1 | 1 + 5 | 5 | 5 | 5 2 | 2 | 2 | 2 3 | 3 | 3 | 3 4 | 4 | 4 | 4 @@ -2216,8 +2223,6 @@ DETAIL: Unknown error: Partially Distributed Data 8 | 8 | 8 | 8 9 | 9 | 9 | 9 10 | 10 | 10 | 10 - 1 | 1 | 1 | 1 - 5 | 5 | 5 | 5 (10 rows) select union_schema.t1.a, union_schema.t2.b @@ -2229,6 +2234,8 @@ INFO: GPORCA failed to produce a plan, falling back to Postgres-based planner DETAIL: Unknown error: Partially Distributed Data a | b ----+---- + 1 | 1 + 5 | 5 2 | 2 3 | 3 4 | 4 @@ -2237,8 +2244,6 @@ DETAIL: Unknown error: Partially Distributed Data 8 | 8 9 | 9 10 | 10 - 1 | 1 - 5 | 5 (10 rows) truncate union_schema.t1, union_schema.t2; @@ -2276,8 +2281,8 @@ INFO: GPORCA failed to produce a plan, falling back to Postgres-based planner DETAIL: Unknown error: Partially Distributed Data QUERY PLAN ----------------------------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=1.32..1472.20 rows=86130 width=8) - -> Append (cost=1.32..323.80 rows=28710 width=8) + Gather Motion 3:1 (slice1; segments: 3) (cost=1.32..1615.75 rows=86130 width=8) + -> Append (cost=1.32..467.35 rows=28710 width=8) -> Hash Join (cost=1.32..2.70 rows=10 width=8) Hash Cond: (t1.a = t2.b) -> Seq Scan on t1 (cost=0.00..1.20 rows=20 width=4) @@ -2340,6 +2345,8 @@ reset allow_system_table_mods; create table rep (a int) distributed replicated; insert into rep select i from generate_series (1, 10) i; create table dist (a int); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. insert into dist select i from generate_series (1, 1000) i; analyze dist; analyze rep; @@ -2352,7 +2359,7 @@ explain select a from rep union all select a from dist; Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..862.03 rows=1010 width=4) -> Append (cost=0.00..862.01 rows=337 width=4) -> Result (cost=0.00..431.00 rows=4 width=4) - One-Time Filter: (gp_execution_segment() = 2) + One-Time Filter: (gp_execution_segment() = 0) -> Seq Scan on rep (cost=0.00..431.00 rows=10 width=4) -> Seq Scan on dist (cost=0.00..431.01 rows=334 width=4) Optimizer: GPORCA @@ -2368,12 +2375,12 @@ analyze rand; explain select i from generate_series(1,1000) i union all select a from rand; QUERY PLAN ---------------------------------------------------------------------------------------- - Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..431.28 rows=11000 width=4) + Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..431.29 rows=11000 width=4) -> Append (cost=0.00..431.12 rows=3667 width=4) -> Result (cost=0.00..0.01 rows=334 width=4) - One-Time Filter: (gp_execution_segment() = 2) + One-Time Filter: (gp_execution_segment() = 0) -> Function Scan on generate_series (cost=0.00..0.00 rows=334 width=4) - -> Seq Scan on rand (cost=0.00..431.06 rows=3334 width=4) + -> Seq Scan on rand (cost=0.00..431.07 rows=3334 width=4) Optimizer: GPORCA (7 rows) @@ -2460,7 +2467,7 @@ DETAIL: Falling back to Postgres-based planner because GPORCA does not support -> Gather Motion 3:1 (slice2; segments: 3) -> Subquery Scan on "*SELECT* 2" -> Seq Scan on p1_1240 - Optimizer: Postgres-based planner + Optimizer: Postgres query optimizer (11 rows) with result as (update r_1240 set a = a +1 where a < 5 returning *) select * from result except select * from p1_1240; @@ -2475,6 +2482,43 @@ DETAIL: Falling back to Postgres-based planner because GPORCA does not support drop table r_1240; drop table p1_1240; -- +-- Test CTAS with UNION ALL when branches have different typmods (issue #1431). +-- ORCA should resolve the output column type to character varying (no length), +-- same as the Postgres planner, instead of picking the first branch's typmod. +-- +create table union_ctas_t1(id int, name varchar(1)); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'id' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +create table union_ctas_t2(id int, name varchar(2)); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'id' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +insert into union_ctas_t1 values (1, 'a'); +insert into union_ctas_t2 values (1, 'ab'); +create table union_ctas_result as + (select id, name from union_ctas_t1) + union all + (select id, name from union_ctas_t2); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause. Creating a NULL policy entry. +-- name column should be "character varying" without length, not varchar(1) +select atttypmod from pg_attribute +where attrelid = 'union_ctas_result'::regclass and attname = 'name'; +INFO: GPORCA failed to produce a plan, falling back to Postgres-based planner +DETAIL: Falling back to Postgres-based planner because GPORCA does not support the following feature: Queries on master-only tables + atttypmod +----------- + -1 +(1 row) + +-- data should not be truncated +select * from union_ctas_result order by name; + id | name +----+------ + 1 | a + 1 | ab +(2 rows) + +drop table union_ctas_t1, union_ctas_t2, union_ctas_result; +-- -- Clean up -- DROP TABLE IF EXISTS T_a1 CASCADE; diff --git a/src/test/regress/sql/union_gp.sql b/src/test/regress/sql/union_gp.sql index e7cac952704..9e9e5c3a815 100644 --- a/src/test/regress/sql/union_gp.sql +++ b/src/test/regress/sql/union_gp.sql @@ -721,9 +721,32 @@ drop table r_1240; drop table p1_1240; -- --- Clean up +-- Test CTAS with UNION ALL when branches have different typmods (issue #1431). +-- ORCA should resolve the output column type to character varying (no length), +-- same as the Postgres planner, instead of picking the first branch's typmod. -- +create table union_ctas_t1(id int, name varchar(1)); +create table union_ctas_t2(id int, name varchar(2)); +insert into union_ctas_t1 values (1, 'a'); +insert into union_ctas_t2 values (1, 'ab'); + +create table union_ctas_result as + (select id, name from union_ctas_t1) + union all + (select id, name from union_ctas_t2); + +-- name column should be "character varying" without length, not varchar(1) +select atttypmod from pg_attribute +where attrelid = 'union_ctas_result'::regclass and attname = 'name'; + +-- data should not be truncated +select * from union_ctas_result order by name; +drop table union_ctas_t1, union_ctas_t2, union_ctas_result; + +-- +-- Clean up +-- DROP TABLE IF EXISTS T_a1 CASCADE; DROP TABLE IF EXISTS T_b2 CASCADE; DROP TABLE IF EXISTS T_random CASCADE;