Skip to content

Commit ade1c9b

Browse files
authored
Merge branch 'main' into mergeYagpHooks
2 parents f26c287 + cb61a95 commit ade1c9b

File tree

22 files changed

+1138
-301
lines changed

22 files changed

+1138
-301
lines changed

contrib/pax_storage/expected/cbdb_parallel.out

Lines changed: 122 additions & 61 deletions
Large diffs are not rendered by default.

doc/src/sgml/protocol.sgml

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1526,10 +1526,10 @@ SELCT 1/0;<!-- this typo is intentional -->
15261526

15271527
<para>
15281528
The frontend should also be prepared to handle an ErrorMessage
1529-
response to SSLRequest from the server. This would only occur if
1530-
the server predates the addition of <acronym>SSL</acronym> support
1531-
to <productname>PostgreSQL</productname>. (Such servers are now very ancient,
1532-
and likely do not exist in the wild anymore.)
1529+
response to SSLRequest from the server. The frontend should not display
1530+
this error message to the user/application, since the server has not been
1531+
authenticated
1532+
(<ulink url="https://www.postgresql.org/support/security/CVE-2024-10977/">CVE-2024-10977</ulink>).
15331533
In this case the connection must
15341534
be closed, but the frontend might choose to open a fresh connection
15351535
and proceed without requesting <acronym>SSL</acronym>.
@@ -1603,12 +1603,13 @@ SELCT 1/0;<!-- this typo is intentional -->
16031603

16041604
<para>
16051605
The frontend should also be prepared to handle an ErrorMessage
1606-
response to GSSENCRequest from the server. This would only occur if
1607-
the server predates the addition of <acronym>GSSAPI</acronym> encryption
1608-
support to <productname>PostgreSQL</productname>. In this case the
1609-
connection must be closed, but the frontend might choose to open a fresh
1610-
connection and proceed without requesting <acronym>GSSAPI</acronym>
1611-
encryption.
1606+
response to GSSENCRequest from the server. The frontend should not display
1607+
this error message to the user/application, since the server has not been
1608+
authenticated
1609+
(<ulink url="https://www.postgresql.org/support/security/CVE-2024-10977/">CVE-2024-10977</ulink>).
1610+
In this case the connection must be closed, but the frontend might choose
1611+
to open a fresh connection and proceed without requesting
1612+
<acronym>GSSAPI</acronym> encryption.
16121613
</para>
16131614

16141615
<para>

src/backend/cdb/cdbpath.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3112,8 +3112,9 @@ cdbpath_motion_for_parallel_join(PlannerInfo *root,
31123112
case JOIN_UNIQUE_INNER:
31133113
case JOIN_RIGHT:
31143114
case JOIN_FULL:
3115-
/* Join types are not supported in parallel yet. */
3116-
goto fail;
3115+
outer.ok_to_replicate = false;
3116+
inner.ok_to_replicate = false;
3117+
break;
31173118
case JOIN_DEDUP_SEMI:
31183119
if (!enable_parallel_dedup_semi_join)
31193120
goto fail;

src/backend/cdb/cdbpathlocus.c

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,11 @@ cdbpathlocus_equal(CdbPathLocus a, CdbPathLocus b)
119119
list_length(a.distkey) != list_length(b.distkey))
120120
return false;
121121

122+
/*
123+
* CBDB_PARALLEL: What if both a and b are HashedOJ with parallel workers > 0 ?
124+
* Are they equal in practice?
125+
*/
126+
122127
if ((CdbPathLocus_IsHashed(a) || CdbPathLocus_IsHashedOJ(a)) &&
123128
(CdbPathLocus_IsHashed(b) || CdbPathLocus_IsHashedOJ(b)))
124129
return cdbpath_distkey_equal(a.distkey, b.distkey);
@@ -544,7 +549,7 @@ cdbpathlocus_from_subquery(struct PlannerInfo *root,
544549
else
545550
{
546551
Assert(CdbPathLocus_IsHashedOJ(subpath->locus));
547-
CdbPathLocus_MakeHashedOJ(&locus, distkeys, numsegments);
552+
CdbPathLocus_MakeHashedOJ(&locus, distkeys, numsegments, subpath->locus.parallel_workers);
548553
}
549554
}
550555
else
@@ -711,7 +716,7 @@ cdbpathlocus_pull_above_projection(struct PlannerInfo *root,
711716
CdbPathLocus_MakeHashedWorkers(&newlocus, newdistkeys, numsegments, locus.parallel_workers);
712717
}
713718
else
714-
CdbPathLocus_MakeHashedOJ(&newlocus, newdistkeys, numsegments);
719+
CdbPathLocus_MakeHashedOJ(&newlocus, newdistkeys, numsegments, locus.parallel_workers);
715720
return newlocus;
716721
}
717722
else
@@ -880,7 +885,7 @@ cdbpathlocus_join(JoinType jointype, CdbPathLocus a, CdbPathLocus b)
880885

881886
newdistkeys = lappend(newdistkeys, newdistkey);
882887
}
883-
CdbPathLocus_MakeHashedOJ(&resultlocus, newdistkeys, numsegments);
888+
CdbPathLocus_MakeHashedOJ(&resultlocus, newdistkeys, numsegments, 0 /* Both are 0 parallel here*/);
884889
}
885890
Assert(cdbpathlocus_is_valid(resultlocus));
886891
return resultlocus;
@@ -1236,8 +1241,14 @@ cdbpathlocus_parallel_join(JoinType jointype, CdbPathLocus a, CdbPathLocus b, bo
12361241
Assert(cdbpathlocus_is_valid(a));
12371242
Assert(cdbpathlocus_is_valid(b));
12381243

1239-
/* Do both input rels have same locus? */
1240-
if (cdbpathlocus_equal(a, b))
1244+
/*
1245+
* Do both input rels have same locus?
1246+
* CBDB_PARALLEL: for FULL JOIN, it could be different even both
1247+
* are same loucs. Because the NULL values could be on any segments
1248+
* after join.
1249+
*/
1250+
1251+
if (jointype != JOIN_FULL && cdbpathlocus_equal(a, b))
12411252
return a;
12421253

12431254
/*
@@ -1412,8 +1423,9 @@ cdbpathlocus_parallel_join(JoinType jointype, CdbPathLocus a, CdbPathLocus b, bo
14121423
* If inner is hashed workers, and outer is hashed. Join locus will be hashed.
14131424
* If outer is hashed workers, and inner is hashed. Join locus will be hashed workers.
14141425
* Seems we should just return outer locus anyway.
1426+
* Things changed since we have parallel full join now.
14151427
*/
1416-
if (parallel_aware)
1428+
if (parallel_aware && jointype != JOIN_FULL)
14171429
return a;
14181430

14191431
numsegments = CdbPathLocus_NumSegments(a);
@@ -1469,7 +1481,9 @@ cdbpathlocus_parallel_join(JoinType jointype, CdbPathLocus a, CdbPathLocus b, bo
14691481
newdistkeys = lappend(newdistkeys, newdistkey);
14701482
}
14711483

1472-
CdbPathLocus_MakeHashedOJ(&resultlocus, newdistkeys, numsegments);
1484+
Assert(CdbPathLocus_NumParallelWorkers(a) == CdbPathLocus_NumParallelWorkers(b));
1485+
1486+
CdbPathLocus_MakeHashedOJ(&resultlocus, newdistkeys, numsegments, CdbPathLocus_NumParallelWorkers(a));
14731487
}
14741488
Assert(cdbpathlocus_is_valid(resultlocus));
14751489
return resultlocus;

src/backend/cdb/motion/cdbmotion.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,8 @@ RemoveMotionLayer(MotionLayerState *mlStates)
133133
/* Emit statistics to log */
134134
if (gp_log_interconnect >= GPVARS_VERBOSITY_VERBOSE)
135135
elog(LOG, "RemoveMotionLayer(): dumping stats\n"
136-
" Sent: %9u chunks %9u total bytes %9u tuple bytes\n"
137-
" Received: %9u chunks %9u total bytes %9u tuple bytes; "
136+
" Sent: %9" INT64_MODIFIER "u chunks %9" INT64_MODIFIER "u total bytes %9" INT64_MODIFIER "u tuple bytes\n"
137+
" Received: %9" INT64_MODIFIER "u chunks %9" INT64_MODIFIER "u total bytes %9" INT64_MODIFIER "u tuple bytes; "
138138
"%9u chunkproc calls\n",
139139
mlStates->stat_total_chunks_sent,
140140
mlStates->stat_total_bytes_sent,

src/backend/executor/nodeHash.c

Lines changed: 175 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2004,6 +2004,7 @@ ExecParallelHashTableInsert(HashJoinTable hashtable,
20042004
/* Store the hash value in the HashJoinTuple header. */
20052005
hashTuple->hashvalue = hashvalue;
20062006
memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
2007+
HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));
20072008

20082009
/* Push it onto the front of the bucket's list */
20092010
ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno],
@@ -2388,6 +2389,69 @@ ExecPrepHashTableForUnmatched(HashJoinState *hjstate)
23882389
hjstate->hj_CurTuple = NULL;
23892390
}
23902391

2392+
/*
2393+
* Decide if this process is allowed to run the unmatched scan. If so, the
2394+
* batch barrier is advanced to PHJ_BATCH_SCAN and true is returned.
2395+
* Otherwise the batch is detached and false is returned.
2396+
*/
2397+
bool
2398+
ExecParallelPrepHashTableForUnmatched(HashJoinState *hjstate)
2399+
{
2400+
HashJoinTable hashtable = hjstate->hj_HashTable;
2401+
int curbatch = hashtable->curbatch;
2402+
ParallelHashJoinBatch *batch = hashtable->batches[curbatch].shared;
2403+
2404+
Assert(BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_PROBING);
2405+
2406+
/*
2407+
* It would not be deadlock-free to wait on the batch barrier, because it
2408+
* is in PHJ_BATCH_PROBING phase, and thus processes attached to it have
2409+
* already emitted tuples. Therefore, we'll hold a wait-free election:
2410+
* only one process can continue to the next phase, and all others detach
2411+
* from this batch. They can still go any work on other batches, if there
2412+
* are any.
2413+
*/
2414+
if (!BarrierArriveAndDetachExceptLast(&batch->batch_barrier))
2415+
{
2416+
/* This process considers the batch to be done. */
2417+
hashtable->batches[hashtable->curbatch].done = true;
2418+
2419+
/* Make sure any temporary files are closed. */
2420+
sts_end_parallel_scan(hashtable->batches[curbatch].inner_tuples);
2421+
sts_end_parallel_scan(hashtable->batches[curbatch].outer_tuples);
2422+
2423+
/*
2424+
* Track largest batch we've seen, which would normally happen in
2425+
* ExecHashTableDetachBatch().
2426+
*/
2427+
hashtable->spacePeak =
2428+
Max(hashtable->spacePeak,
2429+
batch->size + sizeof(dsa_pointer_atomic) * hashtable->nbuckets);
2430+
hashtable->curbatch = -1;
2431+
return false;
2432+
}
2433+
2434+
/* Now we are alone with this batch. */
2435+
Assert(BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_SCAN);
2436+
Assert(BarrierParticipants(&batch->batch_barrier) == 1);
2437+
2438+
/*
2439+
* Has another process decided to give up early and command all processes
2440+
* to skip the unmatched scan?
2441+
*/
2442+
if (batch->skip_unmatched)
2443+
{
2444+
hashtable->batches[hashtable->curbatch].done = true;
2445+
ExecHashTableDetachBatch(hashtable);
2446+
return false;
2447+
}
2448+
2449+
/* Now prepare the process local state, just as for non-parallel join. */
2450+
ExecPrepHashTableForUnmatched(hjstate);
2451+
2452+
return true;
2453+
}
2454+
23912455
/*
23922456
* ExecScanHashTableForUnmatched
23932457
* scan the hash table for unmatched inner tuples
@@ -2462,6 +2526,72 @@ ExecScanHashTableForUnmatched(HashJoinState *hjstate, ExprContext *econtext)
24622526
return false;
24632527
}
24642528

2529+
/*
2530+
* ExecParallelScanHashTableForUnmatched
2531+
* scan the hash table for unmatched inner tuples, in parallel join
2532+
*
2533+
* On success, the inner tuple is stored into hjstate->hj_CurTuple and
2534+
* econtext->ecxt_innertuple, using hjstate->hj_HashTupleSlot as the slot
2535+
* for the latter.
2536+
*/
2537+
bool
2538+
ExecParallelScanHashTableForUnmatched(HashJoinState *hjstate,
2539+
ExprContext *econtext)
2540+
{
2541+
HashJoinTable hashtable = hjstate->hj_HashTable;
2542+
HashJoinTuple hashTuple = hjstate->hj_CurTuple;
2543+
2544+
for (;;)
2545+
{
2546+
/*
2547+
* hj_CurTuple is the address of the tuple last returned from the
2548+
* current bucket, or NULL if it's time to start scanning a new
2549+
* bucket.
2550+
*/
2551+
if (hashTuple != NULL)
2552+
hashTuple = ExecParallelHashNextTuple(hashtable, hashTuple);
2553+
else if (hjstate->hj_CurBucketNo < hashtable->nbuckets)
2554+
hashTuple = ExecParallelHashFirstTuple(hashtable,
2555+
hjstate->hj_CurBucketNo++);
2556+
else
2557+
break; /* finished all buckets */
2558+
2559+
while (hashTuple != NULL)
2560+
{
2561+
if (!HeapTupleHeaderHasMatch(HJTUPLE_MINTUPLE(hashTuple)))
2562+
{
2563+
TupleTableSlot *inntuple;
2564+
2565+
/* insert hashtable's tuple into exec slot */
2566+
inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),
2567+
hjstate->hj_HashTupleSlot,
2568+
false); /* do not pfree */
2569+
econtext->ecxt_innertuple = inntuple;
2570+
2571+
/*
2572+
* Reset temp memory each time; although this function doesn't
2573+
* do any qual eval, the caller will, so let's keep it
2574+
* parallel to ExecScanHashBucket.
2575+
*/
2576+
ResetExprContext(econtext);
2577+
2578+
hjstate->hj_CurTuple = hashTuple;
2579+
return true;
2580+
}
2581+
2582+
hashTuple = ExecParallelHashNextTuple(hashtable, hashTuple);
2583+
}
2584+
2585+
/* allow this loop to be cancellable */
2586+
CHECK_FOR_INTERRUPTS();
2587+
}
2588+
2589+
/*
2590+
* no more unmatched tuples
2591+
*/
2592+
return false;
2593+
}
2594+
24652595
/*
24662596
* ExecHashTableReset
24672597
*
@@ -3793,6 +3923,7 @@ ExecParallelHashEnsureBatchAccessors(HashJoinTable hashtable)
37933923
accessor->shared = shared;
37943924
accessor->preallocated = 0;
37953925
accessor->done = false;
3926+
accessor->outer_eof = false;
37963927
accessor->inner_tuples =
37973928
sts_attach(ParallelHashJoinBatchInner(shared),
37983929
hashtable->hjstate->worker_id,
@@ -3838,25 +3969,63 @@ ExecHashTableDetachBatch(HashJoinTable hashtable)
38383969
{
38393970
int curbatch = hashtable->curbatch;
38403971
ParallelHashJoinBatch *batch = hashtable->batches[curbatch].shared;
3972+
bool attached = true;
38413973

38423974
/* Make sure any temporary files are closed. */
38433975
sts_end_parallel_scan(hashtable->batches[curbatch].inner_tuples);
38443976
sts_end_parallel_scan(hashtable->batches[curbatch].outer_tuples);
38453977

3846-
/* Detach from the batch we were last working on. */
3978+
/* After attaching we always get at least to PHJ_BATCH_PROBING. */
3979+
Assert(BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_PROBING ||
3980+
BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_SCAN);
3981+
3982+
/*
3983+
* If we're abandoning the PHJ_BATCH_PROBING phase early without having
3984+
* reached the end of it, it means the plan doesn't want any more
3985+
* tuples, and it is happy to abandon any tuples buffered in this
3986+
* process's subplans. For correctness, we can't allow any process to
3987+
* execute the PHJ_BATCH_SCAN phase, because we will never have the
3988+
* complete set of match bits. Therefore we skip emitting unmatched
3989+
* tuples in all backends (if this is a full/right join), as if those
3990+
* tuples were all due to be emitted by this process and it has
3991+
* abandoned them too.
3992+
*/
38473993
/*
38483994
* CBDB_PARALLEL: Parallel Hash Left Anti Semi (Not-In) Join(parallel-aware)
38493995
* If phs_lasj_has_null is true, that means we have found null when building hash table,
38503996
* there were no batches to detach.
38513997
*/
3852-
if (!hashtable->parallel_state->phs_lasj_has_null && BarrierArriveAndDetach(&batch->batch_barrier))
3998+
if (BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_PROBING &&
3999+
!hashtable->parallel_state->phs_lasj_has_null && /* CBDB_PARALLEL */
4000+
!hashtable->batches[curbatch].outer_eof)
4001+
{
4002+
/*
4003+
* This flag may be written to by multiple backends during
4004+
* PHJ_BATCH_PROBING phase, but will only be read in PHJ_BATCH_SCAN
4005+
* phase so requires no extra locking.
4006+
*/
4007+
batch->skip_unmatched = true;
4008+
}
4009+
4010+
/*
4011+
* Even if we aren't doing a full/right outer join, we'll step through
4012+
* the PHJ_BATCH_SCAN phase just to maintain the invariant that
4013+
* freeing happens in PHJ_BATCH_FREE, but that'll be wait-free.
4014+
*/
4015+
if (BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_PROBING &&
4016+
!hashtable->parallel_state->phs_lasj_has_null /* CBDB_PARALLEL */)
4017+
attached = BarrierArriveAndDetachExceptLast(&batch->batch_barrier);
4018+
if (attached && !hashtable->parallel_state->phs_lasj_has_null /* CBDB_PARALLEL */ &&
4019+
BarrierArriveAndDetach(&batch->batch_barrier))
38534020
{
38544021
/*
3855-
* Technically we shouldn't access the barrier because we're no
3856-
* longer attached, but since there is no way it's moving after
3857-
* this point it seems safe to make the following assertion.
4022+
* We are not longer attached to the batch barrier, but we're the
4023+
* process that was chosen to free resources and it's safe to
4024+
* assert the current phase. The ParallelHashJoinBatch can't go
4025+
* away underneath us while we are attached to the build barrier,
4026+
* making this access safe.
38584027
*/
3859-
Assert(BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_DONE);
4028+
Assert(BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_FREE);
38604029

38614030
/* Free shared chunks and buckets. */
38624031
while (DsaPointerIsValid(batch->chunks))

0 commit comments

Comments
 (0)