From a812ed25162b8bd0685c1d8137c7abcfade4824f Mon Sep 17 00:00:00 2001 From: ludamad Date: Thu, 19 Feb 2026 19:45:05 +0000 Subject: [PATCH 1/2] type: description From a4fe074d69dc1be147edcf2b0210fa6b943ba65c Mon Sep 17 00:00:00 2001 From: ludamad Date: Thu, 19 Feb 2026 23:16:57 +0000 Subject: [PATCH 2/2] feat: parallelize serial bottlenecks in trace construction pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Parallelize six serial bottlenecks identified by BB_BENCH profiling: 1. construct_trace_data (trace_to_polynomials.cpp): Parallelize wire writes and selector writes per-block. Stage copy cycle entries per-thread to avoid races, merge sequentially after all blocks. 2. compute_permutation_mapping cycle loop (permutation_lib.hpp): Each cycle writes to disjoint (row, col) positions — wrap in parallel_for_heuristic. 3. construct_lookup_read_counts (composer_lib.hpp): Pre-compute table offsets, process tables in parallel via parallel_for. 4. construct_lookup_table_polynomials (composer_lib.hpp): Same offset pre-computation pattern, parallel table data copy. 5. construct_databus_polynomials (prover_instance.cpp): Parallelize all four databus column loops (calldata, secondary_calldata, return_data, databus_id). 6. add_ram_rom_memory_records_to_wire_4 (oink_prover.cpp): Parallelize both read and write record loops — each gate_idx is unique. --- .../honk/composer/composer_lib.hpp | 54 ++++++++----- .../honk/composer/permutation_lib.hpp | 79 ++++++++++--------- .../trace_to_polynomials.cpp | 72 +++++++++++------ .../barretenberg/ultra_honk/oink_prover.cpp | 39 +++++---- .../ultra_honk/prover_instance.cpp | 46 ++++++----- 5 files changed, 180 insertions(+), 110 deletions(-) diff --git a/barretenberg/cpp/src/barretenberg/honk/composer/composer_lib.hpp b/barretenberg/cpp/src/barretenberg/honk/composer/composer_lib.hpp index 9375c62c33a2..f1ceffa4efae 100644 --- a/barretenberg/cpp/src/barretenberg/honk/composer/composer_lib.hpp +++ b/barretenberg/cpp/src/barretenberg/honk/composer/composer_lib.hpp @@ -6,6 +6,7 @@ #pragma once #include "barretenberg/common/ref_array.hpp" +#include "barretenberg/common/thread.hpp" #include "barretenberg/flavor/flavor.hpp" #include "barretenberg/stdlib_circuit_builders/plookup_tables/types.hpp" @@ -24,16 +25,27 @@ template void construct_lookup_table_polynomials(const RefArray& table_polynomials, const typename Flavor::CircuitBuilder& circuit) { + // Pre-compute cumulative offsets, then process tables in parallel (disjoint offset ranges) + const auto& tables = circuit.get_lookup_tables(); + const size_t num_tables = tables.size(); + + std::vector table_offsets(num_tables); size_t offset = 0; - for (const auto& table : circuit.get_lookup_tables()) { + for (size_t i = 0; i < num_tables; i++) { + table_offsets[i] = offset; + offset += tables[i].size(); + } + + parallel_for(num_tables, [&](size_t table_idx) { + const auto& table = tables[table_idx]; + size_t tbl_offset = table_offsets[table_idx]; for (size_t i = 0; i < table.size(); ++i) { - table_polynomials[0].at(offset) = table.column_1[i]; - table_polynomials[1].at(offset) = table.column_2[i]; - table_polynomials[2].at(offset) = table.column_3[i]; - table_polynomials[3].at(offset) = table.table_index; - offset++; + table_polynomials[0].at(tbl_offset + i) = table.column_1[i]; + table_polynomials[1].at(tbl_offset + i) = table.column_2[i]; + table_polynomials[2].at(tbl_offset + i) = table.column_3[i]; + table_polynomials[3].at(tbl_offset + i) = table.table_index; } - } + }); } /** @@ -48,25 +60,31 @@ void construct_lookup_read_counts(typename Flavor::Polynomial& read_counts, typename Flavor::Polynomial& read_tags, typename Flavor::CircuitBuilder& circuit) { - // loop over all tables used in the circuit; each table contains data about the lookups made on it + // Process each table independently in parallel (each table writes to disjoint offset ranges) + auto& tables = circuit.get_lookup_tables(); + const size_t num_tables = tables.size(); + + // Pre-compute cumulative offsets (sequential, trivially fast) + std::vector table_offsets(num_tables); size_t table_offset = 0; - for (auto& table : circuit.get_lookup_tables()) { + for (size_t i = 0; i < num_tables; i++) { + table_offsets[i] = table_offset; + table_offset += tables[i].size(); + } + + // Process each table independently + parallel_for(num_tables, [&](size_t table_idx) { + auto& table = tables[table_idx]; table.initialize_index_map(); for (auto& gate_data : table.lookup_gates) { - // convert lookup gate data to an array of three field elements, one for each of the 3 columns auto table_entry = gate_data.to_table_components(table.use_twin_keys); - - // find the index of the entry in the table auto index_in_table = table.index_map[table_entry]; - - // increment the read count at the corresponding index in the full polynomial - size_t index_in_poly = table_offset + index_in_table; + size_t index_in_poly = table_offsets[table_idx] + index_in_table; read_counts.at(index_in_poly)++; - read_tags.at(index_in_poly) = 1; // tag is 1 if entry has been read 1 or more times + read_tags.at(index_in_poly) = 1; } - table_offset += table.size(); // set the offset of the next table within the polynomials - } + }); } } // namespace bb diff --git a/barretenberg/cpp/src/barretenberg/honk/composer/permutation_lib.hpp b/barretenberg/cpp/src/barretenberg/honk/composer/permutation_lib.hpp index 619843b7753a..3aefc9c4789c 100644 --- a/barretenberg/cpp/src/barretenberg/honk/composer/permutation_lib.hpp +++ b/barretenberg/cpp/src/barretenberg/honk/composer/permutation_lib.hpp @@ -15,6 +15,7 @@ #include "barretenberg/common/assert.hpp" #include "barretenberg/common/ref_span.hpp" +#include "barretenberg/common/thread.hpp" #include "barretenberg/flavor/flavor.hpp" #include "barretenberg/polynomials/polynomial.hpp" @@ -131,44 +132,48 @@ PermutationMapping compute_permutation_mapping( // Represents the idx of a variable in circuit_constructor.variables std::span real_variable_tags = circuit_constructor.real_variable_tags; - // Go through each cycle - for (size_t cycle_idx = 0; cycle_idx < wire_copy_cycles.size(); ++cycle_idx) { - // We go through the cycle and fill-out/modify `mapping`. Following the generalized permutation algorithm, we - // take separate care of first/last node handling. - const CyclicPermutation& cycle = wire_copy_cycles[cycle_idx]; - const auto cycle_size = cycle.size(); - if (cycle_size == 0) { - continue; - } - - const cycle_node& first_node = cycle[0]; - const cycle_node& last_node = cycle[cycle_size - 1]; - - const auto first_row = static_cast(first_node.gate_idx); - const auto first_col = first_node.wire_idx; - const auto last_row = static_cast(last_node.gate_idx); - const auto last_col = last_node.wire_idx; - - // First node: id gets tagged with the cycle's variable tag - mapping.ids[first_col].is_tag[first_row] = true; - mapping.ids[first_col].row_idx[first_row] = real_variable_tags[cycle_idx]; - - // Last node: sigma gets tagged and points to tau(tag) instead of wrapping to first node - mapping.sigmas[last_col].is_tag[last_row] = true; - mapping.sigmas[last_col].row_idx[last_row] = circuit_constructor.tau().at(real_variable_tags[cycle_idx]); - - // All nodes except the last: sigma points to the next node in the cycle - for (size_t node_idx = 0; node_idx + 1 < cycle_size; ++node_idx) { - const cycle_node& current_node = cycle[node_idx]; - const cycle_node& next_node = cycle[node_idx + 1]; + // Go through each cycle (parallelizable: each cycle writes to disjoint positions) + parallel_for_heuristic( + wire_copy_cycles.size(), + [&](size_t start, size_t end, BB_UNUSED size_t chunk_index) { + for (size_t cycle_idx = start; cycle_idx < end; ++cycle_idx) { + const CyclicPermutation& cycle = wire_copy_cycles[cycle_idx]; + const auto cycle_size = cycle.size(); + if (cycle_size == 0) { + continue; + } - const auto current_row = static_cast(current_node.gate_idx); - const auto current_col = current_node.wire_idx; - // Point current node to next node. - mapping.sigmas[current_col].row_idx[current_row] = next_node.gate_idx; - mapping.sigmas[current_col].col_idx[current_row] = static_cast(next_node.wire_idx); - } - } + const cycle_node& first_node = cycle[0]; + const cycle_node& last_node = cycle[cycle_size - 1]; + + const auto first_row = static_cast(first_node.gate_idx); + const auto first_col = first_node.wire_idx; + const auto last_row = static_cast(last_node.gate_idx); + const auto last_col = last_node.wire_idx; + + // First node: id gets tagged with the cycle's variable tag + mapping.ids[first_col].is_tag[first_row] = true; + mapping.ids[first_col].row_idx[first_row] = real_variable_tags[cycle_idx]; + + // Last node: sigma gets tagged and points to tau(tag) instead of wrapping to first node + mapping.sigmas[last_col].is_tag[last_row] = true; + mapping.sigmas[last_col].row_idx[last_row] = + circuit_constructor.tau().at(real_variable_tags[cycle_idx]); + + // All nodes except the last: sigma points to the next node in the cycle + for (size_t node_idx = 0; node_idx + 1 < cycle_size; ++node_idx) { + const cycle_node& current_node = cycle[node_idx]; + const cycle_node& next_node = cycle[node_idx + 1]; + + const auto current_row = static_cast(current_node.gate_idx); + const auto current_col = current_node.wire_idx; + // Point current node to next node. + mapping.sigmas[current_col].row_idx[current_row] = next_node.gate_idx; + mapping.sigmas[current_col].col_idx[current_row] = static_cast(next_node.wire_idx); + } + } + }, + thread_heuristics::FF_COPY_COST * 5); // Add information about public inputs so that the cycles can be altered later; See the construction of the // permutation polynomials for details. This _only_ effects sigma_0, the 0th sigma polynomial, as the structure of diff --git a/barretenberg/cpp/src/barretenberg/trace_to_polynomials/trace_to_polynomials.cpp b/barretenberg/cpp/src/barretenberg/trace_to_polynomials/trace_to_polynomials.cpp index f781aa22f8df..1d7c12266f3a 100644 --- a/barretenberg/cpp/src/barretenberg/trace_to_polynomials/trace_to_polynomials.cpp +++ b/barretenberg/cpp/src/barretenberg/trace_to_polynomials/trace_to_polynomials.cpp @@ -5,6 +5,7 @@ // ===================== #include "trace_to_polynomials.hpp" +#include "barretenberg/common/thread.hpp" #include "barretenberg/constants.hpp" #include "barretenberg/ext/starknet/flavor/ultra_starknet_flavor.hpp" #include "barretenberg/ext/starknet/flavor/ultra_starknet_zk_flavor.hpp" @@ -51,40 +52,67 @@ std::vector TraceToPolynomials::populate_wires_and_se RefArray wires = polynomials.get_wires(); auto selectors = polynomials.get_selectors(); + // Per-thread staging for copy cycle entries to avoid races on copy_cycles[real_var_idx] + using CycleEntry = std::pair; + const size_t num_threads = get_num_cpus(); + std::vector> thread_cycle_entries(num_threads); + // For each block in the trace, populate wire polys, copy cycles and selector polys for (auto& block : builder.blocks.get()) { const uint32_t offset = block.trace_offset(); const uint32_t block_size = static_cast(block.size()); - // Update wire polynomials and copy cycles - // NB: The order of row/column loops is arbitrary but needs to be row/column to match old copy_cycle code + // Parallel wire writes + copy cycle staging + // Wire writes are safe: each (wire_idx, trace_row_idx) is unique within a block. + // Copy cycles are staged per-thread and merged per-block to preserve the original + // cross-block ordering (which determines sigma/id polynomials and thus the VK). { BB_BENCH_NAME("populating wires and copy_cycles"); - for (uint32_t block_row_idx = 0; block_row_idx < block_size; ++block_row_idx) { - for (uint32_t wire_idx = 0; wire_idx < NUM_WIRES; ++wire_idx) { - uint32_t var_idx = block.wires[wire_idx][block_row_idx]; // an index into the variables array - // Use .at() for bounds checking - fuzzer found OOB with malformed ACIR - uint32_t real_var_idx = builder.real_variable_index.at(var_idx); - uint32_t trace_row_idx = block_row_idx + offset; - // Insert the real witness values from this block into the wire polys at the correct offset - wires[wire_idx].at(trace_row_idx) = builder.get_variable(var_idx); - // Add the address of the witness value to its corresponding copy cycle - // Note that the copy_cycles are indexed by real_variable_indices. - copy_cycles[real_var_idx].emplace_back(cycle_node{ wire_idx, trace_row_idx }); + for (auto& staged : thread_cycle_entries) { + staged.clear(); + } + + parallel_for_heuristic( + block_size, + [&](size_t start, size_t end, size_t chunk_index) { + auto& staged = thread_cycle_entries[chunk_index]; + for (size_t row = start; row < end; ++row) { + uint32_t block_row_idx = static_cast(row); + for (uint32_t wire_idx = 0; wire_idx < NUM_WIRES; ++wire_idx) { + uint32_t var_idx = block.wires[wire_idx][block_row_idx]; + uint32_t real_var_idx = builder.real_variable_index.at(var_idx); + uint32_t trace_row_idx = block_row_idx + offset; + wires[wire_idx].at(trace_row_idx) = builder.get_variable(var_idx); + staged.emplace_back(real_var_idx, cycle_node{ wire_idx, trace_row_idx }); + } + } + }, + thread_heuristics::FF_COPY_COST * NUM_WIRES * 4); + + // Merge this block's staged entries into copy_cycles (preserves block ordering) + for (auto& staged : thread_cycle_entries) { + for (auto& [real_var_idx, node] : staged) { + copy_cycles[real_var_idx].emplace_back(node); } } } - RefVector> block_selectors = block.get_selectors(); - // Insert the selector values for this block into the selector polynomials at the correct offset - // TODO(https://github.com/AztecProtocol/barretenberg/issues/398): implicit arithmetization/flavor consistency - for (size_t selector_idx = 0; selector_idx < block_selectors.size(); selector_idx++) { - auto& selector = block_selectors[selector_idx]; - for (size_t row_idx = 0; row_idx < block_size; ++row_idx) { - size_t trace_row_idx = row_idx + offset; - selectors[selector_idx].set_if_valid_index(trace_row_idx, selector[row_idx]); - } + // Parallel selector writes (each trace_row_idx is unique within a block) + { + RefVector> block_selectors = block.get_selectors(); + parallel_for_heuristic( + block_size, + [&](size_t start, size_t end, BB_UNUSED size_t chunk_index) { + for (size_t row_idx = start; row_idx < end; ++row_idx) { + size_t trace_row_idx = row_idx + offset; + for (size_t selector_idx = 0; selector_idx < block_selectors.size(); selector_idx++) { + selectors[selector_idx].set_if_valid_index(trace_row_idx, + block_selectors[selector_idx][row_idx]); + } + } + }, + thread_heuristics::FF_COPY_COST * block_selectors.size()); } } diff --git a/barretenberg/cpp/src/barretenberg/ultra_honk/oink_prover.cpp b/barretenberg/cpp/src/barretenberg/ultra_honk/oink_prover.cpp index 65608330b60b..8f9d107b43d4 100644 --- a/barretenberg/cpp/src/barretenberg/ultra_honk/oink_prover.cpp +++ b/barretenberg/cpp/src/barretenberg/ultra_honk/oink_prover.cpp @@ -6,6 +6,7 @@ #include "barretenberg/ultra_honk/oink_prover.hpp" #include "barretenberg/common/bb_bench.hpp" +#include "barretenberg/common/thread.hpp" #include "barretenberg/flavor/mega_avm_flavor.hpp" #include "barretenberg/honk/library/grand_product_delta.hpp" #include "barretenberg/honk/library/grand_product_library.hpp" @@ -219,20 +220,30 @@ template void OinkProver::add_ram_rom_memory_records_t const auto& eta_two = instance.relation_parameters.eta_two; const auto& eta_three = instance.relation_parameters.eta_three; - // Compute read record values - for (const auto& gate_idx : instance.memory_read_records) { - wires[3].at(gate_idx) = wires[2][gate_idx] * eta_three; - wires[3].at(gate_idx) += wires[1][gate_idx] * eta_two; - wires[3].at(gate_idx) += wires[0][gate_idx] * eta; - } - - // Compute write record values - for (const auto& gate_idx : instance.memory_write_records) { - wires[3].at(gate_idx) = wires[2][gate_idx] * eta_three; - wires[3].at(gate_idx) += wires[1][gate_idx] * eta_two; - wires[3].at(gate_idx) += wires[0][gate_idx] * eta; - wires[3].at(gate_idx) += 1; - } + // Compute read record values (each gate_idx is unique — safe to parallelize) + const auto& read_records = instance.memory_read_records; + parallel_for_heuristic( + read_records.size(), + [&](size_t i) { + const auto gate_idx = read_records[i]; + wires[3].at(gate_idx) = wires[2][gate_idx] * eta_three; + wires[3].at(gate_idx) += wires[1][gate_idx] * eta_two; + wires[3].at(gate_idx) += wires[0][gate_idx] * eta; + }, + thread_heuristics::FF_MULTIPLICATION_COST * 3); + + // Compute write record values (each gate_idx is unique — safe to parallelize) + const auto& write_records = instance.memory_write_records; + parallel_for_heuristic( + write_records.size(), + [&](size_t i) { + const auto gate_idx = write_records[i]; + wires[3].at(gate_idx) = wires[2][gate_idx] * eta_three; + wires[3].at(gate_idx) += wires[1][gate_idx] * eta_two; + wires[3].at(gate_idx) += wires[0][gate_idx] * eta; + wires[3].at(gate_idx) += 1; + }, + thread_heuristics::FF_MULTIPLICATION_COST * 3); } /** diff --git a/barretenberg/cpp/src/barretenberg/ultra_honk/prover_instance.cpp b/barretenberg/cpp/src/barretenberg/ultra_honk/prover_instance.cpp index bdbefac3fac6..c4e00d65df91 100644 --- a/barretenberg/cpp/src/barretenberg/ultra_honk/prover_instance.cpp +++ b/barretenberg/cpp/src/barretenberg/ultra_honk/prover_instance.cpp @@ -8,6 +8,7 @@ #include "barretenberg/common/assert.hpp" #include "barretenberg/common/bb_bench.hpp" #include "barretenberg/common/log.hpp" +#include "barretenberg/common/thread.hpp" #include "barretenberg/common/throw_or_abort.hpp" #include "barretenberg/flavor/mega_avm_flavor.hpp" #include "barretenberg/honk/composer/composer_lib.hpp" @@ -303,28 +304,35 @@ void ProverInstance_::construct_databus_polynomials(Circuit& circuit) // Note: Databus columns start from index 0. If this ever changes, make sure to also update the active range // construction in ExecutionTraceUsageTracker::update(). We do not utilize a zero row for databus columns. - for (size_t idx = 0; idx < calldata.size(); ++idx) { - calldata_poly.at(idx) = circuit.get_variable(calldata[idx]); // calldata values - calldata_read_counts.at(idx) = calldata.get_read_count(idx); // read counts - calldata_read_tags.at(idx) = calldata_read_counts[idx] > 0 ? 1 : 0; // has row been read or not - } - for (size_t idx = 0; idx < secondary_calldata.size(); ++idx) { - secondary_calldata_poly.at(idx) = circuit.get_variable(secondary_calldata[idx]); // secondary_calldata values - secondary_calldata_read_counts.at(idx) = secondary_calldata.get_read_count(idx); // read counts - secondary_calldata_read_tags.at(idx) = - secondary_calldata_read_counts[idx] > 0 ? 1 : 0; // has row been read or not - } - for (size_t idx = 0; idx < return_data.size(); ++idx) { - return_data_poly.at(idx) = circuit.get_variable(return_data[idx]); // return data values - return_data_read_counts.at(idx) = return_data.get_read_count(idx); // read counts - return_data_read_tags.at(idx) = return_data_read_counts[idx] > 0 ? 1 : 0; // has row been read or not - } + // Each databus column writes to disjoint polynomial ranges; parallelize per-column. + parallel_for_heuristic( + calldata.size(), + [&](size_t idx) { + calldata_poly.at(idx) = circuit.get_variable(calldata[idx]); + calldata_read_counts.at(idx) = calldata.get_read_count(idx); + calldata_read_tags.at(idx) = calldata_read_counts[idx] > 0 ? 1 : 0; + }, + thread_heuristics::FF_COPY_COST * 3); + parallel_for_heuristic( + secondary_calldata.size(), + [&](size_t idx) { + secondary_calldata_poly.at(idx) = circuit.get_variable(secondary_calldata[idx]); + secondary_calldata_read_counts.at(idx) = secondary_calldata.get_read_count(idx); + secondary_calldata_read_tags.at(idx) = secondary_calldata_read_counts[idx] > 0 ? 1 : 0; + }, + thread_heuristics::FF_COPY_COST * 3); + parallel_for_heuristic( + return_data.size(), + [&](size_t idx) { + return_data_poly.at(idx) = circuit.get_variable(return_data[idx]); + return_data_read_counts.at(idx) = return_data.get_read_count(idx); + return_data_read_tags.at(idx) = return_data_read_counts[idx] > 0 ? 1 : 0; + }, + thread_heuristics::FF_COPY_COST * 3); auto& databus_id = polynomials.databus_id; // Compute a simple identity polynomial for use in the databus lookup argument - for (size_t i = 0; i < databus_id.size(); ++i) { - databus_id.at(i) = i; - } + parallel_for_heuristic(databus_id.size(), [&](size_t i) { databus_id.at(i) = i; }, thread_heuristics::FF_COPY_COST); } /**