From a812ed25162b8bd0685c1d8137c7abcfade4824f Mon Sep 17 00:00:00 2001
From: ludamad <domuradical@gmail.com>
Date: Thu, 19 Feb 2026 19:45:05 +0000
Subject: [PATCH 1/2] type: description


From a4fe074d69dc1be147edcf2b0210fa6b943ba65c Mon Sep 17 00:00:00 2001
From: ludamad <domuradical@gmail.com>
Date: Thu, 19 Feb 2026 23:16:57 +0000
Subject: [PATCH 2/2] feat: parallelize serial bottlenecks in trace
 construction pipeline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Parallelize six serial bottlenecks identified by BB_BENCH profiling:

1. construct_trace_data (trace_to_polynomials.cpp): Parallelize wire writes
   and selector writes per-block. Stage copy cycle entries per-thread to
   avoid races, merge sequentially after all blocks.

2. compute_permutation_mapping cycle loop (permutation_lib.hpp): Each cycle
   writes to disjoint (row, col) positions — wrap in parallel_for_heuristic.

3. construct_lookup_read_counts (composer_lib.hpp): Pre-compute table offsets,
   process tables in parallel via parallel_for.

4. construct_lookup_table_polynomials (composer_lib.hpp): Same offset
   pre-computation pattern, parallel table data copy.

5. construct_databus_polynomials (prover_instance.cpp): Parallelize all four
   databus column loops (calldata, secondary_calldata, return_data, databus_id).

6. add_ram_rom_memory_records_to_wire_4 (oink_prover.cpp): Parallelize both
   read and write record loops — each gate_idx is unique.
---
 .../honk/composer/composer_lib.hpp            | 54 ++++++++-----
 .../honk/composer/permutation_lib.hpp         | 79 ++++++++++---------
 .../trace_to_polynomials.cpp                  | 72 +++++++++++------
 .../barretenberg/ultra_honk/oink_prover.cpp   | 39 +++++----
 .../ultra_honk/prover_instance.cpp            | 46 ++++++-----
 5 files changed, 180 insertions(+), 110 deletions(-)
diff --git a/barretenberg/cpp/src/barretenberg/honk/composer/composer_lib.hpp b/barretenberg/cpp/src/barretenberg/honk/composer/composer_lib.hpp
index 9375c62c33a2..f1ceffa4efae 100644
--- a/barretenberg/cpp/src/barretenberg/honk/composer/composer_lib.hpp
+++ b/barretenberg/cpp/src/barretenberg/honk/composer/composer_lib.hpp
@@ -6,6 +6,7 @@
 
 #pragma once
 #include "barretenberg/common/ref_array.hpp"
+#include "barretenberg/common/thread.hpp"
 #include "barretenberg/flavor/flavor.hpp"
 #include "barretenberg/stdlib_circuit_builders/plookup_tables/types.hpp"
 
@@ -24,16 +25,27 @@ template <typename Flavor>
 void construct_lookup_table_polynomials(const RefArray<typename Flavor::Polynomial, 4>& table_polynomials,
                                         const typename Flavor::CircuitBuilder& circuit)
 {
+    // Pre-compute cumulative offsets, then process tables in parallel (disjoint offset ranges)
+    const auto& tables = circuit.get_lookup_tables();
+    const size_t num_tables = tables.size();
+
+    std::vector<size_t> table_offsets(num_tables);
     size_t offset = 0;
-    for (const auto& table : circuit.get_lookup_tables()) {
+    for (size_t i = 0; i < num_tables; i++) {
+        table_offsets[i] = offset;
+        offset += tables[i].size();
+    }
+
+    parallel_for(num_tables, [&](size_t table_idx) {
+        const auto& table = tables[table_idx];
+        size_t tbl_offset = table_offsets[table_idx];
         for (size_t i = 0; i < table.size(); ++i) {
-            table_polynomials[0].at(offset) = table.column_1[i];
-            table_polynomials[1].at(offset) = table.column_2[i];
-            table_polynomials[2].at(offset) = table.column_3[i];
-            table_polynomials[3].at(offset) = table.table_index;
-            offset++;
+            table_polynomials[0].at(tbl_offset + i) = table.column_1[i];
+            table_polynomials[1].at(tbl_offset + i) = table.column_2[i];
+            table_polynomials[2].at(tbl_offset + i) = table.column_3[i];
+            table_polynomials[3].at(tbl_offset + i) = table.table_index;
         }
-    }
+    });
 }
 
 /**
@@ -48,25 +60,31 @@ void construct_lookup_read_counts(typename Flavor::Polynomial& read_counts,
                                   typename Flavor::Polynomial& read_tags,
                                   typename Flavor::CircuitBuilder& circuit)
 {
-    // loop over all tables used in the circuit; each table contains data about the lookups made on it
+    // Process each table independently in parallel (each table writes to disjoint offset ranges)
+    auto& tables = circuit.get_lookup_tables();
+    const size_t num_tables = tables.size();
+
+    // Pre-compute cumulative offsets (sequential, trivially fast)
+    std::vector<size_t> table_offsets(num_tables);
     size_t table_offset = 0;
-    for (auto& table : circuit.get_lookup_tables()) {
+    for (size_t i = 0; i < num_tables; i++) {
+        table_offsets[i] = table_offset;
+        table_offset += tables[i].size();
+    }
+
+    // Process each table independently
+    parallel_for(num_tables, [&](size_t table_idx) {
+        auto& table = tables[table_idx];
         table.initialize_index_map();
 
         for (auto& gate_data : table.lookup_gates) {
-            // convert lookup gate data to an array of three field elements, one for each of the 3 columns
             auto table_entry = gate_data.to_table_components(table.use_twin_keys);
-
-            // find the index of the entry in the table
             auto index_in_table = table.index_map[table_entry];
-
-            // increment the read count at the corresponding index in the full polynomial
-            size_t index_in_poly = table_offset + index_in_table;
+            size_t index_in_poly = table_offsets[table_idx] + index_in_table;
             read_counts.at(index_in_poly)++;
-            read_tags.at(index_in_poly) = 1; // tag is 1 if entry has been read 1 or more times
+            read_tags.at(index_in_poly) = 1;
         }
-        table_offset += table.size(); // set the offset of the next table within the polynomials
-    }
+    });
 }
 
 } // namespace bb
diff --git a/barretenberg/cpp/src/barretenberg/honk/composer/permutation_lib.hpp b/barretenberg/cpp/src/barretenberg/honk/composer/permutation_lib.hpp
index 619843b7753a..3aefc9c4789c 100644
--- a/barretenberg/cpp/src/barretenberg/honk/composer/permutation_lib.hpp
+++ b/barretenberg/cpp/src/barretenberg/honk/composer/permutation_lib.hpp
@@ -15,6 +15,7 @@
 
 #include "barretenberg/common/assert.hpp"
 #include "barretenberg/common/ref_span.hpp"
+#include "barretenberg/common/thread.hpp"
 #include "barretenberg/flavor/flavor.hpp"
 #include "barretenberg/polynomials/polynomial.hpp"
 
@@ -131,44 +132,48 @@ PermutationMapping<Flavor::NUM_WIRES> compute_permutation_mapping(
     // Represents the idx of a variable in circuit_constructor.variables
     std::span<const uint32_t> real_variable_tags = circuit_constructor.real_variable_tags;
 
-    // Go through each cycle
-    for (size_t cycle_idx = 0; cycle_idx < wire_copy_cycles.size(); ++cycle_idx) {
-        // We go through the cycle and fill-out/modify `mapping`. Following the generalized permutation algorithm, we
-        // take separate care of first/last node handling.
-        const CyclicPermutation& cycle = wire_copy_cycles[cycle_idx];
-        const auto cycle_size = cycle.size();
-        if (cycle_size == 0) {
-            continue;
-        }
-
-        const cycle_node& first_node = cycle[0];
-        const cycle_node& last_node = cycle[cycle_size - 1];
-
-        const auto first_row = static_cast<ptrdiff_t>(first_node.gate_idx);
-        const auto first_col = first_node.wire_idx;
-        const auto last_row = static_cast<ptrdiff_t>(last_node.gate_idx);
-        const auto last_col = last_node.wire_idx;
-
-        // First node: id gets tagged with the cycle's variable tag
-        mapping.ids[first_col].is_tag[first_row] = true;
-        mapping.ids[first_col].row_idx[first_row] = real_variable_tags[cycle_idx];
-
-        // Last node: sigma gets tagged and points to tau(tag) instead of wrapping to first node
-        mapping.sigmas[last_col].is_tag[last_row] = true;
-        mapping.sigmas[last_col].row_idx[last_row] = circuit_constructor.tau().at(real_variable_tags[cycle_idx]);
-
-        // All nodes except the last: sigma points to the next node in the cycle
-        for (size_t node_idx = 0; node_idx + 1 < cycle_size; ++node_idx) {
-            const cycle_node& current_node = cycle[node_idx];
-            const cycle_node& next_node = cycle[node_idx + 1];
+    // Go through each cycle (parallelizable: each cycle writes to disjoint positions)
+    parallel_for_heuristic(
+        wire_copy_cycles.size(),
+        [&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
+            for (size_t cycle_idx = start; cycle_idx < end; ++cycle_idx) {
+                const CyclicPermutation& cycle = wire_copy_cycles[cycle_idx];
+                const auto cycle_size = cycle.size();
+                if (cycle_size == 0) {
+                    continue;
+                }
 
-            const auto current_row = static_cast<ptrdiff_t>(current_node.gate_idx);
-            const auto current_col = current_node.wire_idx;
-            // Point current node to next node.
-            mapping.sigmas[current_col].row_idx[current_row] = next_node.gate_idx;
-            mapping.sigmas[current_col].col_idx[current_row] = static_cast<uint8_t>(next_node.wire_idx);
-        }
-    }
+                const cycle_node& first_node = cycle[0];
+                const cycle_node& last_node = cycle[cycle_size - 1];
+
+                const auto first_row = static_cast<ptrdiff_t>(first_node.gate_idx);
+                const auto first_col = first_node.wire_idx;
+                const auto last_row = static_cast<ptrdiff_t>(last_node.gate_idx);
+                const auto last_col = last_node.wire_idx;
+
+                // First node: id gets tagged with the cycle's variable tag
+                mapping.ids[first_col].is_tag[first_row] = true;
+                mapping.ids[first_col].row_idx[first_row] = real_variable_tags[cycle_idx];
+
+                // Last node: sigma gets tagged and points to tau(tag) instead of wrapping to first node
+                mapping.sigmas[last_col].is_tag[last_row] = true;
+                mapping.sigmas[last_col].row_idx[last_row] =
+                    circuit_constructor.tau().at(real_variable_tags[cycle_idx]);
+
+                // All nodes except the last: sigma points to the next node in the cycle
+                for (size_t node_idx = 0; node_idx + 1 < cycle_size; ++node_idx) {
+                    const cycle_node& current_node = cycle[node_idx];
+                    const cycle_node& next_node = cycle[node_idx + 1];
+
+                    const auto current_row = static_cast<ptrdiff_t>(current_node.gate_idx);
+                    const auto current_col = current_node.wire_idx;
+                    // Point current node to next node.
+                    mapping.sigmas[current_col].row_idx[current_row] = next_node.gate_idx;
+                    mapping.sigmas[current_col].col_idx[current_row] = static_cast<uint8_t>(next_node.wire_idx);
+                }
+            }
+        },
+        thread_heuristics::FF_COPY_COST * 5);
 
     // Add information about public inputs so that the cycles can be altered later; See the construction of the
     // permutation polynomials for details. This _only_ effects sigma_0, the 0th sigma polynomial, as the structure of
diff --git a/barretenberg/cpp/src/barretenberg/trace_to_polynomials/trace_to_polynomials.cpp b/barretenberg/cpp/src/barretenberg/trace_to_polynomials/trace_to_polynomials.cpp
index f781aa22f8df..1d7c12266f3a 100644
--- a/barretenberg/cpp/src/barretenberg/trace_to_polynomials/trace_to_polynomials.cpp
+++ b/barretenberg/cpp/src/barretenberg/trace_to_polynomials/trace_to_polynomials.cpp
@@ -5,6 +5,7 @@
 // =====================
 
 #include "trace_to_polynomials.hpp"
+#include "barretenberg/common/thread.hpp"
 #include "barretenberg/constants.hpp"
 #include "barretenberg/ext/starknet/flavor/ultra_starknet_flavor.hpp"
 #include "barretenberg/ext/starknet/flavor/ultra_starknet_zk_flavor.hpp"
@@ -51,40 +52,67 @@ std::vector<CyclicPermutation> TraceToPolynomials<Flavor>::populate_wires_and_se
     RefArray<Polynomial, NUM_WIRES> wires = polynomials.get_wires();
     auto selectors = polynomials.get_selectors();
 
+    // Per-thread staging for copy cycle entries to avoid races on copy_cycles[real_var_idx]
+    using CycleEntry = std::pair<uint32_t, cycle_node>;
+    const size_t num_threads = get_num_cpus();
+    std::vector<std::vector<CycleEntry>> thread_cycle_entries(num_threads);
+
     // For each block in the trace, populate wire polys, copy cycles and selector polys
     for (auto& block : builder.blocks.get()) {
         const uint32_t offset = block.trace_offset();
         const uint32_t block_size = static_cast<uint32_t>(block.size());
 
-        // Update wire polynomials and copy cycles
-        // NB: The order of row/column loops is arbitrary but needs to be row/column to match old copy_cycle code
+        // Parallel wire writes + copy cycle staging
+        // Wire writes are safe: each (wire_idx, trace_row_idx) is unique within a block.
+        // Copy cycles are staged per-thread and merged per-block to preserve the original
+        // cross-block ordering (which determines sigma/id polynomials and thus the VK).
         {
             BB_BENCH_NAME("populating wires and copy_cycles");
 
-            for (uint32_t block_row_idx = 0; block_row_idx < block_size; ++block_row_idx) {
-                for (uint32_t wire_idx = 0; wire_idx < NUM_WIRES; ++wire_idx) {
-                    uint32_t var_idx = block.wires[wire_idx][block_row_idx]; // an index into the variables array
-                    // Use .at() for bounds checking - fuzzer found OOB with malformed ACIR
-                    uint32_t real_var_idx = builder.real_variable_index.at(var_idx);
-                    uint32_t trace_row_idx = block_row_idx + offset;
-                    // Insert the real witness values from this block into the wire polys at the correct offset
-                    wires[wire_idx].at(trace_row_idx) = builder.get_variable(var_idx);
-                    // Add the address of the witness value to its corresponding copy cycle
-                    // Note that the copy_cycles are indexed by real_variable_indices.
-                    copy_cycles[real_var_idx].emplace_back(cycle_node{ wire_idx, trace_row_idx });
+            for (auto& staged : thread_cycle_entries) {
+                staged.clear();
+            }
+
+            parallel_for_heuristic(
+                block_size,
+                [&](size_t start, size_t end, size_t chunk_index) {
+                    auto& staged = thread_cycle_entries[chunk_index];
+                    for (size_t row = start; row < end; ++row) {
+                        uint32_t block_row_idx = static_cast<uint32_t>(row);
+                        for (uint32_t wire_idx = 0; wire_idx < NUM_WIRES; ++wire_idx) {
+                            uint32_t var_idx = block.wires[wire_idx][block_row_idx];
+                            uint32_t real_var_idx = builder.real_variable_index.at(var_idx);
+                            uint32_t trace_row_idx = block_row_idx + offset;
+                            wires[wire_idx].at(trace_row_idx) = builder.get_variable(var_idx);
+                            staged.emplace_back(real_var_idx, cycle_node{ wire_idx, trace_row_idx });
+                        }
+                    }
+                },
+                thread_heuristics::FF_COPY_COST * NUM_WIRES * 4);
+
+            // Merge this block's staged entries into copy_cycles (preserves block ordering)
+            for (auto& staged : thread_cycle_entries) {
+                for (auto& [real_var_idx, node] : staged) {
+                    copy_cycles[real_var_idx].emplace_back(node);
                 }
             }
         }
 
-        RefVector<Selector<FF>> block_selectors = block.get_selectors();
-        // Insert the selector values for this block into the selector polynomials at the correct offset
-        // TODO(https://github.com/AztecProtocol/barretenberg/issues/398): implicit arithmetization/flavor consistency
-        for (size_t selector_idx = 0; selector_idx < block_selectors.size(); selector_idx++) {
-            auto& selector = block_selectors[selector_idx];
-            for (size_t row_idx = 0; row_idx < block_size; ++row_idx) {
-                size_t trace_row_idx = row_idx + offset;
-                selectors[selector_idx].set_if_valid_index(trace_row_idx, selector[row_idx]);
-            }
+        // Parallel selector writes (each trace_row_idx is unique within a block)
+        {
+            RefVector<Selector<FF>> block_selectors = block.get_selectors();
+            parallel_for_heuristic(
+                block_size,
+                [&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
+                    for (size_t row_idx = start; row_idx < end; ++row_idx) {
+                        size_t trace_row_idx = row_idx + offset;
+                        for (size_t selector_idx = 0; selector_idx < block_selectors.size(); selector_idx++) {
+                            selectors[selector_idx].set_if_valid_index(trace_row_idx,
+                                                                       block_selectors[selector_idx][row_idx]);
+                        }
+                    }
+                },
+                thread_heuristics::FF_COPY_COST * block_selectors.size());
         }
     }
 
diff --git a/barretenberg/cpp/src/barretenberg/ultra_honk/oink_prover.cpp b/barretenberg/cpp/src/barretenberg/ultra_honk/oink_prover.cpp
index 65608330b60b..8f9d107b43d4 100644
--- a/barretenberg/cpp/src/barretenberg/ultra_honk/oink_prover.cpp
+++ b/barretenberg/cpp/src/barretenberg/ultra_honk/oink_prover.cpp
@@ -6,6 +6,7 @@
 
 #include "barretenberg/ultra_honk/oink_prover.hpp"
 #include "barretenberg/common/bb_bench.hpp"
+#include "barretenberg/common/thread.hpp"
 #include "barretenberg/flavor/mega_avm_flavor.hpp"
 #include "barretenberg/honk/library/grand_product_delta.hpp"
 #include "barretenberg/honk/library/grand_product_library.hpp"
@@ -219,20 +220,30 @@ template <typename Flavor> void OinkProver<Flavor>::add_ram_rom_memory_records_t
     const auto& eta_two = instance.relation_parameters.eta_two;
     const auto& eta_three = instance.relation_parameters.eta_three;
 
-    // Compute read record values
-    for (const auto& gate_idx : instance.memory_read_records) {
-        wires[3].at(gate_idx) = wires[2][gate_idx] * eta_three;
-        wires[3].at(gate_idx) += wires[1][gate_idx] * eta_two;
-        wires[3].at(gate_idx) += wires[0][gate_idx] * eta;
-    }
-
-    // Compute write record values
-    for (const auto& gate_idx : instance.memory_write_records) {
-        wires[3].at(gate_idx) = wires[2][gate_idx] * eta_three;
-        wires[3].at(gate_idx) += wires[1][gate_idx] * eta_two;
-        wires[3].at(gate_idx) += wires[0][gate_idx] * eta;
-        wires[3].at(gate_idx) += 1;
-    }
+    // Compute read record values (each gate_idx is unique — safe to parallelize)
+    const auto& read_records = instance.memory_read_records;
+    parallel_for_heuristic(
+        read_records.size(),
+        [&](size_t i) {
+            const auto gate_idx = read_records[i];
+            wires[3].at(gate_idx) = wires[2][gate_idx] * eta_three;
+            wires[3].at(gate_idx) += wires[1][gate_idx] * eta_two;
+            wires[3].at(gate_idx) += wires[0][gate_idx] * eta;
+        },
+        thread_heuristics::FF_MULTIPLICATION_COST * 3);
+
+    // Compute write record values (each gate_idx is unique — safe to parallelize)
+    const auto& write_records = instance.memory_write_records;
+    parallel_for_heuristic(
+        write_records.size(),
+        [&](size_t i) {
+            const auto gate_idx = write_records[i];
+            wires[3].at(gate_idx) = wires[2][gate_idx] * eta_three;
+            wires[3].at(gate_idx) += wires[1][gate_idx] * eta_two;
+            wires[3].at(gate_idx) += wires[0][gate_idx] * eta;
+            wires[3].at(gate_idx) += 1;
+        },
+        thread_heuristics::FF_MULTIPLICATION_COST * 3);
 }
 
 /**
diff --git a/barretenberg/cpp/src/barretenberg/ultra_honk/prover_instance.cpp b/barretenberg/cpp/src/barretenberg/ultra_honk/prover_instance.cpp
index bdbefac3fac6..c4e00d65df91 100644
--- a/barretenberg/cpp/src/barretenberg/ultra_honk/prover_instance.cpp
+++ b/barretenberg/cpp/src/barretenberg/ultra_honk/prover_instance.cpp
@@ -8,6 +8,7 @@
 #include "barretenberg/common/assert.hpp"
 #include "barretenberg/common/bb_bench.hpp"
 #include "barretenberg/common/log.hpp"
+#include "barretenberg/common/thread.hpp"
 #include "barretenberg/common/throw_or_abort.hpp"
 #include "barretenberg/flavor/mega_avm_flavor.hpp"
 #include "barretenberg/honk/composer/composer_lib.hpp"
@@ -303,28 +304,35 @@ void ProverInstance_<Flavor>::construct_databus_polynomials(Circuit& circuit)
 
     // Note: Databus columns start from index 0. If this ever changes, make sure to also update the active range
     // construction in ExecutionTraceUsageTracker::update(). We do not utilize a zero row for databus columns.
-    for (size_t idx = 0; idx < calldata.size(); ++idx) {
-        calldata_poly.at(idx) = circuit.get_variable(calldata[idx]);        // calldata values
-        calldata_read_counts.at(idx) = calldata.get_read_count(idx);        // read counts
-        calldata_read_tags.at(idx) = calldata_read_counts[idx] > 0 ? 1 : 0; // has row been read or not
-    }
-    for (size_t idx = 0; idx < secondary_calldata.size(); ++idx) {
-        secondary_calldata_poly.at(idx) = circuit.get_variable(secondary_calldata[idx]); // secondary_calldata values
-        secondary_calldata_read_counts.at(idx) = secondary_calldata.get_read_count(idx); // read counts
-        secondary_calldata_read_tags.at(idx) =
-            secondary_calldata_read_counts[idx] > 0 ? 1 : 0; // has row been read or not
-    }
-    for (size_t idx = 0; idx < return_data.size(); ++idx) {
-        return_data_poly.at(idx) = circuit.get_variable(return_data[idx]);        // return data values
-        return_data_read_counts.at(idx) = return_data.get_read_count(idx);        // read counts
-        return_data_read_tags.at(idx) = return_data_read_counts[idx] > 0 ? 1 : 0; // has row been read or not
-    }
+    // Each databus column writes to disjoint polynomial ranges; parallelize per-column.
+    parallel_for_heuristic(
+        calldata.size(),
+        [&](size_t idx) {
+            calldata_poly.at(idx) = circuit.get_variable(calldata[idx]);
+            calldata_read_counts.at(idx) = calldata.get_read_count(idx);
+            calldata_read_tags.at(idx) = calldata_read_counts[idx] > 0 ? 1 : 0;
+        },
+        thread_heuristics::FF_COPY_COST * 3);
+    parallel_for_heuristic(
+        secondary_calldata.size(),
+        [&](size_t idx) {
+            secondary_calldata_poly.at(idx) = circuit.get_variable(secondary_calldata[idx]);
+            secondary_calldata_read_counts.at(idx) = secondary_calldata.get_read_count(idx);
+            secondary_calldata_read_tags.at(idx) = secondary_calldata_read_counts[idx] > 0 ? 1 : 0;
+        },
+        thread_heuristics::FF_COPY_COST * 3);
+    parallel_for_heuristic(
+        return_data.size(),
+        [&](size_t idx) {
+            return_data_poly.at(idx) = circuit.get_variable(return_data[idx]);
+            return_data_read_counts.at(idx) = return_data.get_read_count(idx);
+            return_data_read_tags.at(idx) = return_data_read_counts[idx] > 0 ? 1 : 0;
+        },
+        thread_heuristics::FF_COPY_COST * 3);
 
     auto& databus_id = polynomials.databus_id;
     // Compute a simple identity polynomial for use in the databus lookup argument
-    for (size_t i = 0; i < databus_id.size(); ++i) {
-        databus_id.at(i) = i;
-    }
+    parallel_for_heuristic(databus_id.size(), [&](size_t i) { databus_id.at(i) = i; }, thread_heuristics::FF_COPY_COST);
 }
 
 /**