Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 36 additions & 18 deletions barretenberg/cpp/src/barretenberg/honk/composer/composer_lib.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#pragma once
#include "barretenberg/common/ref_array.hpp"
#include "barretenberg/common/thread.hpp"
#include "barretenberg/flavor/flavor.hpp"
#include "barretenberg/stdlib_circuit_builders/plookup_tables/types.hpp"

Expand All @@ -24,16 +25,27 @@ template <typename Flavor>
void construct_lookup_table_polynomials(const RefArray<typename Flavor::Polynomial, 4>& table_polynomials,
const typename Flavor::CircuitBuilder& circuit)
{
// Pre-compute cumulative offsets, then process tables in parallel (disjoint offset ranges)
const auto& tables = circuit.get_lookup_tables();
const size_t num_tables = tables.size();

std::vector<size_t> table_offsets(num_tables);
size_t offset = 0;
for (const auto& table : circuit.get_lookup_tables()) {
for (size_t i = 0; i < num_tables; i++) {
table_offsets[i] = offset;
offset += tables[i].size();
}

parallel_for(num_tables, [&](size_t table_idx) {
const auto& table = tables[table_idx];
size_t tbl_offset = table_offsets[table_idx];
for (size_t i = 0; i < table.size(); ++i) {
table_polynomials[0].at(offset) = table.column_1[i];
table_polynomials[1].at(offset) = table.column_2[i];
table_polynomials[2].at(offset) = table.column_3[i];
table_polynomials[3].at(offset) = table.table_index;
offset++;
table_polynomials[0].at(tbl_offset + i) = table.column_1[i];
table_polynomials[1].at(tbl_offset + i) = table.column_2[i];
table_polynomials[2].at(tbl_offset + i) = table.column_3[i];
table_polynomials[3].at(tbl_offset + i) = table.table_index;
}
}
});
}

/**
Expand All @@ -48,25 +60,31 @@ void construct_lookup_read_counts(typename Flavor::Polynomial& read_counts,
typename Flavor::Polynomial& read_tags,
typename Flavor::CircuitBuilder& circuit)
{
// loop over all tables used in the circuit; each table contains data about the lookups made on it
// Process each table independently in parallel (each table writes to disjoint offset ranges)
auto& tables = circuit.get_lookup_tables();
const size_t num_tables = tables.size();

// Pre-compute cumulative offsets (sequential, trivially fast)
std::vector<size_t> table_offsets(num_tables);
size_t table_offset = 0;
for (auto& table : circuit.get_lookup_tables()) {
for (size_t i = 0; i < num_tables; i++) {
table_offsets[i] = table_offset;
table_offset += tables[i].size();
}

// Process each table independently
parallel_for(num_tables, [&](size_t table_idx) {
auto& table = tables[table_idx];
table.initialize_index_map();

for (auto& gate_data : table.lookup_gates) {
// convert lookup gate data to an array of three field elements, one for each of the 3 columns
auto table_entry = gate_data.to_table_components(table.use_twin_keys);

// find the index of the entry in the table
auto index_in_table = table.index_map[table_entry];

// increment the read count at the corresponding index in the full polynomial
size_t index_in_poly = table_offset + index_in_table;
size_t index_in_poly = table_offsets[table_idx] + index_in_table;
read_counts.at(index_in_poly)++;
read_tags.at(index_in_poly) = 1; // tag is 1 if entry has been read 1 or more times
read_tags.at(index_in_poly) = 1;
}
table_offset += table.size(); // set the offset of the next table within the polynomials
}
});
}

} // namespace bb
79 changes: 42 additions & 37 deletions barretenberg/cpp/src/barretenberg/honk/composer/permutation_lib.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

#include "barretenberg/common/assert.hpp"
#include "barretenberg/common/ref_span.hpp"
#include "barretenberg/common/thread.hpp"
#include "barretenberg/flavor/flavor.hpp"
#include "barretenberg/polynomials/polynomial.hpp"

Expand Down Expand Up @@ -131,44 +132,48 @@ PermutationMapping<Flavor::NUM_WIRES> compute_permutation_mapping(
// Represents the idx of a variable in circuit_constructor.variables
std::span<const uint32_t> real_variable_tags = circuit_constructor.real_variable_tags;

// Go through each cycle
for (size_t cycle_idx = 0; cycle_idx < wire_copy_cycles.size(); ++cycle_idx) {
// We go through the cycle and fill-out/modify `mapping`. Following the generalized permutation algorithm, we
// take separate care of first/last node handling.
const CyclicPermutation& cycle = wire_copy_cycles[cycle_idx];
const auto cycle_size = cycle.size();
if (cycle_size == 0) {
continue;
}

const cycle_node& first_node = cycle[0];
const cycle_node& last_node = cycle[cycle_size - 1];

const auto first_row = static_cast<ptrdiff_t>(first_node.gate_idx);
const auto first_col = first_node.wire_idx;
const auto last_row = static_cast<ptrdiff_t>(last_node.gate_idx);
const auto last_col = last_node.wire_idx;

// First node: id gets tagged with the cycle's variable tag
mapping.ids[first_col].is_tag[first_row] = true;
mapping.ids[first_col].row_idx[first_row] = real_variable_tags[cycle_idx];

// Last node: sigma gets tagged and points to tau(tag) instead of wrapping to first node
mapping.sigmas[last_col].is_tag[last_row] = true;
mapping.sigmas[last_col].row_idx[last_row] = circuit_constructor.tau().at(real_variable_tags[cycle_idx]);

// All nodes except the last: sigma points to the next node in the cycle
for (size_t node_idx = 0; node_idx + 1 < cycle_size; ++node_idx) {
const cycle_node& current_node = cycle[node_idx];
const cycle_node& next_node = cycle[node_idx + 1];
// Go through each cycle (parallelizable: each cycle writes to disjoint positions)
parallel_for_heuristic(
wire_copy_cycles.size(),
[&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
for (size_t cycle_idx = start; cycle_idx < end; ++cycle_idx) {
const CyclicPermutation& cycle = wire_copy_cycles[cycle_idx];
const auto cycle_size = cycle.size();
if (cycle_size == 0) {
continue;
}

const auto current_row = static_cast<ptrdiff_t>(current_node.gate_idx);
const auto current_col = current_node.wire_idx;
// Point current node to next node.
mapping.sigmas[current_col].row_idx[current_row] = next_node.gate_idx;
mapping.sigmas[current_col].col_idx[current_row] = static_cast<uint8_t>(next_node.wire_idx);
}
}
const cycle_node& first_node = cycle[0];
const cycle_node& last_node = cycle[cycle_size - 1];

const auto first_row = static_cast<ptrdiff_t>(first_node.gate_idx);
const auto first_col = first_node.wire_idx;
const auto last_row = static_cast<ptrdiff_t>(last_node.gate_idx);
const auto last_col = last_node.wire_idx;

// First node: id gets tagged with the cycle's variable tag
mapping.ids[first_col].is_tag[first_row] = true;
mapping.ids[first_col].row_idx[first_row] = real_variable_tags[cycle_idx];

// Last node: sigma gets tagged and points to tau(tag) instead of wrapping to first node
mapping.sigmas[last_col].is_tag[last_row] = true;
mapping.sigmas[last_col].row_idx[last_row] =
circuit_constructor.tau().at(real_variable_tags[cycle_idx]);

// All nodes except the last: sigma points to the next node in the cycle
for (size_t node_idx = 0; node_idx + 1 < cycle_size; ++node_idx) {
const cycle_node& current_node = cycle[node_idx];
const cycle_node& next_node = cycle[node_idx + 1];

const auto current_row = static_cast<ptrdiff_t>(current_node.gate_idx);
const auto current_col = current_node.wire_idx;
// Point current node to next node.
mapping.sigmas[current_col].row_idx[current_row] = next_node.gate_idx;
mapping.sigmas[current_col].col_idx[current_row] = static_cast<uint8_t>(next_node.wire_idx);
}
}
},
thread_heuristics::FF_COPY_COST * 5);

// Add information about public inputs so that the cycles can be altered later; See the construction of the
// permutation polynomials for details. This _only_ effects sigma_0, the 0th sigma polynomial, as the structure of
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
// =====================

#include "trace_to_polynomials.hpp"
#include "barretenberg/common/thread.hpp"
#include "barretenberg/constants.hpp"
#include "barretenberg/ext/starknet/flavor/ultra_starknet_flavor.hpp"
#include "barretenberg/ext/starknet/flavor/ultra_starknet_zk_flavor.hpp"
Expand Down Expand Up @@ -51,40 +52,67 @@ std::vector<CyclicPermutation> TraceToPolynomials<Flavor>::populate_wires_and_se
RefArray<Polynomial, NUM_WIRES> wires = polynomials.get_wires();
auto selectors = polynomials.get_selectors();

// Per-thread staging for copy cycle entries to avoid races on copy_cycles[real_var_idx]
using CycleEntry = std::pair<uint32_t, cycle_node>;
const size_t num_threads = get_num_cpus();
std::vector<std::vector<CycleEntry>> thread_cycle_entries(num_threads);

// For each block in the trace, populate wire polys, copy cycles and selector polys
for (auto& block : builder.blocks.get()) {
const uint32_t offset = block.trace_offset();
const uint32_t block_size = static_cast<uint32_t>(block.size());

// Update wire polynomials and copy cycles
// NB: The order of row/column loops is arbitrary but needs to be row/column to match old copy_cycle code
// Parallel wire writes + copy cycle staging
// Wire writes are safe: each (wire_idx, trace_row_idx) is unique within a block.
// Copy cycles are staged per-thread and merged per-block to preserve the original
// cross-block ordering (which determines sigma/id polynomials and thus the VK).
{
BB_BENCH_NAME("populating wires and copy_cycles");

for (uint32_t block_row_idx = 0; block_row_idx < block_size; ++block_row_idx) {
for (uint32_t wire_idx = 0; wire_idx < NUM_WIRES; ++wire_idx) {
uint32_t var_idx = block.wires[wire_idx][block_row_idx]; // an index into the variables array
// Use .at() for bounds checking - fuzzer found OOB with malformed ACIR
uint32_t real_var_idx = builder.real_variable_index.at(var_idx);
uint32_t trace_row_idx = block_row_idx + offset;
// Insert the real witness values from this block into the wire polys at the correct offset
wires[wire_idx].at(trace_row_idx) = builder.get_variable(var_idx);
// Add the address of the witness value to its corresponding copy cycle
// Note that the copy_cycles are indexed by real_variable_indices.
copy_cycles[real_var_idx].emplace_back(cycle_node{ wire_idx, trace_row_idx });
for (auto& staged : thread_cycle_entries) {
staged.clear();
}

parallel_for_heuristic(
block_size,
[&](size_t start, size_t end, size_t chunk_index) {
auto& staged = thread_cycle_entries[chunk_index];
for (size_t row = start; row < end; ++row) {
uint32_t block_row_idx = static_cast<uint32_t>(row);
for (uint32_t wire_idx = 0; wire_idx < NUM_WIRES; ++wire_idx) {
uint32_t var_idx = block.wires[wire_idx][block_row_idx];
uint32_t real_var_idx = builder.real_variable_index.at(var_idx);
uint32_t trace_row_idx = block_row_idx + offset;
wires[wire_idx].at(trace_row_idx) = builder.get_variable(var_idx);
staged.emplace_back(real_var_idx, cycle_node{ wire_idx, trace_row_idx });
}
}
},
thread_heuristics::FF_COPY_COST * NUM_WIRES * 4);

// Merge this block's staged entries into copy_cycles (preserves block ordering)
for (auto& staged : thread_cycle_entries) {
for (auto& [real_var_idx, node] : staged) {
copy_cycles[real_var_idx].emplace_back(node);
}
}
}

RefVector<Selector<FF>> block_selectors = block.get_selectors();
// Insert the selector values for this block into the selector polynomials at the correct offset
// TODO(https://github.com/AztecProtocol/barretenberg/issues/398): implicit arithmetization/flavor consistency
for (size_t selector_idx = 0; selector_idx < block_selectors.size(); selector_idx++) {
auto& selector = block_selectors[selector_idx];
for (size_t row_idx = 0; row_idx < block_size; ++row_idx) {
size_t trace_row_idx = row_idx + offset;
selectors[selector_idx].set_if_valid_index(trace_row_idx, selector[row_idx]);
}
// Parallel selector writes (each trace_row_idx is unique within a block)
{
RefVector<Selector<FF>> block_selectors = block.get_selectors();
parallel_for_heuristic(
block_size,
[&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
for (size_t row_idx = start; row_idx < end; ++row_idx) {
size_t trace_row_idx = row_idx + offset;
for (size_t selector_idx = 0; selector_idx < block_selectors.size(); selector_idx++) {
selectors[selector_idx].set_if_valid_index(trace_row_idx,
block_selectors[selector_idx][row_idx]);
}
}
},
thread_heuristics::FF_COPY_COST * block_selectors.size());
}
}

Expand Down
39 changes: 25 additions & 14 deletions barretenberg/cpp/src/barretenberg/ultra_honk/oink_prover.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include "barretenberg/ultra_honk/oink_prover.hpp"
#include "barretenberg/common/bb_bench.hpp"
#include "barretenberg/common/thread.hpp"
#include "barretenberg/flavor/mega_avm_flavor.hpp"
#include "barretenberg/honk/library/grand_product_delta.hpp"
#include "barretenberg/honk/library/grand_product_library.hpp"
Expand Down Expand Up @@ -219,20 +220,30 @@ template <typename Flavor> void OinkProver<Flavor>::add_ram_rom_memory_records_t
const auto& eta_two = instance.relation_parameters.eta_two;
const auto& eta_three = instance.relation_parameters.eta_three;

// Compute read record values
for (const auto& gate_idx : instance.memory_read_records) {
wires[3].at(gate_idx) = wires[2][gate_idx] * eta_three;
wires[3].at(gate_idx) += wires[1][gate_idx] * eta_two;
wires[3].at(gate_idx) += wires[0][gate_idx] * eta;
}

// Compute write record values
for (const auto& gate_idx : instance.memory_write_records) {
wires[3].at(gate_idx) = wires[2][gate_idx] * eta_three;
wires[3].at(gate_idx) += wires[1][gate_idx] * eta_two;
wires[3].at(gate_idx) += wires[0][gate_idx] * eta;
wires[3].at(gate_idx) += 1;
}
// Compute read record values (each gate_idx is unique — safe to parallelize)
const auto& read_records = instance.memory_read_records;
parallel_for_heuristic(
read_records.size(),
[&](size_t i) {
const auto gate_idx = read_records[i];
wires[3].at(gate_idx) = wires[2][gate_idx] * eta_three;
wires[3].at(gate_idx) += wires[1][gate_idx] * eta_two;
wires[3].at(gate_idx) += wires[0][gate_idx] * eta;
},
thread_heuristics::FF_MULTIPLICATION_COST * 3);

// Compute write record values (each gate_idx is unique — safe to parallelize)
const auto& write_records = instance.memory_write_records;
parallel_for_heuristic(
write_records.size(),
[&](size_t i) {
const auto gate_idx = write_records[i];
wires[3].at(gate_idx) = wires[2][gate_idx] * eta_three;
wires[3].at(gate_idx) += wires[1][gate_idx] * eta_two;
wires[3].at(gate_idx) += wires[0][gate_idx] * eta;
wires[3].at(gate_idx) += 1;
},
thread_heuristics::FF_MULTIPLICATION_COST * 3);
}

/**
Expand Down
Loading