Skip to content

Commit 495fa23

Browse files
committed
[SW] WIP: Update dotp algorithm for better performance in multi-tile configuration
1 parent e3f0def commit 495fa23

5 files changed

Lines changed: 62 additions & 35 deletions

File tree

sim/sim.mk

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ VSIM_FLAGS += -sv_lib $(SIM_DIR)/${DPI_LIB}/cachepool_dpi
2828
VSIM_FLAGS += -t 1ps
2929
VSIM_FLAGS += -voptargs=+acc
3030
VSIM_FLAGS += -suppress vsim-3999
31+
# Division by zero, generated by performance counter
32+
VSIM_FLAGS += -suppress vsim-8630
3133

3234
VLOG_FLAGS += -svinputport=compat
3335
VLOG_FLAGS += -override_timescale 1ns/1ps

software/tests/fdotp-32b/data/data_32768.h

Lines changed: 3 additions & 3 deletions
Large diffs are not rendered by default.

software/tests/fdotp-32b/data/data_8192.h

Lines changed: 3 additions & 3 deletions
Large diffs are not rendered by default.

software/tests/fdotp-32b/main.c

Lines changed: 51 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -25,35 +25,59 @@
2525

2626
int main() {
2727
const uint32_t num_cores = snrt_cluster_core_num();
28+
const uint32_t num_tiles = snrt_cluster_tile_num();
2829
const uint32_t cid = snrt_cluster_core_idx();
30+
const uint32_t tid = snrt_cluster_tile_idx();
31+
// core id within a tile (0-3)
32+
const uint32_t cid_tile = cid - tid * num_tiles;
2933

30-
const int measure_iter = 3;
31-
32-
// Byte-level interleaving for DRAM
33-
// Default setting is 1024b (128 Byte)
34-
// This is used to ensure we can utilze all four channels to DRAM
35-
const uint32_t Interleave = 512;
36-
const uint32_t max_vlen = 512;
37-
// Calculate the best lmul setting for current configuration
38-
const uint32_t lmul = Interleave * 8 / max_vlen;
39-
34+
const uint32_t num_cores_per_tile = num_cores / num_tiles;
4035

41-
// Each round we can calculate Interleave/4 32b-elements
42-
const uint32_t elem_per_round = Interleave * num_cores / 4;
43-
// how many rounds do we need to finish executing?
44-
const uint32_t rounds = (dotp_l.M > elem_per_round) ? ((dotp_l.M + elem_per_round - 1) / elem_per_round) : 1;
45-
46-
const uint32_t dim = elem_per_round / num_cores;
47-
48-
uint32_t offset = 31 - __builtin_clz(dim * sizeof(float));
36+
const int measure_iter = 3;
4937

38+
// Here we target to reduce the remote access.
39+
// We want to keep the data fully interleaved on L1
40+
// Therefore, give a small value to it go with the default minimum
41+
// => interleave with cacheline width
5042
if (cid == 0) {
5143
// Set xbar policy
52-
l1d_xbar_config(offset);
44+
l1d_xbar_config(1);
5345
// Initialize the cache
5446
l1d_init(0);
47+
}
48+
49+
// hardcode for now the cacheline width and number of cache per tile
50+
// TODO: correctly pass in the info from hardware configuration
51+
const uint32_t cacheline = 512;
52+
const uint32_t num_cache_per_tile = 4;
53+
54+
// This is the max length each core can work contiunously without break (in bits)
55+
const uint32_t data_len_per_tile = cacheline * num_cache_per_tile;
56+
// This is the max length each core can work contiunously without break (in elem)
57+
const uint32_t dim = data_len_per_tile / 32;
58+
// This is the distance each core within a tile needs to jump after one iteration (in elem)
59+
// Also the dimension each core will work on in one large iteration
60+
const uint32_t tile_offset = num_tiles * dim;
61+
// This is the distance each core needs to jump after one iteration (in elem)
62+
const uint32_t offset = tile_offset * num_cores_per_tile;
63+
// Max hardware vlen the core support
64+
const uint32_t max_vlen = 512;
65+
// Which lmul settins we can use for the kernel?
66+
const uint32_t lmul = data_len_per_tile / max_vlen;
67+
// This is the number of large iterations need for execution
68+
const uint32_t rounds = dotp_l.M / offset;
69+
70+
if (cid == 0) {
71+
if (rounds < 1) {
72+
// Means we have way too less problem size, not fit for this algorithm
73+
printf ("FATAL: Number of elements too small!\n");
74+
} else {
75+
printf ("round:%u, lmul:%u, dim:%u\n", rounds, lmul, dim);
76+
}
5577

56-
printf ("round:%u, lmul:%u, dim:%u\n", rounds, lmul, dim);
78+
if (lmul > 8) {
79+
printf ("FATAL: Not yet support for long case!\n");
80+
}
5781
}
5882

5983
snrt_cluster_hw_barrier();
@@ -62,9 +86,10 @@ int main() {
6286
uint32_t timer = (uint32_t)-1;
6387
uint32_t timer_tmp, timer_iter1;
6488

89+
// Calculate the starting points for each core
90+
float *a_int = dotp_A_dram + cid_tile * tile_offset + tid * dim;
91+
float *b_int = dotp_B_dram + cid_tile * tile_offset + tid * dim;
6592

66-
float *a_int = dotp_A_dram + dim * cid;
67-
float *b_int = dotp_B_dram + dim * cid;
6893

6994
for (int iter = 0; iter < measure_iter; iter ++) {
7095
// Start dump
@@ -80,13 +105,13 @@ int main() {
80105
float acc;
81106

82107
if (lmul >= 8)
83-
acc = fdotp_v32b_lmul8(a_int, b_int, elem_per_round, dim, rounds);
108+
acc = fdotp_v32b_lmul8(a_int, b_int, offset, dim, rounds);
84109
else if (lmul >= 4)
85-
acc = fdotp_v32b_lmul4(a_int, b_int, elem_per_round, dim, rounds);
110+
acc = fdotp_v32b_lmul4(a_int, b_int, offset, dim, rounds);
86111
else if (lmul >= 2)
87-
acc = fdotp_v32b_lmul2(a_int, b_int, elem_per_round, dim, rounds);
112+
acc = fdotp_v32b_lmul2(a_int, b_int, offset, dim, rounds);
88113
else if (lmul >= 1)
89-
acc = fdotp_v32b_lmul1(a_int, b_int, elem_per_round, dim, rounds);
114+
acc = fdotp_v32b_lmul1(a_int, b_int, offset, dim, rounds);
90115
else
91116
return 0;
92117

software/tests/fdotp-32b/script/gen_data.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def emit_dotp_layer(name="dotp", **kwargs):
6464

6565
layer_str = ""
6666
layer_str += '#include "layer.h"\n\n'
67-
layer_str += f"dotp_layer {name}_l __attribute__((section(\".pdcp_src\"))) = {{\n"
67+
layer_str += f"dotp_layer {name}_l __attribute__((section(\".data\"))) = {{\n"
6868
layer_str += f"\t.M = {m},\n"
6969
layer_str += f'\t.dtype = FP{kwargs["prec"]},\n'
7070
layer_str += "};\n\n\n"
@@ -74,12 +74,12 @@ def emit_dotp_layer(name="dotp", **kwargs):
7474
dtype = ctypes[str(kwargs["prec"])]
7575
if dtype != "char":
7676
layer_str += (
77-
f'static {dtype} {name}_A_dram [{m}] __attribute__((section(".data"))) = '
77+
f'static {dtype} {name}_A_dram [{m}] __attribute__((section(".pdcp_src"), used)) = '
7878
+ array_to_cstr(vec_A)
7979
+ ";\n\n\n"
8080
)
8181
layer_str += (
82-
f'static {dtype} {name}_B_dram [{m}] __attribute__((section(".data"))) = '
82+
f'static {dtype} {name}_B_dram [{m}] __attribute__((section(".pdcp_src"), used))= '
8383
+ array_to_cstr(vec_B)
8484
+ ";\n\n\n"
8585
)

0 commit comments

Comments
 (0)