2525
2626int main () {
2727 const uint32_t num_cores = snrt_cluster_core_num ();
28+ const uint32_t num_tiles = snrt_cluster_tile_num ();
2829 const uint32_t cid = snrt_cluster_core_idx ();
30+ const uint32_t tid = snrt_cluster_tile_idx ();
31+ // core id within a tile (0-3)
32+ const uint32_t cid_tile = cid - tid * num_tiles ;
2933
30- const int measure_iter = 3 ;
31-
32- // Byte-level interleaving for DRAM
33- // Default setting is 1024b (128 Byte)
34- // This is used to ensure we can utilze all four channels to DRAM
35- const uint32_t Interleave = 512 ;
36- const uint32_t max_vlen = 512 ;
37- // Calculate the best lmul setting for current configuration
38- const uint32_t lmul = Interleave * 8 / max_vlen ;
39-
34+ const uint32_t num_cores_per_tile = num_cores / num_tiles ;
4035
41- // Each round we can calculate Interleave/4 32b-elements
42- const uint32_t elem_per_round = Interleave * num_cores / 4 ;
43- // how many rounds do we need to finish executing?
44- const uint32_t rounds = (dotp_l .M > elem_per_round ) ? ((dotp_l .M + elem_per_round - 1 ) / elem_per_round ) : 1 ;
45-
46- const uint32_t dim = elem_per_round / num_cores ;
47-
48- uint32_t offset = 31 - __builtin_clz (dim * sizeof (float ));
36+ const int measure_iter = 3 ;
4937
38+ // Here we target to reduce the remote access.
39+ // We want to keep the data fully interleaved on L1
40+ // Therefore, give a small value to it go with the default minimum
41+ // => interleave with cacheline width
5042 if (cid == 0 ) {
5143 // Set xbar policy
52- l1d_xbar_config (offset );
44+ l1d_xbar_config (1 );
5345 // Initialize the cache
5446 l1d_init (0 );
47+ }
48+
49+ // hardcode for now the cacheline width and number of cache per tile
50+ // TODO: correctly pass in the info from hardware configuration
51+ const uint32_t cacheline = 512 ;
52+ const uint32_t num_cache_per_tile = 4 ;
53+
54+ // This is the max length each core can work contiunously without break (in bits)
55+ const uint32_t data_len_per_tile = cacheline * num_cache_per_tile ;
56+ // This is the max length each core can work contiunously without break (in elem)
57+ const uint32_t dim = data_len_per_tile / 32 ;
58+ // This is the distance each core within a tile needs to jump after one iteration (in elem)
59+ // Also the dimension each core will work on in one large iteration
60+ const uint32_t tile_offset = num_tiles * dim ;
61+ // This is the distance each core needs to jump after one iteration (in elem)
62+ const uint32_t offset = tile_offset * num_cores_per_tile ;
63+ // Max hardware vlen the core support
64+ const uint32_t max_vlen = 512 ;
65+ // Which lmul settins we can use for the kernel?
66+ const uint32_t lmul = data_len_per_tile / max_vlen ;
67+ // This is the number of large iterations need for execution
68+ const uint32_t rounds = dotp_l .M / offset ;
69+
70+ if (cid == 0 ) {
71+ if (rounds < 1 ) {
72+ // Means we have way too less problem size, not fit for this algorithm
73+ printf ("FATAL: Number of elements too small!\n" );
74+ } else {
75+ printf ("round:%u, lmul:%u, dim:%u\n" , rounds , lmul , dim );
76+ }
5577
56- printf ("round:%u, lmul:%u, dim:%u\n" , rounds , lmul , dim );
78+ if (lmul > 8 ) {
79+ printf ("FATAL: Not yet support for long case!\n" );
80+ }
5781 }
5882
5983 snrt_cluster_hw_barrier ();
@@ -62,9 +86,10 @@ int main() {
6286 uint32_t timer = (uint32_t )-1 ;
6387 uint32_t timer_tmp , timer_iter1 ;
6488
89+ // Calculate the starting points for each core
90+ float * a_int = dotp_A_dram + cid_tile * tile_offset + tid * dim ;
91+ float * b_int = dotp_B_dram + cid_tile * tile_offset + tid * dim ;
6592
66- float * a_int = dotp_A_dram + dim * cid ;
67- float * b_int = dotp_B_dram + dim * cid ;
6893
6994 for (int iter = 0 ; iter < measure_iter ; iter ++ ) {
7095 // Start dump
@@ -80,13 +105,13 @@ int main() {
80105 float acc ;
81106
82107 if (lmul >= 8 )
83- acc = fdotp_v32b_lmul8 (a_int , b_int , elem_per_round , dim , rounds );
108+ acc = fdotp_v32b_lmul8 (a_int , b_int , offset , dim , rounds );
84109 else if (lmul >= 4 )
85- acc = fdotp_v32b_lmul4 (a_int , b_int , elem_per_round , dim , rounds );
110+ acc = fdotp_v32b_lmul4 (a_int , b_int , offset , dim , rounds );
86111 else if (lmul >= 2 )
87- acc = fdotp_v32b_lmul2 (a_int , b_int , elem_per_round , dim , rounds );
112+ acc = fdotp_v32b_lmul2 (a_int , b_int , offset , dim , rounds );
88113 else if (lmul >= 1 )
89- acc = fdotp_v32b_lmul1 (a_int , b_int , elem_per_round , dim , rounds );
114+ acc = fdotp_v32b_lmul1 (a_int , b_int , offset , dim , rounds );
90115 else
91116 return 0 ;
92117
0 commit comments