From d7aeae89334eceb730ce2c3d19b557eae4d88877 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gra=C5=BEvydas=20Ignotas?= Date: Tue, 5 May 2026 22:21:00 +0300 Subject: [PATCH 1/4] convert labels to local labels for arm/sgemm Non-local labels interfere with profiling. Same thing was done for arm64 in commit a0128aa489720ac2fd883dbeebfecffd4812ff99. --- kernel/arm/sgemm_kernel_4x4_vfpv3.S | 260 ++++++++++++++-------------- 1 file changed, 130 insertions(+), 130 deletions(-) diff --git a/kernel/arm/sgemm_kernel_4x4_vfpv3.S b/kernel/arm/sgemm_kernel_4x4_vfpv3.S index 789643f566..407120e0e6 100644 --- a/kernel/arm/sgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S @@ -886,9 +886,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr J, N asrs J, J, #2 // J = J / 4 - ble sgemm_kernel_L2_BEGIN + ble .Lsgemm_kernel_L2_BEGIN -sgemm_kernel_L4_BEGIN: +.Lsgemm_kernel_L4_BEGIN: ldr CO1, C // CO1 = C ldr r4 , LDC @@ -902,19 +902,19 @@ sgemm_kernel_L4_BEGIN: -sgemm_kernel_L4_M4_BEGIN: +.Lsgemm_kernel_L4_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 - ble sgemm_kernel_L4_M2_BEGIN + ble .Lsgemm_kernel_L4_M2_BEGIN -sgemm_kernel_L4_M4_20: +.Lsgemm_kernel_L4_M4_20: mov BO, BC asrs L , K1, #1 // L = L / 8 cmp L , #2 - blt sgemm_kernel_L4_M4_32 + blt .Lsgemm_kernel_L4_M4_32 @@ -922,81 +922,81 @@ sgemm_kernel_L4_M4_20: KERNEL4x4_M2 subs L, L, #2 - ble sgemm_kernel_L4_M4_22a + ble .Lsgemm_kernel_L4_M4_22a .align 5 -sgemm_kernel_L4_M4_22: +.Lsgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs L, L, #1 - bgt sgemm_kernel_L4_M4_22 + bgt .Lsgemm_kernel_L4_M4_22 -sgemm_kernel_L4_M4_22a: +.Lsgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b sgemm_kernel_L4_M4_44 + b .Lsgemm_kernel_L4_M4_44 -sgemm_kernel_L4_M4_32: +.Lsgemm_kernel_L4_M4_32: tst L, #1 - ble sgemm_kernel_L4_M4_40 + ble .Lsgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b sgemm_kernel_L4_M4_44 + b .Lsgemm_kernel_L4_M4_44 -sgemm_kernel_L4_M4_40: +.Lsgemm_kernel_L4_M4_40: INIT4x4 -sgemm_kernel_L4_M4_44: +.Lsgemm_kernel_L4_M4_44: ands L , K1, #1 // L = L % 8 - ble sgemm_kernel_L4_M4_100 + ble .Lsgemm_kernel_L4_M4_100 -sgemm_kernel_L4_M4_46: +.Lsgemm_kernel_L4_M4_46: KERNEL4x4_SUB subs L, L, #1 - bne sgemm_kernel_L4_M4_46 + bne .Lsgemm_kernel_L4_M4_46 -sgemm_kernel_L4_M4_100: +.Lsgemm_kernel_L4_M4_100: SAVE4x4 -sgemm_kernel_L4_M4_END: +.Lsgemm_kernel_L4_M4_END: subs I, I, #1 - bne sgemm_kernel_L4_M4_20 + bne .Lsgemm_kernel_L4_M4_20 -sgemm_kernel_L4_M2_BEGIN: +.Lsgemm_kernel_L4_M2_BEGIN: ldr I, M tst I , #3 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst I, #2 // I = I / 2 - ble sgemm_kernel_L4_M1_BEGIN + ble .Lsgemm_kernel_L4_M1_BEGIN -sgemm_kernel_L4_M2_20: +.Lsgemm_kernel_L4_M2_20: INIT2x4 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble sgemm_kernel_L4_M2_40 + ble .Lsgemm_kernel_L4_M2_40 -sgemm_kernel_L4_M2_22: +.Lsgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1009,42 +1009,42 @@ sgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs L, L, #1 - bgt sgemm_kernel_L4_M2_22 + bgt .Lsgemm_kernel_L4_M2_22 -sgemm_kernel_L4_M2_40: +.Lsgemm_kernel_L4_M2_40: ands L , K1, #7 // L = L % 8 - ble sgemm_kernel_L4_M2_100 + ble .Lsgemm_kernel_L4_M2_100 -sgemm_kernel_L4_M2_42: +.Lsgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs L, L, #1 - bgt sgemm_kernel_L4_M2_42 + bgt .Lsgemm_kernel_L4_M2_42 -sgemm_kernel_L4_M2_100: +.Lsgemm_kernel_L4_M2_100: SAVE2x4 -sgemm_kernel_L4_M2_END: +.Lsgemm_kernel_L4_M2_END: -sgemm_kernel_L4_M1_BEGIN: +.Lsgemm_kernel_L4_M1_BEGIN: tst I, #1 // I = I % 2 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END -sgemm_kernel_L4_M1_20: +.Lsgemm_kernel_L4_M1_20: INIT1x4 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble sgemm_kernel_L4_M1_40 + ble .Lsgemm_kernel_L4_M1_40 -sgemm_kernel_L4_M1_22: +.Lsgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1056,27 +1056,27 @@ sgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs L, L, #1 - bgt sgemm_kernel_L4_M1_22 + bgt .Lsgemm_kernel_L4_M1_22 -sgemm_kernel_L4_M1_40: +.Lsgemm_kernel_L4_M1_40: ands L , K1, #7 // L = L % 8 - ble sgemm_kernel_L4_M1_100 + ble .Lsgemm_kernel_L4_M1_100 -sgemm_kernel_L4_M1_42: +.Lsgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs L, L, #1 - bgt sgemm_kernel_L4_M1_42 + bgt .Lsgemm_kernel_L4_M1_42 -sgemm_kernel_L4_M1_100: +.Lsgemm_kernel_L4_M1_100: SAVE1x4 -sgemm_kernel_L4_END: +.Lsgemm_kernel_L4_END: mov r3, BC mov r4, K1 @@ -1085,20 +1085,20 @@ sgemm_kernel_L4_END: mov BC, r3 subs J , #1 // j-- - bgt sgemm_kernel_L4_BEGIN + bgt .Lsgemm_kernel_L4_BEGIN /*********************************************************************************************/ -sgemm_kernel_L2_BEGIN: +.Lsgemm_kernel_L2_BEGIN: ldr J , N tst J , #3 - ble sgemm_kernel_L999 + ble .Lsgemm_kernel_L999 tst J , #2 - ble sgemm_kernel_L1_BEGIN + ble .Lsgemm_kernel_L1_BEGIN ldr CO1, C // CO1 = C ldr r4 , LDC @@ -1113,22 +1113,22 @@ sgemm_kernel_L2_BEGIN: -sgemm_kernel_L2_M4_BEGIN: +.Lsgemm_kernel_L2_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 - ble sgemm_kernel_L2_M2_BEGIN + ble .Lsgemm_kernel_L2_M2_BEGIN -sgemm_kernel_L2_M4_20: +.Lsgemm_kernel_L2_M4_20: INIT4x2 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble sgemm_kernel_L2_M4_40 + ble .Lsgemm_kernel_L2_M4_40 .align 5 -sgemm_kernel_L2_M4_22: +.Lsgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1140,49 +1140,49 @@ sgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs L, L, #1 - bgt sgemm_kernel_L2_M4_22 + bgt .Lsgemm_kernel_L2_M4_22 -sgemm_kernel_L2_M4_40: +.Lsgemm_kernel_L2_M4_40: ands L , K1, #7 // L = L % 8 - ble sgemm_kernel_L2_M4_100 + ble .Lsgemm_kernel_L2_M4_100 -sgemm_kernel_L2_M4_42: +.Lsgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs L, L, #1 - bgt sgemm_kernel_L2_M4_42 + bgt .Lsgemm_kernel_L2_M4_42 -sgemm_kernel_L2_M4_100: +.Lsgemm_kernel_L2_M4_100: SAVE4x2 -sgemm_kernel_L2_M4_END: +.Lsgemm_kernel_L2_M4_END: subs I, I, #1 - bgt sgemm_kernel_L2_M4_20 + bgt .Lsgemm_kernel_L2_M4_20 -sgemm_kernel_L2_M2_BEGIN: +.Lsgemm_kernel_L2_M2_BEGIN: ldr I, M tst I , #3 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END tst I, #2 // I = I / 2 - ble sgemm_kernel_L2_M1_BEGIN + ble .Lsgemm_kernel_L2_M1_BEGIN -sgemm_kernel_L2_M2_20: +.Lsgemm_kernel_L2_M2_20: INIT2x2 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble sgemm_kernel_L2_M2_40 + ble .Lsgemm_kernel_L2_M2_40 -sgemm_kernel_L2_M2_22: +.Lsgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1195,42 +1195,42 @@ sgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs L, L, #1 - bgt sgemm_kernel_L2_M2_22 + bgt .Lsgemm_kernel_L2_M2_22 -sgemm_kernel_L2_M2_40: +.Lsgemm_kernel_L2_M2_40: ands L , K1, #7 // L = L % 8 - ble sgemm_kernel_L2_M2_100 + ble .Lsgemm_kernel_L2_M2_100 -sgemm_kernel_L2_M2_42: +.Lsgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs L, L, #1 - bgt sgemm_kernel_L2_M2_42 + bgt .Lsgemm_kernel_L2_M2_42 -sgemm_kernel_L2_M2_100: +.Lsgemm_kernel_L2_M2_100: SAVE2x2 -sgemm_kernel_L2_M2_END: +.Lsgemm_kernel_L2_M2_END: -sgemm_kernel_L2_M1_BEGIN: +.Lsgemm_kernel_L2_M1_BEGIN: tst I, #1 // I = I % 2 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END -sgemm_kernel_L2_M1_20: +.Lsgemm_kernel_L2_M1_20: INIT1x2 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble sgemm_kernel_L2_M1_40 + ble .Lsgemm_kernel_L2_M1_40 -sgemm_kernel_L2_M1_22: +.Lsgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1242,27 +1242,27 @@ sgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs L, L, #1 - bgt sgemm_kernel_L2_M1_22 + bgt .Lsgemm_kernel_L2_M1_22 -sgemm_kernel_L2_M1_40: +.Lsgemm_kernel_L2_M1_40: ands L , K1, #7 // L = L % 8 - ble sgemm_kernel_L2_M1_100 + ble .Lsgemm_kernel_L2_M1_100 -sgemm_kernel_L2_M1_42: +.Lsgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs L, L, #1 - bgt sgemm_kernel_L2_M1_42 + bgt .Lsgemm_kernel_L2_M1_42 -sgemm_kernel_L2_M1_100: +.Lsgemm_kernel_L2_M1_100: SAVE1x2 -sgemm_kernel_L2_END: +.Lsgemm_kernel_L2_END: mov r3, BC mov r4, K1 @@ -1272,11 +1272,11 @@ sgemm_kernel_L2_END: /*********************************************************************************************/ -sgemm_kernel_L1_BEGIN: +.Lsgemm_kernel_L1_BEGIN: ldr J , N tst J , #1 - ble sgemm_kernel_L999 + ble .Lsgemm_kernel_L999 ldr CO1, C // CO1 = C @@ -1291,22 +1291,22 @@ sgemm_kernel_L1_BEGIN: -sgemm_kernel_L1_M4_BEGIN: +.Lsgemm_kernel_L1_M4_BEGIN: ldr I, M asrs I, I, #2 // I = I / 4 - ble sgemm_kernel_L1_M2_BEGIN + ble .Lsgemm_kernel_L1_M2_BEGIN -sgemm_kernel_L1_M4_20: +.Lsgemm_kernel_L1_M4_20: INIT4x1 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble sgemm_kernel_L1_M4_40 + ble .Lsgemm_kernel_L1_M4_40 .align 5 -sgemm_kernel_L1_M4_22: +.Lsgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1318,49 +1318,49 @@ sgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs L, L, #1 - bgt sgemm_kernel_L1_M4_22 + bgt .Lsgemm_kernel_L1_M4_22 -sgemm_kernel_L1_M4_40: +.Lsgemm_kernel_L1_M4_40: ands L , K1, #7 // L = L % 8 - ble sgemm_kernel_L1_M4_100 + ble .Lsgemm_kernel_L1_M4_100 -sgemm_kernel_L1_M4_42: +.Lsgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs L, L, #1 - bgt sgemm_kernel_L1_M4_42 + bgt .Lsgemm_kernel_L1_M4_42 -sgemm_kernel_L1_M4_100: +.Lsgemm_kernel_L1_M4_100: SAVE4x1 -sgemm_kernel_L1_M4_END: +.Lsgemm_kernel_L1_M4_END: subs I, I, #1 - bgt sgemm_kernel_L1_M4_20 + bgt .Lsgemm_kernel_L1_M4_20 -sgemm_kernel_L1_M2_BEGIN: +.Lsgemm_kernel_L1_M2_BEGIN: ldr I, M tst I , #3 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END tst I, #2 // I = I / 2 - ble sgemm_kernel_L1_M1_BEGIN + ble .Lsgemm_kernel_L1_M1_BEGIN -sgemm_kernel_L1_M2_20: +.Lsgemm_kernel_L1_M2_20: INIT2x1 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble sgemm_kernel_L1_M2_40 + ble .Lsgemm_kernel_L1_M2_40 -sgemm_kernel_L1_M2_22: +.Lsgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1373,42 +1373,42 @@ sgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs L, L, #1 - bgt sgemm_kernel_L1_M2_22 + bgt .Lsgemm_kernel_L1_M2_22 -sgemm_kernel_L1_M2_40: +.Lsgemm_kernel_L1_M2_40: ands L , K1, #7 // L = L % 8 - ble sgemm_kernel_L1_M2_100 + ble .Lsgemm_kernel_L1_M2_100 -sgemm_kernel_L1_M2_42: +.Lsgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs L, L, #1 - bgt sgemm_kernel_L1_M2_42 + bgt .Lsgemm_kernel_L1_M2_42 -sgemm_kernel_L1_M2_100: +.Lsgemm_kernel_L1_M2_100: SAVE2x1 -sgemm_kernel_L1_M2_END: +.Lsgemm_kernel_L1_M2_END: -sgemm_kernel_L1_M1_BEGIN: +.Lsgemm_kernel_L1_M1_BEGIN: tst I, #1 // I = I % 2 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END -sgemm_kernel_L1_M1_20: +.Lsgemm_kernel_L1_M1_20: INIT1x1 mov BO, BC asrs L , K1, #3 // L = L / 8 - ble sgemm_kernel_L1_M1_40 + ble .Lsgemm_kernel_L1_M1_40 -sgemm_kernel_L1_M1_22: +.Lsgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1420,30 +1420,30 @@ sgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs L, L, #1 - bgt sgemm_kernel_L1_M1_22 + bgt .Lsgemm_kernel_L1_M1_22 -sgemm_kernel_L1_M1_40: +.Lsgemm_kernel_L1_M1_40: ands L , K1, #7 // L = L % 8 - ble sgemm_kernel_L1_M1_100 + ble .Lsgemm_kernel_L1_M1_100 -sgemm_kernel_L1_M1_42: +.Lsgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs L, L, #1 - bgt sgemm_kernel_L1_M1_42 + bgt .Lsgemm_kernel_L1_M1_42 -sgemm_kernel_L1_M1_100: +.Lsgemm_kernel_L1_M1_100: SAVE1x1 -sgemm_kernel_L1_END: +.Lsgemm_kernel_L1_END: -sgemm_kernel_L999: +.Lsgemm_kernel_L999: sub r3, fp, #128 vldm r3, { s8 - s31} // restore floating point registers From cd276c2c099a312e2374bc385cd6cbd37a97b55c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gra=C5=BEvydas=20Ignotas?= Date: Tue, 5 May 2026 22:30:42 +0300 Subject: [PATCH 2/4] only save the required registers for arm/sgemm According to ARM AAPCS (Procedure Call Standard) 5.1.2.1, only registers s16-s31 must be preserved across subroutine calls; registers s0-s15 do not need to be preserved. --- kernel/arm/sgemm_kernel_4x4_vfpv3.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/arm/sgemm_kernel_4x4_vfpv3.S b/kernel/arm/sgemm_kernel_4x4_vfpv3.S index 407120e0e6..33345da38d 100644 --- a/kernel/arm/sgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S @@ -871,7 +871,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vstr OLD_ALPHA, ALPHA sub r3, fp, #128 - vstm r3, { s8 - s31} // store floating point registers + vstm r3, { s16 - s31 } // store floating point registers movs r4, #0 str r4, FP_ZERO @@ -1446,7 +1446,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .Lsgemm_kernel_L999: sub r3, fp, #128 - vldm r3, { s8 - s31} // restore floating point registers + vldm r3, { s16 - s31 } // restore floating point registers movs r0, #0 // set return value sub sp, fp, #24 From 9d58b8d64e610e926a70577668f2dc0482a4847e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gra=C5=BEvydas=20Ignotas?= Date: Tue, 5 May 2026 22:27:01 +0300 Subject: [PATCH 3/4] provide a NEON version of arm/sgemm benchmark/sgemm.goto before: M= 200, N= 200, K= 200 : 9262.97 MFlops 0.001727 sec after: M= 200, N= 200, K= 200 : 30223.64 MFlops 0.000529 sec Conveniently the registers are already allocated suitably for vector operation, so the conversion from vfpv3 was rather straightforward. Prefetching was left out because it doesn't help Cortex-A76, only hurts it slightly. --- kernel/arm/sgemm_kernel_4x4_vfpv3.S | 227 +++++++++++++++++++++++++++- 1 file changed, 225 insertions(+), 2 deletions(-) diff --git a/kernel/arm/sgemm_kernel_4x4_vfpv3.S b/kernel/arm/sgemm_kernel_4x4_vfpv3.S index 33345da38d..a36f3a1433 100644 --- a/kernel/arm/sgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S @@ -114,6 +114,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT4x4 +#ifdef __ARM_NEON__ + vmov.f32 q4, #0.0 + vmov.f32 q5, #0.0 + vmov.f32 q6, #0.0 + vmov.f32 q7, #0.0 +#else flds s16, FP_ZERO vmov.f32 s17, s16 vmov.f32 s18, s16 @@ -130,11 +136,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmov.f32 s29, s16 vmov.f32 s30, s16 vmov.f32 s31, s16 +#endif .endm .macro KERNEL4x4_I +#ifdef __ARM_NEON__ + vld1.32 {q0,q1}, [AO]! + vld1.32 {q2,q3}, [BO]! + vmul.f32 q4, q0, d4[0] + vmul.f32 q5, q0, d4[1] + vmul.f32 q6, q0, d5[0] + vmul.f32 q7, q0, d5[1] +#else pld [ AO , #A_PRE ] vldmia.f32 AO!, { s0 - s1 } pld [ BO , #B_PRE ] @@ -165,12 +180,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmuls s29 , s1, s11 fmuls s30 , s2, s11 fmuls s31 , s3, s11 +#endif .endm .macro KERNEL4x4_M2 +#ifdef __ARM_NEON__ + vld1.32 {q0}, [AO]! + vld1.32 {q2}, [BO]! +// pld [ AO , #A_PRE ] +// pld [ BO , #B_PRE ] + vmla.f32 q4, q1, d6[0] + vmla.f32 q5, q1, d6[1] + vmla.f32 q6, q1, d7[0] + vmla.f32 q7, q1, d7[1] +#else pld [ AO , #A_PRE ] fmacs s16 , s4, s12 fmacs s17 , s5, s12 @@ -196,12 +222,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacs s29 , s5, s15 fmacs s30 , s6, s15 fmacs s31 , s7, s15 +#endif .endm .macro KERNEL4x4_M1 +#ifdef __ARM_NEON__ + vld1.32 {q1}, [AO]! + vld1.32 {q3}, [BO]! + + vmla.f32 q4, q0, d4[0] + vmla.f32 q5, q0, d4[1] + vmla.f32 q6, q0, d5[0] + vmla.f32 q7, q0, d5[1] +#else fmacs s16 , s0, s8 vldmia.f32 AO!, { s4 - s7 } fmacs s17 , s1, s8 @@ -225,6 +261,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacs s29 , s1, s11 fmacs s30 , s2, s11 fmacs s31 , s3, s11 +#endif .endm @@ -232,6 +269,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_E +#ifdef __ARM_NEON__ + vmla.f32 q4, q1, d6[0] + vmla.f32 q5, q1, d6[1] + vmla.f32 q6, q1, d7[0] + vmla.f32 q7, q1, d7[1] +#else fmacs s16 , s4, s12 fmacs s17 , s5, s12 fmacs s18 , s6, s12 @@ -251,6 +294,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacs s29 , s5, s15 fmacs s30 , s6, s15 fmacs s31 , s7, s15 +#endif .endm @@ -259,6 +303,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_SUB +#ifdef __ARM_NEON__ + vld1.32 {q2}, [BO]! + vld1.32 {q0}, [AO]! + vmla.f32 q4, q0, d4[0] + vmla.f32 q5, q0, d4[1] + vmla.f32 q6, q0, d5[0] + vmla.f32 q7, q0, d5[1] +#else flds s8 , [ BO ] flds s0 , [ AO ] @@ -290,11 +342,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacs s30 , s2, s11 add BO , BO, #16 fmacs s31 , s3, s11 +#endif .endm .macro SAVE4x4 +#ifdef __ARM_NEON__ + vldr s0, ALPHA + ldr r3, LDC + vld1.32 {q2}, [CO1] + add CO2, CO1, r3 + add r4, CO2, r3 + + vld1.32 {q3}, [CO2] + vmla.f32 q2, q4, d0[0] + vmla.f32 q3, q5, d0[0] + pld [ CO1, #C_PRE ] + vst1.32 {q2}, [CO1]! + + vld1.32 {q2}, [r4] + vst1.32 {q3}, [CO2] + vmla.f32 q2, q6, d0[0] + pld [ CO2, #C_PRE ] + add CO2, r4 , r3 + + vld1.32 {q3}, [CO2] + vst1.32 {q2}, [r4] + vmla.f32 q3, q7, d0[0] + pld [ r4, #C_PRE ] + pld [ CO2, #C_PRE ] + vst1.32 {q3}, [CO2] +#else ldr r3 , LDC add CO2 , CO1, r3 flds s0, ALPHA @@ -354,6 +433,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ CO2 , #C_PRE ] add CO1, CO1, #16 +#endif .endm @@ -361,6 +441,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x4 +#ifdef __ARM_NEON__ + vmov.f32 d8, #0.0 + vmov.f32 d10, #0.0 + vmov.f32 d12, #0.0 + vmov.f32 d14, #0.0 +#else flds s16, FP_ZERO vmov.f32 s17, s16 vmov.f32 s20, s16 @@ -369,6 +455,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmov.f32 s25, s16 vmov.f32 s28, s16 vmov.f32 s29, s16 +#endif .endm @@ -376,6 +463,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x4_SUB +#ifdef __ARM_NEON__ + vld1.32 {q2}, [BO]! + vld1.32 {d0}, [AO]! + vmla.f32 d8, d0, d4[0] + vmla.f32 d10, d0, d4[1] + vmla.f32 d12, d0, d5[0] + vmla.f32 d14, d0, d5[1] +#else flds s8 , [ BO ] flds s9 , [ BO, #4 ] flds s10, [ BO, #8 ] @@ -397,6 +492,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmacs s29 , s1, s11 add AO , AO, #8 add BO , BO, #16 +#endif .endm @@ -406,6 +502,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add CO2 , CO1, r3 add r4 , CO2, r3 +#ifdef __ARM_NEON__ + vldr s0, ALPHA + vld1.32 {d4}, [CO1] + vld1.32 {d6}, [CO2] + vmla.f32 d4, d8, d0[0] + vmla.f32 d6, d10, d0[0] + vst1.32 {d4}, [CO1]! + + vld1.32 {d4}, [r4] + vst1.32 {d6}, [CO2] + vmla.f32 d4, d12, d0[0] + add CO2, r4, r3 + + vld1.32 {d6}, [CO2] + vst1.32 {d4}, [r4] + vmla.f32 d6, d14, d0[0] + vst1.32 {d6}, [CO2] +#else flds s0, ALPHA flds s8 , [CO1] @@ -447,6 +561,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fsts s13, [CO2, #4 ] add CO1, CO1, #8 +#endif .endm @@ -466,6 +581,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL1x4_SUB +#ifdef __ARM_NEON__ + vld1.32 {q2}, [BO]! + vld1.32 {d0[0]}, [AO]! + vmla.f32 s16, s0, s8 + vmla.f32 s20, s0, s9 + vmla.f32 s24, s0, s10 + vmla.f32 s28, s0, s11 +#else flds s8 , [ BO ] flds s9 , [ BO, #4 ] flds s10, [ BO, #8 ] @@ -480,6 +603,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add AO , AO, #4 add BO , BO, #16 +#endif .endm @@ -518,6 +642,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT4x2 +#ifdef __ARM_NEON__ + vmov.f32 q4, #0.0 + vmov.f32 q5, #0.0 +#else flds s16, FP_ZERO vmov.f32 s17, s16 vmov.f32 s18, s16 @@ -526,6 +654,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmov.f32 s21, s16 vmov.f32 s22, s16 vmov.f32 s23, s16 +#endif .endm @@ -533,6 +662,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x2_SUB +#ifdef __ARM_NEON__ + vld1.32 {d4}, [BO]! + vld1.32 {q0}, [AO]! + vmla.f32 q4, q0, d4[0] + vmla.f32 q5, q0, d4[1] +#else flds s8 , [ BO ] flds s9 , [ BO, #4 ] @@ -553,6 +688,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add AO , AO, #16 add BO , BO, #8 +#endif .endm @@ -561,6 +697,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr r3 , LDC add CO2 , CO1, r3 +#ifdef __ARM_NEON__ + vldr s0, ALPHA + vld1.32 {q2}, [CO1] + vld1.32 {q3}, [CO2] + vmla.f32 q2, q4, d0[0] + vmla.f32 q3, q5, d0[0] + vst1.32 {q2}, [CO1]! + vst1.32 {q3}, [CO2] +#else flds s0, ALPHA flds s8 , [CO1] @@ -594,6 +739,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fsts s15, [CO2, #12 ] add CO1, CO1, #16 +#endif .endm @@ -602,10 +748,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x2 +#ifdef __ARM_NEON__ + vmov.f32 d8, #0.0 + vmov.f32 d10, #0.0 +#else flds s16, FP_ZERO vmov.f32 s17, s16 vmov.f32 s20, s16 vmov.f32 s21, s16 +#endif .endm @@ -613,6 +764,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_SUB +#ifdef __ARM_NEON__ + vld1.32 {d4}, [BO]! + vld1.32 {d0}, [AO]! + vmla.f32 d8, d0, d4[0] + vmla.f32 d10, d0, d4[1] +#else flds s8 , [ BO ] flds s9 , [ BO, #4 ] @@ -627,6 +784,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add AO , AO, #8 add BO , BO, #8 +#endif .endm @@ -635,6 +793,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr r3 , LDC add CO2 , CO1, r3 +#ifdef __ARM_NEON__ + vldr s0, ALPHA + vld1.32 {d4}, [CO1] + vld1.32 {d6}, [CO2] + vmla.f32 d4, d8, d0[0] + vmla.f32 d6, d10, d0[0] + vst1.32 {d4}, [CO1]! + vst1.32 {d6}, [CO2] +#else flds s0, ALPHA flds s8 , [CO1] @@ -656,6 +823,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fsts s13, [CO2, #4 ] add CO1, CO1, #8 +#endif .endm @@ -672,6 +840,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL1x2_SUB +#ifdef __ARM_NEON__ + vld1.32 {d4}, [BO]! + vld1.32 {d0[0]}, [AO]! + vmla.f32 s16, s0, s8 + vmla.f32 s20, s0, s9 +#else flds s8 , [ BO ] flds s9 , [ BO, #4 ] @@ -681,6 +855,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add AO , AO, #4 add BO , BO, #8 +#endif .endm @@ -689,6 +864,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr r3 , LDC add CO2 , CO1, r3 +#ifdef __ARM_NEON__ + vldr s0, ALPHA + vld1.32 {d4[0]}, [CO1] + vld1.32 {d6[0]}, [CO2] + vmla.f32 s8, s0, s16 + vmla.f32 s12, s0, s20 + vst1.32 {d4[0]}, [CO1]! + vst1.32 {d6[0]}, [CO2] +#else flds s0, ALPHA flds s8 , [CO1] @@ -700,6 +884,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fsts s12, [CO2] add CO1, CO1, #4 +#endif .endm @@ -708,10 +893,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT4x1 +#ifdef __ARM_NEON__ + vmov.f32 q4, #0.0 +#else flds s16, FP_ZERO vmov.f32 s17, s16 vmov.f32 s18, s16 vmov.f32 s19, s16 +#endif .endm @@ -719,6 +908,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x1_SUB +#ifdef __ARM_NEON__ + vld1.32 {d4[0]}, [BO]! + vld1.32 {q0}, [AO]! + vmla.f32 q4, q0, d4[0] +#else flds s8 , [ BO ] flds s0 , [ AO ] @@ -733,12 +927,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add AO , AO, #16 add BO , BO, #4 +#endif .endm .macro SAVE4x1 - +#ifdef __ARM_NEON__ + vldr s0, ALPHA + vld1.32 {q2}, [CO1] + vmla.f32 q2, q4, d0[0] + vst1.32 {q2}, [CO1]! +#else flds s0, ALPHA flds s8 , [CO1] @@ -757,6 +957,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fsts s11, [CO1, #12 ] add CO1, CO1, #16 +#endif .endm @@ -767,8 +968,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x1 +#ifdef __ARM_NEON__ + vmov.f32 d8, #0.0 +#else flds s16, FP_ZERO vmov.f32 s17, s16 +#endif .endm @@ -776,6 +981,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x1_SUB +#ifdef __ARM_NEON__ + vld1.32 {d4[0]}, [BO]! + vld1.32 {d0}, [AO]! + vmla.f32 d8, d0, d4[0] +#else flds s8 , [ BO ] flds s0 , [ AO ] @@ -786,12 +996,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add AO , AO, #8 add BO , BO, #4 +#endif .endm .macro SAVE2x1 - +#ifdef __ARM_NEON__ + vldr s0, ALPHA + vld1.32 {d4}, [CO1] + vmla.f32 d4, d8, d0[0] + vst1.32 {d4}, [CO1]! +#else flds s0, ALPHA flds s8 , [CO1] @@ -804,6 +1020,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fsts s9 , [CO1, #4 ] add CO1, CO1, #8 +#endif .endm @@ -819,6 +1036,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL1x1_SUB +#ifdef __ARM_NEON__ + vld1.32 {d4[0]}, [BO]! + vld1.32 {d0[0]}, [AO]! + vmla.f32 s16, s0, s8 +#else flds s8 , [ BO ] flds s0 , [ AO ] @@ -827,6 +1049,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add AO , AO, #4 add BO , BO, #4 +#endif .endm From fc9d7c7fe3db87bea9390eb62e25f281967e322e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gra=C5=BEvydas=20Ignotas?= Date: Tue, 5 May 2026 22:37:44 +0300 Subject: [PATCH 4/4] rename arm32 sgemm_kernel to indicate neon support --- kernel/arm/KERNEL.ARMV7 | 2 +- .../{sgemm_kernel_4x4_vfpv3.S => sgemm_kernel_4x4_vfpv3_neon.S} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename kernel/arm/{sgemm_kernel_4x4_vfpv3.S => sgemm_kernel_4x4_vfpv3_neon.S} (100%) diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 5e0b4cfb81..82f06fe79a 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -8,7 +8,7 @@ ZNRM2KERNEL = nrm2_vfpv3.S SGEMVNKERNEL = gemv_n_vfpv3.S DGEMVNKERNEL = gemv_n_vfpv3.S -SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S +SGEMMKERNEL = sgemm_kernel_4x4_vfpv3_neon.S SGEMMONCOPY = sgemm_ncopy_4_vfp.S SGEMMOTCOPY = sgemm_tcopy_4_vfp.S SGEMMONCOPYOBJ = sgemm_oncopy.o diff --git a/kernel/arm/sgemm_kernel_4x4_vfpv3.S b/kernel/arm/sgemm_kernel_4x4_vfpv3_neon.S similarity index 100% rename from kernel/arm/sgemm_kernel_4x4_vfpv3.S rename to kernel/arm/sgemm_kernel_4x4_vfpv3_neon.S