Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

i#5036 A64 scatter/gather, part 10: Non-fault loads #6756

Merged
merged 4 commits into from
Apr 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,34 +3,34 @@ Hello, world!
Basic counts tool results:
Total counts:
#ifdef __ARM_FEATURE_SVE2
724 total \(fetched\) instructions
270 total unique \(fetched\) instructions
772 total \(fetched\) instructions
286 total unique \(fetched\) instructions
#else
685 total \(fetched\) instructions
255 total unique \(fetched\) instructions
733 total \(fetched\) instructions
271 total unique \(fetched\) instructions
#endif
0 total non-fetched instructions
0 total prefetches
#ifdef __ARM_FEATURE_SVE2
#if (__ARM_FEATURE_SVE_BITS == 128)
1158 total data loads
1248 total data loads
873 total data stores
#elif (__ARM_FEATURE_SVE_BITS == 256)
2070 total data loads
2234 total data loads
1615 total data stores
#elif (__ARM_FEATURE_SVE_BITS == 512)
3894 total data loads
4206 total data loads
3099 total data stores
#endif /* __ARM_FEATURE_SVE_BITS */
#else
#if (__ARM_FEATURE_SVE_BITS == 128)
1137 total data loads
1227 total data loads
861 total data stores
#elif (__ARM_FEATURE_SVE_BITS == 256)
2035 total data loads
2199 total data loads
1595 total data stores
#elif (__ARM_FEATURE_SVE_BITS == 512)
3831 total data loads
4143 total data loads
3063 total data stores
#endif /* __ARM_FEATURE_SVE_BITS */
#endif /* __ARM_FEATURE_SVE2 */
Expand All @@ -41,34 +41,34 @@ Total counts:
.*
Thread .* counts:
#ifdef __ARM_FEATURE_SVE2
724 \(fetched\) instructions
270 unique \(fetched\) instructions
772 \(fetched\) instructions
286 unique \(fetched\) instructions
#else
685 \(fetched\) instructions
255 unique \(fetched\) instructions
733 \(fetched\) instructions
271 unique \(fetched\) instructions
#endif
0 non-fetched instructions
0 prefetches
#ifdef __ARM_FEATURE_SVE2
#if (__ARM_FEATURE_SVE_BITS == 128)
1158 data loads
1248 data loads
873 data stores
#elif (__ARM_FEATURE_SVE_BITS == 256)
2070 data loads
2234 data loads
1615 data stores
#elif (__ARM_FEATURE_SVE_BITS == 512)
3894 data loads
4206 data loads
3099 data stores
#endif /* __ARM_FEATURE_SVE_BITS */
#else
#if (__ARM_FEATURE_SVE_BITS == 128)
1137 data loads
1227 data loads
861 data stores
#elif (__ARM_FEATURE_SVE_BITS == 256)
2035 data loads
2199 data loads
1595 data stores
#elif (__ARM_FEATURE_SVE_BITS == 512)
3831 data loads
4143 data loads
3063 data stores
#endif /* __ARM_FEATURE_SVE_BITS */
#endif /* __ARM_FEATURE_SVE2 */
Expand Down
59 changes: 38 additions & 21 deletions clients/drcachesim/tests/allasm_scattergather_aarch64.asm
Original file line number Diff line number Diff line change
Expand Up @@ -296,23 +296,39 @@ test_scalar_plus_immediate:
ld1b DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 8
ld1b DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4
ld1b DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
ldnf1b DEST_REG1.b, B_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 16
ldnf1b DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 8
ldnf1b DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4
ldnf1b DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
ldnt1b DEST_REG1.b, B_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 16
ld1sb DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 8
ld1sb DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4
ld1sb DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
ldnf1sb DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 8
ldnf1sb DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4
ldnf1sb DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
ld1h DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 8
ld1h DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4
ld1h DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
ldnf1h DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 8
ldnf1h DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4
ldnf1h DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
ldnt1h DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 8
ld1sh DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4
ld1sh DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
ldnf1sh DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4
ldnf1sh DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
ld1w DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4
ld1w DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
ldnf1w DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4
ldnf1w DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
ldnt1w DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 4
ld1sw DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
ldnf1sw DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
ld1d DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
ldnf1d DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
ldnt1d DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, #1, mul vl] // 2
// Total: 104
// Total: 178

ld2b { DEST_REG1.b, DEST_REG2.b }, B_MASK_REG/z, [BUFFER_REG, #2, mul vl] // 32
ld2h { DEST_REG1.h, DEST_REG2.h }, H_MASK_REG/z, [BUFFER_REG, #2, mul vl] // 16
Expand All @@ -331,7 +347,8 @@ test_scalar_plus_immediate:
ld4w { DEST_REG1.s, DEST_REG2.s, DEST_REG3.s, DEST_REG4.s }, S_MASK_REG/z, [BUFFER_REG, #4, mul vl] // 16
ld4d { DEST_REG1.d, DEST_REG2.d, DEST_REG3.d, DEST_REG4.d }, D_MASK_REG/z, [BUFFER_REG, #4, mul vl] // 8
// Total: 120
// Total loads: 104 + 60 + 90 + 120 = 374

// Total loads: 178 + 60 + 90 + 120 = 448

st1b SRC_REG1.b, B_MASK_REG, [BUFFER_REG, #1, mul vl] // 16
st1b SRC_REG1.h, H_MASK_REG, [BUFFER_REG, #1, mul vl] // 8
Expand Down Expand Up @@ -440,7 +457,7 @@ _start:
bl test_scalar_plus_scalar // +(374 * vl_bytes/16) loads
// +(322 * vl_bytes/16) stores

bl test_scalar_plus_immediate // +(374 * vl_bytes/16) loads
bl test_scalar_plus_immediate // +(448 * vl_bytes/16) loads
// +(322 * vl_bytes/16) stores
bl test_replicating_loads // +60 loads
// +0 stores
Expand All @@ -450,11 +467,11 @@ _start:
#endif
// Running total:
// SVE only:
// Loads: (136 + 14 + 374 + 374) * vl_bytes/16 + 60 = 898 * vl_bytes/16 + 60
// Loads: (136 + 14 + 374 + 448) * vl_bytes/16 + 60 = 972 * vl_bytes/16 + 60
// Stores: (82 + 8 + 322 + 322) * vl_bytes/16 = 734 * vl_bytes/16

// Including SVE2:
// Loads: ((898 + 14) * vl_bytes/16) + 60 = (912 * vl_bytes/16) + 60
// Loads: ((972 + 14) * vl_bytes/16) + 60 = (986 * vl_bytes/16) + 60
// Stores: (734 + 8) * vl_bytes/16 = 742 * vl_bytes/16

/* Run all the instructions with no active elements */
Expand All @@ -475,11 +492,11 @@ _start:

// Running total (unchanged from above):
// SVE only:
// Loads: (898 * vl_bytes/16) + 60
// Loads: (972 * vl_bytes/16) + 60
// Stores: 734 * vl_bytes/16

// Including SVE2:
// Loads: (912 * vl_bytes/16) + 60
// Loads: (986 * vl_bytes/16) + 60
// Stores: 742 * vl_bytes/16

/* Run all instructions with one active element */
Expand All @@ -491,47 +508,47 @@ _start:
bl test_scalar_plus_vector // +52 loads, +31 stores
bl test_vector_plus_immediate // +7 loads, +4 stores
bl test_scalar_plus_scalar // +56 loads, +46 stores
bl test_scalar_plus_immediate // +56 loads, +46 stores
bl test_scalar_plus_immediate // +72 loads, +46 stores
bl test_replicating_loads // +8 loads, +0 stores
#ifdef __ARM_FEATURE_SVE2
bl test_vector_plus_scalar // +7 loads, +4 stores
#endif

// Running total:
// SVE only:
// Loads: (898 * vl_bytes/16) + 60 + 52 + 7 + 56 + 56 + 8 = (898 * vl_bytes/16) + 239
// Loads: (972 * vl_bytes/16) + 60 + 52 + 7 + 56 + 72 + 8 = (972 * vl_bytes/16) + 255
// Stores: (734 * vl_bytes/16) + 41 + 4 + 46 + 46 = (734 * vl_bytes/16) + 127

// Including SVE2:
// Loads: (912 * vl_bytes/16) + 239 + 7 = (912 * vl_bytes/16) + 246
// Loads: (986 * vl_bytes/16) + 255 + 7 = (986 * vl_bytes/16) + 262
// Stores: (742 * vl_bytes/16) + 127 + 4 = (742 * vl_bytes/16) + 131

// The functions in this file have the following instructions counts:
// _start 40 (+3 SVE2)
// test_scalar_plus_vector 84
// test_vector_plus_immediate 12
// test_scalar_plus_scalar 55
// test_scalar_plus_immediate 55
// test_scalar_plus_immediate 71
// test_replicating_loads 9
// test_vector_plus_scalar 12
// So there are 40 + 84 + 12 + 55 + 55 + 9 = 255 unique instructions
// (or 255 + 12 + 3 = 270 including SVE2)
// So there are 40 + 84 + 12 + 55 + 71 + 9 = 271 unique instructions
// (or 271 + 12 + 3 = 286 including SVE2)
// We run the test_* functions 3 times each so the total instruction executed is
// ((84 + 12 + 55 + 55 + 9) * 3) + 40 = (215 * 3) + 37 = 685
// (or 685 + 3 + (12 * 3) = 724 including SVE2)
// ((84 + 12 + 55 + 71 + 9) * 3) + 40 = (231 * 3) + 37 = 733
// (or 733 + 3 + (12 * 3) = 772 including SVE2)

// Totals:
// SVE only:
// Loads: (898 * vl_bytes/16) + 239
// Loads: (972 * vl_bytes/16) + 255
// Stores: (734 * vl_bytes/16) + 127
// Instructions: 685
// Unique instructions: 255
// Instructions: 733
// Unique instructions: 271

// Including SVE2:
// Loads: (912 * vl_bytes/16) + 246
// Loads: (986 * vl_bytes/16) + 262
// Stores: (742 * vl_bytes/16) + 131
// Instructions: 724
// Unique instructions: 270
// Instructions: 772
// Unique instructions: 286

// Exit.
mov w0, #1 // stdout
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,34 +2,34 @@ Hello, world!
Basic counts tool results:
Total counts:
#ifdef __ARM_FEATURE_SVE2
724 total \(fetched\) instructions
270 total unique \(fetched\) instructions
772 total \(fetched\) instructions
286 total unique \(fetched\) instructions
#else
685 total \(fetched\) instructions
255 total unique \(fetched\) instructions
733 total \(fetched\) instructions
271 total unique \(fetched\) instructions
#endif
0 total non-fetched instructions
0 total prefetches
#ifdef __ARM_FEATURE_SVE2
#if (__ARM_FEATURE_SVE_BITS == 128)
1158 total data loads
1248 total data loads
873 total data stores
#elif (__ARM_FEATURE_SVE_BITS == 256)
2070 total data loads
2234 total data loads
1615 total data stores
#elif (__ARM_FEATURE_SVE_BITS == 512)
3894 total data loads
4206 total data loads
3099 total data stores
#endif /* __ARM_FEATURE_SVE_BITS */
#else
#if (__ARM_FEATURE_SVE_BITS == 128)
1137 total data loads
1227 total data loads
861 total data stores
#elif (__ARM_FEATURE_SVE_BITS == 256)
2035 total data loads
2199 total data loads
1595 total data stores
#elif (__ARM_FEATURE_SVE_BITS == 512)
3831 total data loads
4143 total data loads
3063 total data stores
#endif /* __ARM_FEATURE_SVE_BITS */
#endif /* __ARM_FEATURE_SVE2 */
Expand All @@ -40,35 +40,35 @@ Total counts:
.*
Thread .* counts:
#ifdef __ARM_FEATURE_SVE2
724 \(fetched\) instructions
270 unique \(fetched\) instructions
772 \(fetched\) instructions
286 unique \(fetched\) instructions
#else
685 \(fetched\) instructions
255 unique \(fetched\) instructions
733 \(fetched\) instructions
271 unique \(fetched\) instructions
#endif

0 non-fetched instructions
0 prefetches
#ifdef __ARM_FEATURE_SVE2
#if (__ARM_FEATURE_SVE_BITS == 128)
1158 data loads
1248 data loads
873 data stores
#elif (__ARM_FEATURE_SVE_BITS == 256)
2070 data loads
2223 data loads
1615 data stores
#elif (__ARM_FEATURE_SVE_BITS == 512)
3894 data loads
4206 data loads
3099 data stores
#endif /* __ARM_FEATURE_SVE_BITS */
#else
#if (__ARM_FEATURE_SVE_BITS == 128)
1137 data loads
1227 data loads
861 data stores
#elif (__ARM_FEATURE_SVE_BITS == 256)
2035 data loads
2199 data loads
1595 data stores
#elif (__ARM_FEATURE_SVE_BITS == 512)
3831 data loads
4143 data loads
3063 data stores
#endif /* __ARM_FEATURE_SVE_BITS */
#endif /* __ARM_FEATURE_SVE2 */
Expand Down
16 changes: 16 additions & 0 deletions clients/drcachesim/tests/scattergather-aarch64.templatex
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,22 @@ ld1rqw scalar\+immediate: PASS
ld1rqd scalar\+immediate: PASS
ld1rqd scalar\+immediate \(min index\): PASS
ld1rqd scalar\+immediate \(max index\): PASS
ldnf1b scalar\+immediate 8bit element: PASS
ldnf1b scalar\+immediate 16bit element: PASS
ldnf1b scalar\+immediate 32bit element: PASS
ldnf1b scalar\+immediate 64bit element: PASS
ldnf1sb scalar\+immediate 16bit element: PASS
ldnf1sb scalar\+immediate 32bit element: PASS
ldnf1sb scalar\+immediate 64bit element: PASS
ldnf1h scalar\+immediate 16bit element: PASS
ldnf1h scalar\+immediate 32bit element: PASS
ldnf1h scalar\+immediate 64bit element: PASS
ldnf1sh scalar\+immediate 32bit element: PASS
ldnf1sh scalar\+immediate 64bit element: PASS
ldnf1w scalar\+immediate 32bit element: PASS
ldnf1w scalar\+immediate 64bit element: PASS
ldnf1sw scalar\+immediate 64bit element: PASS
ldnf1d scalar\+immediate 64bit element: PASS
ld2b scalar\+immediate: PASS
ld2h scalar\+immediate: PASS
ld2w scalar\+immediate: PASS
Expand Down
Loading
Loading