Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add casts to ARM NEON code #409

Merged
merged 1 commit into from
Oct 15, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 52 additions & 51 deletions src/SDL_ttf.c
Original file line number Diff line number Diff line change
Expand Up @@ -761,33 +761,33 @@ static void BG_Blended_Opaque_NEON(const TTF_Image *image, Uint32 *destination,

uint32x4_t s, d0, d1, d2, d3, r0, r1, r2, r3;
uint8x16x2_t sx, sx01, sx23;
uint32x4_t zero = vmovq_n_u32(0);
const uint8x16_t zero = vdupq_n_u8(0);

while (height--) {
/* *INDENT-OFF* */
DUFFS_LOOP4(
/* Read 4 Uint32 and put 16 Uint8 into uint32x4x2_t (uint8x16x2_t)
* takes advantage of vzipq_u8 which produces two lanes */

s = vld1q_u32(src); // load
d0 = vld1q_u32(dst); // load
d1 = vld1q_u32(dst + 4); // load
d2 = vld1q_u32(dst + 8); // load
d3 = vld1q_u32(dst + 12); // load

sx = vzipq_u8(zero, s); // interleave
sx01 = vzipq_u8(zero, sx.val[0]); // interleave
sx23 = vzipq_u8(zero, sx.val[1]); // interleave
// already shifted by 24
r0 = vorrq_u32(d0, sx01.val[0]); // or
r1 = vorrq_u32(d1, sx01.val[1]); // or
r2 = vorrq_u32(d2, sx23.val[0]); // or
r3 = vorrq_u32(d3, sx23.val[1]); // or

vst1q_u32(dst, r0); // store
vst1q_u32(dst + 4, r1); // store
vst1q_u32(dst + 8, r2); // store
vst1q_u32(dst + 12, r3); // store
s = vld1q_u32(src); // load
d0 = vld1q_u32(dst); // load
d1 = vld1q_u32(dst + 4); // load
d2 = vld1q_u32(dst + 8); // load
d3 = vld1q_u32(dst + 12); // load

sx = vzipq_u8(zero, (uint8x16_t)s); // interleave
sx01 = vzipq_u8(zero, sx.val[0]); // interleave
sx23 = vzipq_u8(zero, sx.val[1]); // interleave
// already shifted by 24
r0 = vorrq_u32(d0, (uint32x4_t)sx01.val[0]); // or
r1 = vorrq_u32(d1, (uint32x4_t)sx01.val[1]); // or
r2 = vorrq_u32(d2, (uint32x4_t)sx23.val[0]); // or
r3 = vorrq_u32(d3, (uint32x4_t)sx23.val[1]); // or

vst1q_u32(dst, r0); // store
vst1q_u32(dst + 4, r1); // store
vst1q_u32(dst + 8, r2); // store
vst1q_u32(dst + 12, r3); // store

dst += 16;
src += 4;
Expand All @@ -809,60 +809,61 @@ static void BG_Blended_NEON(const TTF_Image *image, Uint32 *destination, Sint32
uint32x4_t s, d0, d1, d2, d3, r0, r1, r2, r3;
uint16x8_t Ls8, Hs8;
uint8x16x2_t sx, sx01, sx23;
uint16x8x2_t sxm;

const uint16x8_t alpha = vmovq_n_u16(fg_alpha);
const uint16x8_t one = vmovq_n_u16(1);
const uint32x4_t zero = vmovq_n_u32(0);
const uint8x16_t zero = vdupq_n_u8(0);

while (height--) {
/* *INDENT-OFF* */
DUFFS_LOOP4(
/* Read 4 Uint32 and put 16 Uint8 into uint32x4x2_t (uint8x16x2_t)
* takes advantage of vzipq_u8 which produces two lanes */

s = vld1q_u32(src); // load
d0 = vld1q_u32(dst); // load
d1 = vld1q_u32(dst + 4); // load
d2 = vld1q_u32(dst + 8); // load
d3 = vld1q_u32(dst + 12); // load
s = vld1q_u32(src); // load
d0 = vld1q_u32(dst); // load
d1 = vld1q_u32(dst + 4); // load
d2 = vld1q_u32(dst + 8); // load
d3 = vld1q_u32(dst + 12); // load

sx = vzipq_u8(s, zero); // interleave, no shifting
// enough room to multiply
sx = vzipq_u8((uint8x16_t)s, zero); // interleave, no shifting
// enough room to multiply

/* Apply: alpha_table[i] = ((i * fg.a / 255) << 24; */
// Divide by 255 is done as: (x + 1 + (x >> 8)) >> 8

sx.val[0] = vmulq_u16(sx.val[0], alpha); // x := i * fg.a
sx.val[1] = vmulq_u16(sx.val[1], alpha);
sxm.val[0] = vmulq_u16((uint16x8_t)sx.val[0], alpha); // x := i * fg.a
sxm.val[1] = vmulq_u16((uint16x8_t)sx.val[1], alpha);

Ls8 = vshrq_n_u16(sx.val[0], 8); // x >> 8
Hs8 = vshrq_n_u16(sx.val[1], 8);
Ls8 = vshrq_n_u16(sxm.val[0], 8); // x >> 8
Hs8 = vshrq_n_u16(sxm.val[1], 8);

sx.val[0] = vaddq_u16(sx.val[0], one); // x + 1
sx.val[1] = vaddq_u16(sx.val[1], one);
sxm.val[0] = vaddq_u16(sxm.val[0], one); // x + 1
sxm.val[1] = vaddq_u16(sxm.val[1], one);

sx.val[0] = vaddq_u16(sx.val[0], Ls8); // x + 1 + (x >> 8)
sx.val[1] = vaddq_u16(sx.val[1], Hs8);
sxm.val[0] = vaddq_u16(sxm.val[0], Ls8); // x + 1 + (x >> 8)
sxm.val[1] = vaddq_u16(sxm.val[1], Hs8);

sx.val[0] = vshrq_n_u16(sx.val[0], 8); // ((x + 1 + (x >> 8)) >> 8
sx.val[1] = vshrq_n_u16(sx.val[1], 8);
sxm.val[0] = vshrq_n_u16(sxm.val[0], 8); // ((x + 1 + (x >> 8)) >> 8
sxm.val[1] = vshrq_n_u16(sxm.val[1], 8);

sx.val[0] = vshlq_n_u16(sx.val[0], 8); // shift << 8, so we're prepared
sx.val[1] = vshlq_n_u16(sx.val[1], 8); // to have final format << 24
sxm.val[0] = vshlq_n_u16(sxm.val[0], 8); // shift << 8, so we're prepared
sxm.val[1] = vshlq_n_u16(sxm.val[1], 8); // to have final format << 24

sx01 = vzipq_u8(zero, sx.val[0]); // interleave
sx23 = vzipq_u8(zero, sx.val[1]); // interleave
// already shifted by 24
sx01 = vzipq_u8(zero, (uint8x16_t)sxm.val[0]); // interleave
sx23 = vzipq_u8(zero, (uint8x16_t)sxm.val[1]); // interleave
// already shifted by 24

r0 = vorrq_u32(d0, sx01.val[0]); // or
r1 = vorrq_u32(d1, sx01.val[1]); // or
r2 = vorrq_u32(d2, sx23.val[0]); // or
r3 = vorrq_u32(d3, sx23.val[1]); // or
r0 = vorrq_u32(d0, (uint32x4_t)sx01.val[0]); // or
r1 = vorrq_u32(d1, (uint32x4_t)sx01.val[1]); // or
r2 = vorrq_u32(d2, (uint32x4_t)sx23.val[0]); // or
r3 = vorrq_u32(d3, (uint32x4_t)sx23.val[1]); // or

vst1q_u32(dst, r0); // store
vst1q_u32(dst + 4, r1); // store
vst1q_u32(dst + 8, r2); // store
vst1q_u32(dst + 12, r3); // store
vst1q_u32(dst, r0); // store
vst1q_u32(dst + 4, r1); // store
vst1q_u32(dst + 8, r2); // store
vst1q_u32(dst + 12, r3); // store

dst += 16;
src += 4;
Expand Down
Loading