diff --git a/sim/aarch64/ChangeLog b/sim/aarch64/ChangeLog index eff0a93bf0..814365d266 100644 --- a/sim/aarch64/ChangeLog +++ b/sim/aarch64/ChangeLog @@ -1,3 +1,7 @@ +2017-01-09 Jim Wilson + + * simulator.c (do_vec_UZP): Rewrite. + 2017-01-04 Jim Wilson * cpustate.c: Include math.h. diff --git a/sim/aarch64/simulator.c b/sim/aarch64/simulator.c index 7b75c6e2d4..36129e5308 100644 --- a/sim/aarch64/simulator.c +++ b/sim/aarch64/simulator.c @@ -2958,12 +2958,10 @@ do_vec_UZP (sim_cpu *cpu) uint64_t val_n1 = aarch64_get_vec_u64 (cpu, vn, 0); uint64_t val_n2 = aarch64_get_vec_u64 (cpu, vn, 1); - uint64_t val1 = 0; - uint64_t val2 = 0; + uint64_t val1; + uint64_t val2; - uint64_t input1 = upper ? val_n1 : val_m1; - uint64_t input2 = upper ? val_n2 : val_m2; - unsigned i; + uint64_t input2 = full ? val_n2 : val_m1; NYI_assert (29, 24, 0x0E); NYI_assert (21, 21, 0); @@ -2971,32 +2969,68 @@ do_vec_UZP (sim_cpu *cpu) NYI_assert (13, 10, 6); TRACE_DECODE (cpu, "emulated at line %d", __LINE__); - switch (INSTR (23, 23)) + switch (INSTR (23, 22)) { case 0: - for (i = 0; i < 8; i++) + val1 = (val_n1 >> (upper * 8)) & 0xFFULL; + val1 |= (val_n1 >> ((upper * 8) + 8)) & 0xFF00ULL; + val1 |= (val_n1 >> ((upper * 8) + 16)) & 0xFF0000ULL; + val1 |= (val_n1 >> ((upper * 8) + 24)) & 0xFF000000ULL; + + val1 |= (input2 << (32 - (upper * 8))) & 0xFF00000000ULL; + val1 |= (input2 << (24 - (upper * 8))) & 0xFF0000000000ULL; + val1 |= (input2 << (16 - (upper * 8))) & 0xFF000000000000ULL; + val1 |= (input2 << (8 - (upper * 8))) & 0xFF00000000000000ULL; + + if (full) { - val1 |= (input1 >> (i * 8)) & (0xFFULL << (i * 8)); - val2 |= (input2 >> (i * 8)) & (0xFFULL << (i * 8)); + val2 = (val_m1 >> (upper * 8)) & 0xFFULL; + val2 |= (val_m1 >> ((upper * 8) + 8)) & 0xFF00ULL; + val2 |= (val_m1 >> ((upper * 8) + 16)) & 0xFF0000ULL; + val2 |= (val_m1 >> ((upper * 8) + 24)) & 0xFF000000ULL; + + val2 |= (val_m2 << (32 - (upper * 8))) & 0xFF00000000ULL; + val2 |= (val_m2 << (24 - (upper * 8))) & 0xFF0000000000ULL; + val2 |= (val_m2 << (16 - (upper * 8))) & 0xFF000000000000ULL; + val2 |= (val_m2 << (8 - (upper * 8))) & 0xFF00000000000000ULL; } break; case 1: - for (i = 0; i < 4; i++) + val1 = (val_n1 >> (upper * 16)) & 0xFFFFULL; + val1 |= (val_n1 >> ((upper * 16) + 16)) & 0xFFFF0000ULL; + + val1 |= (input2 << (32 - (upper * 16))) & 0xFFFF00000000ULL;; + val1 |= (input2 << (16 - (upper * 16))) & 0xFFFF000000000000ULL; + + if (full) { - val1 |= (input1 >> (i * 16)) & (0xFFFFULL << (i * 16)); - val2 |= (input2 >> (i * 16)) & (0xFFFFULL << (i * 16)); + val2 = (val_m1 >> (upper * 16)) & 0xFFFFULL; + val2 |= (val_m1 >> ((upper * 16) + 16)) & 0xFFFF0000ULL; + + val2 |= (val_m2 << (32 - (upper * 16))) & 0xFFFF00000000ULL; + val2 |= (val_m2 << (16 - (upper * 16))) & 0xFFFF000000000000ULL; } break; case 2: - val1 = ((input1 & 0xFFFFFFFF) | ((input1 >> 32) & 0xFFFFFFFF00000000ULL)); - val2 = ((input2 & 0xFFFFFFFF) | ((input2 >> 32) & 0xFFFFFFFF00000000ULL)); + val1 = (val_n1 >> (upper * 32)) & 0xFFFFFFFF; + val1 |= (input2 << (32 - (upper * 32))) & 0xFFFFFFFF00000000ULL; + + if (full) + { + val2 = (val_m1 >> (upper * 32)) & 0xFFFFFFFF; + val2 |= (val_m2 << (32 - (upper * 32))) & 0xFFFFFFFF00000000ULL; + } + break; case 3: - val1 = input1; - val2 = input2; - break; + if (! full) + HALT_UNALLOC; + + val1 = upper ? val_n2 : val_n1; + val2 = upper ? val_m2 : val_m1; + break; } aarch64_set_vec_u64 (cpu, vd, 0, val1); diff --git a/sim/testsuite/sim/aarch64/ChangeLog b/sim/testsuite/sim/aarch64/ChangeLog index 63f0d7d9c5..b756603e34 100644 --- a/sim/testsuite/sim/aarch64/ChangeLog +++ b/sim/testsuite/sim/aarch64/ChangeLog @@ -1,3 +1,7 @@ +2017-01-09 Jim Wilson + + * uzp.s: New. + 2017-01-04 Jim Wilson * fcsel.s: New. diff --git a/sim/testsuite/sim/aarch64/uzp.s b/sim/testsuite/sim/aarch64/uzp.s new file mode 100644 index 0000000000..55e2cd7b9b --- /dev/null +++ b/sim/testsuite/sim/aarch64/uzp.s @@ -0,0 +1,214 @@ +# mach: aarch64 + +# Check the unzip instructions: uzp1, uzp2. + +.include "testutils.inc" + +input1: + .word 0x04030201 + .word 0x08070605 + .word 0x0c0b0a09 + .word 0x100f0e0d +input2: + .word 0x14131211 + .word 0x18171615 + .word 0x1c1b1a19 + .word 0x201f1e1d +zl8b: + .word 0x07050301 + .word 0x17151311 +zu8b: + .word 0x08060402 + .word 0x18161412 +zl16b: + .word 0x07050301 + .word 0x0f0d0b09 + .word 0x17151311 + .word 0x1f1d1b19 +zu16b: + .word 0x08060402 + .word 0x100e0c0a + .word 0x18161412 + .word 0x201e1c1a +zl4h: + .word 0x06050201 + .word 0x16151211 +zu4h: + .word 0x08070403 + .word 0x18171413 +zl8h: + .word 0x06050201 + .word 0x0e0d0a09 + .word 0x16151211 + .word 0x1e1d1a19 +zu8h: + .word 0x08070403 + .word 0x100f0c0b + .word 0x18171413 + .word 0x201f1c1b +zl2s: + .word 0x04030201 + .word 0x14131211 +zu2s: + .word 0x08070605 + .word 0x18171615 +zl4s: + .word 0x04030201 + .word 0x0c0b0a09 + .word 0x14131211 + .word 0x1c1b1a19 +zu4s: + .word 0x08070605 + .word 0x100f0e0d + .word 0x18171615 + .word 0x201f1e1d +zl2d: + .word 0x04030201 + .word 0x08070605 + .word 0x14131211 + .word 0x18171615 +zu2d: + .word 0x0c0b0a09 + .word 0x100f0e0d + .word 0x1c1b1a19 + .word 0x201f1e1d + + start + adrp x0, input1 + ldr q0, [x0, #:lo12:input1] + adrp x0, input2 + ldr q1, [x0, #:lo12:input2] + + uzp1 v2.8b, v0.8b, v1.8b + mov x1, v2.d[0] + adrp x3, zl8b + ldr x4, [x3, #:lo12:zl8b] + cmp x1, x4 + bne .Lfailure + + uzp2 v2.8b, v0.8b, v1.8b + mov x1, v2.d[0] + adrp x3, zu8b + ldr x4, [x3, #:lo12:zu8b] + cmp x1, x4 + bne .Lfailure + + uzp1 v2.16b, v0.16b, v1.16b + mov x1, v2.d[0] + mov x2, v2.d[1] + adrp x3, zl16b + ldr x4, [x3, #:lo12:zl16b] + cmp x1, x4 + bne .Lfailure + ldr x5, [x3, #:lo12:zl16b+8] + cmp x2, x5 + bne .Lfailure + + uzp2 v2.16b, v0.16b, v1.16b + mov x1, v2.d[0] + mov x2, v2.d[1] + adrp x3, zu16b + ldr x4, [x3, #:lo12:zu16b] + cmp x1, x4 + bne .Lfailure + ldr x5, [x3, #:lo12:zu16b+8] + cmp x2, x5 + bne .Lfailure + + uzp1 v2.4h, v0.4h, v1.4h + mov x1, v2.d[0] + adrp x3, zl4h + ldr x4, [x3, #:lo12:zl4h] + cmp x1, x4 + bne .Lfailure + + uzp2 v2.4h, v0.4h, v1.4h + mov x1, v2.d[0] + adrp x3, zu4h + ldr x4, [x3, #:lo12:zu4h] + cmp x1, x4 + bne .Lfailure + + uzp1 v2.8h, v0.8h, v1.8h + mov x1, v2.d[0] + mov x2, v2.d[1] + adrp x3, zl8h + ldr x4, [x3, #:lo12:zl8h] + cmp x1, x4 + bne .Lfailure + ldr x5, [x3, #:lo12:zl8h+8] + cmp x2, x5 + bne .Lfailure + + uzp2 v2.8h, v0.8h, v1.8h + mov x1, v2.d[0] + mov x2, v2.d[1] + adrp x3, zu8h + ldr x4, [x3, #:lo12:zu8h] + cmp x1, x4 + bne .Lfailure + ldr x5, [x3, #:lo12:zu8h+8] + cmp x2, x5 + bne .Lfailure + + uzp1 v2.2s, v0.2s, v1.2s + mov x1, v2.d[0] + adrp x3, zl2s + ldr x4, [x3, #:lo12:zl2s] + cmp x1, x4 + bne .Lfailure + + uzp2 v2.2s, v0.2s, v1.2s + mov x1, v2.d[0] + adrp x3, zu2s + ldr x4, [x3, #:lo12:zu2s] + cmp x1, x4 + bne .Lfailure + + uzp1 v2.4s, v0.4s, v1.4s + mov x1, v2.d[0] + mov x2, v2.d[1] + adrp x3, zl4s + ldr x4, [x3, #:lo12:zl4s] + cmp x1, x4 + bne .Lfailure + ldr x5, [x3, #:lo12:zl4s+8] + cmp x2, x5 + bne .Lfailure + + uzp2 v2.4s, v0.4s, v1.4s + mov x1, v2.d[0] + mov x2, v2.d[1] + adrp x3, zu4s + ldr x4, [x3, #:lo12:zu4s] + cmp x1, x4 + bne .Lfailure + ldr x5, [x3, #:lo12:zu4s+8] + cmp x2, x5 + bne .Lfailure + + uzp1 v2.2d, v0.2d, v1.2d + mov x1, v2.d[0] + mov x2, v2.d[1] + adrp x3, zl2d + ldr x4, [x3, #:lo12:zl2d] + cmp x1, x4 + bne .Lfailure + ldr x5, [x3, #:lo12:zl2d+8] + cmp x2, x5 + bne .Lfailure + + uzp2 v2.2d, v0.2d, v1.2d + mov x1, v2.d[0] + mov x2, v2.d[1] + adrp x3, zu2d + ldr x4, [x3, #:lo12:zu2d] + cmp x1, x4 + bne .Lfailure + ldr x5, [x3, #:lo12:zu2d+8] + cmp x2, x5 + bne .Lfailure + + pass +.Lfailure: + fail