diff --git a/README.OpenSource b/README.OpenSource index 55af8e65..adb4d8cd 100644 --- a/README.OpenSource +++ b/README.OpenSource @@ -3,8 +3,8 @@ "Name": "vixl", "License": "BSD 3-clause", "License File": "LICENCE", - "Version Number": "7.0.0", - "Owner": "huanghuijin@huawei.com", + "Version Number": "8.0.0", + "Owner": "liyiming13@huawei.com", "Upstream URL": "https://github.com/Linaro/vixl", "Description": "vixl is a programmatic assemblers to generate A64, A32 or T32 code at runtime." } diff --git a/README.md b/README.md index f0255eaf..f114ac6a 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -VIXL: ARMv8 Runtime Code Generation Library 7.0.0 +VIXL: ARMv8 Runtime Code Generation Library 8.0.0 ================================================= Contents: diff --git a/SConstruct b/SConstruct index 934a81e3..b855d646 100644 --- a/SConstruct +++ b/SConstruct @@ -98,7 +98,9 @@ options = { 'CCFLAGS' : ['-O3'], }, 'simulator:aarch64' : { - 'CCFLAGS' : ['-DVIXL_INCLUDE_SIMULATOR_AARCH64'], + 'CCFLAGS' : ['-DVIXL_INCLUDE_SIMULATOR_AARCH64', + '-pthread'], + 'LINKFLAGS' : ['-pthread'] }, 'symbols:on' : { 'CCFLAGS' : ['-g'], @@ -120,6 +122,9 @@ options = { 'coverage:on' : { 'CCFLAGS': ['-fprofile-instr-generate', '-fcoverage-mapping'], 'LINKFLAGS': ['-fprofile-instr-generate', '-fcoverage-mapping'] + }, + 'implicit_checks:on' : { + 'CCFLAGS' : ['-DVIXL_ENABLE_IMPLICIT_CHECKS'], } } @@ -265,6 +270,10 @@ vars.AddVariables( EnumVariable('negative_testing', 'Enable negative testing (needs exceptions)', 'off', allowed_values=['on', 'off']), + EnumVariable('implicit_checks', + 'Allow signals raised from simulated invalid (e.g: out of' + + ' bounds) memory reads to be handled by the host.', + 'off', allowed_values=['on', 'off']), DefaultVariable('symbols', 'Include debugging symbols in the binaries', ['on', 'off']), DefaultVariable('simulator', 'Simulators to include', ['aarch64', 'none']), diff --git a/doc/range-limits.md b/doc/range-limits.md new file mode 100644 index 00000000..cd7cf8bb --- /dev/null +++ b/doc/range-limits.md @@ -0,0 +1,148 @@ +Immediate Range Limits in VIXL +============================== + +VIXL's macro assembler tries to increase the range of branches and literal loads +automatically for you, but applications must still be aware of these extended +limits, and stay within them, in order to ensure valid code is generated. + +In debug builds, assertions prevent exceeding these limits at run time. In +release builds, for performance reasons, the application is responsible for +staying within the limits. + +You should decide what corrections should be applied in your application if it +exceeds these limits. + +Terms +----- + +**Bind** assigning an address to a label such that the instructions that refer +to the label can be assigned PC-relative offsets. + +**Forward** a forward branch or load literal will refer to a location that will +be bound later in code generation, ie. at a higher address. + +**Backward** a backward branch or load literal refers to a location that has +already been bound earlier in code generation, ie. at a lower address. + +**Instruction range** the range of values that can be encoded in the instruction +to be generated. Outside the instruction range, additional instructions may be +generated to increase the range, branching further than would be possible in +one instruction, for example. + +**Veneer** a sequence of additional instructions produced to increase the +instruction range. + +**Adjusted PC** the PC including its architecturally-defined offset. In AArch32 +T32, this is the current PC plus four bytes. In AArch64, there is no adjustment; +Adjusted PC is equal to PC. + +AArch64 +------- + +### Branches + +All instructions and targets must be aligned to the instruction size, four +bytes. + +#### Unconditional immediate branches (`B`) + +* Unconditional immediate branches have an instruction range of -134,217,728 to ++134,217,724 bytes from the current PC. +* No veneers are applied to unconditional immediate branches to extend their +instruction range. +* Callers can use the function `IsValidImmPCOffset(UncondBranchType, offset)` to +check `offset` (in units of instruction) is within the instruction range. + +#### Conditional branches (`B.cond`) and compare-and-branch (`CBZ`, `CBNZ`) + +* Conditional branch and compare-and-branch instructions have the same +instruction range. +* The instruction range is -1,048,576 to +1,048,574 bytes from the current PC. +* Veneers are applied to extend the range to -134,217,724 to +135,266,298 bytes +from the current PC. + * Unconditional branch range minus one instruction backwards. + * Unconditional branch range plus conditional branch range forwards. +* Callers can use the functions `IsValidImmPCOffset(CondBranchType, offset)` and +`IsValidImmPCOffset(CompareBranchType, offset)` to check `offset` (in units of +instruction) is within the instruction range. + +#### Test-and-branch (`TBZ`, `TBNZ`) + +* Test-and-branch instructions have an instruction range of -32,768 to 32,764 +bytes from the current PC. +* Veneers are applied to extend the range to -134,217,728 to +135,299,062 bytes +from the current PC. + * Unconditional branch range minus one instruction backwards. + * Unconditional branch range plus test-and-branch range forwards. +* Callers can use the function `IsValidImmPCOffset(TestBranchType, offset)` to +check `offset` (in units of instruction) is within the instruction range. + +### Literals + +#### Compute PC-relative address (`ADR`) + +* Compute PC-relative address instructions have an instruction range of +-1,048,576 to +1,048,575 bytes from the current PC. +* No veneers are applied to extend the instruction range. +* Callers can use `IsInt21(offset)` to check `offset` (in bytes) is within the +instruction range. + +#### Load from PC-relative address (`LDR`) + +* Load from PC-relative address instructions have an instruction range of +-1,048,576 to +1,048,572 bytes from the current PC. The offset must be four-byte +aligned. +* Automatically-placed literals (eg. those created by `Ldr(reg, literal_value)`) +will be emitted into code such that they are in range of the instructions that +refer to them. +* Veneers are not applied to manually-placed literals, ie. those created by +`Literal x(value)` and emitted by `place()`. +* Callers can use `IsInt19(offset)` to check `offset` (in units of instruction) +is within the instruction range. + +AArch32 +------- + +Limits stated in this section relate to the T32 instruction encodings only. + +### Branches + +#### Unconditional immediate branches (`B`) + +* Unconditional immediate branches have an instruction range of -16,777,216 to ++16,777,214 bytes from the current adjusted PC. +* Veneers are applied to forward branches to extend them to an unlimited range. +* No veneers are applied to backward branches. + +#### Conditional immediate branches (`B`) + +* Conditional immediate branches have an instruction range of -1,048,576 to ++1,048,574 bytes from the current adjusted PC. +* Veneers are applied to forward branches to extend them to an unlimited range. +* Veneers are applied to backward branches to extend the range to that of +unconditional immediate branches, -16,777,216 bytes from the current adjusted +PC. + +#### Compare and branch (`CBZ`, `CBNZ`) + +* Compare and branch has an instruction range of 0 to +126 bytes from the +current adjusted PC. +* Veneers are applied to forward branches to extend them to an unlimited range. +* Veneers are applied to backward branches to extend the range to that of +unconditional immediate branches, -16,777,216 bytes from the current adjusted +PC. + +### Literals + +#### Compute/load PC-relative address (`ADR`, `LDR`) + +* Compute and load PC-relative address instructions have the same instruction +range. +* The instruction range is -4,095 to +4,095 bytes from the current adjusted PC. +The PC is aligned down to a four-byte boundary before the offset is added. +* Automatically-placed literals (ie. those created by `Literal x(value)`) +will be emitted into code such that they are in range of the instructions that +refer to them. +* Veneers are not applied to manually-placed literals, ie. those created by +`Literal x(value, RawLiteral::kManuallyPlaced)` and emitted by `Place()`. + diff --git a/src/aarch32/instructions-aarch32.cc b/src/aarch32/instructions-aarch32.cc index fe5458f1..f3ed0e01 100644 --- a/src/aarch32/instructions-aarch32.cc +++ b/src/aarch32/instructions-aarch32.cc @@ -636,20 +636,15 @@ ImmediateT32::ImmediateT32(uint32_t imm) { } -static inline uint32_t ror(uint32_t x, int i) { - VIXL_ASSERT((0 < i) && (i < 32)); - return (x >> i) | (x << (32 - i)); -} - - bool ImmediateT32::IsImmediateT32(uint32_t imm) { /* abcdefgh abcdefgh abcdefgh abcdefgh */ - if ((imm ^ ror(imm, 8)) == 0) return true; + if (AllBytesMatch(imm)) return true; /* 00000000 abcdefgh 00000000 abcdefgh */ /* abcdefgh 00000000 abcdefgh 00000000 */ - if ((imm ^ ror(imm, 16)) == 0 && - (((imm & 0xff00) == 0) || ((imm & 0xff) == 0))) + if (AllHalfwordsMatch(imm) && + (((imm & 0xff00) == 0) || ((imm & 0xff) == 0))) { return true; + } /* isolate least-significant set bit */ uint32_t lsb = imm & UnsignedNegate(imm); /* if imm is less than lsb*256 then it fits, but instead we test imm/256 to @@ -697,7 +692,7 @@ bool ImmediateA32::IsImmediateA32(uint32_t imm) { if (imm < 256) return true; /* avoid getting confused by wrapped-around bytes (this transform has no * effect on pass/fail results) */ - if (imm & 0xff000000) imm = ror(imm, 16); + if (imm & 0xff000000) imm = static_cast(RotateRight(imm, 16, 32)); /* copy odd-numbered set bits into even-numbered bits immediately below, so * that the least-significant set bit is always an even bit */ imm = imm | ((imm >> 1) & 0x55555555); diff --git a/src/aarch32/location-aarch32.h b/src/aarch32/location-aarch32.h index 0959a55a..38800046 100644 --- a/src/aarch32/location-aarch32.h +++ b/src/aarch32/location-aarch32.h @@ -80,6 +80,8 @@ class Location : public LocationBase { #endif } + Location(Location&&) = default; // movable + bool IsReferenced() const { return referenced_; } private: diff --git a/src/aarch32/macro-assembler-aarch32.cc b/src/aarch32/macro-assembler-aarch32.cc index 3a837ae8..e04f6905 100644 --- a/src/aarch32/macro-assembler-aarch32.cc +++ b/src/aarch32/macro-assembler-aarch32.cc @@ -1268,6 +1268,57 @@ void MacroAssembler::Delegate(InstructionType type, } +void MacroAssembler::Delegate(InstructionType type, + InstructionCondSizeL instruction, + Condition cond, + EncodingSize size, + Location* location) { + VIXL_ASSERT(type == kB); + + CONTEXT_SCOPE; + + // Apply veneer to increase range of backwards conditional branches. + // This replaces: + // label: + // + // bcond label ; T3 + // With: + // label: + // + // binvcond skip ; T1 + // b label ; T4 + // skip: + Location::Offset offset = location->GetLocation() - + (GetCursorOffset() + GetArchitectureStatePCOffset()); + if (IsUsingT32() && location->IsBound() && ((offset & 0x1) == 0) && + !cond.Is(al) && cond.IsNotNever()) { + // Bound locations must be earlier in the code. + VIXL_ASSERT(offset < 0); + + // The offset must be within range of a T4 branch, accounting for the + // conditional branch (T1) we emit first, in order to jump over it. + offset -= k16BitT32InstructionSizeInBytes; + if (offset >= -16777216) { + CodeBufferCheckScope scope(this, k16BitT32InstructionSizeInBytes + + k32BitT32InstructionSizeInBytes); +#ifndef PANDA_BUILD + Label skip; +#else + Label skip(allocator_); +#endif + b(cond.Negate(), Narrow, &skip); + b(location); + Bind(&skip); + return; + } else { + VIXL_ABORT_WITH_MSG("Conditional branch too far for veneer.\n"); + } + } + + Assembler::Delegate(type, instruction, cond, size, location); +} + + template static inline bool IsI64BitPattern(T imm) { for (T mask = 0xff << ((sizeof(T) - 1) * 8); mask != 0; mask >>= 8) { diff --git a/src/aarch32/macro-assembler-aarch32.h b/src/aarch32/macro-assembler-aarch32.h index 9742b1bc..f1ef2edb 100644 --- a/src/aarch32/macro-assembler-aarch32.h +++ b/src/aarch32/macro-assembler-aarch32.h @@ -1041,6 +1041,12 @@ ITScope(AllocatorWrapper allocator, MacroAssembler* masm, InstructionRL instruction, Register rn, Location* location) VIXL_OVERRIDE; + // B + virtual void Delegate(InstructionType type, + InstructionCondSizeL instruction, + Condition cond, + EncodingSize size, + Location* location) VIXL_OVERRIDE; // VMOV virtual void Delegate(InstructionType type, InstructionCondDtSSop instruction, diff --git a/src/aarch64/assembler-aarch64.cc b/src/aarch64/assembler-aarch64.cc index c0227177..8e7cee5b 100644 --- a/src/aarch64/assembler-aarch64.cc +++ b/src/aarch64/assembler-aarch64.cc @@ -1918,6 +1918,12 @@ void Assembler::sys(int op, const Register& xt) { } +void Assembler::sysl(int op, const Register& xt) { + VIXL_ASSERT(xt.Is64Bits()); + Emit(SYSL | SysOp(op) | Rt(xt)); +} + + void Assembler::dc(DataCacheOp op, const Register& rt) { if (op == CVAP) VIXL_ASSERT(CPUHas(CPUFeatures::kDCPoP)); if (op == CVADP) VIXL_ASSERT(CPUHas(CPUFeatures::kDCCVADP)); @@ -1930,6 +1936,35 @@ void Assembler::ic(InstructionCacheOp op, const Register& rt) { sys(op, rt); } +void Assembler::gcspushm(const Register& rt) { + VIXL_ASSERT(CPUHas(CPUFeatures::kGCS)); + sys(GCSPUSHM, rt); +} + +void Assembler::gcspopm(const Register& rt) { + VIXL_ASSERT(CPUHas(CPUFeatures::kGCS)); + sysl(GCSPOPM, rt); +} + + +void Assembler::gcsss1(const Register& rt) { + VIXL_ASSERT(CPUHas(CPUFeatures::kGCS)); + sys(GCSSS1, rt); +} + + +void Assembler::gcsss2(const Register& rt) { + VIXL_ASSERT(CPUHas(CPUFeatures::kGCS)); + sysl(GCSSS2, rt); +} + + +void Assembler::chkfeat(const Register& rd) { + VIXL_ASSERT(rd.Is(x16)); + USE(rd); + hint(CHKFEAT); +} + void Assembler::hint(SystemHint code) { hint(static_cast(code)); } @@ -2913,6 +2948,25 @@ void Assembler::st1(const VRegister& vt, int lane, const MemOperand& dst) { LoadStoreStructSingle(vt, lane, dst, NEONLoadStoreSingleStructStore1); } +void Assembler::pmull(const VRegister& vd, + const VRegister& vn, + const VRegister& vm) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(AreSameFormat(vn, vm)); + VIXL_ASSERT((vn.Is8B() && vd.Is8H()) || (vn.Is1D() && vd.Is1Q())); + VIXL_ASSERT(CPUHas(CPUFeatures::kPmull1Q) || vd.Is8H()); + Emit(VFormat(vn) | NEON_PMULL | Rm(vm) | Rn(vn) | Rd(vd)); +} + +void Assembler::pmull2(const VRegister& vd, + const VRegister& vn, + const VRegister& vm) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(AreSameFormat(vn, vm)); + VIXL_ASSERT((vn.Is16B() && vd.Is8H()) || (vn.Is2D() && vd.Is1Q())); + VIXL_ASSERT(CPUHas(CPUFeatures::kPmull1Q) || vd.Is8H()); + Emit(VFormat(vn) | NEON_PMULL2 | Rm(vm) | Rn(vn) | Rd(vd)); +} void Assembler::NEON3DifferentL(const VRegister& vd, const VRegister& vn, @@ -2960,8 +3014,6 @@ void Assembler::NEON3DifferentHN(const VRegister& vd, // clang-format off #define NEON_3DIFF_LONG_LIST(V) \ - V(pmull, NEON_PMULL, vn.IsVector() && vn.Is8B()) \ - V(pmull2, NEON_PMULL2, vn.IsVector() && vn.Is16B()) \ V(saddl, NEON_SADDL, vn.IsVector() && vn.IsD()) \ V(saddl2, NEON_SADDL2, vn.IsVector() && vn.IsQ()) \ V(sabal, NEON_SABAL, vn.IsVector() && vn.IsD()) \ @@ -4336,7 +4388,7 @@ void Assembler::sqrdmlah(const VRegister& vd, const VRegister& vm) { VIXL_ASSERT(CPUHas(CPUFeatures::kNEON, CPUFeatures::kRDM)); VIXL_ASSERT(AreSameFormat(vd, vn, vm)); - VIXL_ASSERT(vd.IsVector() || !vd.IsQ()); + VIXL_ASSERT(vd.IsLaneSizeH() || vd.IsLaneSizeS()); Instr format, op = NEON_SQRDMLAH; if (vd.IsScalar()) { @@ -4355,7 +4407,7 @@ void Assembler::sqrdmlsh(const VRegister& vd, const VRegister& vm) { VIXL_ASSERT(CPUHas(CPUFeatures::kNEON, CPUFeatures::kRDM)); VIXL_ASSERT(AreSameFormat(vd, vn, vm)); - VIXL_ASSERT(vd.IsVector() || !vd.IsQ()); + VIXL_ASSERT(vd.IsLaneSizeH() || vd.IsLaneSizeS()); Instr format, op = NEON_SQRDMLSH; if (vd.IsScalar()) { @@ -5824,6 +5876,263 @@ void Assembler::ummla(const VRegister& vd, const VRegister& vn, const VRegister& Emit(0x6e80a400 | Rd(vd) | Rn(vn) | Rm(vm)); } +void Assembler::bcax(const VRegister& vd, const VRegister& vn, const VRegister& vm, const VRegister& va) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSHA3)); + VIXL_ASSERT(vd.Is16B() && vn.Is16B() && vm.Is16B()); + + Emit(0xce200000 | Rd(vd) | Rn(vn) | Rm(vm) | Ra(va)); +} + +void Assembler::eor3(const VRegister& vd, const VRegister& vn, const VRegister& vm, const VRegister& va) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSHA3)); + VIXL_ASSERT(vd.Is16B() && vn.Is16B() && vm.Is16B() && va.Is16B()); + + Emit(0xce000000 | Rd(vd) | Rn(vn) | Rm(vm) | Ra(va)); +} + +void Assembler::xar(const VRegister& vd, const VRegister& vn, const VRegister& vm, int rotate) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSHA3)); + VIXL_ASSERT(vd.Is2D() && vn.Is2D() && vm.Is2D()); + VIXL_ASSERT(IsUint6(rotate)); + + Emit(0xce800000 | Rd(vd) | Rn(vn) | Rm(vm) | rotate << 10); +} + +void Assembler::rax1(const VRegister& vd, const VRegister& vn, const VRegister& vm) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSHA3)); + VIXL_ASSERT(vd.Is2D() && vn.Is2D() && vm.Is2D()); + + Emit(0xce608c00 | Rd(vd) | Rn(vn) | Rm(vm)); +} + +void Assembler::sha1c(const VRegister& vd, const VRegister& vn, const VRegister& vm) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSHA1)); + VIXL_ASSERT(vd.IsQ() && vn.IsS() && vm.Is4S()); + + Emit(0x5e000000 | Rd(vd) | Rn(vn) | Rm(vm)); +} + +void Assembler::sha1h(const VRegister& sd, const VRegister& sn) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSHA1)); + VIXL_ASSERT(sd.IsS() && sn.IsS()); + + Emit(0x5e280800 | Rd(sd) | Rn(sn)); +} + +void Assembler::sha1m(const VRegister& vd, const VRegister& vn, const VRegister& vm) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSHA1)); + VIXL_ASSERT(vd.IsQ() && vn.IsS() && vm.Is4S()); + + Emit(0x5e002000 | Rd(vd) | Rn(vn) | Rm(vm)); +} + +void Assembler::sha1p(const VRegister& vd, const VRegister& vn, const VRegister& vm) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSHA1)); + VIXL_ASSERT(vd.IsQ() && vn.IsS() && vm.Is4S()); + + Emit(0x5e001000 | Rd(vd) | Rn(vn) | Rm(vm)); +} + +void Assembler::sha1su0(const VRegister& vd, const VRegister& vn, const VRegister& vm) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSHA1)); + VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S()); + + Emit(0x5e003000 | Rd(vd) | Rn(vn) | Rm(vm)); +} + +void Assembler::sha1su1(const VRegister& vd, const VRegister& vn) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSHA1)); + VIXL_ASSERT(vd.Is4S() && vn.Is4S()); + + Emit(0x5e281800 | Rd(vd) | Rn(vn)); +} + +void Assembler::sha256h(const VRegister& vd, const VRegister& vn, const VRegister& vm) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSHA2)); + VIXL_ASSERT(vd.IsQ() && vn.IsQ() && vm.Is4S()); + + Emit(0x5e004000 | Rd(vd) | Rn(vn) | Rm(vm)); +} + +void Assembler::sha256h2(const VRegister& vd, const VRegister& vn, const VRegister& vm) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSHA2)); + VIXL_ASSERT(vd.IsQ() && vn.IsQ() && vm.Is4S()); + + Emit(0x5e005000 | Rd(vd) | Rn(vn) | Rm(vm)); +} + +void Assembler::sha256su0(const VRegister& vd, const VRegister& vn) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSHA2)); + VIXL_ASSERT(vd.Is4S() && vn.Is4S()); + + Emit(0x5e282800 | Rd(vd) | Rn(vn)); +} + +void Assembler::sha256su1(const VRegister& vd, const VRegister& vn, const VRegister& vm) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSHA2)); + VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S()); + + Emit(0x5e006000 | Rd(vd) | Rn(vn) | Rm(vm)); +} + +void Assembler::sha512h(const VRegister& vd, const VRegister& vn, const VRegister& vm) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSHA512)); + VIXL_ASSERT(vd.IsQ() && vn.IsQ() && vm.Is2D()); + + Emit(0xce608000 | Rd(vd) | Rn(vn) | Rm(vm)); +} + +void Assembler::sha512h2(const VRegister& vd, const VRegister& vn, const VRegister& vm) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSHA512)); + VIXL_ASSERT(vd.IsQ() && vn.IsQ() && vm.Is2D()); + + Emit(0xce608400 | Rd(vd) | Rn(vn) | Rm(vm)); +} + +void Assembler::sha512su0(const VRegister& vd, const VRegister& vn) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSHA512)); + VIXL_ASSERT(vd.Is2D() && vn.Is2D()); + + Emit(0xcec08000 | Rd(vd) | Rn(vn)); +} + +void Assembler::sha512su1(const VRegister& vd, const VRegister& vn, const VRegister& vm) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSHA512)); + VIXL_ASSERT(vd.Is2D() && vn.Is2D() && vm.Is2D()); + + Emit(0xce608800 | Rd(vd) | Rn(vn) | Rm(vm)); +} + +void Assembler::aesd(const VRegister& vd, const VRegister& vn) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kAES)); + VIXL_ASSERT(vd.Is16B() && vn.Is16B()); + + Emit(0x4e285800 | Rd(vd) | Rn(vn)); +} + +void Assembler::aese(const VRegister& vd, const VRegister& vn) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kAES)); + VIXL_ASSERT(vd.Is16B() && vn.Is16B()); + + Emit(0x4e284800 | Rd(vd) | Rn(vn)); +} + +void Assembler::aesimc(const VRegister& vd, const VRegister& vn) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kAES)); + VIXL_ASSERT(vd.Is16B() && vn.Is16B()); + + Emit(0x4e287800 | Rd(vd) | Rn(vn)); +} + +void Assembler::aesmc(const VRegister& vd, const VRegister& vn) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kAES)); + VIXL_ASSERT(vd.Is16B() && vn.Is16B()); + + Emit(0x4e286800 | Rd(vd) | Rn(vn)); +} + +void Assembler::sm3partw1(const VRegister& vd, const VRegister& vn, const VRegister& vm) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSM3)); + VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S()); + + Emit(0xce60c000 | Rd(vd) | Rn(vn) | Rm(vm)); +} + +void Assembler::sm3partw2(const VRegister& vd, const VRegister& vn, const VRegister& vm) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSM3)); + VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S()); + + Emit(0xce60c400 | Rd(vd) | Rn(vn) | Rm(vm)); +} + +void Assembler::sm3ss1(const VRegister& vd, const VRegister& vn, const VRegister& vm, const VRegister& va) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSM3)); + VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S() && va.Is4S()); + + Emit(0xce400000 | Rd(vd) | Rn(vn) | Rm(vm) | Ra(va)); +} + +void Assembler::sm3tt1a(const VRegister& vd, const VRegister& vn, const VRegister& vm, int index) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSM3)); + VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S()); + VIXL_ASSERT(IsUint2(index)); + + Instr i = static_cast(index) << 12; + Emit(0xce408000 | Rd(vd) | Rn(vn) | Rm(vm) | i); +} + +void Assembler::sm3tt1b(const VRegister& vd, const VRegister& vn, const VRegister& vm, int index) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSM3)); + VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S()); + VIXL_ASSERT(IsUint2(index)); + + Instr i = static_cast(index) << 12; + Emit(0xce408400 | Rd(vd) | Rn(vn) | Rm(vm) | i); +} + +void Assembler::sm3tt2a(const VRegister& vd, const VRegister& vn, const VRegister& vm, int index) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSM3)); + VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S()); + VIXL_ASSERT(IsUint2(index)); + + Instr i = static_cast(index) << 12; + Emit(0xce408800 | Rd(vd) | Rn(vn) | Rm(vm) | i); +} + +void Assembler::sm3tt2b(const VRegister& vd, const VRegister& vn, const VRegister& vm, int index) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSM3)); + VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S()); + VIXL_ASSERT(IsUint2(index)); + + Instr i = static_cast(index) << 12; + Emit(0xce408c00 | Rd(vd) | Rn(vn) | Rm(vm) | i); +} + +void Assembler::sm4e(const VRegister& vd, const VRegister& vn) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSM4)); + VIXL_ASSERT(vd.Is4S() && vn.Is4S()); + + Emit(0xcec08400 | Rd(vd) | Rn(vn)); +} + +void Assembler::sm4ekey(const VRegister& vd, const VRegister& vn, const VRegister& vm) { + VIXL_ASSERT(CPUHas(CPUFeatures::kNEON)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSM4)); + VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S()); + + Emit(0xce60c800 | Rd(vd) | Rn(vn) | Rm(vm)); +} + // Note: // For all ToImm instructions below, a difference in case // for the same letter indicates a negated bit. @@ -6868,6 +7177,7 @@ bool Assembler::CPUHas(SystemRegister sysreg) const { return CPUHas(CPUFeatures::kRNG); case FPCR: case NZCV: + case DCZID_EL0: break; } return true; diff --git a/src/aarch64/assembler-aarch64.h b/src/aarch64/assembler-aarch64.h index da0a70c4..b4098c13 100644 --- a/src/aarch64/assembler-aarch64.h +++ b/src/aarch64/assembler-aarch64.h @@ -2183,6 +2183,9 @@ class Assembler : public vixl::internal::AssemblerBase { // System instruction with pre-encoded op (op1:crn:crm:op2). void sys(int op, const Register& xt = xzr); + // System instruction with result. + void sysl(int op, const Register& xt = xzr); + // System data cache operation. void dc(DataCacheOp op, const Register& rt); @@ -3643,6 +3646,123 @@ class Assembler : public vixl::internal::AssemblerBase { // Unsigned 8-bit integer matrix multiply-accumulate (vector). void ummla(const VRegister& vd, const VRegister& vn, const VRegister& vm); + // Bit Clear and exclusive-OR. + void bcax(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + const VRegister& va); + + // Three-way Exclusive-OR. + void eor3(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + const VRegister& va); + + // Exclusive-OR and Rotate. + void xar(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + int rotate); + + // Rotate and Exclusive-OR + void rax1(const VRegister& vd, const VRegister& vn, const VRegister& vm); + + // SHA1 hash update (choose). + void sha1c(const VRegister& vd, const VRegister& vn, const VRegister& vm); + + // SHA1 fixed rotate. + void sha1h(const VRegister& sd, const VRegister& sn); + + // SHA1 hash update (majority). + void sha1m(const VRegister& vd, const VRegister& vn, const VRegister& vm); + + // SHA1 hash update (parity). + void sha1p(const VRegister& vd, const VRegister& vn, const VRegister& vm); + + // SHA1 schedule update 0. + void sha1su0(const VRegister& vd, const VRegister& vn, const VRegister& vm); + + // SHA1 schedule update 1. + void sha1su1(const VRegister& vd, const VRegister& vn); + + // SHA256 hash update (part 1). + void sha256h(const VRegister& vd, const VRegister& vn, const VRegister& vm); + + // SHA256 hash update (part 2). + void sha256h2(const VRegister& vd, const VRegister& vn, const VRegister& vm); + + // SHA256 schedule update 0. + void sha256su0(const VRegister& vd, const VRegister& vn); + + // SHA256 schedule update 1. + void sha256su1(const VRegister& vd, const VRegister& vn, const VRegister& vm); + + // SHA512 hash update part 1. + void sha512h(const VRegister& vd, const VRegister& vn, const VRegister& vm); + + // SHA512 hash update part 2. + void sha512h2(const VRegister& vd, const VRegister& vn, const VRegister& vm); + + // SHA512 schedule Update 0. + void sha512su0(const VRegister& vd, const VRegister& vn); + + // SHA512 schedule Update 1. + void sha512su1(const VRegister& vd, const VRegister& vn, const VRegister& vm); + + // AES single round decryption. + void aesd(const VRegister& vd, const VRegister& vn); + + // AES single round encryption. + void aese(const VRegister& vd, const VRegister& vn); + + // AES inverse mix columns. + void aesimc(const VRegister& vd, const VRegister& vn); + + // AES mix columns. + void aesmc(const VRegister& vd, const VRegister& vn); + + // SM3PARTW1. + void sm3partw1(const VRegister& vd, const VRegister& vn, const VRegister& vm); + + // SM3PARTW2. + void sm3partw2(const VRegister& vd, const VRegister& vn, const VRegister& vm); + + // SM3SS1. + void sm3ss1(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + const VRegister& va); + + // SM3TT1A. + void sm3tt1a(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + int index); + + // SM3TT1B. + void sm3tt1b(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + int index); + + // SM3TT2A. + void sm3tt2a(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + int index); + + // SM3TT2B. + void sm3tt2b(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + int index); + + // SM4 Encode. + void sm4e(const VRegister& vd, const VRegister& vn); + + // SM4 Key. + void sm4ekey(const VRegister& vd, const VRegister& vn, const VRegister& vm); + // Scalable Vector Extensions. // Absolute value (predicated). @@ -7097,6 +7217,21 @@ class Assembler : public vixl::internal::AssemblerBase { // Unsigned Minimum. void umin(const Register& rd, const Register& rn, const Operand& op); + // Check feature status. + void chkfeat(const Register& rd); + + // Guarded Control Stack Push. + void gcspushm(const Register& rt); + + // Guarded Control Stack Pop. + void gcspopm(const Register& rt); + + // Guarded Control Stack Switch Stack 1. + void gcsss1(const Register& rt); + + // Guarded Control Stack Switch Stack 2. + void gcsss2(const Register& rt); + // Emit generic instructions. // Emit raw instructions into the instruction stream. @@ -7565,6 +7700,8 @@ class Assembler : public vixl::internal::AssemblerBase { static Instr VFormat(VRegister vd) { if (vd.Is64Bits()) { switch (vd.GetLanes()) { + case 1: + return NEON_1D; case 2: return NEON_2S; case 4: diff --git a/src/aarch64/assembler-sve-aarch64.cc b/src/aarch64/assembler-sve-aarch64.cc index e99cfdcd..0c3c7f88 100644 --- a/src/aarch64/assembler-sve-aarch64.cc +++ b/src/aarch64/assembler-sve-aarch64.cc @@ -7410,13 +7410,13 @@ void Assembler::pmullb(const ZRegister& zd, // size<23:22> | Zm<20:16> | op<12> | U<11> | T<10> | Zn<9:5> | Zd<4:0> VIXL_ASSERT(CPUHas(CPUFeatures::kSVE2)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSVEPmull128) || !zd.IsLaneSizeQ()); VIXL_ASSERT(AreSameLaneSize(zn, zm)); VIXL_ASSERT(!zd.IsLaneSizeB() && !zd.IsLaneSizeS()); VIXL_ASSERT(zd.GetLaneSizeInBytes() == zn.GetLaneSizeInBytes() * 2); - // SVEPmull128 is not supported - VIXL_ASSERT(!zd.IsLaneSizeQ()); + Instr size = zd.IsLaneSizeQ() ? 0 : SVESize(zd); - Emit(0x45006800 | SVESize(zd) | Rd(zd) | Rn(zn) | Rm(zm)); + Emit(0x45006800 | size | Rd(zd) | Rn(zn) | Rm(zm)); } void Assembler::pmullt(const ZRegister& zd, @@ -7427,13 +7427,13 @@ void Assembler::pmullt(const ZRegister& zd, // size<23:22> | Zm<20:16> | op<12> | U<11> | T<10> | Zn<9:5> | Zd<4:0> VIXL_ASSERT(CPUHas(CPUFeatures::kSVE2)); + VIXL_ASSERT(CPUHas(CPUFeatures::kSVEPmull128) || !zd.IsLaneSizeQ()); VIXL_ASSERT(AreSameLaneSize(zn, zm)); VIXL_ASSERT(!zd.IsLaneSizeB() && !zd.IsLaneSizeS()); VIXL_ASSERT(zd.GetLaneSizeInBytes() == zn.GetLaneSizeInBytes() * 2); - // SVEPmull128 is not supported - VIXL_ASSERT(!zd.IsLaneSizeQ()); + Instr size = zd.IsLaneSizeQ() ? 0 : SVESize(zd); - Emit(0x45006c00 | SVESize(zd) | Rd(zd) | Rn(zn) | Rm(zm)); + Emit(0x45006c00 | size | Rd(zd) | Rn(zn) | Rm(zm)); } void Assembler::raddhnb(const ZRegister& zd, diff --git a/src/aarch64/constants-aarch64.h b/src/aarch64/constants-aarch64.h index 20bd12f9..279587cf 100644 --- a/src/aarch64/constants-aarch64.h +++ b/src/aarch64/constants-aarch64.h @@ -389,7 +389,8 @@ enum SystemHint { BTI = 32, BTI_c = 34, BTI_j = 36, - BTI_jc = 38 + BTI_jc = 38, + CHKFEAT = 40 }; enum BranchTargetIdentifier { @@ -500,7 +501,8 @@ enum SystemRegister { NZCV = SystemRegisterEncoder<3, 3, 4, 2, 0>::value, FPCR = SystemRegisterEncoder<3, 3, 4, 4, 0>::value, RNDR = SystemRegisterEncoder<3, 3, 2, 4, 0>::value, // Random number. - RNDRRS = SystemRegisterEncoder<3, 3, 2, 4, 1>::value // Reseeded random number. + RNDRRS = SystemRegisterEncoder<3, 3, 2, 4, 1>::value, // Reseeded random number. + DCZID_EL0 = SystemRegisterEncoder<3, 3, 0, 0, 7>::value }; template @@ -534,6 +536,13 @@ enum DataCacheOp { CIGDVAC = CacheOpEncoder<3, 7, 14, 5>::value }; +enum GCSOp { + GCSPUSHM = CacheOpEncoder<3, 7, 7, 0>::value, + GCSPOPM = CacheOpEncoder<3, 7, 7, 1>::value, + GCSSS1 = CacheOpEncoder<3, 7, 7, 2>::value, + GCSSS2 = CacheOpEncoder<3, 7, 7, 3>::value +}; + // Some SVE instructions support a predicate constraint pattern. This is // interpreted as a VL-dependent value, and is typically used to initialise // predicates, or to otherwise limit the number of processed elements. @@ -942,7 +951,8 @@ enum SystemSysOp { SystemSysFixed = 0xD5080000, SystemSysFMask = 0xFFF80000, SystemSysMask = 0xFFF80000, - SYS = SystemSysFixed | 0x00000000 + SYS = SystemSysFixed | 0x00000000, + SYSL = SystemSysFixed | 0x00200000 }; // Exception. diff --git a/src/aarch64/cpu-features-auditor-aarch64.cc b/src/aarch64/cpu-features-auditor-aarch64.cc index 4efdef82..b447cad2 100644 --- a/src/aarch64/cpu-features-auditor-aarch64.cc +++ b/src/aarch64/cpu-features-auditor-aarch64.cc @@ -244,16 +244,47 @@ void CPUFeaturesAuditor::VisitConditionalSelect(const Instruction* instr) { void CPUFeaturesAuditor::VisitCrypto2RegSHA(const Instruction* instr) { RecordInstructionFeaturesScope scope(this); + if (form_hash_ == "sha256su0_vv_cryptosha2"_h) { + scope.Record(CPUFeatures::kNEON, CPUFeatures::kSHA2); + } else { + scope.Record(CPUFeatures::kNEON, CPUFeatures::kSHA1); + } USE(instr); } void CPUFeaturesAuditor::VisitCrypto3RegSHA(const Instruction* instr) { RecordInstructionFeaturesScope scope(this); + switch (form_hash_) { + case "sha1c_qsv_cryptosha3"_h: + case "sha1m_qsv_cryptosha3"_h: + case "sha1p_qsv_cryptosha3"_h: + case "sha1su0_vvv_cryptosha3"_h: + scope.Record(CPUFeatures::kNEON, CPUFeatures::kSHA1); + break; + case "sha256h_qqv_cryptosha3"_h: + case "sha256h2_qqv_cryptosha3"_h: + case "sha256su1_vvv_cryptosha3"_h: + scope.Record(CPUFeatures::kNEON, CPUFeatures::kSHA2); + break; + } USE(instr); } void CPUFeaturesAuditor::VisitCryptoAES(const Instruction* instr) { RecordInstructionFeaturesScope scope(this); + scope.Record(CPUFeatures::kNEON, CPUFeatures::kAES); + USE(instr); +} + +void CPUFeaturesAuditor::VisitCryptoSM3(const Instruction* instr) { + RecordInstructionFeaturesScope scope(this); + scope.Record(CPUFeatures::kNEON, CPUFeatures::kSM3); + USE(instr); +} + +void CPUFeaturesAuditor::VisitCryptoSM4(const Instruction* instr) { + RecordInstructionFeaturesScope scope(this); + scope.Record(CPUFeatures::kNEON, CPUFeatures::kSM4); USE(instr); } @@ -733,6 +764,12 @@ void CPUFeaturesAuditor::VisitNEON3Different(const Instruction* instr) { RecordInstructionFeaturesScope scope(this); // All of these instructions require NEON. scope.Record(CPUFeatures::kNEON); + if (form_hash_ == "pmull_asimddiff_l"_h) { + if (instr->GetNEONSize() == 3) { + // Source is 1D or 2D, destination is 1Q. + scope.Record(CPUFeatures::kPmull1Q); + } + } USE(instr); } @@ -1267,91 +1304,93 @@ VIXL_SIMPLE_SVE_VISITOR_LIST(VIXL_DEFINE_SIMPLE_SVE_VISITOR) void CPUFeaturesAuditor::VisitSystem(const Instruction* instr) { RecordInstructionFeaturesScope scope(this); - if (instr->Mask(SystemHintFMask) == SystemHintFixed) { - CPUFeatures required; - switch (instr->GetInstructionBits()) { - case PACIA1716: - case PACIB1716: - case AUTIA1716: - case AUTIB1716: - case PACIAZ: - case PACIASP: - case PACIBZ: - case PACIBSP: - case AUTIAZ: - case AUTIASP: - case AUTIBZ: - case AUTIBSP: - case XPACLRI: - required.Combine(CPUFeatures::kPAuth); - break; - default: - switch (instr->GetImmHint()) { - case ESB: - required.Combine(CPUFeatures::kRAS); - break; - case BTI: - case BTI_j: - case BTI_c: - case BTI_jc: - required.Combine(CPUFeatures::kBTI); - break; - default: - break; - } - break; - } - // These are all HINT instructions, and behave as NOPs if the corresponding - // features are not implemented, so we record the corresponding features - // only if they are available. - if (available_.Has(required)) scope.Record(required); - } else if (instr->Mask(SystemSysMask) == SYS) { - switch (instr->GetSysOp()) { - // DC instruction variants. - case CGVAC: - case CGDVAC: - case CGVAP: - case CGDVAP: - case CIGVAC: - case CIGDVAC: - case GVA: - case GZVA: - scope.Record(CPUFeatures::kMTE); - break; - case CVAP: - scope.Record(CPUFeatures::kDCPoP); - break; - case CVADP: - scope.Record(CPUFeatures::kDCCVADP); - break; - case IVAU: - case CVAC: - case CVAU: - case CIVAC: - case ZVA: - // No special CPU features. - break; - } - } else if (instr->Mask(SystemPStateFMask) == SystemPStateFixed) { - switch (instr->Mask(SystemPStateMask)) { - case CFINV: - scope.Record(CPUFeatures::kFlagM); - break; - case AXFLAG: - case XAFLAG: - scope.Record(CPUFeatures::kAXFlag); - break; - } - } else if (instr->Mask(SystemSysRegFMask) == SystemSysRegFixed) { - if (instr->Mask(SystemSysRegMask) == MRS) { + CPUFeatures required; + switch (form_hash_) { + case "pacib1716_hi_hints"_h: + case "pacia1716_hi_hints"_h: + case "pacibsp_hi_hints"_h: + case "paciasp_hi_hints"_h: + case "pacibz_hi_hints"_h: + case "paciaz_hi_hints"_h: + case "autib1716_hi_hints"_h: + case "autia1716_hi_hints"_h: + case "autibsp_hi_hints"_h: + case "autiasp_hi_hints"_h: + case "autibz_hi_hints"_h: + case "autiaz_hi_hints"_h: + case "xpaclri_hi_hints"_h: + required.Combine(CPUFeatures::kPAuth); + break; + case "esb_hi_hints"_h: + required.Combine(CPUFeatures::kRAS); + break; + case "bti_hb_hints"_h: + required.Combine(CPUFeatures::kBTI); + break; + } + + // The instructions above are all HINTs and behave as NOPs if the + // corresponding features are not implemented, so we record the corresponding + // features only if they are available. + if (available_.Has(required)) scope.Record(required); + + switch (form_hash_) { + case "cfinv_m_pstate"_h: + scope.Record(CPUFeatures::kFlagM); + break; + case "axflag_m_pstate"_h: + case "xaflag_m_pstate"_h: + scope.Record(CPUFeatures::kAXFlag); + break; + case "mrs_rs_systemmove"_h: switch (instr->GetImmSystemRegister()) { case RNDR: case RNDRRS: scope.Record(CPUFeatures::kRNG); break; } - } + break; + case "sys_cr_systeminstrs"_h: + switch (instr->GetSysOp()) { + // DC instruction variants. + case CGVAC: + case CGDVAC: + case CGVAP: + case CGDVAP: + case CIGVAC: + case CIGDVAC: + case GVA: + case GZVA: + scope.Record(CPUFeatures::kMTE); + break; + case CVAP: + scope.Record(CPUFeatures::kDCPoP); + break; + case CVADP: + scope.Record(CPUFeatures::kDCCVADP); + break; + case IVAU: + case CVAC: + case CVAU: + case CIVAC: + case ZVA: + // No special CPU features. + break; + case GCSPUSHM: + case GCSSS1: + scope.Record(CPUFeatures::kGCS); + break; + } + break; + case "sysl_rc_systeminstrs"_h: + switch (instr->GetSysOp()) { + case GCSPOPM: + case GCSSS2: + scope.Record(CPUFeatures::kGCS); + break; + } + break; } } @@ -1405,9 +1444,9 @@ void CPUFeaturesAuditor::VisitUnimplemented(const Instruction* instr) { void CPUFeaturesAuditor::Visit(Metadata* metadata, const Instruction* instr) { VIXL_ASSERT(metadata->count("form") > 0); const std::string& form = (*metadata)["form"]; - uint32_t form_hash = Hash(form.c_str()); + form_hash_ = Hash(form.c_str()); const FormToVisitorFnMap* fv = CPUFeaturesAuditor::GetFormToVisitorFnMap(); - FormToVisitorFnMap::const_iterator it = fv->find(form_hash); + FormToVisitorFnMap::const_iterator it = fv->find(form_hash_); if (it == fv->end()) { RecordInstructionFeaturesScope scope(this); std::map features = { @@ -1824,10 +1863,30 @@ void CPUFeaturesAuditor::Visit(Metadata* metadata, const Instruction* instr) { {"umax_64u_minmax_imm"_h, CPUFeatures::kCSSC}, {"umin_32u_minmax_imm"_h, CPUFeatures::kCSSC}, {"umin_64u_minmax_imm"_h, CPUFeatures::kCSSC}, + {"bcax_vvv16_crypto4"_h, + CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA3)}, + {"eor3_vvv16_crypto4"_h, + CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA3)}, + {"rax1_vvv2_cryptosha512_3"_h, + CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA3)}, + {"xar_vvv2_crypto3_imm6"_h, + CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA3)}, + {"sha512h_qqv_cryptosha512_3"_h, + CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA512)}, + {"sha512h2_qqv_cryptosha512_3"_h, + CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA512)}, + {"sha512su0_vv2_cryptosha512_2"_h, + CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA512)}, + {"sha512su1_vvv2_cryptosha512_3"_h, + CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA512)}, + {"pmullb_z_zz_q"_h, + CPUFeatures(CPUFeatures::kSVE2, CPUFeatures::kSVEPmull128)}, + {"pmullt_z_zz_q"_h, + CPUFeatures(CPUFeatures::kSVE2, CPUFeatures::kSVEPmull128)}, }; - if (features.count(form_hash) > 0) { - scope.Record(features[form_hash]); + if (features.count(form_hash_) > 0) { + scope.Record(features[form_hash_]); } } else { (it->second)(this, instr); diff --git a/src/aarch64/cpu-features-auditor-aarch64.h b/src/aarch64/cpu-features-auditor-aarch64.h index aa7bd852..d533d06d 100644 --- a/src/aarch64/cpu-features-auditor-aarch64.h +++ b/src/aarch64/cpu-features-auditor-aarch64.h @@ -31,7 +31,7 @@ #include #include -#include "cpu-features.h" +#include "../cpu-features.h" #include "decoder-aarch64.h" #include "decoder-visitor-map-aarch64.h" @@ -113,6 +113,8 @@ class CPUFeaturesAuditor : public DecoderVisitor { #define DECLARE(A) virtual void Visit##A(const Instruction* instr); VISITOR_LIST(DECLARE) #undef DECLARE + void VisitCryptoSM3(const Instruction* instr); + void VisitCryptoSM4(const Instruction* instr); void LoadStoreHelper(const Instruction* instr); void LoadStorePairHelper(const Instruction* instr); @@ -126,6 +128,7 @@ class CPUFeaturesAuditor : public DecoderVisitor { using FormToVisitorFnMap = FormToVisitorFnMapT; static const FormToVisitorFnMap* GetFormToVisitorFnMap(); + uint32_t form_hash_; }; } // namespace aarch64 diff --git a/src/aarch64/debugger-aarch64.cc b/src/aarch64/debugger-aarch64.cc index 3c6d2669..1abe7d14 100644 --- a/src/aarch64/debugger-aarch64.cc +++ b/src/aarch64/debugger-aarch64.cc @@ -33,7 +33,6 @@ #include #include #include -#include namespace vixl { namespace aarch64 { @@ -205,7 +204,7 @@ std::optional Debugger::ParseRegString( return std::nullopt; } - return {{reg_prefix, *reg_code}}; + return {{reg_prefix, static_cast(*reg_code)}}; } diff --git a/src/aarch64/debugger-aarch64.h b/src/aarch64/debugger-aarch64.h index 2a96ee52..3eefa803 100644 --- a/src/aarch64/debugger-aarch64.h +++ b/src/aarch64/debugger-aarch64.h @@ -31,9 +31,9 @@ #include #include +#include "../cpu-features.h" #include "../globals-vixl.h" #include "../utils-vixl.h" -#include "cpu-features.h" #include "abi-aarch64.h" #include "cpu-features-auditor-aarch64.h" diff --git a/src/aarch64/decoder-constants-aarch64.h b/src/aarch64/decoder-constants-aarch64.h index 70e01a10..af50a552 100644 --- a/src/aarch64/decoder-constants-aarch64.h +++ b/src/aarch64/decoder-constants-aarch64.h @@ -3764,7 +3764,7 @@ static const DecodeMapping kDecodeMapping[] = { {"001110"_b, "autiaz_hi_hints"}, {"001111"_b, "autibz_hi_hints"}, {"0100xx"_b, "bti_hb_hints"}, - {"010100"_b, "chkfeat_hi_hints"}, + {"010100"_b, "chkfeat_hf_hints"}, {"0101x1"_b, "hint_hm_hints"}, {"01x110"_b, "hint_hm_hints"}, {"10xxxx"_b, "hint_hm_hints"}, diff --git a/src/aarch64/decoder-visitor-map-aarch64.h b/src/aarch64/decoder-visitor-map-aarch64.h index 8ae438c1..bda71ce1 100644 --- a/src/aarch64/decoder-visitor-map-aarch64.h +++ b/src/aarch64/decoder-visitor-map-aarch64.h @@ -2074,7 +2074,6 @@ {"scvtf_asimdmiscfp16_r"_h, &VISITORCLASS::VisitNEON2RegMiscFP16}, \ {"ucvtf_asimdmiscfp16_r"_h, &VISITORCLASS::VisitNEON2RegMiscFP16}, \ {"addhn_asimddiff_n"_h, &VISITORCLASS::VisitNEON3Different}, \ - {"pmull_asimddiff_l"_h, &VISITORCLASS::VisitNEON3Different}, \ {"raddhn_asimddiff_n"_h, &VISITORCLASS::VisitNEON3Different}, \ {"rsubhn_asimddiff_n"_h, &VISITORCLASS::VisitNEON3Different}, \ {"sabal_asimddiff_l"_h, &VISITORCLASS::VisitNEON3Different}, \ @@ -2592,6 +2591,7 @@ {"dmb_bo_barriers"_h, &VISITORCLASS::VisitSystem}, \ {"dsb_bo_barriers"_h, &VISITORCLASS::VisitSystem}, \ {"hint_hm_hints"_h, &VISITORCLASS::VisitSystem}, \ + {"chkfeat_hf_hints"_h, &VISITORCLASS::VisitSystem}, \ {"mrs_rs_systemmove"_h, &VISITORCLASS::VisitSystem}, \ {"msr_sr_systemmove"_h, &VISITORCLASS::VisitSystem}, \ {"psb_hc_hints"_h, &VISITORCLASS::VisitSystem}, \ @@ -2638,7 +2638,6 @@ &VISITORCLASS::VisitUnconditionalBranchToRegister}, \ {"ret_64r_branch_reg"_h, \ &VISITORCLASS::VisitUnconditionalBranchToRegister}, \ - {"bcax_vvv16_crypto4"_h, &VISITORCLASS::VisitUnimplemented}, \ {"bfcvtn_asimdmisc_4s"_h, &VISITORCLASS::VisitUnimplemented}, \ {"bfdot_asimdelem_e"_h, &VISITORCLASS::VisitUnimplemented}, \ {"bfdot_asimdsame2_d"_h, &VISITORCLASS::VisitUnimplemented}, \ @@ -2646,7 +2645,6 @@ {"bfmlal_asimdsame2_f"_h, &VISITORCLASS::VisitUnimplemented}, \ {"bfmmla_asimdsame2_e"_h, &VISITORCLASS::VisitUnimplemented}, \ {"dsb_bon_barriers"_h, &VISITORCLASS::VisitUnimplemented}, \ - {"eor3_vvv16_crypto4"_h, &VISITORCLASS::VisitUnimplemented}, \ {"ld64b_64l_memop"_h, &VISITORCLASS::VisitUnimplemented}, \ {"ldgm_64bulk_ldsttags"_h, &VISITORCLASS::VisitUnimplemented}, \ {"ldtrb_32_ldst_unpriv"_h, &VISITORCLASS::VisitUnimplemented}, \ @@ -2658,20 +2656,15 @@ {"ldtrsw_64_ldst_unpriv"_h, &VISITORCLASS::VisitUnimplemented}, \ {"ldtr_32_ldst_unpriv"_h, &VISITORCLASS::VisitUnimplemented}, \ {"ldtr_64_ldst_unpriv"_h, &VISITORCLASS::VisitUnimplemented}, \ - {"rax1_vvv2_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented}, \ - {"sha512h2_qqv_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented}, \ - {"sha512h_qqv_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented}, \ - {"sha512su0_vv2_cryptosha512_2"_h, &VISITORCLASS::VisitUnimplemented}, \ - {"sha512su1_vvv2_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented}, \ - {"sm3partw1_vvv4_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented}, \ - {"sm3partw2_vvv4_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented}, \ - {"sm3ss1_vvv4_crypto4"_h, &VISITORCLASS::VisitUnimplemented}, \ - {"sm3tt1a_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitUnimplemented}, \ - {"sm3tt1b_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitUnimplemented}, \ - {"sm3tt2a_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitUnimplemented}, \ - {"sm3tt2b_vvv_crypto3_imm2"_h, &VISITORCLASS::VisitUnimplemented}, \ - {"sm4ekey_vvv4_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented}, \ - {"sm4e_vv4_cryptosha512_2"_h, &VISITORCLASS::VisitUnimplemented}, \ + {"sm3partw1_vvv4_cryptosha512_3"_h, &VISITORCLASS::VisitCryptoSM3}, \ + {"sm3partw2_vvv4_cryptosha512_3"_h, &VISITORCLASS::VisitCryptoSM3}, \ + {"sm3ss1_vvv4_crypto4"_h, &VISITORCLASS::VisitCryptoSM3}, \ + {"sm3tt1a_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitCryptoSM3}, \ + {"sm3tt1b_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitCryptoSM3}, \ + {"sm3tt2a_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitCryptoSM3}, \ + {"sm3tt2b_vvv_crypto3_imm2"_h, &VISITORCLASS::VisitCryptoSM3}, \ + {"sm4ekey_vvv4_cryptosha512_3"_h, &VISITORCLASS::VisitCryptoSM4}, \ + {"sm4e_vv4_cryptosha512_2"_h, &VISITORCLASS::VisitCryptoSM4}, \ {"st64b_64l_memop"_h, &VISITORCLASS::VisitUnimplemented}, \ {"st64bv_64_memop"_h, &VISITORCLASS::VisitUnimplemented}, \ {"st64bv0_64_memop"_h, &VISITORCLASS::VisitUnimplemented}, \ @@ -2686,7 +2679,6 @@ {"ttest_br_systemresult"_h, &VISITORCLASS::VisitUnimplemented}, \ {"wfet_only_systeminstrswithreg"_h, &VISITORCLASS::VisitUnimplemented}, \ {"wfit_only_systeminstrswithreg"_h, &VISITORCLASS::VisitUnimplemented}, \ - {"xar_vvv2_crypto3_imm6"_h, &VISITORCLASS::VisitUnimplemented}, \ {"bfcvt_z_p_z_s2bf"_h, &VISITORCLASS::VisitUnimplemented}, \ {"bfcvtnt_z_p_z_s2bf"_h, &VISITORCLASS::VisitUnimplemented}, \ {"bfdot_z_zzz"_h, &VISITORCLASS::VisitUnimplemented}, \ @@ -2827,6 +2819,7 @@ {"fmlal_asimdsame_f"_h, &VISITORCLASS::VisitNEON3Same}, \ {"fmlsl2_asimdsame_f"_h, &VISITORCLASS::VisitNEON3Same}, \ {"fmlsl_asimdsame_f"_h, &VISITORCLASS::VisitNEON3Same}, \ + {"pmull_asimddiff_l"_h, &VISITORCLASS::VisitNEON3Different}, \ {"ushll_asimdshf_l"_h, &VISITORCLASS::VisitNEONShiftImmediate}, \ {"sshll_asimdshf_l"_h, &VISITORCLASS::VisitNEONShiftImmediate}, \ {"shrn_asimdshf_n"_h, &VISITORCLASS::VisitNEONShiftImmediate}, \ @@ -2856,22 +2849,6 @@ &VISITORCLASS::VisitNEONScalarByIndexedElement}, \ {"sqdmull_asisdelem_l"_h, \ &VISITORCLASS::VisitNEONScalarByIndexedElement}, \ - {"fmla_asisdelem_rh_h"_h, \ - &VISITORCLASS::VisitNEONScalarByIndexedElement}, \ - {"fmla_asisdelem_r_sd"_h, \ - &VISITORCLASS::VisitNEONScalarByIndexedElement}, \ - {"fmls_asisdelem_rh_h"_h, \ - &VISITORCLASS::VisitNEONScalarByIndexedElement}, \ - {"fmls_asisdelem_r_sd"_h, \ - &VISITORCLASS::VisitNEONScalarByIndexedElement}, \ - {"fmulx_asisdelem_rh_h"_h, \ - &VISITORCLASS::VisitNEONScalarByIndexedElement}, \ - {"fmulx_asisdelem_r_sd"_h, \ - &VISITORCLASS::VisitNEONScalarByIndexedElement}, \ - {"fmul_asisdelem_rh_h"_h, \ - &VISITORCLASS::VisitNEONScalarByIndexedElement}, \ - {"fmul_asisdelem_r_sd"_h, \ - &VISITORCLASS::VisitNEONScalarByIndexedElement}, \ {"fabd_asisdsame_only"_h, &VISITORCLASS::VisitNEONScalar3Same}, \ {"facge_asisdsame_only"_h, &VISITORCLASS::VisitNEONScalar3Same}, \ {"facgt_asisdsame_only"_h, &VISITORCLASS::VisitNEONScalar3Same}, \ @@ -2944,6 +2921,22 @@ {"frecpe_asisdmisc_r"_h, &VISITORCLASS::VisitNEONScalar2RegMisc}, \ {"frecpx_asisdmisc_r"_h, &VISITORCLASS::VisitNEONScalar2RegMisc}, \ {"frsqrte_asisdmisc_r"_h, &VISITORCLASS::VisitNEONScalar2RegMisc}, \ - {"scvtf_asisdmisc_r"_h, &VISITORCLASS::VisitNEONScalar2RegMisc}, { \ - "ucvtf_asisdmisc_r"_h, &VISITORCLASS::VisitNEONScalar2RegMisc \ + {"scvtf_asisdmisc_r"_h, &VISITORCLASS::VisitNEONScalar2RegMisc}, \ + {"ucvtf_asisdmisc_r"_h, &VISITORCLASS::VisitNEONScalar2RegMisc}, \ + {"fmla_asisdelem_rh_h"_h, \ + &VISITORCLASS::VisitNEONScalarByIndexedElement}, \ + {"fmla_asisdelem_r_sd"_h, \ + &VISITORCLASS::VisitNEONScalarByIndexedElement}, \ + {"fmls_asisdelem_rh_h"_h, \ + &VISITORCLASS::VisitNEONScalarByIndexedElement}, \ + {"fmls_asisdelem_r_sd"_h, \ + &VISITORCLASS::VisitNEONScalarByIndexedElement}, \ + {"fmulx_asisdelem_rh_h"_h, \ + &VISITORCLASS::VisitNEONScalarByIndexedElement}, \ + {"fmulx_asisdelem_r_sd"_h, \ + &VISITORCLASS::VisitNEONScalarByIndexedElement}, \ + {"fmul_asisdelem_rh_h"_h, \ + &VISITORCLASS::VisitNEONScalarByIndexedElement}, \ + { \ + "fmul_asisdelem_r_sd"_h, &VISITORCLASS::VisitNEONScalarByIndexedElement \ } diff --git a/src/aarch64/disasm-aarch64.cc b/src/aarch64/disasm-aarch64.cc index 86dd7388..4d78369c 100644 --- a/src/aarch64/disasm-aarch64.cc +++ b/src/aarch64/disasm-aarch64.cc @@ -2024,7 +2024,7 @@ void Disassembler::DisassembleNoArgs(const Instruction *instr) { void Disassembler::VisitSystem(const Instruction *instr) { const char *mnemonic = mnemonic_.c_str(); - const char *form = "(System)"; + const char *form = ""; const char *suffix = NULL; switch (form_hash_) { @@ -2053,6 +2053,10 @@ void Disassembler::VisitSystem(const Instruction *instr) { break; } break; + case "chkfeat_hf_hints"_h: + mnemonic = "chkfeat"; + form = "x16"; + break; case "hint_hm_hints"_h: form = "'IH"; break; @@ -2073,9 +2077,6 @@ void Disassembler::VisitSystem(const Instruction *instr) { break; } case Hash("sys_cr_systeminstrs"): { - mnemonic = "dc"; - suffix = ", 'Xt"; - const std::map dcop = { {IVAU, "ivau"}, {CVAC, "cvac"}, @@ -2098,17 +2099,36 @@ void Disassembler::VisitSystem(const Instruction *instr) { if (dcop.count(sysop)) { if (sysop == IVAU) { mnemonic = "ic"; + } else { + mnemonic = "dc"; } form = dcop.at(sysop); + suffix = ", 'Xt"; + } else if (sysop == GCSSS1) { + mnemonic = "gcsss1"; + form = "'Xt"; + } else if (sysop == GCSPUSHM) { + mnemonic = "gcspushm"; + form = "'Xt"; } else { mnemonic = "sys"; form = "'G1, 'Kn, 'Km, 'G2"; - if (instr->GetRt() == 31) { - suffix = NULL; + if (instr->GetRt() < 31) { + suffix = ", 'Xt"; } - break; } + break; } + case "sysl_rc_systeminstrs"_h: + uint32_t sysop = instr->GetSysOp(); + if (sysop == GCSPOPM) { + mnemonic = "gcspopm"; + form = (instr->GetRt() == 31) ? "" : "'Xt"; + } else if (sysop == GCSSS2) { + mnemonic = "gcsss2"; + form = "'Xt"; + } + break; } Format(instr, mnemonic, form, suffix); } @@ -2154,17 +2174,74 @@ void Disassembler::VisitException(const Instruction *instr) { void Disassembler::VisitCrypto2RegSHA(const Instruction *instr) { - VisitUnimplemented(instr); + const char *form = "'Vd.4s, 'Vn.4s"; + if (form_hash_ == "sha1h_ss_cryptosha2"_h) { + form = "'Sd, 'Sn"; + } + FormatWithDecodedMnemonic(instr, form); } void Disassembler::VisitCrypto3RegSHA(const Instruction *instr) { - VisitUnimplemented(instr); + const char *form = "'Qd, 'Sn, 'Vm.4s"; + switch (form_hash_) { + case "sha1su0_vvv_cryptosha3"_h: + case "sha256su1_vvv_cryptosha3"_h: + form = "'Vd.4s, 'Vn.4s, 'Vm.4s"; + break; + case "sha256h_qqv_cryptosha3"_h: + case "sha256h2_qqv_cryptosha3"_h: + form = "'Qd, 'Qn, 'Vm.4s"; + break; + } + FormatWithDecodedMnemonic(instr, form); } void Disassembler::VisitCryptoAES(const Instruction *instr) { - VisitUnimplemented(instr); + FormatWithDecodedMnemonic(instr, "'Vd.16b, 'Vn.16b"); +} + +void Disassembler::VisitCryptoSM3(const Instruction *instr) { + const char *form = "'Vd.4s, 'Vn.4s, 'Vm."; + const char *suffix = "4s"; + + switch (form_hash_) { + case "sm3ss1_vvv4_crypto4"_h: + suffix = "4s, 'Va.4s"; + break; + case "sm3tt1a_vvv4_crypto3_imm2"_h: + case "sm3tt1b_vvv4_crypto3_imm2"_h: + case "sm3tt2a_vvv4_crypto3_imm2"_h: + case "sm3tt2b_vvv_crypto3_imm2"_h: + suffix = "s['u1312]"; + break; + } + + FormatWithDecodedMnemonic(instr, form, suffix); +} + +void Disassembler::VisitCryptoSM4(const Instruction *instr) { + VIXL_ASSERT((form_hash_ == "sm4ekey_vvv4_cryptosha512_3"_h) || + (form_hash_ == "sm4e_vv4_cryptosha512_2"_h)); + const char *form = "'Vd.4s, 'Vn.4s"; + const char *suffix = + (form_hash_ == "sm4e_vv4_cryptosha512_2"_h) ? NULL : ", 'Vm.4s"; + + FormatWithDecodedMnemonic(instr, form, suffix); +} + +void Disassembler::DisassembleSHA512(const Instruction *instr) { + const char *form = "'Qd, 'Qn, 'Vm.2d"; + const char *suffix = NULL; + switch (form_hash_) { + case "sha512su1_vvv2_cryptosha512_3"_h: + suffix = ", 'Vm.2d"; + VIXL_FALLTHROUGH(); + case "sha512su0_vv2_cryptosha512_2"_h: + form = "'Vd.2d, 'Vn.2d"; + } + FormatWithDecodedMnemonic(instr, form, suffix); } void Disassembler::DisassembleNEON2RegAddlp(const Instruction *instr) { @@ -2380,13 +2457,19 @@ void Disassembler::VisitNEON3SameFP16(const Instruction *instr) { } void Disassembler::VisitNEON3SameExtra(const Instruction *instr) { - static const NEONFormatMap map_usdot = {{30}, {NF_8B, NF_16B}}; + static const NEONFormatMap map_dot = + {{23, 22, 30}, {NF_UNDEF, NF_UNDEF, NF_UNDEF, NF_UNDEF, NF_2S, NF_4S}}; + static const NEONFormatMap map_fc = + {{23, 22, 30}, + {NF_UNDEF, NF_UNDEF, NF_4H, NF_8H, NF_2S, NF_4S, NF_UNDEF, NF_2D}}; + static const NEONFormatMap map_rdm = + {{23, 22, 30}, {NF_UNDEF, NF_UNDEF, NF_4H, NF_8H, NF_2S, NF_4S}}; const char *mnemonic = mnemonic_.c_str(); const char *form = "'Vd.%s, 'Vn.%s, 'Vm.%s"; const char *suffix = NULL; - NEONFormatDecoder nfd(instr); + NEONFormatDecoder nfd(instr, &map_fc); switch (form_hash_) { case "fcmla_asimdsame2_c"_h: @@ -2399,17 +2482,28 @@ void Disassembler::VisitNEON3SameExtra(const Instruction *instr) { case "sdot_asimdsame2_d"_h: case "udot_asimdsame2_d"_h: case "usdot_asimdsame2_d"_h: - nfd.SetFormatMap(1, &map_usdot); - nfd.SetFormatMap(2, &map_usdot); + nfd.SetFormatMaps(nfd.LogicalFormatMap()); + nfd.SetFormatMap(0, &map_dot); break; default: - // sqrdml[as]h - nothing to do. + nfd.SetFormatMaps(&map_rdm); break; } Format(instr, mnemonic, nfd.Substitute(form), suffix); } +void Disassembler::DisassembleNEON4Same(const Instruction *instr) { + FormatWithDecodedMnemonic(instr, "'Vd.16b, 'Vn.16b, 'Vm.16b, 'Va.16b"); +} + +void Disassembler::DisassembleNEONXar(const Instruction *instr) { + FormatWithDecodedMnemonic(instr, "'Vd.2d, 'Vn.2d, 'Vm.2d, #'u1510"); +} + +void Disassembler::DisassembleNEONRax1(const Instruction *instr) { + FormatWithDecodedMnemonic(instr, "'Vd.2d, 'Vn.2d, 'Vm.2d"); +} void Disassembler::VisitNEON3Different(const Instruction *instr) { const char *mnemonic = mnemonic_.c_str(); @@ -2432,11 +2526,6 @@ void Disassembler::VisitNEON3Different(const Instruction *instr) { nfd.SetFormatMaps(nfd.LongIntegerFormatMap()); nfd.SetFormatMap(0, nfd.IntegerFormatMap()); break; - case "pmull_asimddiff_l"_h: - if (nfd.GetVectorFormat(0) != kFormat8H) { - mnemonic = NULL; - } - break; case "sqdmlal_asimddiff_l"_h: case "sqdmlsl_asimddiff_l"_h: case "sqdmull_asimddiff_l"_h: @@ -2448,6 +2537,22 @@ void Disassembler::VisitNEON3Different(const Instruction *instr) { Format(instr, nfd.Mnemonic(mnemonic), nfd.Substitute(form)); } +void Disassembler::DisassembleNEONPolynomialMul(const Instruction *instr) { + const char *mnemonic = instr->ExtractBit(30) ? "pmull2" : "pmull"; + const char *form = NULL; + int size = instr->ExtractBits(23, 22); + if (size == 0) { + // Bits 30:27 of the instruction are x001, where x is the Q bit. Map + // this to "8" and "16" by adding 7. + form = "'Vd.8h, 'Vn.'u3127+7b, 'Vm.'u3127+7b"; + } else if (size == 3) { + form = "'Vd.1q, 'Vn.'?30:21d, 'Vm.'?30:21d"; + } else { + mnemonic = NULL; + } + Format(instr, mnemonic, form); +} + void Disassembler::DisassembleNEONFPAcrossLanes(const Instruction *instr) { const char *mnemonic = mnemonic_.c_str(); const char *form = "'Sd, 'Vn.4s"; @@ -2624,10 +2729,10 @@ void Disassembler::VisitNEONExtract(const Instruction *instr) { void Disassembler::VisitNEONLoadStoreMultiStruct(const Instruction *instr) { const char *mnemonic = NULL; const char *form = NULL; - const char *form_1v = "{'Vt.%1$s}, ['Xns]"; - const char *form_2v = "{'Vt.%1$s, 'Vt2.%1$s}, ['Xns]"; - const char *form_3v = "{'Vt.%1$s, 'Vt2.%1$s, 'Vt3.%1$s}, ['Xns]"; - const char *form_4v = "{'Vt.%1$s, 'Vt2.%1$s, 'Vt3.%1$s, 'Vt4.%1$s}, ['Xns]"; + const char *form_1v = "{'Vt.%s}, ['Xns]"; + const char *form_2v = "{'Vt.%s, 'Vt2.%s}, ['Xns]"; + const char *form_3v = "{'Vt.%s, 'Vt2.%s, 'Vt3.%s}, ['Xns]"; + const char *form_4v = "{'Vt.%s, 'Vt2.%s, 'Vt3.%s, 'Vt4.%s}, ['Xns]"; NEONFormatDecoder nfd(instr, NEONFormatDecoder::LoadStoreFormatMap()); switch (instr->Mask(NEONLoadStoreMultiStructMask)) { @@ -2722,11 +2827,10 @@ void Disassembler::VisitNEONLoadStoreMultiStructPostIndex( const Instruction *instr) { const char *mnemonic = NULL; const char *form = NULL; - const char *form_1v = "{'Vt.%1$s}, ['Xns], 'Xmr1"; - const char *form_2v = "{'Vt.%1$s, 'Vt2.%1$s}, ['Xns], 'Xmr2"; - const char *form_3v = "{'Vt.%1$s, 'Vt2.%1$s, 'Vt3.%1$s}, ['Xns], 'Xmr3"; - const char *form_4v = - "{'Vt.%1$s, 'Vt2.%1$s, 'Vt3.%1$s, 'Vt4.%1$s}, ['Xns], 'Xmr4"; + const char *form_1v = "{'Vt.%s}, ['Xns], 'Xmr1"; + const char *form_2v = "{'Vt.%s, 'Vt2.%s}, ['Xns], 'Xmr2"; + const char *form_3v = "{'Vt.%s, 'Vt2.%s, 'Vt3.%s}, ['Xns], 'Xmr3"; + const char *form_4v = "{'Vt.%s, 'Vt2.%s, 'Vt3.%s, 'Vt4.%s}, ['Xns], 'Xmr4"; NEONFormatDecoder nfd(instr, NEONFormatDecoder::LoadStoreFormatMap()); switch (instr->Mask(NEONLoadStoreMultiStructPostIndexMask)) { @@ -2929,7 +3033,7 @@ void Disassembler::VisitNEONLoadStoreSingleStruct(const Instruction *instr) { break; case NEON_LD4R: mnemonic = "ld4r"; - form = "{'Vt.%1$s, 'Vt2.%1$s, 'Vt3.%1$s, 'Vt4.%1$s}, ['Xns]"; + form = "{'Vt.%s, 'Vt2.%s, 'Vt3.%s, 'Vt4.%s}, ['Xns]"; break; default: break; @@ -3089,7 +3193,7 @@ void Disassembler::VisitNEONLoadStoreSingleStructPostIndex( break; case NEON_LD4R_post: mnemonic = "ld4r"; - form = "{'Vt.%1$s, 'Vt2.%1$s, 'Vt3.%1$s, 'Vt4.%1$s}, ['Xns], 'Xmz4"; + form = "{'Vt.%s, 'Vt2.%s, 'Vt3.%s, 'Vt4.%s}, ['Xns], 'Xmz4"; break; default: break; @@ -3305,6 +3409,8 @@ void Disassembler::VisitNEONScalar3Same(const Instruction *instr) { break; case "sqdmulh_asisdsame_only"_h: case "sqrdmulh_asisdsame_only"_h: + case "sqrdmlah_asisdsame2_only"_h: + case "sqrdmlsh_asisdsame2_only"_h: if ((vform == kFormatB) || (vform == kFormatD)) { mnemonic = NULL; } @@ -3923,8 +4029,7 @@ static bool SVEMoveMaskPreferred(uint64_t value, int lane_bytes_log2) { } // Check 0x0000pq00_0000pq00 or 0xffffpq00_ffffpq00. - uint64_t rotvalue = RotateRight(value, 32, 64); - if (value == rotvalue) { + if (AllWordsMatch(value)) { generic_value &= 0xffffffff; if ((generic_value == 0xffff) || (generic_value == UINT32_MAX)) { return false; @@ -3932,8 +4037,7 @@ static bool SVEMoveMaskPreferred(uint64_t value, int lane_bytes_log2) { } // Check 0xpq00pq00_pq00pq00. - rotvalue = RotateRight(value, 16, 64); - if (value == rotvalue) { + if (AllHalfwordsMatch(value)) { return false; } } else { @@ -3947,8 +4051,7 @@ static bool SVEMoveMaskPreferred(uint64_t value, int lane_bytes_log2) { } // Check 0x000000pq_000000pq or 0xffffffpq_ffffffpq. - uint64_t rotvalue = RotateRight(value, 32, 64); - if (value == rotvalue) { + if (AllWordsMatch(value)) { generic_value &= 0xffffffff; if ((generic_value == 0xff) || (generic_value == UINT32_MAX)) { return false; @@ -3956,8 +4059,7 @@ static bool SVEMoveMaskPreferred(uint64_t value, int lane_bytes_log2) { } // Check 0x00pq00pq_00pq00pq or 0xffpqffpq_ffpqffpq. - rotvalue = RotateRight(value, 16, 64); - if (value == rotvalue) { + if (AllHalfwordsMatch(value)) { generic_value &= 0xffff; if ((generic_value == 0xff) || (generic_value == UINT16_MAX)) { return false; @@ -3965,8 +4067,7 @@ static bool SVEMoveMaskPreferred(uint64_t value, int lane_bytes_log2) { } // Check 0xpqpqpqpq_pqpqpqpq. - rotvalue = RotateRight(value, 8, 64); - if (value == rotvalue) { + if (AllBytesMatch(value)) { return false; } } @@ -5748,15 +5849,26 @@ void Disassembler::Disassemble_ZdT_ZnTb(const Instruction *instr) { } } +void Disassembler::DisassembleSVEPmull(const Instruction *instr) { + if (instr->GetSVEVectorFormat() == kFormatVnS) { + VisitUnallocated(instr); + } else { + Disassemble_ZdT_ZnTb_ZmTb(instr); + } +} + +void Disassembler::DisassembleSVEPmull128(const Instruction *instr) { + FormatWithDecodedMnemonic(instr, "'Zd.q, 'Zn.d, 'Zm.d"); +} + void Disassembler::Disassemble_ZdT_ZnTb_ZmTb(const Instruction *instr) { - const char *form = "'Zd.'t, 'Zn.'th, 'Zm.'th"; if (instr->GetSVEVectorFormat() == kFormatVnB) { // TODO: This is correct for saddlbt, ssublbt, subltb, which don't have - // b-lane sized form, and for pmull[b|t] as feature `SVEPmull128` isn't - // supported, but may need changes for other instructions reaching here. + // b-lane sized form, but may need changes for other instructions reaching + // here. Format(instr, "unimplemented", "(ZdT_ZnTb_ZmTb)"); } else { - Format(instr, mnemonic_.c_str(), form); + FormatWithDecodedMnemonic(instr, "'Zd.'t, 'Zn.'th, 'Zm.'th"); } } @@ -6908,6 +7020,9 @@ int Disassembler::SubstituteImmediateField(const Instruction *instr, case RNDRRS: AppendToOutput("rndrrs"); break; + case DCZID_EL0: + AppendToOutput("dczid_el0"); + break; default: AppendToOutput("S%d_%d_c%d_c%d_%d", instr->GetSysOp0(), diff --git a/src/aarch64/disasm-aarch64.h b/src/aarch64/disasm-aarch64.h index 55c5047a..57c2a1ab 100644 --- a/src/aarch64/disasm-aarch64.h +++ b/src/aarch64/disasm-aarch64.h @@ -206,6 +206,8 @@ class Disassembler : public DecoderVisitor { void DisassembleSVEBitwiseTernary(const Instruction* instr); void DisassembleSVEFlogb(const Instruction* instr); void DisassembleSVEFPPair(const Instruction* instr); + void DisassembleSVEPmull(const Instruction* instr); + void DisassembleSVEPmull128(const Instruction* instr); void DisassembleNoArgs(const Instruction* instr); @@ -238,6 +240,11 @@ class Disassembler : public DecoderVisitor { void DisassembleNEONScalarShiftRightNarrowImm(const Instruction* instr); void DisassembleNEONScalar2RegMiscOnlyD(const Instruction* instr); void DisassembleNEONFPScalar2RegMisc(const Instruction* instr); + void DisassembleNEONPolynomialMul(const Instruction* instr); + void DisassembleNEON4Same(const Instruction* instr); + void DisassembleNEONXar(const Instruction* instr); + void DisassembleNEONRax1(const Instruction* instr); + void DisassembleSHA512(const Instruction* instr); void DisassembleMTELoadTag(const Instruction* instr); void DisassembleMTEStoreTag(const Instruction* instr); @@ -248,6 +255,9 @@ class Disassembler : public DecoderVisitor { void Disassemble_Xd_XnSP_Xm(const Instruction* instr); void Disassemble_Xd_XnSP_XmSP(const Instruction* instr); + void VisitCryptoSM3(const Instruction* instr); + void VisitCryptoSM4(const Instruction* instr); + void Format(const Instruction* instr, const char* mnemonic, const char* format0, diff --git a/src/aarch64/instructions-aarch64.cc b/src/aarch64/instructions-aarch64.cc index 298a7be8..adef87f4 100644 --- a/src/aarch64/instructions-aarch64.cc +++ b/src/aarch64/instructions-aarch64.cc @@ -603,6 +603,28 @@ std::pair Instruction::GetSVEMulLongZmAndIndex() const { return std::make_pair(reg_code, index); } +// Get the register and index for NEON indexed multiplies. +std::pair Instruction::GetNEONMulRmAndIndex() const { + int reg_code = GetRm(); + int index = (GetNEONH() << 2) | (GetNEONL() << 1) | GetNEONM(); + switch (GetNEONSize()) { + case 0: // FP H-sized elements. + case 1: // Integer H-sized elements. + // 4-bit Rm, 3-bit index. + reg_code &= 0xf; + break; + case 2: // S-sized elements. + // 5-bit Rm, 2-bit index. + index >>= 1; + break; + case 3: // FP D-sized elements. + // 5-bit Rm, 1-bit index. + index >>= 2; + break; + } + return std::make_pair(reg_code, index); +} + // Logical immediates can't encode zero, so a return value of zero is used to // indicate a failure case. Specifically, where the constraints on imm_s are // not met. @@ -1011,6 +1033,8 @@ VectorFormat VectorFormatHalfWidth(VectorFormat vform) { return kFormat4H; case kFormat2D: return kFormat2S; + case kFormat1Q: + return kFormat1D; case kFormatH: return kFormatB; case kFormatS: @@ -1023,6 +1047,8 @@ VectorFormat VectorFormatHalfWidth(VectorFormat vform) { return kFormatVnH; case kFormatVnD: return kFormatVnS; + case kFormatVnQ: + return kFormatVnD; default: VIXL_UNREACHABLE(); return kFormatUndefined; @@ -1095,6 +1121,8 @@ VectorFormat VectorFormatHalfWidthDoubleLanes(VectorFormat vform) { return kFormat2S; case kFormat2D: return kFormat4S; + case kFormat1Q: + return kFormat2D; case kFormatVnH: return kFormatVnB; case kFormatVnS: @@ -1246,6 +1274,7 @@ unsigned RegisterSizeInBitsFromFormat(VectorFormat vform) { case kFormat8H: case kFormat4S: case kFormat2D: + case kFormat1Q: return kQRegSize; default: VIXL_UNREACHABLE(); @@ -1283,6 +1312,7 @@ unsigned LaneSizeInBitsFromFormat(VectorFormat vform) { case kFormat2D: case kFormatVnD: return 64; + case kFormat1Q: case kFormatVnQ: return 128; case kFormatVnO: @@ -1348,6 +1378,7 @@ int LaneCountFromFormat(VectorFormat vform) { case kFormat2D: return 2; case kFormat1D: + case kFormat1Q: case kFormatB: case kFormatH: case kFormatS: diff --git a/src/aarch64/instructions-aarch64.h b/src/aarch64/instructions-aarch64.h index 38a0d67c..00aeb3cf 100644 --- a/src/aarch64/instructions-aarch64.h +++ b/src/aarch64/instructions-aarch64.h @@ -217,9 +217,10 @@ enum VectorFormat { kFormatVnQ = kFormatSVEQ | kFormatSVE, kFormatVnO = kFormatSVEO | kFormatSVE, - // An artificial value, used by simulator trace tests and a few oddball + // Artificial values, used by simulator trace tests and a few oddball // instructions (such as FMLAL). - kFormat2H = 0xfffffffe + kFormat2H = 0xfffffffe, + kFormat1Q = 0xfffffffd }; // Instructions. --------------------------------------------------------------- @@ -372,6 +373,7 @@ class Instruction { std::pair GetSVEPermuteIndexAndLaneSizeLog2() const; + std::pair GetNEONMulRmAndIndex() const; std::pair GetSVEMulZmAndIndex() const; std::pair GetSVEMulLongZmAndIndex() const; @@ -854,11 +856,13 @@ class NEONFormatDecoder { // Set the format mapping for all or individual substitutions. void SetFormatMaps(const NEONFormatMap* format0, const NEONFormatMap* format1 = NULL, - const NEONFormatMap* format2 = NULL) { + const NEONFormatMap* format2 = NULL, + const NEONFormatMap* format3 = NULL) { VIXL_ASSERT(format0 != NULL); formats_[0] = format0; formats_[1] = (format1 == NULL) ? formats_[0] : format1; formats_[2] = (format2 == NULL) ? formats_[1] : format2; + formats_[3] = (format3 == NULL) ? formats_[2] : format3; } void SetFormatMap(unsigned index, const NEONFormatMap* format) { VIXL_ASSERT(index <= ArrayLength(formats_)); @@ -877,12 +881,15 @@ class NEONFormatDecoder { const char* Substitute(const char* string, SubstitutionMode mode0 = kFormat, SubstitutionMode mode1 = kFormat, - SubstitutionMode mode2 = kFormat) { + SubstitutionMode mode2 = kFormat, + SubstitutionMode mode3 = kFormat) { const char* subst0 = GetSubstitute(0, mode0); const char* subst1 = GetSubstitute(1, mode1); const char* subst2 = GetSubstitute(2, mode2); + const char* subst3 = GetSubstitute(3, mode3); - if ((subst0 == NULL) || (subst1 == NULL) || (subst2 == NULL)) { + if ((subst0 == NULL) || (subst1 == NULL) || (subst2 == NULL) || + (subst3 == NULL)) { return NULL; } @@ -891,7 +898,8 @@ class NEONFormatDecoder { string, subst0, subst1, - subst2); + subst2, + subst3); return form_buffer_; } @@ -1129,7 +1137,7 @@ class NEONFormatDecoder { } Instr instrbits_; - const NEONFormatMap* formats_[3]; + const NEONFormatMap* formats_[4]; char form_buffer_[64]; char mne_buffer_[16]; }; diff --git a/src/aarch64/logic-aarch64.cc b/src/aarch64/logic-aarch64.cc index b41db923..ef5b07af 100644 --- a/src/aarch64/logic-aarch64.cc +++ b/src/aarch64/logic-aarch64.cc @@ -36,33 +36,33 @@ namespace aarch64 { using vixl::internal::SimFloat16; template -bool IsFloat64() { +constexpr bool IsFloat64() { return false; } template <> -bool IsFloat64() { +constexpr bool IsFloat64() { return true; } template -bool IsFloat32() { +constexpr bool IsFloat32() { return false; } template <> -bool IsFloat32() { +constexpr bool IsFloat32() { return true; } template -bool IsFloat16() { +constexpr bool IsFloat16() { return false; } template <> -bool IsFloat16() { +constexpr bool IsFloat16() { return true; } template <> -bool IsFloat16() { +constexpr bool IsFloat16() { return true; } @@ -168,11 +168,12 @@ SimFloat16 Simulator::UFixedToFloat16(uint64_t src, uint64_t Simulator::GenerateRandomTag(uint16_t exclude) { - uint64_t rtag = nrand48(rand_state_) >> 28; + // Generate a 4 bit integer from a 48bit random number + uint64_t rtag = rand_gen_() >> 44; VIXL_ASSERT(IsUint4(rtag)); if (exclude == 0) { - exclude = nrand48(rand_state_) >> 27; + exclude = static_cast(rand_gen_() >> 44); } // TODO: implement this to better match the specification, which calls for a @@ -182,24 +183,28 @@ uint64_t Simulator::GenerateRandomTag(uint16_t exclude) { } -void Simulator::ld1(VectorFormat vform, LogicVRegister dst, uint64_t addr) { +bool Simulator::ld1(VectorFormat vform, LogicVRegister dst, uint64_t addr) { dst.ClearForWrite(vform); for (int i = 0; i < LaneCountFromFormat(vform); i++) { - LoadLane(dst, vform, i, addr); + if (!LoadLane(dst, vform, i, addr)) { + return false; + } addr += LaneSizeInBytesFromFormat(vform); } + return true; } -void Simulator::ld1(VectorFormat vform, +bool Simulator::ld1(VectorFormat vform, LogicVRegister dst, int index, uint64_t addr) { - LoadLane(dst, vform, index, addr); + dst.ClearForWrite(vform); + return LoadLane(dst, vform, index, addr); } -void Simulator::ld1r(VectorFormat vform, +bool Simulator::ld1r(VectorFormat vform, VectorFormat unpack_vform, LogicVRegister dst, uint64_t addr, @@ -208,20 +213,25 @@ void Simulator::ld1r(VectorFormat vform, dst.ClearForWrite(vform); for (int i = 0; i < LaneCountFromFormat(vform); i++) { if (is_signed) { - LoadIntToLane(dst, vform, unpack_size, i, addr); + if (!LoadIntToLane(dst, vform, unpack_size, i, addr)) { + return false; + } } else { - LoadUintToLane(dst, vform, unpack_size, i, addr); + if (!LoadUintToLane(dst, vform, unpack_size, i, addr)) { + return false; + } } } + return true; } -void Simulator::ld1r(VectorFormat vform, LogicVRegister dst, uint64_t addr) { - ld1r(vform, vform, dst, addr); +bool Simulator::ld1r(VectorFormat vform, LogicVRegister dst, uint64_t addr) { + return ld1r(vform, vform, dst, addr); } -void Simulator::ld2(VectorFormat vform, +bool Simulator::ld2(VectorFormat vform, LogicVRegister dst1, LogicVRegister dst2, uint64_t addr1) { @@ -230,15 +240,17 @@ void Simulator::ld2(VectorFormat vform, int esize = LaneSizeInBytesFromFormat(vform); uint64_t addr2 = addr1 + esize; for (int i = 0; i < LaneCountFromFormat(vform); i++) { - LoadLane(dst1, vform, i, addr1); - LoadLane(dst2, vform, i, addr2); + if (!LoadLane(dst1, vform, i, addr1) || !LoadLane(dst2, vform, i, addr2)) { + return false; + } addr1 += 2 * esize; addr2 += 2 * esize; } + return true; } -void Simulator::ld2(VectorFormat vform, +bool Simulator::ld2(VectorFormat vform, LogicVRegister dst1, LogicVRegister dst2, int index, @@ -246,12 +258,12 @@ void Simulator::ld2(VectorFormat vform, dst1.ClearForWrite(vform); dst2.ClearForWrite(vform); uint64_t addr2 = addr1 + LaneSizeInBytesFromFormat(vform); - LoadLane(dst1, vform, index, addr1); - LoadLane(dst2, vform, index, addr2); + return (LoadLane(dst1, vform, index, addr1) && + LoadLane(dst2, vform, index, addr2)); } -void Simulator::ld2r(VectorFormat vform, +bool Simulator::ld2r(VectorFormat vform, LogicVRegister dst1, LogicVRegister dst2, uint64_t addr) { @@ -259,13 +271,15 @@ void Simulator::ld2r(VectorFormat vform, dst2.ClearForWrite(vform); uint64_t addr2 = addr + LaneSizeInBytesFromFormat(vform); for (int i = 0; i < LaneCountFromFormat(vform); i++) { - LoadLane(dst1, vform, i, addr); - LoadLane(dst2, vform, i, addr2); + if (!LoadLane(dst1, vform, i, addr) || !LoadLane(dst2, vform, i, addr2)) { + return false; + } } + return true; } -void Simulator::ld3(VectorFormat vform, +bool Simulator::ld3(VectorFormat vform, LogicVRegister dst1, LogicVRegister dst2, LogicVRegister dst3, @@ -277,17 +291,19 @@ void Simulator::ld3(VectorFormat vform, uint64_t addr2 = addr1 + esize; uint64_t addr3 = addr2 + esize; for (int i = 0; i < LaneCountFromFormat(vform); i++) { - LoadLane(dst1, vform, i, addr1); - LoadLane(dst2, vform, i, addr2); - LoadLane(dst3, vform, i, addr3); + if (!LoadLane(dst1, vform, i, addr1) || !LoadLane(dst2, vform, i, addr2) || + !LoadLane(dst3, vform, i, addr3)) { + return false; + } addr1 += 3 * esize; addr2 += 3 * esize; addr3 += 3 * esize; } + return true; } -void Simulator::ld3(VectorFormat vform, +bool Simulator::ld3(VectorFormat vform, LogicVRegister dst1, LogicVRegister dst2, LogicVRegister dst3, @@ -298,13 +314,13 @@ void Simulator::ld3(VectorFormat vform, dst3.ClearForWrite(vform); uint64_t addr2 = addr1 + LaneSizeInBytesFromFormat(vform); uint64_t addr3 = addr2 + LaneSizeInBytesFromFormat(vform); - LoadLane(dst1, vform, index, addr1); - LoadLane(dst2, vform, index, addr2); - LoadLane(dst3, vform, index, addr3); + return (LoadLane(dst1, vform, index, addr1) && + LoadLane(dst2, vform, index, addr2) && + LoadLane(dst3, vform, index, addr3)); } -void Simulator::ld3r(VectorFormat vform, +bool Simulator::ld3r(VectorFormat vform, LogicVRegister dst1, LogicVRegister dst2, LogicVRegister dst3, @@ -315,14 +331,16 @@ void Simulator::ld3r(VectorFormat vform, uint64_t addr2 = addr + LaneSizeInBytesFromFormat(vform); uint64_t addr3 = addr2 + LaneSizeInBytesFromFormat(vform); for (int i = 0; i < LaneCountFromFormat(vform); i++) { - LoadLane(dst1, vform, i, addr); - LoadLane(dst2, vform, i, addr2); - LoadLane(dst3, vform, i, addr3); + if (!LoadLane(dst1, vform, i, addr) || !LoadLane(dst2, vform, i, addr2) || + !LoadLane(dst3, vform, i, addr3)) { + return false; + } } + return true; } -void Simulator::ld4(VectorFormat vform, +bool Simulator::ld4(VectorFormat vform, LogicVRegister dst1, LogicVRegister dst2, LogicVRegister dst3, @@ -337,19 +355,20 @@ void Simulator::ld4(VectorFormat vform, uint64_t addr3 = addr2 + esize; uint64_t addr4 = addr3 + esize; for (int i = 0; i < LaneCountFromFormat(vform); i++) { - LoadLane(dst1, vform, i, addr1); - LoadLane(dst2, vform, i, addr2); - LoadLane(dst3, vform, i, addr3); - LoadLane(dst4, vform, i, addr4); + if (!LoadLane(dst1, vform, i, addr1) || !LoadLane(dst2, vform, i, addr2) || + !LoadLane(dst3, vform, i, addr3) || !LoadLane(dst4, vform, i, addr4)) { + return false; + } addr1 += 4 * esize; addr2 += 4 * esize; addr3 += 4 * esize; addr4 += 4 * esize; } + return true; } -void Simulator::ld4(VectorFormat vform, +bool Simulator::ld4(VectorFormat vform, LogicVRegister dst1, LogicVRegister dst2, LogicVRegister dst3, @@ -363,14 +382,14 @@ void Simulator::ld4(VectorFormat vform, uint64_t addr2 = addr1 + LaneSizeInBytesFromFormat(vform); uint64_t addr3 = addr2 + LaneSizeInBytesFromFormat(vform); uint64_t addr4 = addr3 + LaneSizeInBytesFromFormat(vform); - LoadLane(dst1, vform, index, addr1); - LoadLane(dst2, vform, index, addr2); - LoadLane(dst3, vform, index, addr3); - LoadLane(dst4, vform, index, addr4); + return (LoadLane(dst1, vform, index, addr1) && + LoadLane(dst2, vform, index, addr2) && + LoadLane(dst3, vform, index, addr3) && + LoadLane(dst4, vform, index, addr4)); } -void Simulator::ld4r(VectorFormat vform, +bool Simulator::ld4r(VectorFormat vform, LogicVRegister dst1, LogicVRegister dst2, LogicVRegister dst3, @@ -384,57 +403,61 @@ void Simulator::ld4r(VectorFormat vform, uint64_t addr3 = addr2 + LaneSizeInBytesFromFormat(vform); uint64_t addr4 = addr3 + LaneSizeInBytesFromFormat(vform); for (int i = 0; i < LaneCountFromFormat(vform); i++) { - LoadLane(dst1, vform, i, addr); - LoadLane(dst2, vform, i, addr2); - LoadLane(dst3, vform, i, addr3); - LoadLane(dst4, vform, i, addr4); + if (!LoadLane(dst1, vform, i, addr) || !LoadLane(dst2, vform, i, addr2) || + !LoadLane(dst3, vform, i, addr3) || !LoadLane(dst4, vform, i, addr4)) { + return false; + } } + return true; } -void Simulator::st1(VectorFormat vform, LogicVRegister src, uint64_t addr) { +bool Simulator::st1(VectorFormat vform, LogicVRegister src, uint64_t addr) { for (int i = 0; i < LaneCountFromFormat(vform); i++) { - StoreLane(src, vform, i, addr); + if (!StoreLane(src, vform, i, addr)) return false; addr += LaneSizeInBytesFromFormat(vform); } + return true; } -void Simulator::st1(VectorFormat vform, +bool Simulator::st1(VectorFormat vform, LogicVRegister src, int index, uint64_t addr) { - StoreLane(src, vform, index, addr); + return StoreLane(src, vform, index, addr); } -void Simulator::st2(VectorFormat vform, +bool Simulator::st2(VectorFormat vform, LogicVRegister src, LogicVRegister src2, uint64_t addr) { int esize = LaneSizeInBytesFromFormat(vform); uint64_t addr2 = addr + esize; for (int i = 0; i < LaneCountFromFormat(vform); i++) { - StoreLane(src, vform, i, addr); - StoreLane(src2, vform, i, addr2); + if (!StoreLane(src, vform, i, addr) || !StoreLane(src2, vform, i, addr2)) { + return false; + } addr += 2 * esize; addr2 += 2 * esize; } + return true; } -void Simulator::st2(VectorFormat vform, +bool Simulator::st2(VectorFormat vform, LogicVRegister src, LogicVRegister src2, int index, uint64_t addr) { int esize = LaneSizeInBytesFromFormat(vform); - StoreLane(src, vform, index, addr); - StoreLane(src2, vform, index, addr + 1 * esize); + return (StoreLane(src, vform, index, addr) && + StoreLane(src2, vform, index, addr + 1 * esize)); } -void Simulator::st3(VectorFormat vform, +bool Simulator::st3(VectorFormat vform, LogicVRegister src, LogicVRegister src2, LogicVRegister src3, @@ -443,30 +466,32 @@ void Simulator::st3(VectorFormat vform, uint64_t addr2 = addr + esize; uint64_t addr3 = addr2 + esize; for (int i = 0; i < LaneCountFromFormat(vform); i++) { - StoreLane(src, vform, i, addr); - StoreLane(src2, vform, i, addr2); - StoreLane(src3, vform, i, addr3); + if (!StoreLane(src, vform, i, addr) || !StoreLane(src2, vform, i, addr2) || + !StoreLane(src3, vform, i, addr3)) { + return false; + } addr += 3 * esize; addr2 += 3 * esize; addr3 += 3 * esize; } + return true; } -void Simulator::st3(VectorFormat vform, +bool Simulator::st3(VectorFormat vform, LogicVRegister src, LogicVRegister src2, LogicVRegister src3, int index, uint64_t addr) { int esize = LaneSizeInBytesFromFormat(vform); - StoreLane(src, vform, index, addr); - StoreLane(src2, vform, index, addr + 1 * esize); - StoreLane(src3, vform, index, addr + 2 * esize); + return (StoreLane(src, vform, index, addr) && + StoreLane(src2, vform, index, addr + 1 * esize) && + StoreLane(src3, vform, index, addr + 2 * esize)); } -void Simulator::st4(VectorFormat vform, +bool Simulator::st4(VectorFormat vform, LogicVRegister src, LogicVRegister src2, LogicVRegister src3, @@ -477,19 +502,21 @@ void Simulator::st4(VectorFormat vform, uint64_t addr3 = addr2 + esize; uint64_t addr4 = addr3 + esize; for (int i = 0; i < LaneCountFromFormat(vform); i++) { - StoreLane(src, vform, i, addr); - StoreLane(src2, vform, i, addr2); - StoreLane(src3, vform, i, addr3); - StoreLane(src4, vform, i, addr4); + if (!StoreLane(src, vform, i, addr) || !StoreLane(src2, vform, i, addr2) || + !StoreLane(src3, vform, i, addr3) || + !StoreLane(src4, vform, i, addr4)) { + return false; + } addr += 4 * esize; addr2 += 4 * esize; addr3 += 4 * esize; addr4 += 4 * esize; } + return true; } -void Simulator::st4(VectorFormat vform, +bool Simulator::st4(VectorFormat vform, LogicVRegister src, LogicVRegister src2, LogicVRegister src3, @@ -497,10 +524,10 @@ void Simulator::st4(VectorFormat vform, int index, uint64_t addr) { int esize = LaneSizeInBytesFromFormat(vform); - StoreLane(src, vform, index, addr); - StoreLane(src2, vform, index, addr + 1 * esize); - StoreLane(src3, vform, index, addr + 2 * esize); - StoreLane(src4, vform, index, addr + 3 * esize); + return (StoreLane(src, vform, index, addr) && + StoreLane(src2, vform, index, addr + 1 * esize) && + StoreLane(src3, vform, index, addr + 2 * esize) && + StoreLane(src4, vform, index, addr + 3 * esize)); } @@ -895,23 +922,12 @@ LogicVRegister Simulator::sqrdmlsh(VectorFormat vform, return sqrdmlsh(vform, dst, src1, dup_element(indexform, temp, src2, index)); } - uint64_t Simulator::PolynomialMult(uint64_t op1, uint64_t op2, int lane_size_in_bits) const { - VIXL_ASSERT(static_cast(lane_size_in_bits) <= kSRegSize); - VIXL_ASSERT(IsUintN(lane_size_in_bits, op1)); - VIXL_ASSERT(IsUintN(lane_size_in_bits, op2)); - uint64_t result = 0; - for (int i = 0; i < lane_size_in_bits; ++i) { - if ((op1 >> i) & 1) { - result = result ^ (op2 << i); - } - } - return result; + return PolynomialMult128(op1, op2, lane_size_in_bits).second; } - LogicVRegister Simulator::pmul(VectorFormat vform, LogicVRegister dst, const LogicVRegister& src1, @@ -933,14 +949,16 @@ LogicVRegister Simulator::pmull(VectorFormat vform, const LogicVRegister& src1, const LogicVRegister& src2) { dst.ClearForWrite(vform); - VectorFormat vform_src = VectorFormatHalfWidth(vform); - for (int i = 0; i < LaneCountFromFormat(vform); i++) { + + // Process the elements in reverse to avoid problems when the destination + // register is the same as a source. + for (int i = LaneCountFromFormat(vform) - 1; i >= 0; i--) { dst.SetUint(vform, i, - PolynomialMult(src1.Uint(vform_src, i), - src2.Uint(vform_src, i), - LaneSizeInBitsFromFormat(vform_src))); + PolynomialMult128(src1.Uint(vform_src, i), + src2.Uint(vform_src, i), + LaneSizeInBitsFromFormat(vform_src))); } return dst; @@ -951,16 +969,18 @@ LogicVRegister Simulator::pmull2(VectorFormat vform, LogicVRegister dst, const LogicVRegister& src1, const LogicVRegister& src2) { - VectorFormat vform_src = VectorFormatHalfWidthDoubleLanes(vform); dst.ClearForWrite(vform); + VectorFormat vform_src = VectorFormatHalfWidthDoubleLanes(vform); + int lane_count = LaneCountFromFormat(vform); for (int i = 0; i < lane_count; i++) { dst.SetUint(vform, i, - PolynomialMult(src1.Uint(vform_src, lane_count + i), - src2.Uint(vform_src, lane_count + i), - LaneSizeInBitsFromFormat(vform_src))); + PolynomialMult128(src1.Uint(vform_src, lane_count + i), + src2.Uint(vform_src, lane_count + i), + LaneSizeInBitsFromFormat(vform_src))); } + return dst; } @@ -2257,7 +2277,10 @@ LogicVRegister Simulator::extractnarrow(VectorFormat dstform, } } - if (!upperhalf) { + if (upperhalf) { + // Clear any bits beyond a Q register. + dst.ClearForWrite(kFormat16B); + } else { dst.ClearForWrite(dstform); } return dst; @@ -2491,6 +2514,7 @@ LogicVRegister Simulator::ror(VectorFormat vform, LogicVRegister dst, const LogicVRegister& src, int rotation) { + dst.ClearForWrite(vform); int width = LaneSizeInBitsFromFormat(vform); for (int i = 0; i < LaneCountFromFormat(vform); i++) { uint64_t value = src.Uint(vform, i); @@ -2499,6 +2523,14 @@ LogicVRegister Simulator::ror(VectorFormat vform, return dst; } +LogicVRegister Simulator::rol(VectorFormat vform, + LogicVRegister dst, + const LogicVRegister& src, + int rotation) { + int ror_equivalent = LaneSizeInBitsFromFormat(vform) - rotation; + return ror(vform, dst, src, ror_equivalent); +} + LogicVRegister Simulator::ext(VectorFormat vform, LogicVRegister dst, const LogicVRegister& src1, @@ -2507,10 +2539,10 @@ LogicVRegister Simulator::ext(VectorFormat vform, uint8_t result[kZRegMaxSizeInBytes] = {}; int lane_count = LaneCountFromFormat(vform); for (int i = 0; i < lane_count - index; ++i) { - result[i] = src1.Uint(vform, i + index); + result[i] = static_cast(src1.Uint(vform, i + index)); } for (int i = 0; i < index; ++i) { - result[lane_count - index + i] = src2.Uint(vform, i); + result[lane_count - index + i] = static_cast(src2.Uint(vform, i)); } dst.ClearForWrite(vform); for (int i = 0; i < lane_count; ++i) { @@ -2707,7 +2739,7 @@ LogicVRegister Simulator::fcmla(VectorFormat vform, int index, int rot) { if (LaneSizeInBitsFromFormat(vform) == kHRegSize) { - VIXL_UNIMPLEMENTED(); + fcmla(vform, dst, src1, src2, dst, index, rot); } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) { fcmla(vform, dst, src1, src2, dst, index, rot); } else { @@ -4153,7 +4185,7 @@ LogicVRegister Simulator::sqrdmlash_d(VectorFormat vform, // Arithmetic shift the whole value right by `esize - 1` bits. accum.second = (accum.first << 1) | (accum.second >> (esize - 1)); - accum.first = -(accum.first >> (esize - 1)); + accum.first = UnsignedNegate(accum.first >> (esize - 1)); // Perform saturation. bool is_pos = (accum.first == 0) ? true : false; @@ -4531,7 +4563,7 @@ T Simulator::FPMulx(T op1, T op2) { if ((IsInf(op1) && (op2 == 0.0)) || (IsInf(op2) && (op1 == 0.0))) { // inf * 0.0 returns +/-2.0. T two = 2.0; - return copysign(1.0, op1) * copysign(1.0, op2) * two; + return copysign(T(1.0), op1) * copysign(T(1.0), op2) * two; } return FPMul(op1, op2); } @@ -4541,8 +4573,8 @@ template T Simulator::FPMulAdd(T a, T op1, T op2) { T result = FPProcessNaNs3(a, op1, op2); - T sign_a = copysign(1.0, a); - T sign_prod = copysign(1.0, op1) * copysign(1.0, op2); + T sign_a = copysign(T(1.0), a); + T sign_prod = copysign(T(1.0), op1) * copysign(T(1.0), op2); bool isinf_prod = IsInf(op1) || IsInf(op2); bool operation_generates_nan = (IsInf(op1) && (op2 == 0.0)) || // inf * 0.0 @@ -4568,7 +4600,7 @@ T Simulator::FPMulAdd(T a, T op1, T op2) { // Work around broken fma implementations for exact zero results: The sign of // exact 0.0 results is positive unless both a and op1 * op2 are negative. if (((op1 == 0.0) || (op2 == 0.0)) && (a == 0.0)) { - return ((sign_a < T(0.0)) && (sign_prod < T(0.0))) ? -0.0 : 0.0; + return ((sign_a < T(0.0)) && (sign_prod < T(0.0))) ? T(-0.0) : T(0.0); } result = FusedMultiplyAdd(op1, op2, a); @@ -4577,7 +4609,7 @@ T Simulator::FPMulAdd(T a, T op1, T op2) { // Work around broken fma implementations for rounded zero results: If a is // 0.0, the sign of the result is the sign of op1 * op2 before rounding. if ((a == 0.0) && (result == 0.0)) { - return copysign(0.0, sign_prod); + return copysign(T(0.0), sign_prod); } return result; @@ -4642,9 +4674,9 @@ T Simulator::FPMax(T a, T b) { template T Simulator::FPMaxNM(T a, T b) { if (IsQuietNaN(a) && !IsQuietNaN(b)) { - a = kFP64NegativeInfinity; + a = T(kFP64NegativeInfinity); } else if (!IsQuietNaN(a) && IsQuietNaN(b)) { - b = kFP64NegativeInfinity; + b = T(kFP64NegativeInfinity); } T result = FPProcessNaNs(a, b); @@ -4669,9 +4701,9 @@ T Simulator::FPMin(T a, T b) { template T Simulator::FPMinNM(T a, T b) { if (IsQuietNaN(a) && !IsQuietNaN(b)) { - a = kFP64PositiveInfinity; + a = T(kFP64PositiveInfinity); } else if (!IsQuietNaN(a) && IsQuietNaN(b)) { - b = kFP64PositiveInfinity; + b = T(kFP64PositiveInfinity); } T result = FPProcessNaNs(a, b); @@ -4686,8 +4718,8 @@ T Simulator::FPRecipStepFused(T op1, T op2) { return two; } else if (IsInf(op1) || IsInf(op2)) { // Return +inf if signs match, otherwise -inf. - return ((op1 >= 0.0) == (op2 >= 0.0)) ? kFP64PositiveInfinity - : kFP64NegativeInfinity; + return ((op1 >= 0.0) == (op2 >= 0.0)) ? T(kFP64PositiveInfinity) + : T(kFP64NegativeInfinity); } else { return FusedMultiplyAdd(op1, op2, two); } @@ -4716,8 +4748,8 @@ T Simulator::FPRSqrtStepFused(T op1, T op2) { return one_point_five; } else if (IsInf(op1) || IsInf(op2)) { // Return +inf if signs match, otherwise -inf. - return ((op1 >= 0.0) == (op2 >= 0.0)) ? kFP64PositiveInfinity - : kFP64NegativeInfinity; + return ((op1 >= 0.0) == (op2 >= 0.0)) ? T(kFP64PositiveInfinity) + : T(kFP64NegativeInfinity); } else { // The multiply-add-halve operation must be fully fused, so avoid interim // rounding by checking which operand can be losslessly divided by two @@ -4746,7 +4778,7 @@ int32_t Simulator::FPToFixedJS(double value) { (value == kFP64NegativeInfinity)) { // +/- zero and infinity all return zero, however -0 and +/- Infinity also // unset the Z-flag. - result = 0.0; + result = 0; if ((value != 0.0) || std::signbit(value)) { Z = 0; } @@ -5531,38 +5563,40 @@ LogicVRegister Simulator::fsqrt(VectorFormat vform, } -#define DEFINE_NEON_FP_PAIR_OP(FNP, FN, OP) \ - LogicVRegister Simulator::FNP(VectorFormat vform, \ - LogicVRegister dst, \ - const LogicVRegister& src1, \ - const LogicVRegister& src2) { \ - SimVRegister temp1, temp2; \ - uzp1(vform, temp1, src1, src2); \ - uzp2(vform, temp2, src1, src2); \ - FN(vform, dst, temp1, temp2); \ - if (IsSVEFormat(vform)) { \ - interleave_top_bottom(vform, dst, dst); \ - } \ - return dst; \ - } \ - \ - LogicVRegister Simulator::FNP(VectorFormat vform, \ - LogicVRegister dst, \ - const LogicVRegister& src) { \ - if (vform == kFormatH) { \ - SimFloat16 result(OP(SimFloat16(RawbitsToFloat16(src.Uint(vform, 0))), \ - SimFloat16(RawbitsToFloat16(src.Uint(vform, 1))))); \ - dst.SetUint(vform, 0, Float16ToRawbits(result)); \ - } else if (vform == kFormatS) { \ - float result = OP(src.Float(0), src.Float(1)); \ - dst.SetFloat(0, result); \ - } else { \ - VIXL_ASSERT(vform == kFormatD); \ - double result = OP(src.Float(0), src.Float(1)); \ - dst.SetFloat(0, result); \ - } \ - dst.ClearForWrite(vform); \ - return dst; \ +#define DEFINE_NEON_FP_PAIR_OP(FNP, FN, OP) \ + LogicVRegister Simulator::FNP(VectorFormat vform, \ + LogicVRegister dst, \ + const LogicVRegister& src1, \ + const LogicVRegister& src2) { \ + SimVRegister temp1, temp2; \ + uzp1(vform, temp1, src1, src2); \ + uzp2(vform, temp2, src1, src2); \ + FN(vform, dst, temp1, temp2); \ + if (IsSVEFormat(vform)) { \ + interleave_top_bottom(vform, dst, dst); \ + } \ + return dst; \ + } \ + \ + LogicVRegister Simulator::FNP(VectorFormat vform, \ + LogicVRegister dst, \ + const LogicVRegister& src) { \ + if (vform == kFormatH) { \ + SimFloat16 result(OP(SimFloat16(RawbitsToFloat16( \ + static_cast(src.Uint(vform, 0)))), \ + SimFloat16(RawbitsToFloat16( \ + static_cast(src.Uint(vform, 1)))))); \ + dst.SetUint(vform, 0, Float16ToRawbits(result)); \ + } else if (vform == kFormatS) { \ + float result = OP(src.Float(0), src.Float(1)); \ + dst.SetFloat(0, result); \ + } else { \ + VIXL_ASSERT(vform == kFormatD); \ + double result = OP(src.Float(0), src.Float(1)); \ + dst.SetFloat(0, result); \ + } \ + dst.ClearForWrite(vform); \ + return dst; \ } NEON_FPPAIRWISE_LIST(DEFINE_NEON_FP_PAIR_OP) #undef DEFINE_NEON_FP_PAIR_OP @@ -5804,7 +5838,8 @@ LogicVRegister Simulator::frint(VectorFormat vform, } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) { for (int i = 0; i < LaneCountFromFormat(vform); i++) { float input = src.Float(i); - float rounded = FPRoundInt(input, rounding_mode, frint_mode); + float rounded = + static_cast(FPRoundInt(input, rounding_mode, frint_mode)); if (inexact_exception && !IsNaN(input) && (input != rounded)) { FPProcessException(); @@ -5966,6 +6001,7 @@ LogicVRegister Simulator::fcvtu(VectorFormat vform, LogicVRegister Simulator::fcvtl(VectorFormat vform, LogicVRegister dst, const LogicVRegister& src) { + dst.ClearForWrite(vform); if (LaneSizeInBitsFromFormat(vform) == kSRegSize) { for (int i = LaneCountFromFormat(vform) - 1; i >= 0; i--) { // TODO: Full support for SimFloat16 in SimRegister(s). @@ -5986,6 +6022,7 @@ LogicVRegister Simulator::fcvtl(VectorFormat vform, LogicVRegister Simulator::fcvtl2(VectorFormat vform, LogicVRegister dst, const LogicVRegister& src) { + dst.ClearForWrite(vform); int lane_count = LaneCountFromFormat(vform); if (LaneSizeInBitsFromFormat(vform) == kSRegSize) { for (int i = 0; i < lane_count; i++) { @@ -6031,6 +6068,7 @@ LogicVRegister Simulator::fcvtn(VectorFormat vform, LogicVRegister Simulator::fcvtn2(VectorFormat vform, LogicVRegister dst, const LogicVRegister& src) { + dst.ClearForWrite(vform); int lane_count = LaneCountFromFormat(vform) / 2; if (LaneSizeInBitsFromFormat(vform) == kHRegSize) { for (int i = lane_count - 1; i >= 0; i--) { @@ -6074,6 +6112,7 @@ LogicVRegister Simulator::fcvtxn2(VectorFormat vform, LogicVRegister dst, const LogicVRegister& src) { VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kSRegSize); + dst.ClearForWrite(vform); int lane_count = LaneCountFromFormat(vform) / 2; for (int i = lane_count - 1; i >= 0; i--) { dst.SetFloat(i + lane_count, @@ -6110,9 +6149,9 @@ T Simulator::FPRecipSqrtEstimate(T op) { return FPProcessNaN(op); } else if (op == 0.0) { if (copysign(1.0, op) < 0.0) { - return kFP64NegativeInfinity; + return T(kFP64NegativeInfinity); } else { - return kFP64PositiveInfinity; + return T(kFP64PositiveInfinity); } } else if (copysign(1.0, op) < 0.0) { FPProcessException(); @@ -6123,11 +6162,11 @@ T Simulator::FPRecipSqrtEstimate(T op) { uint64_t fraction; int exp, result_exp; - if (IsFloat16()) { + if constexpr (IsFloat16()) { exp = Float16Exp(op); fraction = Float16Mantissa(op); fraction <<= 42; - } else if (IsFloat32()) { + } else if constexpr (IsFloat32()) { exp = FloatExp(op); fraction = FloatMantissa(op); fraction <<= 29; @@ -6152,9 +6191,9 @@ T Simulator::FPRecipSqrtEstimate(T op) { scaled = DoublePack(0, 1021, Bits(fraction, 51, 44) << 44); } - if (IsFloat16()) { + if constexpr (IsFloat16()) { result_exp = (44 - exp) / 2; - } else if (IsFloat32()) { + } else if constexpr (IsFloat32()) { result_exp = (380 - exp) / 2; } else { VIXL_ASSERT(IsFloat64()); @@ -6163,11 +6202,11 @@ T Simulator::FPRecipSqrtEstimate(T op) { uint64_t estimate = DoubleToRawbits(recip_sqrt_estimate(scaled)); - if (IsFloat16()) { + if constexpr (IsFloat16()) { uint16_t exp_bits = static_cast(Bits(result_exp, 4, 0)); uint16_t est_bits = static_cast(Bits(estimate, 51, 42)); return Float16Pack(0, exp_bits, est_bits); - } else if (IsFloat32()) { + } else if constexpr (IsFloat32()) { uint32_t exp_bits = static_cast(Bits(result_exp, 7, 0)); uint32_t est_bits = static_cast(Bits(estimate, 51, 29)); return FloatPack(0, exp_bits, est_bits); @@ -6207,9 +6246,9 @@ template T Simulator::FPRecipEstimate(T op, FPRounding rounding) { uint32_t sign; - if (IsFloat16()) { + if constexpr (IsFloat16()) { sign = Float16Sign(op); - } else if (IsFloat32()) { + } else if constexpr (IsFloat32()) { sign = FloatSign(op); } else { VIXL_ASSERT(IsFloat64()); @@ -6219,10 +6258,10 @@ T Simulator::FPRecipEstimate(T op, FPRounding rounding) { if (IsNaN(op)) { return FPProcessNaN(op); } else if (IsInf(op)) { - return (sign == 1) ? -0.0 : 0.0; + return (sign == 1) ? T(-0.0) : T(0.0); } else if (op == 0.0) { FPProcessException(); // FPExc_DivideByZero exception. - return (sign == 1) ? kFP64NegativeInfinity : kFP64PositiveInfinity; + return (sign == 1) ? T(kFP64NegativeInfinity) : T(kFP64PositiveInfinity); } else if ((IsFloat16() && (std::fabs(op) < std::pow(2.0, -16.0))) || (IsFloat32() && (std::fabs(op) < std::pow(2.0, -128.0))) || (IsFloat64() && (std::fabs(op) < std::pow(2.0, -1024.0)))) { @@ -6245,12 +6284,12 @@ T Simulator::FPRecipEstimate(T op, FPRounding rounding) { } FPProcessException(); // FPExc_Overflow and FPExc_Inexact. if (overflow_to_inf) { - return (sign == 1) ? kFP64NegativeInfinity : kFP64PositiveInfinity; + return (sign == 1) ? T(kFP64NegativeInfinity) : T(kFP64PositiveInfinity); } else { // Return FPMaxNormal(sign). - if (IsFloat16()) { + if constexpr (IsFloat16()) { return Float16Pack(sign, 0x1f, 0x3ff); - } else if (IsFloat32()) { + } else if constexpr (IsFloat32()) { return FloatPack(sign, 0xfe, 0x07fffff); } else { VIXL_ASSERT(IsFloat64()); @@ -6261,12 +6300,12 @@ T Simulator::FPRecipEstimate(T op, FPRounding rounding) { uint64_t fraction; int exp, result_exp; - if (IsFloat16()) { + if constexpr (IsFloat16()) { sign = Float16Sign(op); exp = Float16Exp(op); fraction = Float16Mantissa(op); fraction <<= 42; - } else if (IsFloat32()) { + } else if constexpr (IsFloat32()) { sign = FloatSign(op); exp = FloatExp(op); fraction = FloatMantissa(op); @@ -6289,9 +6328,9 @@ T Simulator::FPRecipEstimate(T op, FPRounding rounding) { double scaled = DoublePack(0, 1022, Bits(fraction, 51, 44) << 44); - if (IsFloat16()) { + if constexpr (IsFloat16()) { result_exp = (29 - exp); // In range 29-30 = -1 to 29+1 = 30. - } else if (IsFloat32()) { + } else if constexpr (IsFloat32()) { result_exp = (253 - exp); // In range 253-254 = -1 to 253+1 = 254. } else { VIXL_ASSERT(IsFloat64()); @@ -6307,11 +6346,11 @@ T Simulator::FPRecipEstimate(T op, FPRounding rounding) { fraction = (UINT64_C(1) << 50) | Bits(fraction, 51, 2); result_exp = 0; } - if (IsFloat16()) { + if constexpr (IsFloat16()) { uint16_t exp_bits = static_cast(Bits(result_exp, 4, 0)); uint16_t frac_bits = static_cast(Bits(fraction, 51, 42)); return Float16Pack(sign, exp_bits, frac_bits); - } else if (IsFloat32()) { + } else if constexpr (IsFloat32()) { uint32_t exp_bits = static_cast(Bits(result_exp, 7, 0)); uint32_t frac_bits = static_cast(Bits(fraction, 51, 29)); return FloatPack(sign, exp_bits, frac_bits); @@ -6457,12 +6496,12 @@ LogicVRegister Simulator::frecpx(VectorFormat vform, } else { int exp; uint32_t sign; - if (IsFloat16()) { + if constexpr (IsFloat16()) { sign = Float16Sign(op); exp = Float16Exp(op); exp = (exp == 0) ? (0x1F - 1) : static_cast(Bits(~exp, 4, 0)); result = Float16Pack(sign, exp, 0); - } else if (IsFloat32()) { + } else if constexpr (IsFloat32()) { sign = FloatSign(op); exp = FloatExp(op); exp = (exp == 0) ? (0xFF - 1) : static_cast(Bits(~exp, 7, 0)); @@ -6766,18 +6805,21 @@ LogicVRegister Simulator::fexpa(VectorFormat vform, if (lane_size == kHRegSize) { index_highbit = 4; - VIXL_ASSERT(ArrayLength(fexpa_coeff16) == (1U << (index_highbit + 1))); + VIXL_ASSERT(ArrayLength(fexpa_coeff16) == + (uint64_t{1} << (index_highbit + 1))); fexpa_coeff = fexpa_coeff16; op_highbit = 9; op_shift = 10; } else if (lane_size == kSRegSize) { - VIXL_ASSERT(ArrayLength(fexpa_coeff32) == (1U << (index_highbit + 1))); + VIXL_ASSERT(ArrayLength(fexpa_coeff32) == + (uint64_t{1} << (index_highbit + 1))); fexpa_coeff = fexpa_coeff32; op_highbit = 13; op_shift = 23; } else { VIXL_ASSERT(lane_size == kDRegSize); - VIXL_ASSERT(ArrayLength(fexpa_coeff64) == (1U << (index_highbit + 1))); + VIXL_ASSERT(ArrayLength(fexpa_coeff64) == + (uint64_t{1} << (index_highbit + 1))); fexpa_coeff = fexpa_coeff64; op_highbit = 16; op_shift = 52; @@ -7274,7 +7316,9 @@ void Simulator::SVEStructuredStoreHelper(VectorFormat vform, for (int r = 0; r < reg_count; r++) { uint64_t element_address = addr.GetElementAddress(i, r); - StoreLane(zt[r], unpack_vform, i << unpack_shift, element_address); + if (!StoreLane(zt[r], unpack_vform, i << unpack_shift, element_address)) { + return; + } } } @@ -7298,7 +7342,7 @@ void Simulator::SVEStructuredStoreHelper(VectorFormat vform, } } -void Simulator::SVEStructuredLoadHelper(VectorFormat vform, +bool Simulator::SVEStructuredLoadHelper(VectorFormat vform, const LogicPRegister& pg, unsigned zt_code, const LogicSVEAddressVector& addr, @@ -7333,9 +7377,13 @@ void Simulator::SVEStructuredLoadHelper(VectorFormat vform, } if (is_signed) { - LoadIntToLane(zt[r], vform, msize_in_bytes, i, element_address); + if (!LoadIntToLane(zt[r], vform, msize_in_bytes, i, element_address)) { + return false; + } } else { - LoadUintToLane(zt[r], vform, msize_in_bytes, i, element_address); + if (!LoadUintToLane(zt[r], vform, msize_in_bytes, i, element_address)) { + return false; + } } } } @@ -7354,6 +7402,7 @@ void Simulator::SVEStructuredLoadHelper(VectorFormat vform, "<-", addr); } + return true; } LogicPRegister Simulator::brka(LogicPRegister pd, @@ -7448,7 +7497,7 @@ void Simulator::SVEFaultTolerantLoadHelper(VectorFormat vform, // Non-faulting loads are allowed to fail arbitrarily. To stress user // code, fail a random element in roughly one in eight full-vector loads. - uint32_t rnd = static_cast(jrand48(rand_state_)); + uint32_t rnd = static_cast(rand_gen_()); int fake_fault_at_lane = rnd % (LaneCountFromFormat(vform) * 8); for (int i = 0; i < LaneCountFromFormat(vform); i++) { @@ -7461,7 +7510,9 @@ void Simulator::SVEFaultTolerantLoadHelper(VectorFormat vform, // First-faulting loads always load the first active element, regardless // of FFR. The result will be discarded if its FFR lane is inactive, but // it could still generate a fault. - value = MemReadUint(msize_in_bytes, element_address); + VIXL_DEFINE_OR_RETURN(mem_result, + MemReadUint(msize_in_bytes, element_address)); + value = mem_result; // All subsequent elements have non-fault semantics. type = kSVENonFaultLoad; @@ -7473,7 +7524,9 @@ void Simulator::SVEFaultTolerantLoadHelper(VectorFormat vform, bool can_read = (i < fake_fault_at_lane) && CanReadMemory(element_address, msize_in_bytes); if (can_read) { - value = MemReadUint(msize_in_bytes, element_address); + VIXL_DEFINE_OR_RETURN(mem_result, + MemReadUint(msize_in_bytes, element_address)); + value = mem_result; } else { // Propagate the fault to the end of FFR. for (int j = i; j < LaneCountFromFormat(vform); j++) { @@ -7851,6 +7904,653 @@ LogicVRegister Simulator::fmatmul(VectorFormat vform, return dst; } +template <> +uint64_t CryptoOp<"choose"_h>(uint64_t x, uint64_t y, uint64_t z) { + return ((y ^ z) & x) ^ z; +} + +template <> +uint64_t CryptoOp<"majority"_h>(uint64_t x, uint64_t y, uint64_t z) { + return (x & y) | ((x | y) & z); +} + +template <> +uint64_t CryptoOp<"parity"_h>(uint64_t x, uint64_t y, uint64_t z) { + return x ^ y ^ z; +} + +template +static uint64_t SHASigma(uint64_t x) { + return static_cast(RotateRight(x, A, sizeof(T) * kBitsPerByte) ^ + RotateRight(x, B, sizeof(T) * kBitsPerByte) ^ + RotateRight(x, C, sizeof(T) * kBitsPerByte)); +} + +LogicVRegister Simulator::sha2h(LogicVRegister srcdst, + const LogicVRegister& src1, + const LogicVRegister& src2, + bool part1) { + uint64_t x[4] = {}; + uint64_t y[4] = {}; + if (part1) { + // Switch input order based on which part is being handled. + srcdst.UintArray(kFormat4S, x); + src1.UintArray(kFormat4S, y); + } else { + src1.UintArray(kFormat4S, x); + srcdst.UintArray(kFormat4S, y); + } + + for (unsigned i = 0; i < ArrayLength(x); i++) { + uint64_t chs = CryptoOp<"choose"_h>(y[0], y[1], y[2]); + uint64_t maj = CryptoOp<"majority"_h>(x[0], x[1], x[2]); + + uint64_t w = src2.Uint(kFormat4S, i); + uint64_t t = y[3] + SHASigma(y[0]) + chs + w; + + x[3] += t; + y[3] = t + SHASigma(x[0]) + maj; + + // y:x = ROL(y:x, 32) + SHARotateEltsLeftOne(x); + SHARotateEltsLeftOne(y); + std::swap(x[0], y[0]); + } + + srcdst.SetUintArray(kFormat4S, part1 ? x : y); + return srcdst; +} + +template +static uint64_t SHASURotate(uint64_t x) { + return RotateRight(x, A, sizeof(T) * kBitsPerByte) ^ + RotateRight(x, B, sizeof(T) * kBitsPerByte) ^ + ((x & ~static_cast(0)) >> C); +} + +LogicVRegister Simulator::sha2su0(LogicVRegister srcdst, + const LogicVRegister& src1) { + uint64_t w[4] = {}; + uint64_t result[4]; + srcdst.UintArray(kFormat4S, w); + uint64_t x = src1.Uint(kFormat4S, 0); + + result[0] = SHASURotate(w[1]) + w[0]; + result[1] = SHASURotate(w[2]) + w[1]; + result[2] = SHASURotate(w[3]) + w[2]; + result[3] = SHASURotate(x) + w[3]; + + srcdst.SetUintArray(kFormat4S, result); + return srcdst; +} + +LogicVRegister Simulator::sha2su1(LogicVRegister srcdst, + const LogicVRegister& src1, + const LogicVRegister& src2) { + uint64_t w[4] = {}; + uint64_t x[4] = {}; + uint64_t y[4] = {}; + uint64_t result[4]; + srcdst.UintArray(kFormat4S, w); + src1.UintArray(kFormat4S, x); + src2.UintArray(kFormat4S, y); + + result[0] = SHASURotate(y[2]) + w[0] + x[1]; + result[1] = SHASURotate(y[3]) + w[1] + x[2]; + result[2] = SHASURotate(result[0]) + w[2] + x[3]; + result[3] = SHASURotate(result[1]) + w[3] + y[0]; + + srcdst.SetUintArray(kFormat4S, result); + return srcdst; +} + +LogicVRegister Simulator::sha512h(LogicVRegister srcdst, + const LogicVRegister& src1, + const LogicVRegister& src2) { + uint64_t w[2] = {}; + uint64_t x[2] = {}; + uint64_t y[2] = {}; + uint64_t result[2] = {}; + srcdst.UintArray(kFormat2D, w); + src1.UintArray(kFormat2D, x); + src2.UintArray(kFormat2D, y); + + result[1] = (y[1] & x[0]) ^ (~y[1] & x[1]); + result[1] += SHASigma(y[1]) + w[1]; + + uint64_t tmp = result[1] + y[0]; + + result[0] = (tmp & y[1]) ^ (~tmp & x[0]); + result[0] += SHASigma(tmp) + w[0]; + + srcdst.SetUintArray(kFormat2D, result); + return srcdst; +} + +LogicVRegister Simulator::sha512h2(LogicVRegister srcdst, + const LogicVRegister& src1, + const LogicVRegister& src2) { + uint64_t w[2] = {}; + uint64_t x[2] = {}; + uint64_t y[2] = {}; + uint64_t result[2] = {}; + srcdst.UintArray(kFormat2D, w); + src1.UintArray(kFormat2D, x); + src2.UintArray(kFormat2D, y); + + result[1] = (x[0] & y[1]) ^ (x[0] & y[0]) ^ (y[1] & y[0]); + result[1] += SHASigma(y[0]) + w[1]; + + result[0] = (result[1] & y[0]) ^ (result[1] & y[1]) ^ (y[1] & y[0]); + result[0] += SHASigma(result[1]) + w[0]; + + srcdst.SetUintArray(kFormat2D, result); + return srcdst; +} + +LogicVRegister Simulator::sha512su0(LogicVRegister srcdst, + const LogicVRegister& src1) { + uint64_t w[2] = {}; + uint64_t x[2] = {}; + uint64_t result[2] = {}; + srcdst.UintArray(kFormat2D, w); + src1.UintArray(kFormat2D, x); + + result[0] = SHASURotate(w[1]) + w[0]; + result[1] = SHASURotate(x[0]) + w[1]; + + srcdst.SetUintArray(kFormat2D, result); + return srcdst; +} + +LogicVRegister Simulator::sha512su1(LogicVRegister srcdst, + const LogicVRegister& src1, + const LogicVRegister& src2) { + uint64_t w[2] = {}; + uint64_t x[2] = {}; + uint64_t y[2] = {}; + uint64_t result[2] = {}; + srcdst.UintArray(kFormat2D, w); + src1.UintArray(kFormat2D, x); + src2.UintArray(kFormat2D, y); + + result[1] = w[1] + SHASURotate(x[1]) + y[1]; + result[0] = w[0] + SHASURotate(x[0]) + y[0]; + + srcdst.SetUintArray(kFormat2D, result); + return srcdst; +} + +static uint8_t GalMul(int table, uint64_t x) { + // Galois multiplication lookup tables. + static const uint8_t ffmul02[256] = { + 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, + 0x18, 0x1a, 0x1c, 0x1e, 0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e, + 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e, 0x40, 0x42, 0x44, 0x46, + 0x48, 0x4a, 0x4c, 0x4e, 0x50, 0x52, 0x54, 0x56, 0x58, 0x5a, 0x5c, 0x5e, + 0x60, 0x62, 0x64, 0x66, 0x68, 0x6a, 0x6c, 0x6e, 0x70, 0x72, 0x74, 0x76, + 0x78, 0x7a, 0x7c, 0x7e, 0x80, 0x82, 0x84, 0x86, 0x88, 0x8a, 0x8c, 0x8e, + 0x90, 0x92, 0x94, 0x96, 0x98, 0x9a, 0x9c, 0x9e, 0xa0, 0xa2, 0xa4, 0xa6, + 0xa8, 0xaa, 0xac, 0xae, 0xb0, 0xb2, 0xb4, 0xb6, 0xb8, 0xba, 0xbc, 0xbe, + 0xc0, 0xc2, 0xc4, 0xc6, 0xc8, 0xca, 0xcc, 0xce, 0xd0, 0xd2, 0xd4, 0xd6, + 0xd8, 0xda, 0xdc, 0xde, 0xe0, 0xe2, 0xe4, 0xe6, 0xe8, 0xea, 0xec, 0xee, + 0xf0, 0xf2, 0xf4, 0xf6, 0xf8, 0xfa, 0xfc, 0xfe, 0x1b, 0x19, 0x1f, 0x1d, + 0x13, 0x11, 0x17, 0x15, 0x0b, 0x09, 0x0f, 0x0d, 0x03, 0x01, 0x07, 0x05, + 0x3b, 0x39, 0x3f, 0x3d, 0x33, 0x31, 0x37, 0x35, 0x2b, 0x29, 0x2f, 0x2d, + 0x23, 0x21, 0x27, 0x25, 0x5b, 0x59, 0x5f, 0x5d, 0x53, 0x51, 0x57, 0x55, + 0x4b, 0x49, 0x4f, 0x4d, 0x43, 0x41, 0x47, 0x45, 0x7b, 0x79, 0x7f, 0x7d, + 0x73, 0x71, 0x77, 0x75, 0x6b, 0x69, 0x6f, 0x6d, 0x63, 0x61, 0x67, 0x65, + 0x9b, 0x99, 0x9f, 0x9d, 0x93, 0x91, 0x97, 0x95, 0x8b, 0x89, 0x8f, 0x8d, + 0x83, 0x81, 0x87, 0x85, 0xbb, 0xb9, 0xbf, 0xbd, 0xb3, 0xb1, 0xb7, 0xb5, + 0xab, 0xa9, 0xaf, 0xad, 0xa3, 0xa1, 0xa7, 0xa5, 0xdb, 0xd9, 0xdf, 0xdd, + 0xd3, 0xd1, 0xd7, 0xd5, 0xcb, 0xc9, 0xcf, 0xcd, 0xc3, 0xc1, 0xc7, 0xc5, + 0xfb, 0xf9, 0xff, 0xfd, 0xf3, 0xf1, 0xf7, 0xf5, 0xeb, 0xe9, 0xef, 0xed, + 0xe3, 0xe1, 0xe7, 0xe5, + }; + + static const uint8_t ffmul03[256] = { + 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, 0x18, 0x1b, 0x1e, 0x1d, + 0x14, 0x17, 0x12, 0x11, 0x30, 0x33, 0x36, 0x35, 0x3c, 0x3f, 0x3a, 0x39, + 0x28, 0x2b, 0x2e, 0x2d, 0x24, 0x27, 0x22, 0x21, 0x60, 0x63, 0x66, 0x65, + 0x6c, 0x6f, 0x6a, 0x69, 0x78, 0x7b, 0x7e, 0x7d, 0x74, 0x77, 0x72, 0x71, + 0x50, 0x53, 0x56, 0x55, 0x5c, 0x5f, 0x5a, 0x59, 0x48, 0x4b, 0x4e, 0x4d, + 0x44, 0x47, 0x42, 0x41, 0xc0, 0xc3, 0xc6, 0xc5, 0xcc, 0xcf, 0xca, 0xc9, + 0xd8, 0xdb, 0xde, 0xdd, 0xd4, 0xd7, 0xd2, 0xd1, 0xf0, 0xf3, 0xf6, 0xf5, + 0xfc, 0xff, 0xfa, 0xf9, 0xe8, 0xeb, 0xee, 0xed, 0xe4, 0xe7, 0xe2, 0xe1, + 0xa0, 0xa3, 0xa6, 0xa5, 0xac, 0xaf, 0xaa, 0xa9, 0xb8, 0xbb, 0xbe, 0xbd, + 0xb4, 0xb7, 0xb2, 0xb1, 0x90, 0x93, 0x96, 0x95, 0x9c, 0x9f, 0x9a, 0x99, + 0x88, 0x8b, 0x8e, 0x8d, 0x84, 0x87, 0x82, 0x81, 0x9b, 0x98, 0x9d, 0x9e, + 0x97, 0x94, 0x91, 0x92, 0x83, 0x80, 0x85, 0x86, 0x8f, 0x8c, 0x89, 0x8a, + 0xab, 0xa8, 0xad, 0xae, 0xa7, 0xa4, 0xa1, 0xa2, 0xb3, 0xb0, 0xb5, 0xb6, + 0xbf, 0xbc, 0xb9, 0xba, 0xfb, 0xf8, 0xfd, 0xfe, 0xf7, 0xf4, 0xf1, 0xf2, + 0xe3, 0xe0, 0xe5, 0xe6, 0xef, 0xec, 0xe9, 0xea, 0xcb, 0xc8, 0xcd, 0xce, + 0xc7, 0xc4, 0xc1, 0xc2, 0xd3, 0xd0, 0xd5, 0xd6, 0xdf, 0xdc, 0xd9, 0xda, + 0x5b, 0x58, 0x5d, 0x5e, 0x57, 0x54, 0x51, 0x52, 0x43, 0x40, 0x45, 0x46, + 0x4f, 0x4c, 0x49, 0x4a, 0x6b, 0x68, 0x6d, 0x6e, 0x67, 0x64, 0x61, 0x62, + 0x73, 0x70, 0x75, 0x76, 0x7f, 0x7c, 0x79, 0x7a, 0x3b, 0x38, 0x3d, 0x3e, + 0x37, 0x34, 0x31, 0x32, 0x23, 0x20, 0x25, 0x26, 0x2f, 0x2c, 0x29, 0x2a, + 0x0b, 0x08, 0x0d, 0x0e, 0x07, 0x04, 0x01, 0x02, 0x13, 0x10, 0x15, 0x16, + 0x1f, 0x1c, 0x19, 0x1a, + }; + + static const uint8_t ffmul09[256] = { + 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, + 0x6c, 0x65, 0x7e, 0x77, 0x90, 0x99, 0x82, 0x8b, 0xb4, 0xbd, 0xa6, 0xaf, + 0xd8, 0xd1, 0xca, 0xc3, 0xfc, 0xf5, 0xee, 0xe7, 0x3b, 0x32, 0x29, 0x20, + 0x1f, 0x16, 0x0d, 0x04, 0x73, 0x7a, 0x61, 0x68, 0x57, 0x5e, 0x45, 0x4c, + 0xab, 0xa2, 0xb9, 0xb0, 0x8f, 0x86, 0x9d, 0x94, 0xe3, 0xea, 0xf1, 0xf8, + 0xc7, 0xce, 0xd5, 0xdc, 0x76, 0x7f, 0x64, 0x6d, 0x52, 0x5b, 0x40, 0x49, + 0x3e, 0x37, 0x2c, 0x25, 0x1a, 0x13, 0x08, 0x01, 0xe6, 0xef, 0xf4, 0xfd, + 0xc2, 0xcb, 0xd0, 0xd9, 0xae, 0xa7, 0xbc, 0xb5, 0x8a, 0x83, 0x98, 0x91, + 0x4d, 0x44, 0x5f, 0x56, 0x69, 0x60, 0x7b, 0x72, 0x05, 0x0c, 0x17, 0x1e, + 0x21, 0x28, 0x33, 0x3a, 0xdd, 0xd4, 0xcf, 0xc6, 0xf9, 0xf0, 0xeb, 0xe2, + 0x95, 0x9c, 0x87, 0x8e, 0xb1, 0xb8, 0xa3, 0xaa, 0xec, 0xe5, 0xfe, 0xf7, + 0xc8, 0xc1, 0xda, 0xd3, 0xa4, 0xad, 0xb6, 0xbf, 0x80, 0x89, 0x92, 0x9b, + 0x7c, 0x75, 0x6e, 0x67, 0x58, 0x51, 0x4a, 0x43, 0x34, 0x3d, 0x26, 0x2f, + 0x10, 0x19, 0x02, 0x0b, 0xd7, 0xde, 0xc5, 0xcc, 0xf3, 0xfa, 0xe1, 0xe8, + 0x9f, 0x96, 0x8d, 0x84, 0xbb, 0xb2, 0xa9, 0xa0, 0x47, 0x4e, 0x55, 0x5c, + 0x63, 0x6a, 0x71, 0x78, 0x0f, 0x06, 0x1d, 0x14, 0x2b, 0x22, 0x39, 0x30, + 0x9a, 0x93, 0x88, 0x81, 0xbe, 0xb7, 0xac, 0xa5, 0xd2, 0xdb, 0xc0, 0xc9, + 0xf6, 0xff, 0xe4, 0xed, 0x0a, 0x03, 0x18, 0x11, 0x2e, 0x27, 0x3c, 0x35, + 0x42, 0x4b, 0x50, 0x59, 0x66, 0x6f, 0x74, 0x7d, 0xa1, 0xa8, 0xb3, 0xba, + 0x85, 0x8c, 0x97, 0x9e, 0xe9, 0xe0, 0xfb, 0xf2, 0xcd, 0xc4, 0xdf, 0xd6, + 0x31, 0x38, 0x23, 0x2a, 0x15, 0x1c, 0x07, 0x0e, 0x79, 0x70, 0x6b, 0x62, + 0x5d, 0x54, 0x4f, 0x46, + }; + + static const uint8_t ffmul0b[256] = { + 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, + 0x74, 0x7f, 0x62, 0x69, 0xb0, 0xbb, 0xa6, 0xad, 0x9c, 0x97, 0x8a, 0x81, + 0xe8, 0xe3, 0xfe, 0xf5, 0xc4, 0xcf, 0xd2, 0xd9, 0x7b, 0x70, 0x6d, 0x66, + 0x57, 0x5c, 0x41, 0x4a, 0x23, 0x28, 0x35, 0x3e, 0x0f, 0x04, 0x19, 0x12, + 0xcb, 0xc0, 0xdd, 0xd6, 0xe7, 0xec, 0xf1, 0xfa, 0x93, 0x98, 0x85, 0x8e, + 0xbf, 0xb4, 0xa9, 0xa2, 0xf6, 0xfd, 0xe0, 0xeb, 0xda, 0xd1, 0xcc, 0xc7, + 0xae, 0xa5, 0xb8, 0xb3, 0x82, 0x89, 0x94, 0x9f, 0x46, 0x4d, 0x50, 0x5b, + 0x6a, 0x61, 0x7c, 0x77, 0x1e, 0x15, 0x08, 0x03, 0x32, 0x39, 0x24, 0x2f, + 0x8d, 0x86, 0x9b, 0x90, 0xa1, 0xaa, 0xb7, 0xbc, 0xd5, 0xde, 0xc3, 0xc8, + 0xf9, 0xf2, 0xef, 0xe4, 0x3d, 0x36, 0x2b, 0x20, 0x11, 0x1a, 0x07, 0x0c, + 0x65, 0x6e, 0x73, 0x78, 0x49, 0x42, 0x5f, 0x54, 0xf7, 0xfc, 0xe1, 0xea, + 0xdb, 0xd0, 0xcd, 0xc6, 0xaf, 0xa4, 0xb9, 0xb2, 0x83, 0x88, 0x95, 0x9e, + 0x47, 0x4c, 0x51, 0x5a, 0x6b, 0x60, 0x7d, 0x76, 0x1f, 0x14, 0x09, 0x02, + 0x33, 0x38, 0x25, 0x2e, 0x8c, 0x87, 0x9a, 0x91, 0xa0, 0xab, 0xb6, 0xbd, + 0xd4, 0xdf, 0xc2, 0xc9, 0xf8, 0xf3, 0xee, 0xe5, 0x3c, 0x37, 0x2a, 0x21, + 0x10, 0x1b, 0x06, 0x0d, 0x64, 0x6f, 0x72, 0x79, 0x48, 0x43, 0x5e, 0x55, + 0x01, 0x0a, 0x17, 0x1c, 0x2d, 0x26, 0x3b, 0x30, 0x59, 0x52, 0x4f, 0x44, + 0x75, 0x7e, 0x63, 0x68, 0xb1, 0xba, 0xa7, 0xac, 0x9d, 0x96, 0x8b, 0x80, + 0xe9, 0xe2, 0xff, 0xf4, 0xc5, 0xce, 0xd3, 0xd8, 0x7a, 0x71, 0x6c, 0x67, + 0x56, 0x5d, 0x40, 0x4b, 0x22, 0x29, 0x34, 0x3f, 0x0e, 0x05, 0x18, 0x13, + 0xca, 0xc1, 0xdc, 0xd7, 0xe6, 0xed, 0xf0, 0xfb, 0x92, 0x99, 0x84, 0x8f, + 0xbe, 0xb5, 0xa8, 0xa3, + }; + + static const uint8_t ffmul0d[256] = { + 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, + 0x5c, 0x51, 0x46, 0x4b, 0xd0, 0xdd, 0xca, 0xc7, 0xe4, 0xe9, 0xfe, 0xf3, + 0xb8, 0xb5, 0xa2, 0xaf, 0x8c, 0x81, 0x96, 0x9b, 0xbb, 0xb6, 0xa1, 0xac, + 0x8f, 0x82, 0x95, 0x98, 0xd3, 0xde, 0xc9, 0xc4, 0xe7, 0xea, 0xfd, 0xf0, + 0x6b, 0x66, 0x71, 0x7c, 0x5f, 0x52, 0x45, 0x48, 0x03, 0x0e, 0x19, 0x14, + 0x37, 0x3a, 0x2d, 0x20, 0x6d, 0x60, 0x77, 0x7a, 0x59, 0x54, 0x43, 0x4e, + 0x05, 0x08, 0x1f, 0x12, 0x31, 0x3c, 0x2b, 0x26, 0xbd, 0xb0, 0xa7, 0xaa, + 0x89, 0x84, 0x93, 0x9e, 0xd5, 0xd8, 0xcf, 0xc2, 0xe1, 0xec, 0xfb, 0xf6, + 0xd6, 0xdb, 0xcc, 0xc1, 0xe2, 0xef, 0xf8, 0xf5, 0xbe, 0xb3, 0xa4, 0xa9, + 0x8a, 0x87, 0x90, 0x9d, 0x06, 0x0b, 0x1c, 0x11, 0x32, 0x3f, 0x28, 0x25, + 0x6e, 0x63, 0x74, 0x79, 0x5a, 0x57, 0x40, 0x4d, 0xda, 0xd7, 0xc0, 0xcd, + 0xee, 0xe3, 0xf4, 0xf9, 0xb2, 0xbf, 0xa8, 0xa5, 0x86, 0x8b, 0x9c, 0x91, + 0x0a, 0x07, 0x10, 0x1d, 0x3e, 0x33, 0x24, 0x29, 0x62, 0x6f, 0x78, 0x75, + 0x56, 0x5b, 0x4c, 0x41, 0x61, 0x6c, 0x7b, 0x76, 0x55, 0x58, 0x4f, 0x42, + 0x09, 0x04, 0x13, 0x1e, 0x3d, 0x30, 0x27, 0x2a, 0xb1, 0xbc, 0xab, 0xa6, + 0x85, 0x88, 0x9f, 0x92, 0xd9, 0xd4, 0xc3, 0xce, 0xed, 0xe0, 0xf7, 0xfa, + 0xb7, 0xba, 0xad, 0xa0, 0x83, 0x8e, 0x99, 0x94, 0xdf, 0xd2, 0xc5, 0xc8, + 0xeb, 0xe6, 0xf1, 0xfc, 0x67, 0x6a, 0x7d, 0x70, 0x53, 0x5e, 0x49, 0x44, + 0x0f, 0x02, 0x15, 0x18, 0x3b, 0x36, 0x21, 0x2c, 0x0c, 0x01, 0x16, 0x1b, + 0x38, 0x35, 0x22, 0x2f, 0x64, 0x69, 0x7e, 0x73, 0x50, 0x5d, 0x4a, 0x47, + 0xdc, 0xd1, 0xc6, 0xcb, 0xe8, 0xe5, 0xf2, 0xff, 0xb4, 0xb9, 0xae, 0xa3, + 0x80, 0x8d, 0x9a, 0x97, + }; + + static const uint8_t ffmul0e[256] = { + 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, + 0x48, 0x46, 0x54, 0x5a, 0xe0, 0xee, 0xfc, 0xf2, 0xd8, 0xd6, 0xc4, 0xca, + 0x90, 0x9e, 0x8c, 0x82, 0xa8, 0xa6, 0xb4, 0xba, 0xdb, 0xd5, 0xc7, 0xc9, + 0xe3, 0xed, 0xff, 0xf1, 0xab, 0xa5, 0xb7, 0xb9, 0x93, 0x9d, 0x8f, 0x81, + 0x3b, 0x35, 0x27, 0x29, 0x03, 0x0d, 0x1f, 0x11, 0x4b, 0x45, 0x57, 0x59, + 0x73, 0x7d, 0x6f, 0x61, 0xad, 0xa3, 0xb1, 0xbf, 0x95, 0x9b, 0x89, 0x87, + 0xdd, 0xd3, 0xc1, 0xcf, 0xe5, 0xeb, 0xf9, 0xf7, 0x4d, 0x43, 0x51, 0x5f, + 0x75, 0x7b, 0x69, 0x67, 0x3d, 0x33, 0x21, 0x2f, 0x05, 0x0b, 0x19, 0x17, + 0x76, 0x78, 0x6a, 0x64, 0x4e, 0x40, 0x52, 0x5c, 0x06, 0x08, 0x1a, 0x14, + 0x3e, 0x30, 0x22, 0x2c, 0x96, 0x98, 0x8a, 0x84, 0xae, 0xa0, 0xb2, 0xbc, + 0xe6, 0xe8, 0xfa, 0xf4, 0xde, 0xd0, 0xc2, 0xcc, 0x41, 0x4f, 0x5d, 0x53, + 0x79, 0x77, 0x65, 0x6b, 0x31, 0x3f, 0x2d, 0x23, 0x09, 0x07, 0x15, 0x1b, + 0xa1, 0xaf, 0xbd, 0xb3, 0x99, 0x97, 0x85, 0x8b, 0xd1, 0xdf, 0xcd, 0xc3, + 0xe9, 0xe7, 0xf5, 0xfb, 0x9a, 0x94, 0x86, 0x88, 0xa2, 0xac, 0xbe, 0xb0, + 0xea, 0xe4, 0xf6, 0xf8, 0xd2, 0xdc, 0xce, 0xc0, 0x7a, 0x74, 0x66, 0x68, + 0x42, 0x4c, 0x5e, 0x50, 0x0a, 0x04, 0x16, 0x18, 0x32, 0x3c, 0x2e, 0x20, + 0xec, 0xe2, 0xf0, 0xfe, 0xd4, 0xda, 0xc8, 0xc6, 0x9c, 0x92, 0x80, 0x8e, + 0xa4, 0xaa, 0xb8, 0xb6, 0x0c, 0x02, 0x10, 0x1e, 0x34, 0x3a, 0x28, 0x26, + 0x7c, 0x72, 0x60, 0x6e, 0x44, 0x4a, 0x58, 0x56, 0x37, 0x39, 0x2b, 0x25, + 0x0f, 0x01, 0x13, 0x1d, 0x47, 0x49, 0x5b, 0x55, 0x7f, 0x71, 0x63, 0x6d, + 0xd7, 0xd9, 0xcb, 0xc5, 0xef, 0xe1, 0xf3, 0xfd, 0xa7, 0xa9, 0xbb, 0xb5, + 0x9f, 0x91, 0x83, 0x8d, + }; + + x &= 255; + switch (table) { + case 0x2: + return ffmul02[x]; + case 0x3: + return ffmul03[x]; + case 0x9: + return ffmul09[x]; + case 0xb: + return ffmul0b[x]; + case 0xd: + return ffmul0d[x]; + case 0xe: + return ffmul0e[x]; + case 0: + // Case 0 indicates no table lookup, used for some forward mix stages. + return static_cast(x); + default: + VIXL_UNREACHABLE(); + return static_cast(x); + } +} + + +static uint8_t AESMixInner(uint64_t* x, int stage, bool inverse) { + VIXL_ASSERT(IsUint2(stage)); + + int imc_gm[7] = {0xb, 0xd, 0x9, 0xe}; + int mc_gm[7] = {0x3, 0x0, 0x0, 0x2}; + + int* gm = inverse ? imc_gm : mc_gm; + int index = 3 - stage; + + uint8_t result = 0; + for (int i = 0; i < 4; i++) { + result ^= GalMul(gm[(index + i) % 4], x[i]); + } + return result; +} + + +LogicVRegister Simulator::aesmix(LogicVRegister dst, + const LogicVRegister& src, + bool inverse) { + uint64_t in[16] = {}; + src.UintArray(kFormat16B, in); + dst.ClearForWrite(kFormat16B); + + for (int c = 0; c < 16; c++) { + int cmod4 = c % 4; + int d = c - cmod4; + VIXL_ASSERT((d == 0) || (d == 4) || (d == 8) || (d == 12)); + dst.SetUint(kFormat16B, c, AESMixInner(&in[d], cmod4, inverse)); + } + + return dst; +} + +LogicVRegister Simulator::aes(LogicVRegister dst, + const LogicVRegister& src, + bool decrypt) { + dst.ClearForWrite(kFormat16B); + + // (Inverse) shift rows. + uint8_t shift[] = {0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11}; + uint8_t shift_inv[] = {0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3}; + for (int i = 0; i < LaneCountFromFormat(kFormat16B); i++) { + uint8_t index = decrypt ? shift_inv[i] : shift[i]; + dst.SetUint(kFormat16B, i, src.Uint(kFormat16B, index)); + } + + // (Inverse) substitute bytes. + static const uint8_t gf2[256] = { + 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, + 0xfe, 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, + 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, + 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, + 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, + 0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, + 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, + 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, + 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, + 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, + 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec, + 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, + 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, + 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, + 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d, + 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, + 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, + 0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, + 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11, + 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, + 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, + 0xb0, 0x54, 0xbb, 0x16, + }; + static const uint8_t gf2_inv[256] = { + 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, + 0x81, 0xf3, 0xd7, 0xfb, 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, + 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, 0x54, 0x7b, 0x94, 0x32, + 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, + 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, + 0x6d, 0x8b, 0xd1, 0x25, 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, + 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, 0x6c, 0x70, 0x48, 0x50, + 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, + 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, + 0xb8, 0xb3, 0x45, 0x06, 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, + 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, 0x3a, 0x91, 0x11, 0x41, + 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, + 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, + 0x1c, 0x75, 0xdf, 0x6e, 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, + 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, 0xfc, 0x56, 0x3e, 0x4b, + 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, + 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, + 0x27, 0x80, 0xec, 0x5f, 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, + 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, 0xa0, 0xe0, 0x3b, 0x4d, + 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, + 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, + 0x55, 0x21, 0x0c, 0x7d, + }; + + for (int i = 0; i < LaneCountFromFormat(kFormat16B); i++) { + const uint8_t* table = decrypt ? gf2_inv : gf2; + dst.SetUint(kFormat16B, i, table[dst.Uint(kFormat16B, i)]); + } + return dst; +} + +LogicVRegister Simulator::sm3partw1(LogicVRegister srcdst, + const LogicVRegister& src1, + const LogicVRegister& src2) { + using namespace std::placeholders; + auto ROL = std::bind(RotateLeft, _1, _2, kSRegSize); + + SimVRegister temp; + + ext(kFormat16B, temp, src2, temp, 4); + rol(kFormat4S, temp, temp, 15); + eor(kFormat4S, temp, temp, src1); + LogicVRegister r = eor(kFormat4S, temp, temp, srcdst); + + uint64_t result[4] = {}; + r.UintArray(kFormat4S, result); + for (int i = 0; i < 4; i++) { + if (i == 3) { + // result[3] already contains srcdst[3] ^ src1[3] from the operations + // above. + result[i] ^= ROL(result[0], 15); + } + result[i] ^= ROL(result[i], 15) ^ ROL(result[i], 23); + } + srcdst.SetUintArray(kFormat4S, result); + return srcdst; +} + +LogicVRegister Simulator::sm3partw2(LogicVRegister srcdst, + const LogicVRegister& src1, + const LogicVRegister& src2) { + using namespace std::placeholders; + auto ROL = std::bind(RotateLeft, _1, _2, kSRegSize); + + SimVRegister temp; + VectorFormat vf = kFormat4S; + + rol(vf, temp, src2, 7); + LogicVRegister r = eor(vf, temp, temp, src1); + eor(vf, srcdst, temp, srcdst); + + uint64_t tmp2 = ROL(r.Uint(vf, 0), 15); + tmp2 ^= ROL(tmp2, 15) ^ ROL(tmp2, 23); + srcdst.SetUint(vf, 3, srcdst.Uint(vf, 3) ^ tmp2); + return srcdst; +} + +LogicVRegister Simulator::sm3ss1(LogicVRegister dst, + const LogicVRegister& src1, + const LogicVRegister& src2, + const LogicVRegister& src3) { + using namespace std::placeholders; + auto ROL = std::bind(RotateLeft, _1, _2, kSRegSize); + + VectorFormat vf = kFormat4S; + uint64_t result = ROL(src1.Uint(vf, 3), 12); + result += src2.Uint(vf, 3) + src3.Uint(vf, 3); + dst.Clear(); + dst.SetUint(vf, 3, ROL(result, 7)); + return dst; +} + +LogicVRegister Simulator::sm3tt1(LogicVRegister srcdst, + const LogicVRegister& src1, + const LogicVRegister& src2, + int index, + bool is_a) { + VectorFormat vf = kFormat4S; + using namespace std::placeholders; + auto ROL = std::bind(RotateLeft, _1, _2, kSRegSize); + auto sd = std::bind(&LogicVRegister::Uint, srcdst, vf, _1); + + VIXL_ASSERT(IsUint2(index)); + + uint64_t wjprime = src2.Uint(vf, index); + uint64_t ss2 = src1.Uint(vf, 3) ^ ROL(sd(3), 12); + + uint64_t tt1; + if (is_a) { + tt1 = CryptoOp<"parity"_h>(sd(1), sd(2), sd(3)); + } else { + tt1 = CryptoOp<"majority"_h>(sd(1), sd(2), sd(3)); + } + tt1 += sd(0) + ss2 + wjprime; + + ext(kFormat16B, srcdst, srcdst, srcdst, 4); + srcdst.SetUint(vf, 1, ROL(sd(1), 9)); + srcdst.SetUint(vf, 3, tt1); + return srcdst; +} + +LogicVRegister Simulator::sm3tt2(LogicVRegister srcdst, + const LogicVRegister& src1, + const LogicVRegister& src2, + int index, + bool is_a) { + VectorFormat vf = kFormat4S; + using namespace std::placeholders; + auto ROL = std::bind(RotateLeft, _1, _2, kSRegSize); + auto sd = std::bind(&LogicVRegister::Uint, srcdst, vf, _1); + + VIXL_ASSERT(IsUint2(index)); + + uint64_t wj = src2.Uint(vf, index); + + uint64_t tt2; + if (is_a) { + tt2 = CryptoOp<"parity"_h>(sd(1), sd(2), sd(3)); + } else { + tt2 = CryptoOp<"choose"_h>(sd(3), sd(2), sd(1)); + } + tt2 += sd(0) + src1.Uint(vf, 3) + wj; + + ext(kFormat16B, srcdst, srcdst, srcdst, 4); + srcdst.SetUint(vf, 1, ROL(sd(1), 19)); + tt2 ^= ROL(tt2, 9) ^ ROL(tt2, 17); + srcdst.SetUint(vf, 3, tt2); + return srcdst; +} + +static uint64_t SM4SBox(uint64_t x) { + static const uint8_t sbox[256] = { + 0x48, 0x39, 0xcb, 0xd7, 0x3e, 0x5f, 0xee, 0x79, 0x20, 0x4d, 0xdc, 0x3a, + 0xec, 0x7d, 0xf0, 0x18, 0x84, 0xc6, 0x6e, 0xc5, 0x09, 0xf1, 0xb9, 0x65, + 0x7e, 0x77, 0x96, 0x0c, 0x4a, 0x97, 0x69, 0x89, 0xb0, 0xb4, 0xe5, 0xb8, + 0x12, 0xd0, 0x74, 0x2d, 0xbd, 0x7b, 0xcd, 0xa5, 0x88, 0x31, 0xc1, 0x0a, + 0xd8, 0x5a, 0x10, 0x1f, 0x41, 0x5c, 0xd9, 0x11, 0x7f, 0xbc, 0xdd, 0xbb, + 0x92, 0xaf, 0x1b, 0x8d, 0x51, 0x5b, 0x6c, 0x6d, 0x72, 0x6a, 0xff, 0x03, + 0x2f, 0x8e, 0xfd, 0xde, 0x45, 0x37, 0xdb, 0xd5, 0x6f, 0x4e, 0x53, 0x0d, + 0xab, 0x23, 0x29, 0xc0, 0x60, 0xca, 0x66, 0x82, 0x2e, 0xe2, 0xf6, 0x1d, + 0xe3, 0xb1, 0x8c, 0xf5, 0x30, 0x32, 0x93, 0xad, 0x55, 0x1a, 0x34, 0x9b, + 0xa4, 0x5d, 0xae, 0xe0, 0xa1, 0x15, 0x61, 0xf9, 0xce, 0xf2, 0xf7, 0xa3, + 0xb5, 0x38, 0xc7, 0x40, 0xd2, 0x8a, 0xbf, 0xea, 0x9e, 0xc8, 0xc4, 0xa0, + 0xe7, 0x02, 0x36, 0x4c, 0x52, 0x27, 0xd3, 0x9f, 0x57, 0x46, 0x00, 0xd4, + 0x87, 0x78, 0x21, 0x01, 0x3b, 0x7c, 0x22, 0x25, 0xa2, 0xd1, 0x58, 0x63, + 0x5e, 0x0e, 0x24, 0x1e, 0x35, 0x9d, 0x56, 0x70, 0x4b, 0x0f, 0xeb, 0xf8, + 0x8b, 0xda, 0x64, 0x71, 0xb2, 0x81, 0x6b, 0x68, 0xa8, 0x4f, 0x85, 0xe6, + 0x19, 0x3c, 0x59, 0x83, 0xba, 0x17, 0x73, 0xf3, 0xfc, 0xa7, 0x07, 0x47, + 0xa6, 0x3f, 0x8f, 0x75, 0xfa, 0x94, 0xdf, 0x80, 0x95, 0xe8, 0x08, 0xc9, + 0xa9, 0x1c, 0xb3, 0xe4, 0x62, 0xac, 0xcf, 0xed, 0x43, 0x0b, 0x54, 0x33, + 0x7a, 0x98, 0xef, 0x91, 0xf4, 0x50, 0x42, 0x9c, 0x99, 0x06, 0x86, 0x49, + 0x26, 0x13, 0x44, 0xaa, 0xc3, 0x04, 0xbe, 0x2a, 0x76, 0x9a, 0x67, 0x2b, + 0x05, 0x2c, 0xfb, 0x28, 0xc2, 0x14, 0xb6, 0x16, 0xb7, 0x3d, 0xe1, 0xcc, + 0xfe, 0xe9, 0x90, 0xd6, + }; + uint64_t result = 0; + for (int j = 24; j >= 0; j -= 8) { + uint8_t s = 255 - ((x >> j) & 0xff); + result = (result << 8) | sbox[s]; + } + return result; +} + +LogicVRegister Simulator::sm4(LogicVRegister srcdst, + const LogicVRegister& src1, + const LogicVRegister& src2, + bool is_key) { + using namespace std::placeholders; + auto ROL = std::bind(RotateLeft, _1, _2, kSRegSize); + + VectorFormat vf = kFormat4S; + uint64_t result[4] = {}; + if (is_key) { + src1.UintArray(vf, result); + } else { + srcdst.UintArray(vf, result); + } + + for (int i = 0; i < 4; i++) { + uint64_t k = is_key ? src2.Uint(vf, i) : src1.Uint(vf, i); + uint64_t intval = result[3] ^ result[2] ^ result[1] ^ k; + intval = SM4SBox(intval); + + if (is_key) { + intval ^= ROL(intval, 13) ^ ROL(intval, 23); + } else { + intval ^= + ROL(intval, 2) ^ ROL(intval, 10) ^ ROL(intval, 18) ^ ROL(intval, 24); + } + + intval ^= result[0]; + + result[0] = result[1]; + result[1] = result[2]; + result[2] = result[3]; + result[3] = intval; + } + srcdst.SetUintArray(vf, result); + return srcdst; +} + } // namespace aarch64 } // namespace vixl diff --git a/src/aarch64/macro-assembler-aarch64.cc b/src/aarch64/macro-assembler-aarch64.cc index 8e1bb2f2..51669fd0 100644 --- a/src/aarch64/macro-assembler-aarch64.cc +++ b/src/aarch64/macro-assembler-aarch64.cc @@ -1240,11 +1240,14 @@ void MacroAssembler::Ccmp(const Register& rn, StatusFlags nzcv, Condition cond) { VIXL_ASSERT(allow_macro_instructions_); - if (operand.IsImmediate() && (operand.GetImmediate() < 0)) { - ConditionalCompareMacro(rn, -operand.GetImmediate(), nzcv, cond, CCMN); - } else { - ConditionalCompareMacro(rn, operand, nzcv, cond, CCMP); + if (operand.IsImmediate()) { + int64_t imm = operand.GetImmediate(); + if ((imm < 0) && CanBeNegated(imm)) { + ConditionalCompareMacro(rn, -imm, nzcv, cond, CCMN); + return; + } } + ConditionalCompareMacro(rn, operand, nzcv, cond, CCMP); } @@ -1253,11 +1256,14 @@ void MacroAssembler::Ccmn(const Register& rn, StatusFlags nzcv, Condition cond) { VIXL_ASSERT(allow_macro_instructions_); - if (operand.IsImmediate() && (operand.GetImmediate() < 0)) { - ConditionalCompareMacro(rn, -operand.GetImmediate(), nzcv, cond, CCMP); - } else { - ConditionalCompareMacro(rn, operand, nzcv, cond, CCMN); + if (operand.IsImmediate()) { + int64_t imm = operand.GetImmediate(); + if ((imm < 0) && CanBeNegated(imm)) { + ConditionalCompareMacro(rn, -imm, nzcv, cond, CCMP); + return; + } } + ConditionalCompareMacro(rn, operand, nzcv, cond, CCMN); } @@ -1491,8 +1497,7 @@ void MacroAssembler::Add(const Register& rd, VIXL_ASSERT(allow_macro_instructions_); if (operand.IsImmediate()) { int64_t imm = operand.GetImmediate(); - if ((imm < 0) && (imm != std::numeric_limits::min()) && - IsImmAddSub(-imm)) { + if ((imm < 0) && CanBeNegated(imm) && IsImmAddSub(-imm)) { AddSubMacro(rd, rn, -imm, S, SUB); return; } @@ -1579,8 +1584,7 @@ void MacroAssembler::Sub(const Register& rd, VIXL_ASSERT(allow_macro_instructions_); if (operand.IsImmediate()) { int64_t imm = operand.GetImmediate(); - if ((imm < 0) && (imm != std::numeric_limits::min()) && - IsImmAddSub(-imm)) { + if ((imm < 0) && CanBeNegated(imm) && IsImmAddSub(-imm)) { AddSubMacro(rd, rn, -imm, S, ADD); return; } @@ -1749,7 +1753,7 @@ void MacroAssembler::Fmov(VRegister vd, Float16 imm) { void MacroAssembler::Neg(const Register& rd, const Operand& operand) { VIXL_ASSERT(allow_macro_instructions_); - if (operand.IsImmediate()) { + if (operand.IsImmediate() && CanBeNegated(operand.GetImmediate())) { Mov(rd, -operand.GetImmediate()); } else { Sub(rd, AppropriateZeroRegFor(rd), operand); @@ -2065,6 +2069,22 @@ void MacroAssembler::Setf16(const Register& wn) { setf16(wn); } +void MacroAssembler::Chkfeat(const Register& xdn) { + VIXL_ASSERT(allow_macro_instructions_); + MacroEmissionCheckScope guard(this); + if (xdn.Is(x16)) { + chkfeat(xdn); + } else { + UseScratchRegisterScope temps(this); + if (temps.TryAcquire(x16)) { + Mov(x16, xdn); + chkfeat(x16); + Mov(xdn, x16); + } else { + VIXL_ABORT(); + } + } +} #define DEFINE_FUNCTION(FN, REGTYPE, REG, OP) \ void MacroAssembler::FN(const REGTYPE REG, const MemOperand& addr) { \ diff --git a/src/aarch64/macro-assembler-aarch64.h b/src/aarch64/macro-assembler-aarch64.h index 0d231f62..4666550a 100644 --- a/src/aarch64/macro-assembler-aarch64.h +++ b/src/aarch64/macro-assembler-aarch64.h @@ -2844,6 +2844,27 @@ MacroAssembler(PandaAllocator* allocator, byte* buffer, subps(xd, xn, xm); } void Cmpp(const Register& xn, const Register& xm) { Subps(xzr, xn, xm); } + void Chkfeat(const Register& xdn); + void Gcspushm(const Register& rt) { + VIXL_ASSERT(allow_macro_instructions_); + SingleEmissionCheckScope guard(this); + gcspushm(rt); + } + void Gcspopm(const Register& rt = xzr) { + VIXL_ASSERT(allow_macro_instructions_); + SingleEmissionCheckScope guard(this); + gcspopm(rt); + } + void Gcsss1(const Register& rt) { + VIXL_ASSERT(allow_macro_instructions_); + SingleEmissionCheckScope guard(this); + gcsss1(rt); + } + void Gcsss2(const Register& rt) { + VIXL_ASSERT(allow_macro_instructions_); + SingleEmissionCheckScope guard(this); + gcsss2(rt); + } // NEON 3 vector register instructions. #define NEON_3VREG_MACRO_LIST(V) \ @@ -2893,6 +2914,7 @@ MacroAssembler(PandaAllocator* allocator, byte* buffer, V(pmull2, Pmull2) \ V(raddhn, Raddhn) \ V(raddhn2, Raddhn2) \ + V(rax1, Rax1) \ V(rsubhn, Rsubhn) \ V(rsubhn2, Rsubhn2) \ V(saba, Saba) \ @@ -2905,8 +2927,21 @@ MacroAssembler(PandaAllocator* allocator, byte* buffer, V(saddl2, Saddl2) \ V(saddw, Saddw) \ V(saddw2, Saddw2) \ + V(sha1c, Sha1c) \ + V(sha1m, Sha1m) \ + V(sha1p, Sha1p) \ + V(sha1su0, Sha1su0) \ + V(sha256h, Sha256h) \ + V(sha256h2, Sha256h2) \ + V(sha256su1, Sha256su1) \ + V(sha512h, Sha512h) \ + V(sha512h2, Sha512h2) \ + V(sha512su1, Sha512su1) \ V(shadd, Shadd) \ V(shsub, Shsub) \ + V(sm3partw1, Sm3partw1) \ + V(sm3partw2, Sm3partw2) \ + V(sm4ekey, Sm4ekey) \ V(smax, Smax) \ V(smaxp, Smaxp) \ V(smin, Smin) \ @@ -3001,6 +3036,10 @@ MacroAssembler(PandaAllocator* allocator, byte* buffer, V(abs, Abs) \ V(addp, Addp) \ V(addv, Addv) \ + V(aesd, Aesd) \ + V(aese, Aese) \ + V(aesimc, Aesimc) \ + V(aesmc, Aesmc) \ V(cls, Cls) \ V(clz, Clz) \ V(cnt, Cnt) \ @@ -3049,6 +3088,11 @@ MacroAssembler(PandaAllocator* allocator, byte* buffer, V(sadalp, Sadalp) \ V(saddlp, Saddlp) \ V(saddlv, Saddlv) \ + V(sha1h, Sha1h) \ + V(sha1su1, Sha1su1) \ + V(sha256su0, Sha256su0) \ + V(sha512su0, Sha512su0) \ + V(sm4e, Sm4e) \ V(smaxv, Smaxv) \ V(sminv, Sminv) \ V(sqabs, Sqabs) \ @@ -3139,7 +3183,11 @@ MacroAssembler(PandaAllocator* allocator, byte* buffer, V(umlsl, Umlsl) \ V(umlsl2, Umlsl2) \ V(sudot, Sudot) \ - V(usdot, Usdot) + V(usdot, Usdot) \ + V(sm3tt1a, Sm3tt1a) \ + V(sm3tt1b, Sm3tt1b) \ + V(sm3tt2a, Sm3tt2a) \ + V(sm3tt2b, Sm3tt2b) #define DEFINE_MACRO_ASM_FUNC(ASM, MASM) \ @@ -3258,6 +3306,14 @@ MacroAssembler(PandaAllocator* allocator, byte* buffer, SVE_3VREG_COMMUTATIVE_MACRO_LIST(DEFINE_MACRO_ASM_FUNC) #undef DEFINE_MACRO_ASM_FUNC + void Bcax(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + const VRegister& va) { + VIXL_ASSERT(allow_macro_instructions_); + SingleEmissionCheckScope guard(this); + bcax(vd, vn, vm, va); + } void Bic(const VRegister& vd, const int imm8, const int left_shift = 0) { VIXL_ASSERT(allow_macro_instructions_); SingleEmissionCheckScope guard(this); @@ -3298,6 +3354,14 @@ MacroAssembler(PandaAllocator* allocator, byte* buffer, SingleEmissionCheckScope guard(this); dup(vd, rn); } + void Eor3(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + const VRegister& va) { + VIXL_ASSERT(allow_macro_instructions_); + SingleEmissionCheckScope guard(this); + eor3(vd, vn, vm, va); + } void Ext(const VRegister& vd, const VRegister& vn, const VRegister& vm, @@ -3594,6 +3658,14 @@ MacroAssembler(PandaAllocator* allocator, byte* buffer, SingleEmissionCheckScope guard(this); st4(vt, vt2, vt3, vt4, lane, dst); } + void Sm3ss1(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + const VRegister& va) { + VIXL_ASSERT(allow_macro_instructions_); + SingleEmissionCheckScope guard(this); + sm3ss1(vd, vn, vm, va); + } void Smov(const Register& rd, const VRegister& vn, int vn_index) { VIXL_ASSERT(allow_macro_instructions_); SingleEmissionCheckScope guard(this); @@ -3604,6 +3676,14 @@ MacroAssembler(PandaAllocator* allocator, byte* buffer, SingleEmissionCheckScope guard(this); umov(rd, vn, vn_index); } + void Xar(const VRegister& vd, + const VRegister& vn, + const VRegister& vm, + int rotate) { + VIXL_ASSERT(allow_macro_instructions_); + SingleEmissionCheckScope guard(this); + xar(vd, vn, vm, rotate); + } void Crc32b(const Register& rd, const Register& rn, const Register& rm) { VIXL_ASSERT(allow_macro_instructions_); SingleEmissionCheckScope guard(this); @@ -8311,9 +8391,10 @@ MacroAssembler(PandaAllocator* allocator, byte* buffer, UseScratchRegisterScope* scratch_scope); bool LabelIsOutOfRange(Label* label, ImmBranchType branch_type) { + int64_t offset = label->GetLocation() - GetCursorOffset(); + VIXL_ASSERT(IsMultiple(offset, kInstructionSize)); return !Instruction::IsValidImmPCOffset(branch_type, - label->GetLocation() - - GetCursorOffset()); + offset / kInstructionSize); } void ConfigureSimulatorCPUFeaturesHelper(const CPUFeatures& features, @@ -8727,6 +8808,16 @@ class UseScratchRegisterScope { return AcquireFrom(available, kGoverningPRegisterMask).P(); } + // TODO: extend to other scratch register lists. + bool TryAcquire(const Register& required_reg) { + CPURegList* list = masm_->GetScratchRegisterList(); + if (list->IncludesAliasOf(required_reg)) { + list->Remove(required_reg); + return true; + } + return false; + } + Register AcquireRegisterOfSize(int size_in_bits); Register AcquireSameSizeAs(const Register& reg) { return AcquireRegisterOfSize(reg.GetSizeInBits()); diff --git a/src/aarch64/pointer-auth-aarch64.cc b/src/aarch64/pointer-auth-aarch64.cc index a33f39a8..6bc3751d 100644 --- a/src/aarch64/pointer-auth-aarch64.cc +++ b/src/aarch64/pointer-auth-aarch64.cc @@ -151,7 +151,7 @@ uint64_t Simulator::AuthPAC(uint64_t ptr, uint64_t pac = ComputePAC(original_ptr, context, key); - uint64_t error_code = 1 << key.number; + uint64_t error_code = uint64_t{1} << key.number; if ((pac & pac_mask) == (ptr & pac_mask)) { return original_ptr; } else { diff --git a/src/aarch64/simulator-aarch64.cc b/src/aarch64/simulator-aarch64.cc index 8572774b..88827bff 100644 --- a/src/aarch64/simulator-aarch64.cc +++ b/src/aarch64/simulator-aarch64.cc @@ -32,8 +32,23 @@ #include #include #include + +#ifdef _WIN32 +#define WIN32_LEAN_AND_MEAN +#define NOMINMAX +#include +#undef MultiplyHigh +#include +#else #include #include +#endif + +#ifdef _MSC_VER +#define VIXL_SYNC() MemoryBarrier() +#else +#define VIXL_SYNC() __sync_synchronize() +#endif namespace vixl { namespace aarch64 { @@ -42,6 +57,25 @@ using vixl::internal::SimFloat16; const Instruction* Simulator::kEndOfSimAddress = NULL; +MemoryAccessResult TryMemoryAccess(uintptr_t address, uintptr_t access_size) { +#ifdef VIXL_ENABLE_IMPLICIT_CHECKS + for (uintptr_t i = 0; i < access_size; i++) { + if (_vixl_internal_ReadMemory(address, i) == MemoryAccessResult::Failure) { + // The memory access failed. + return MemoryAccessResult::Failure; + } + } + + // Either the memory access did not raise a signal or the signal handler did + // not correctly return MemoryAccessResult::Failure. + return MemoryAccessResult::Success; +#else + USE(address); + USE(access_size); + return MemoryAccessResult::Success; +#endif // VIXL_ENABLE_IMPLICIT_CHECKS +} + bool MetaDataDepot::MetaDataMTE::is_active = false; void SimSystemRegister::SetBits(int msb, int lsb, uint32_t bits) { @@ -498,21 +532,27 @@ Simulator::GetFormToVisitorFnMap() { Simulator::Simulator(Decoder* decoder, FILE* stream, SimStack::Allocated stack) : memory_(std::move(stack)), last_instr_(NULL), - cpu_features_auditor_(decoder, CPUFeatures::All()) { + cpu_features_auditor_(decoder, CPUFeatures::All()), + gcs_(kGCSNoStack), + gcs_enabled_(false) { #else Simulator::Simulator(PandaAllocator* allocator, Decoder* decoder, SimStack::Allocated stack, FILE* stream) : memory_(std::move(stack)), last_instr_(NULL), allocator_(allocator), cpu_features_auditor_(decoder, CPUFeatures::All()), - saved_cpu_features_(allocator_.Adapter()) { + saved_cpu_features_(allocator_.Adapter()), + gcs_(kGCSNoStack), + gcs_enabled_(false) { #endif // Ensure that shift operations act as the simulator expects. VIXL_ASSERT((static_cast(-1) >> 1) == -1); VIXL_ASSERT((static_cast(-1) >> 1) == 0x7fffffff); // Set up a placeholder pipe for CanReadMemory. +#ifndef _WIN32 VIXL_CHECK(pipe(placeholder_pipe_fd_) == 0); +#endif // Set up the decoder. decoder_ = decoder; @@ -554,9 +594,8 @@ Simulator::Simulator(PandaAllocator* allocator, Decoder* decoder, SimStack::Allo guard_pages_ = false; // Initialize the common state of RNDR and RNDRRS. - uint16_t seed[3] = {11, 22, 33}; - VIXL_STATIC_ASSERT(sizeof(seed) == sizeof(rand_state_)); - memcpy(rand_state_, seed, sizeof(rand_state_)); + uint64_t seed = (11 + (22 << 16) + (static_cast(33) << 32)); + rand_gen_.seed(seed); // Initialize all bits of pseudo predicate register to true. LogicPRegister ones(pregister_all_true_); @@ -634,6 +673,8 @@ void Simulator::ResetState() { ResetPRegisters(); WriteSp(memory_.GetStack().GetBase()); + ResetGCSState(); + EnableGCSCheck(); pc_ = NULL; pc_modified_ = false; @@ -671,9 +712,16 @@ Simulator::~Simulator() { #ifdef PANDA_BUILD allocator_.DeleteObject(print_disasm_); allocator_.DeleteObject(debugger_); +#else + delete print_disasm_; #endif +#ifndef _WIN32 close(placeholder_pipe_fd_[0]); close(placeholder_pipe_fd_[1]); +#endif + if (IsAllocatedGCS(gcs_)) { + GetGCSManager().FreeStack(gcs_); + } } @@ -974,6 +1022,19 @@ vixl_uint128_t Simulator::Add128(vixl_uint128_t x, vixl_uint128_t y) { return std::make_pair(sum_hi.first, sum_lo.first); } +vixl_uint128_t Simulator::Lsl128(vixl_uint128_t x, unsigned shift) const { + VIXL_ASSERT(shift <= 64); + if (shift == 0) return x; + if (shift == 64) return std::make_pair(x.second, 0); + uint64_t lo = x.second << shift; + uint64_t hi = (x.first << shift) | (x.second >> (64 - shift)); + return std::make_pair(hi, lo); +} + +vixl_uint128_t Simulator::Eor128(vixl_uint128_t x, vixl_uint128_t y) const { + return std::make_pair(x.first ^ y.first, x.second ^ y.second); +} + vixl_uint128_t Simulator::Neg128(vixl_uint128_t x) { // Negate the integer value. Throw an assertion when the input is INT128_MIN. VIXL_ASSERT((x.first != GetSignMask(64)) || (x.second != 0)); @@ -985,11 +1046,11 @@ vixl_uint128_t Simulator::Neg128(vixl_uint128_t x) { vixl_uint128_t Simulator::Mul64(uint64_t x, uint64_t y) { bool neg_result = false; if ((x >> 63) == 1) { - x = -x; + x = UnsignedNegate(x); neg_result = !neg_result; } if ((y >> 63) == 1) { - y = -y; + y = UnsignedNegate(y); neg_result = !neg_result; } @@ -1008,10 +1069,25 @@ vixl_uint128_t Simulator::Mul64(uint64_t x, uint64_t y) { vixl_uint128_t result = Add128(a, b); result = Add128(result, c); result = Add128(result, d); - return neg_result ? std::make_pair(-result.first - 1, -result.second) + return neg_result ? std::make_pair(UnsignedNegate(result.first) - 1, + UnsignedNegate(result.second)) : result; } +vixl_uint128_t Simulator::PolynomialMult128(uint64_t op1, + uint64_t op2, + int lane_size_in_bits) const { + VIXL_ASSERT(static_cast(lane_size_in_bits) <= kDRegSize); + vixl_uint128_t result = std::make_pair(0, 0); + vixl_uint128_t op2q = std::make_pair(0, op2); + for (int i = 0; i < lane_size_in_bits; i++) { + if ((op1 >> i) & 1) { + result = Eor128(result, Lsl128(op2q, i)); + } + } + return result; +} + int64_t Simulator::ShiftOperand(unsigned reg_size, uint64_t uvalue, Shift shift_type, @@ -1747,6 +1823,18 @@ void Simulator::PrintSystemRegister(SystemRegister id) { } } +void Simulator::PrintGCS(bool is_push, uint64_t addr, size_t entry) { + const char* arrow = is_push ? "<-" : "->"; + fprintf(stream_, + "# %sgcs0x%04" PRIx64 "[%" PRIu64 "]: %s %s 0x%016" PRIxPTR "\n", + clr_flag_name, + gcs_, + entry, + clr_normal, + arrow, + addr); +} + uint16_t Simulator::PrintPartialAccess(uint16_t access_mask, uint16_t future_access_mask, int struct_element_count, @@ -1794,8 +1882,9 @@ uint16_t Simulator::PrintPartialAccess(uint16_t access_mask, const char* sep = ""; for (int i = struct_element_count - 1; i >= 0; i--) { int offset = lane_size_in_bytes * i; - uint64_t nibble = MemReadUint(lane_size_in_bytes, address + offset); - fprintf(stream_, "%s%0*" PRIx64, sep, lane_size_in_nibbles, nibble); + auto nibble = MemReadUint(lane_size_in_bytes, address + offset); + VIXL_ASSERT(nibble); + fprintf(stream_, "%s%0*" PRIx64, sep, lane_size_in_nibbles, *nibble); sep = "'"; } fprintf(stream_, @@ -2812,6 +2901,23 @@ void Simulator::SimulateSVEInterleavedArithLong(const Instruction* instr) { } } +void Simulator::SimulateSVEPmull128(const Instruction* instr) { + SimVRegister& zd = ReadVRegister(instr->GetRd()); + SimVRegister& zm = ReadVRegister(instr->GetRm()); + SimVRegister& zn = ReadVRegister(instr->GetRn()); + SimVRegister zn_temp, zm_temp; + + if (form_hash_ == "pmullb_z_zz_q"_h) { + pack_even_elements(kFormatVnD, zn_temp, zn); + pack_even_elements(kFormatVnD, zm_temp, zm); + } else { + VIXL_ASSERT(form_hash_ == "pmullt_z_zz_q"_h); + pack_odd_elements(kFormatVnD, zn_temp, zn); + pack_odd_elements(kFormatVnD, zm_temp, zm); + } + pmull(kFormatVnQ, zd, zn_temp, zm_temp); +} + void Simulator::SimulateSVEIntMulLongVec(const Instruction* instr) { VectorFormat vform = instr->GetSVEVectorFormat(); SimVRegister& zd = ReadVRegister(instr->GetRd()); @@ -2826,15 +2932,15 @@ void Simulator::SimulateSVEIntMulLongVec(const Instruction* instr) { switch (form_hash_) { case "pmullb_z_zz"_h: - // '00' is reserved for Q-sized lane. - if (vform == kFormatVnB) { + // Size '10' is undefined. + if (vform == kFormatVnS) { VIXL_UNIMPLEMENTED(); } pmull(vform, zd, zn_b, zm_b); break; case "pmullt_z_zz"_h: - // '00' is reserved for Q-sized lane. - if (vform == kFormatVnB) { + // Size '10' is undefined. + if (vform == kFormatVnS) { VIXL_UNIMPLEMENTED(); } pmull(vform, zd, zn_t, zm_t); @@ -3723,6 +3829,7 @@ void Simulator::VisitUnconditionalBranch(const Instruction* instr) { switch (instr->Mask(UnconditionalBranchMask)) { case BL: WriteLr(instr->GetNextInstruction()); + GCSPush(reinterpret_cast(instr->GetNextInstruction())); VIXL_FALLTHROUGH(); case B: WritePc(instr->GetImmPCOffsetTarget()); @@ -3766,6 +3873,7 @@ void Simulator::VisitUnconditionalBranchToRegister(const Instruction* instr) { bool authenticate = false; bool link = false; bool ret = false; + bool compare_gcs = false; uint64_t addr = ReadXRegister(instr->GetRn()); uint64_t context = 0; @@ -3802,16 +3910,13 @@ void Simulator::VisitUnconditionalBranchToRegister(const Instruction* instr) { context = ReadXRegister(31, Reg31IsStackPointer); VIXL_FALLTHROUGH(); case RET: + compare_gcs = true; ret = true; break; default: VIXL_UNREACHABLE(); } - if (link) { - WriteLr(instr->GetNextInstruction()); - } - if (authenticate) { PACKey key = (instr->ExtractBit(10) == 0) ? kPACKeyIA : kPACKeyIB; addr = AuthPAC(addr, context, key, kInstructionPointer); @@ -3822,6 +3927,34 @@ void Simulator::VisitUnconditionalBranchToRegister(const Instruction* instr) { } } + if (compare_gcs) { + uint64_t expected_lr = GCSPeek(); + char msg[128]; + if (expected_lr != 0) { + if ((expected_lr & 0x3) != 0) { + snprintf(msg, + sizeof(msg), + "GCS contains misaligned return address: 0x%016" PRIx64 "\n", + expected_lr); + ReportGCSFailure(msg); + } else if ((addr != 0) && (addr != expected_lr)) { + snprintf(msg, + sizeof(msg), + "GCS mismatch: lr = 0x%016" PRIx64 ", gcs = 0x%016" PRIx64 + "\n", + addr, + expected_lr); + ReportGCSFailure(msg); + } + GCSPop(); + } + } + + if (link) { + WriteLr(instr->GetNextInstruction()); + GCSPush(reinterpret_cast(instr->GetNextInstruction())); + } + if (!ret) { // Check for interceptions to the target address, if one is found, call it. MetaDataDepot::BranchInterceptionAbstract* interception = @@ -4143,10 +4276,12 @@ void Simulator::LoadAcquireRCpcUnscaledOffsetHelper(const Instruction* instr) { VIXL_ALIGNMENT_EXCEPTION(); } - WriteRegister(rt, static_cast(MemRead(address))); + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + + WriteRegister(rt, static_cast(value)); // Approximate load-acquire by issuing a full barrier after the load. - __sync_synchronize(); + VIXL_SYNC(); LogRead(rt, GetPrintRegisterFormat(element_size), address); } @@ -4171,9 +4306,9 @@ void Simulator::StoreReleaseUnscaledOffsetHelper(const Instruction* instr) { } // Approximate store-release by issuing a full barrier after the load. - __sync_synchronize(); + VIXL_SYNC(); - MemWrite(address, ReadRegister(rt)); + if (!MemWrite(address, ReadRegister(rt))) return; LogWrite(rt, GetPrintRegisterFormat(element_size), address); } @@ -4260,7 +4395,9 @@ void Simulator::VisitLoadStorePAC(const Instruction* instr) { // Verify that the calculated address is available to the host. VIXL_ASSERT(address == addr_ptr); - WriteXRegister(dst, MemRead(addr_ptr), NoRegLog); + VIXL_DEFINE_OR_RETURN(value, MemRead(addr_ptr)); + + WriteXRegister(dst, value, NoRegLog); unsigned access_size = 1 << 3; LogRead(dst, GetPrintRegisterFormatForSize(access_size), addr_ptr); } @@ -4287,93 +4424,121 @@ void Simulator::LoadStoreHelper(const Instruction* instr, int extend_to_size = 0; LoadStoreOp op = static_cast(instr->Mask(LoadStoreMask)); switch (op) { - case LDRB_w: - WriteWRegister(srcdst, MemRead(address), NoRegLog); + case LDRB_w: { + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + WriteWRegister(srcdst, value, NoRegLog); extend_to_size = kWRegSizeInBytes; break; - case LDRH_w: - WriteWRegister(srcdst, MemRead(address), NoRegLog); + } + case LDRH_w: { + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + WriteWRegister(srcdst, value, NoRegLog); extend_to_size = kWRegSizeInBytes; break; - case LDR_w: - WriteWRegister(srcdst, MemRead(address), NoRegLog); + } + case LDR_w: { + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + WriteWRegister(srcdst, value, NoRegLog); extend_to_size = kWRegSizeInBytes; break; - case LDR_x: - WriteXRegister(srcdst, MemRead(address), NoRegLog); + } + case LDR_x: { + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + WriteXRegister(srcdst, value, NoRegLog); extend_to_size = kXRegSizeInBytes; break; - case LDRSB_w: - WriteWRegister(srcdst, MemRead(address), NoRegLog); + } + case LDRSB_w: { + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + WriteWRegister(srcdst, value, NoRegLog); extend_to_size = kWRegSizeInBytes; break; - case LDRSH_w: - WriteWRegister(srcdst, MemRead(address), NoRegLog); + } + case LDRSH_w: { + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + WriteWRegister(srcdst, value, NoRegLog); extend_to_size = kWRegSizeInBytes; break; - case LDRSB_x: - WriteXRegister(srcdst, MemRead(address), NoRegLog); + } + case LDRSB_x: { + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + WriteXRegister(srcdst, value, NoRegLog); extend_to_size = kXRegSizeInBytes; break; - case LDRSH_x: - WriteXRegister(srcdst, MemRead(address), NoRegLog); + } + case LDRSH_x: { + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + WriteXRegister(srcdst, value, NoRegLog); extend_to_size = kXRegSizeInBytes; break; - case LDRSW_x: - WriteXRegister(srcdst, MemRead(address), NoRegLog); + } + case LDRSW_x: { + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + WriteXRegister(srcdst, value, NoRegLog); extend_to_size = kXRegSizeInBytes; break; - case LDR_b: - WriteBRegister(srcdst, MemRead(address), NoRegLog); + } + case LDR_b: { + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + WriteBRegister(srcdst, value, NoRegLog); rt_is_vreg = true; break; - case LDR_h: - WriteHRegister(srcdst, MemRead(address), NoRegLog); + } + case LDR_h: { + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + WriteHRegister(srcdst, value, NoRegLog); rt_is_vreg = true; break; - case LDR_s: - WriteSRegister(srcdst, MemRead(address), NoRegLog); + } + case LDR_s: { + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + WriteSRegister(srcdst, value, NoRegLog); rt_is_vreg = true; break; - case LDR_d: - WriteDRegister(srcdst, MemRead(address), NoRegLog); + } + case LDR_d: { + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + WriteDRegister(srcdst, value, NoRegLog); rt_is_vreg = true; break; - case LDR_q: - WriteQRegister(srcdst, MemRead(address), NoRegLog); + } + case LDR_q: { + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + WriteQRegister(srcdst, value, NoRegLog); rt_is_vreg = true; break; + } case STRB_w: - MemWrite(address, ReadWRegister(srcdst)); + if (!MemWrite(address, ReadWRegister(srcdst))) return; break; case STRH_w: - MemWrite(address, ReadWRegister(srcdst)); + if (!MemWrite(address, ReadWRegister(srcdst))) return; break; case STR_w: - MemWrite(address, ReadWRegister(srcdst)); + if (!MemWrite(address, ReadWRegister(srcdst))) return; break; case STR_x: - MemWrite(address, ReadXRegister(srcdst)); + if (!MemWrite(address, ReadXRegister(srcdst))) return; break; case STR_b: - MemWrite(address, ReadBRegister(srcdst)); + if (!MemWrite(address, ReadBRegister(srcdst))) return; rt_is_vreg = true; break; case STR_h: - MemWrite(address, ReadHRegisterBits(srcdst)); + if (!MemWrite(address, ReadHRegisterBits(srcdst))) return; rt_is_vreg = true; break; case STR_s: - MemWrite(address, ReadSRegister(srcdst)); + if (!MemWrite(address, ReadSRegister(srcdst))) return; rt_is_vreg = true; break; case STR_d: - MemWrite(address, ReadDRegister(srcdst)); + if (!MemWrite(address, ReadDRegister(srcdst))) return; rt_is_vreg = true; break; case STR_q: - MemWrite(address, ReadQRegister(srcdst)); + if (!MemWrite(address, ReadQRegister(srcdst))) return; rt_is_vreg = true; break; @@ -4454,64 +4619,76 @@ void Simulator::LoadStorePairHelper(const Instruction* instr, // Use NoRegLog to suppress the register trace (LOG_REGS, LOG_FP_REGS). We // will print a more detailed log. case LDP_w: { - WriteWRegister(rt, MemRead(address), NoRegLog); - WriteWRegister(rt2, MemRead(address2), NoRegLog); + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + VIXL_DEFINE_OR_RETURN(value2, MemRead(address2)); + WriteWRegister(rt, value, NoRegLog); + WriteWRegister(rt2, value2, NoRegLog); break; } case LDP_s: { - WriteSRegister(rt, MemRead(address), NoRegLog); - WriteSRegister(rt2, MemRead(address2), NoRegLog); + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + VIXL_DEFINE_OR_RETURN(value2, MemRead(address2)); + WriteSRegister(rt, value, NoRegLog); + WriteSRegister(rt2, value2, NoRegLog); rt_is_vreg = true; break; } case LDP_x: { - WriteXRegister(rt, MemRead(address), NoRegLog); - WriteXRegister(rt2, MemRead(address2), NoRegLog); + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + VIXL_DEFINE_OR_RETURN(value2, MemRead(address2)); + WriteXRegister(rt, value, NoRegLog); + WriteXRegister(rt2, value2, NoRegLog); break; } case LDP_d: { - WriteDRegister(rt, MemRead(address), NoRegLog); - WriteDRegister(rt2, MemRead(address2), NoRegLog); + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + VIXL_DEFINE_OR_RETURN(value2, MemRead(address2)); + WriteDRegister(rt, value, NoRegLog); + WriteDRegister(rt2, value2, NoRegLog); rt_is_vreg = true; break; } case LDP_q: { - WriteQRegister(rt, MemRead(address), NoRegLog); - WriteQRegister(rt2, MemRead(address2), NoRegLog); + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + VIXL_DEFINE_OR_RETURN(value2, MemRead(address2)); + WriteQRegister(rt, value, NoRegLog); + WriteQRegister(rt2, value2, NoRegLog); rt_is_vreg = true; break; } case LDPSW_x: { - WriteXRegister(rt, MemRead(address), NoRegLog); - WriteXRegister(rt2, MemRead(address2), NoRegLog); + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + VIXL_DEFINE_OR_RETURN(value2, MemRead(address2)); + WriteXRegister(rt, value, NoRegLog); + WriteXRegister(rt2, value2, NoRegLog); sign_extend = true; break; } case STP_w: { - MemWrite(address, ReadWRegister(rt)); - MemWrite(address2, ReadWRegister(rt2)); + if (!MemWrite(address, ReadWRegister(rt))) return; + if (!MemWrite(address2, ReadWRegister(rt2))) return; break; } case STP_s: { - MemWrite(address, ReadSRegister(rt)); - MemWrite(address2, ReadSRegister(rt2)); + if (!MemWrite(address, ReadSRegister(rt))) return; + if (!MemWrite(address2, ReadSRegister(rt2))) return; rt_is_vreg = true; break; } case STP_x: { - MemWrite(address, ReadXRegister(rt)); - MemWrite(address2, ReadXRegister(rt2)); + if (!MemWrite(address, ReadXRegister(rt))) return; + if (!MemWrite(address2, ReadXRegister(rt2))) return; break; } case STP_d: { - MemWrite(address, ReadDRegister(rt)); - MemWrite(address2, ReadDRegister(rt2)); + if (!MemWrite(address, ReadDRegister(rt))) return; + if (!MemWrite(address2, ReadDRegister(rt2))) return; rt_is_vreg = true; break; } case STP_q: { - MemWrite(address, ReadQRegister(rt)); - MemWrite(address2, ReadQRegister(rt2)); + if (!MemWrite(address, ReadQRegister(rt))) return; + if (!MemWrite(address2, ReadQRegister(rt2))) return; rt_is_vreg = true; break; } @@ -4571,18 +4748,19 @@ void Simulator::CompareAndSwapHelper(const Instruction* instr) { // associated with that location, even if the compare subsequently fails. local_monitor_.Clear(); - T data = MemRead(address); + VIXL_DEFINE_OR_RETURN(data, MemRead(address)); + if (is_acquire) { // Approximate load-acquire by issuing a full barrier after the load. - __sync_synchronize(); + VIXL_SYNC(); } if (data == comparevalue) { if (is_release) { // Approximate store-release by issuing a full barrier before the store. - __sync_synchronize(); + VIXL_SYNC(); } - MemWrite(address, newvalue); + if (!MemWrite(address, newvalue)) return; LogWrite(rt, GetPrintRegisterFormatForSize(element_size), address); } WriteRegister(rs, data, NoRegLog); @@ -4618,12 +4796,12 @@ void Simulator::CompareAndSwapPairHelper(const Instruction* instr) { // associated with that location, even if the compare subsequently fails. local_monitor_.Clear(); - T data_low = MemRead(address); - T data_high = MemRead(address2); + VIXL_DEFINE_OR_RETURN(data_low, MemRead(address)); + VIXL_DEFINE_OR_RETURN(data_high, MemRead(address2)); if (is_acquire) { // Approximate load-acquire by issuing a full barrier after the load. - __sync_synchronize(); + VIXL_SYNC(); } bool same = @@ -4631,11 +4809,11 @@ void Simulator::CompareAndSwapPairHelper(const Instruction* instr) { if (same) { if (is_release) { // Approximate store-release by issuing a full barrier before the store. - __sync_synchronize(); + VIXL_SYNC(); } - MemWrite(address, newvalue_low); - MemWrite(address2, newvalue_high); + if (!MemWrite(address, newvalue_low)) return; + if (!MemWrite(address2, newvalue_high)) return; } WriteRegister(rs + 1, data_high, NoRegLog); @@ -4652,6 +4830,7 @@ void Simulator::CompareAndSwapPairHelper(const Instruction* instr) { } bool Simulator::CanReadMemory(uintptr_t address, size_t size) { +#ifndef _WIN32 // To simulate fault-tolerant loads, we need to know what host addresses we // can access without generating a real fault. One way to do that is to // attempt to `write()` the memory to a placeholder pipe[1]. This is more @@ -4709,6 +4888,44 @@ bool Simulator::CanReadMemory(uintptr_t address, size_t size) { } return can_read; +#else + // To simulate fault-tolerant loads, we need to know what host addresses we + // can access without generating a real fault + // The pipe code above is almost but not fully compatible with Windows + // Instead, use the platform specific API VirtualQuery() + // + // [2]: https://stackoverflow.com/a/18395247/9109981 + + bool can_read = true; + MEMORY_BASIC_INFORMATION pageInfo; + + size_t checked = 0; + while (can_read && (checked < size)) { + size_t result = VirtualQuery(reinterpret_cast(address + checked), + &pageInfo, + sizeof(pageInfo)); + + if (result < 0) { + can_read = false; + break; + } + + if (pageInfo.State != MEM_COMMIT) { + can_read = false; + break; + } + + if (pageInfo.Protect == PAGE_NOACCESS || pageInfo.Protect == PAGE_EXECUTE) { + can_read = false; + break; + } + checked += pageInfo.RegionSize - + ((address + checked) - + reinterpret_cast(pageInfo.BaseAddress)); + } + + return can_read; +#endif } void Simulator::PrintExclusiveAccessWarning() { @@ -4802,54 +5019,66 @@ void Simulator::VisitLoadStoreExclusive(const Instruction* instr) { case LDXRB_w: case LDAXRB_w: case LDARB_w: - case LDLARB: - WriteWRegister(rt, MemRead(address), NoRegLog); + case LDLARB: { + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + WriteWRegister(rt, value, NoRegLog); reg_size = kWRegSizeInBytes; break; + } case LDXRH_w: case LDAXRH_w: case LDARH_w: - case LDLARH: - WriteWRegister(rt, MemRead(address), NoRegLog); + case LDLARH: { + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + WriteWRegister(rt, value, NoRegLog); reg_size = kWRegSizeInBytes; break; + } case LDXR_w: case LDAXR_w: case LDAR_w: - case LDLAR_w: - WriteWRegister(rt, MemRead(address), NoRegLog); + case LDLAR_w: { + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + WriteWRegister(rt, value, NoRegLog); reg_size = kWRegSizeInBytes; break; + } case LDXR_x: case LDAXR_x: case LDAR_x: - case LDLAR_x: - WriteXRegister(rt, MemRead(address), NoRegLog); + case LDLAR_x: { + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + WriteXRegister(rt, value, NoRegLog); reg_size = kXRegSizeInBytes; break; + } case LDXP_w: - case LDAXP_w: - WriteWRegister(rt, MemRead(address), NoRegLog); - WriteWRegister(rt2, - MemRead(address + element_size), - NoRegLog); + case LDAXP_w: { + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + VIXL_DEFINE_OR_RETURN(value2, + MemRead(address + element_size)); + WriteWRegister(rt, value, NoRegLog); + WriteWRegister(rt2, value2, NoRegLog); reg_size = kWRegSizeInBytes; break; + } case LDXP_x: - case LDAXP_x: - WriteXRegister(rt, MemRead(address), NoRegLog); - WriteXRegister(rt2, - MemRead(address + element_size), - NoRegLog); + case LDAXP_x: { + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + VIXL_DEFINE_OR_RETURN(value2, + MemRead(address + element_size)); + WriteXRegister(rt, value, NoRegLog); + WriteXRegister(rt2, value2, NoRegLog); reg_size = kXRegSizeInBytes; break; + } default: VIXL_UNREACHABLE(); } if (is_acquire_release) { // Approximate load-acquire by issuing a full barrier after the load. - __sync_synchronize(); + VIXL_SYNC(); } PrintRegisterFormat format = GetPrintRegisterFormatForSize(reg_size); @@ -4861,7 +5090,7 @@ void Simulator::VisitLoadStoreExclusive(const Instruction* instr) { if (is_acquire_release) { // Approximate store-release by issuing a full barrier before the // store. - __sync_synchronize(); + VIXL_SYNC(); } bool do_store = true; @@ -4883,35 +5112,41 @@ void Simulator::VisitLoadStoreExclusive(const Instruction* instr) { case STLXRB_w: case STLRB_w: case STLLRB: - MemWrite(address, ReadWRegister(rt)); + if (!MemWrite(address, ReadWRegister(rt))) return; break; case STXRH_w: case STLXRH_w: case STLRH_w: case STLLRH: - MemWrite(address, ReadWRegister(rt)); + if (!MemWrite(address, ReadWRegister(rt))) return; break; case STXR_w: case STLXR_w: case STLR_w: case STLLR_w: - MemWrite(address, ReadWRegister(rt)); + if (!MemWrite(address, ReadWRegister(rt))) return; break; case STXR_x: case STLXR_x: case STLR_x: case STLLR_x: - MemWrite(address, ReadXRegister(rt)); + if (!MemWrite(address, ReadXRegister(rt))) return; break; case STXP_w: case STLXP_w: - MemWrite(address, ReadWRegister(rt)); - MemWrite(address + element_size, ReadWRegister(rt2)); + if (!MemWrite(address, ReadWRegister(rt))) return; + if (!MemWrite(address + element_size, + ReadWRegister(rt2))) { + return; + } break; case STXP_x: case STLXP_x: - MemWrite(address, ReadXRegister(rt)); - MemWrite(address + element_size, ReadXRegister(rt2)); + if (!MemWrite(address, ReadXRegister(rt))) return; + if (!MemWrite(address + element_size, + ReadXRegister(rt2))) { + return; + } break; default: VIXL_UNREACHABLE(); @@ -4944,11 +5179,11 @@ void Simulator::AtomicMemorySimpleHelper(const Instruction* instr) { T value = ReadRegister(rs); - T data = MemRead(address); + VIXL_DEFINE_OR_RETURN(data, MemRead(address)); if (is_acquire) { // Approximate load-acquire by issuing a full barrier after the load. - __sync_synchronize(); + VIXL_SYNC(); } T result = 0; @@ -4982,7 +5217,7 @@ void Simulator::AtomicMemorySimpleHelper(const Instruction* instr) { if (is_release) { // Approximate store-release by issuing a full barrier before the store. - __sync_synchronize(); + VIXL_SYNC(); } WriteRegister(rt, data, NoRegLog); @@ -4994,7 +5229,7 @@ void Simulator::AtomicMemorySimpleHelper(const Instruction* instr) { PrintRegisterFormat format = GetPrintRegisterFormatForSize(register_size); LogExtendingRead(rt, format, element_size, address); - MemWrite(address, result); + if (!MemWrite(address, result)) return; format = GetPrintRegisterFormatForSize(element_size); LogWrite(rs, format, address); } @@ -5013,17 +5248,18 @@ void Simulator::AtomicMemorySwapHelper(const Instruction* instr) { CheckIsValidUnalignedAtomicAccess(rn, address, element_size); - T data = MemRead(address); + VIXL_DEFINE_OR_RETURN(data, MemRead(address)); + if (is_acquire) { // Approximate load-acquire by issuing a full barrier after the load. - __sync_synchronize(); + VIXL_SYNC(); } if (is_release) { // Approximate store-release by issuing a full barrier before the store. - __sync_synchronize(); + VIXL_SYNC(); } - MemWrite(address, ReadRegister(rs)); + if (!MemWrite(address, ReadRegister(rs))) return; WriteRegister(rt, data); @@ -5042,10 +5278,12 @@ void Simulator::LoadAcquireRCpcHelper(const Instruction* instr) { CheckIsValidUnalignedAtomicAccess(rn, address, element_size); - WriteRegister(rt, MemRead(address)); + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + + WriteRegister(rt, value); // Approximate load-acquire by issuing a full barrier after the load. - __sync_synchronize(); + VIXL_SYNC(); LogRead(rt, GetPrintRegisterFormatForSize(element_size), address); } @@ -5162,30 +5400,42 @@ void Simulator::VisitLoadLiteral(const Instruction* instr) { switch (instr->Mask(LoadLiteralMask)) { // Use NoRegLog to suppress the register trace (LOG_REGS, LOG_VREGS), then // print a more detailed log. - case LDR_w_lit: - WriteWRegister(rt, MemRead(address), NoRegLog); + case LDR_w_lit: { + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + WriteWRegister(rt, value, NoRegLog); LogRead(rt, kPrintWReg, address); break; - case LDR_x_lit: - WriteXRegister(rt, MemRead(address), NoRegLog); + } + case LDR_x_lit: { + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + WriteXRegister(rt, value, NoRegLog); LogRead(rt, kPrintXReg, address); break; - case LDR_s_lit: - WriteSRegister(rt, MemRead(address), NoRegLog); + } + case LDR_s_lit: { + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + WriteSRegister(rt, value, NoRegLog); LogVRead(rt, kPrintSRegFP, address); break; - case LDR_d_lit: - WriteDRegister(rt, MemRead(address), NoRegLog); + } + case LDR_d_lit: { + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + WriteDRegister(rt, value, NoRegLog); LogVRead(rt, kPrintDRegFP, address); break; - case LDR_q_lit: - WriteQRegister(rt, MemRead(address), NoRegLog); + } + case LDR_q_lit: { + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + WriteQRegister(rt, value, NoRegLog); LogVRead(rt, kPrintReg1Q, address); break; - case LDRSW_x_lit: - WriteXRegister(rt, MemRead(address), NoRegLog); + } + case LDRSW_x_lit: { + VIXL_DEFINE_OR_RETURN(value, MemRead(address)); + WriteXRegister(rt, value, NoRegLog); LogExtendingRead(rt, kPrintXReg, kWRegSizeInBytes, address); break; + } // Ignore prfm hint instructions. case PRFM_lit: @@ -5294,7 +5544,7 @@ void Simulator::VisitConditionalSelect(const Instruction* instr) { break; case CSNEG_w: case CSNEG_x: - new_val = -new_val; + new_val = UnsignedNegate(new_val); break; default: VIXL_UNIMPLEMENTED(); @@ -6014,6 +6264,8 @@ void Simulator::VisitFPIntegerConvert(const Instruction* instr) { WriteDRegisterBits(dst, ReadXRegister(src)); break; case FMOV_d1_x: + // Zero bits beyond the MSB of a Q register. + mov(kFormat16B, ReadVRegister(dst), ReadVRegister(dst)); LogicVRegister(ReadVRegister(dst)) .SetUint(kFormatD, 1, ReadXRegister(src)); break; @@ -6664,7 +6916,7 @@ bool Simulator::FPProcessNaNs(const Instruction* instr) { } -void Simulator::SysOp_W(int op, int64_t val) { +bool Simulator::SysOp_W(int op, int64_t val) { switch (op) { case IVAU: case CVAC: @@ -6683,15 +6935,30 @@ void Simulator::SysOp_W(int op, int64_t val) { // so temporarily disable MTE. bool mte_enabled = MetaDataDepot::MetaDataMTE::IsActive(); MetaDataDepot::MetaDataMTE::SetActive(false); - volatile uint8_t y = MemRead(val); + volatile uint8_t y = *MemRead(val); MetaDataDepot::MetaDataMTE::SetActive(mte_enabled); USE(y); - // TODO: Implement ZVA, GVA, GZVA. break; } + case ZVA: { + if ((dczid_ & 0x10) != 0) { // Check dc zva is enabled. + return false; + } + int blocksize = (1 << (dczid_ & 0xf)) * kWRegSizeInBytes; + VIXL_ASSERT(IsMultiple(blocksize, sizeof(uint64_t))); + uintptr_t addr = AlignDown(val, blocksize); + for (int i = 0; i < blocksize; i += sizeof(uint64_t)) { + MemWrite(addr + i, 0); + LogWriteU64(0, addr + i); + } + break; + } + // TODO: Implement GVA, GZVA. default: VIXL_UNIMPLEMENTED(); + return false; } + return true; } void Simulator::PACHelper(int dst, @@ -6763,8 +7030,8 @@ void Simulator::VisitSystem(const Instruction* instr) { break; case RNDR: case RNDRRS: { - uint64_t high = jrand48(rand_state_); - uint64_t low = jrand48(rand_state_); + uint64_t high = rand_gen_(); + uint64_t low = rand_gen_(); uint64_t rand_num = (high << 32) | (low & 0xffffffff); WriteXRegister(instr->GetRt(), rand_num); // Simulate successful random number generation. @@ -6774,10 +7041,21 @@ void Simulator::VisitSystem(const Instruction* instr) { LogSystemRegister(NZCV); break; } + case DCZID_EL0: + WriteXRegister(instr->GetRt(), dczid_); + break; default: VIXL_UNIMPLEMENTED(); } break; + case "chkfeat_hf_hints"_h: { + uint64_t feat_select = ReadXRegister(16); + uint64_t gcs_enabled = IsGCSCheckEnabled() ? 1 : 0; + feat_select &= ~gcs_enabled; + WriteXRegister(16, feat_select); + break; + } + case "hint_hm_hints"_h: case "nop_hi_hints"_h: case "esb_hi_hints"_h: case "csdb_hi_hints"_h: @@ -6859,11 +7137,68 @@ void Simulator::VisitSystem(const Instruction* instr) { case "dsb_bo_barriers"_h: case "dmb_bo_barriers"_h: case "isb_bi_barriers"_h: - __sync_synchronize(); + VIXL_SYNC(); break; - case "sys_cr_systeminstrs"_h: - SysOp_W(instr->GetSysOp(), ReadXRegister(instr->GetRt())); + case "sys_cr_systeminstrs"_h: { + uint64_t rt = ReadXRegister(instr->GetRt()); + uint32_t sysop = instr->GetSysOp(); + if (sysop == GCSSS1) { + uint64_t incoming_size = rt >> 32; + // Drop upper 32 bits to get GCS index. + uint64_t incoming_gcs = rt & 0xffffffff; + uint64_t outgoing_gcs = ActivateGCS(incoming_gcs); + uint64_t incoming_seal = GCSPop(); + if (((incoming_seal ^ rt) != 1) || + (GetActiveGCSPtr()->size() != incoming_size)) { + char msg[128]; + snprintf(msg, + sizeof(msg), + "GCS: invalid incoming stack: 0x%016" PRIx64 "\n", + incoming_seal); + ReportGCSFailure(msg); + } + GCSPush(outgoing_gcs + 5); + } else if (sysop == GCSPUSHM) { + GCSPush(ReadXRegister(instr->GetRt())); + } else { + if (!SysOp_W(sysop, rt)) { + VisitUnallocated(instr); + } + } break; + } + case "sysl_rc_systeminstrs"_h: { + uint32_t sysop = instr->GetSysOp(); + if (sysop == GCSPOPM) { + uint64_t addr = GCSPop(); + WriteXRegister(instr->GetRt(), addr); + } else if (sysop == GCSSS2) { + uint64_t outgoing_gcs = GCSPop(); + // Check for token inserted by gcsss1. + if ((outgoing_gcs & 7) != 5) { + char msg[128]; + snprintf(msg, + sizeof(msg), + "GCS: outgoing stack has no token: 0x%016" PRIx64 "\n", + outgoing_gcs); + ReportGCSFailure(msg); + } + uint64_t incoming_gcs = ActivateGCS(outgoing_gcs); + outgoing_gcs &= ~UINT64_C(0x3ff); + + // Encode the size into the outgoing stack seal, to check later. + uint64_t size = GetActiveGCSPtr()->size(); + VIXL_ASSERT(IsUint32(size)); + VIXL_ASSERT(IsUint32(outgoing_gcs + 1)); + uint64_t outgoing_seal = (size << 32) | (outgoing_gcs + 1); + GCSPush(outgoing_seal); + ActivateGCS(incoming_gcs); + WriteXRegister(instr->GetRt(), outgoing_seal - 1); + } else { + VIXL_UNIMPLEMENTED(); + } + break; + } default: VIXL_UNIMPLEMENTED(); } @@ -6928,19 +7263,161 @@ void Simulator::VisitException(const Instruction* instr) { void Simulator::VisitCrypto2RegSHA(const Instruction* instr) { - VisitUnimplemented(instr); + SimVRegister& rd = ReadVRegister(instr->GetRd()); + SimVRegister& rn = ReadVRegister(instr->GetRn()); + + switch (form_hash_) { + case "sha1h_ss_cryptosha2"_h: + ror(kFormatS, rd, rn, 2); + break; + case "sha1su1_vv_cryptosha2"_h: { + SimVRegister temp; + + // temp = srcdst ^ (src >> 32); + ext(kFormat16B, temp, rn, temp, 4); + eor(kFormat16B, temp, rd, temp); + + // srcdst = ROL(temp, 1) ^ (ROL(temp, 2) << 96) + rol(kFormat4S, rd, temp, 1); + rol(kFormatS, temp, temp, 2); // kFormatS will zero bits <127:32> + ext(kFormat16B, temp, temp, temp, 4); + eor(kFormat16B, rd, rd, temp); + break; + } + case "sha256su0_vv_cryptosha2"_h: + sha2su0(rd, rn); + break; + } } void Simulator::VisitCrypto3RegSHA(const Instruction* instr) { - VisitUnimplemented(instr); + SimVRegister& rd = ReadVRegister(instr->GetRd()); + SimVRegister& rn = ReadVRegister(instr->GetRn()); + SimVRegister& rm = ReadVRegister(instr->GetRm()); + + switch (form_hash_) { + case "sha1c_qsv_cryptosha3"_h: + sha1<"choose"_h>(rd, rn, rm); + break; + case "sha1m_qsv_cryptosha3"_h: + sha1<"majority"_h>(rd, rn, rm); + break; + case "sha1p_qsv_cryptosha3"_h: + sha1<"parity"_h>(rd, rn, rm); + break; + case "sha1su0_vvv_cryptosha3"_h: { + SimVRegister temp; + ext(kFormat16B, temp, rd, rn, 8); + eor(kFormat16B, temp, temp, rd); + eor(kFormat16B, rd, temp, rm); + break; + } + case "sha256h_qqv_cryptosha3"_h: + sha2h(rd, rn, rm, /* part1 = */ true); + break; + case "sha256h2_qqv_cryptosha3"_h: + sha2h(rd, rn, rm, /* part1 = */ false); + break; + case "sha256su1_vvv_cryptosha3"_h: + sha2su1(rd, rn, rm); + break; + } } void Simulator::VisitCryptoAES(const Instruction* instr) { - VisitUnimplemented(instr); + SimVRegister& rd = ReadVRegister(instr->GetRd()); + SimVRegister& rn = ReadVRegister(instr->GetRn()); + SimVRegister temp; + + switch (form_hash_) { + case "aesd_b_cryptoaes"_h: + eor(kFormat16B, temp, rd, rn); + aes(rd, temp, /* decrypt = */ true); + break; + case "aese_b_cryptoaes"_h: + eor(kFormat16B, temp, rd, rn); + aes(rd, temp, /* decrypt = */ false); + break; + case "aesimc_b_cryptoaes"_h: + aesmix(rd, rn, /* inverse = */ true); + break; + case "aesmc_b_cryptoaes"_h: + aesmix(rd, rn, /* inverse = */ false); + break; + } } +void Simulator::VisitCryptoSM3(const Instruction* instr) { + SimVRegister& rd = ReadVRegister(instr->GetRd()); + SimVRegister& rn = ReadVRegister(instr->GetRn()); + SimVRegister& rm = ReadVRegister(instr->GetRm()); + SimVRegister& ra = ReadVRegister(instr->GetRa()); + int index = instr->ExtractBits(13, 12); + + bool is_a = false; + switch (form_hash_) { + case "sm3partw1_vvv4_cryptosha512_3"_h: + sm3partw1(rd, rn, rm); + break; + case "sm3partw2_vvv4_cryptosha512_3"_h: + sm3partw2(rd, rn, rm); + break; + case "sm3ss1_vvv4_crypto4"_h: + sm3ss1(rd, rn, rm, ra); + break; + case "sm3tt1a_vvv4_crypto3_imm2"_h: + is_a = true; + VIXL_FALLTHROUGH(); + case "sm3tt1b_vvv4_crypto3_imm2"_h: + sm3tt1(rd, rn, rm, index, is_a); + break; + case "sm3tt2a_vvv4_crypto3_imm2"_h: + is_a = true; + VIXL_FALLTHROUGH(); + case "sm3tt2b_vvv_crypto3_imm2"_h: + sm3tt2(rd, rn, rm, index, is_a); + break; + } +} + +void Simulator::VisitCryptoSM4(const Instruction* instr) { + SimVRegister& rd = ReadVRegister(instr->GetRd()); + SimVRegister& rn = ReadVRegister(instr->GetRn()); + SimVRegister& rm = ReadVRegister(instr->GetRm()); + + bool is_key = false; + switch (form_hash_) { + case "sm4ekey_vvv4_cryptosha512_3"_h: + is_key = true; + VIXL_FALLTHROUGH(); + case "sm4e_vv4_cryptosha512_2"_h: + sm4(rd, rn, rm, is_key); + break; + } +} + +void Simulator::SimulateSHA512(const Instruction* instr) { + SimVRegister& rd = ReadVRegister(instr->GetRd()); + SimVRegister& rn = ReadVRegister(instr->GetRn()); + SimVRegister& rm = ReadVRegister(instr->GetRm()); + + switch (form_hash_) { + case "sha512h_qqv_cryptosha512_3"_h: + sha512h(rd, rn, rm); + break; + case "sha512h2_qqv_cryptosha512_3"_h: + sha512h2(rd, rn, rm); + break; + case "sha512su0_vv2_cryptosha512_2"_h: + sha512su0(rd, rn); + break; + case "sha512su1_vvv2_cryptosha512_3"_h: + sha512su1(rd, rn, rm); + break; + } +} void Simulator::VisitNEON2RegMisc(const Instruction* instr) { NEONFormatDecoder nfd(instr); @@ -7704,13 +8181,24 @@ void Simulator::VisitNEON3Different(const Instruction* instr) { SimVRegister& rd = ReadVRegister(instr->GetRd()); SimVRegister& rn = ReadVRegister(instr->GetRn()); SimVRegister& rm = ReadVRegister(instr->GetRm()); + int size = instr->GetNEONSize(); switch (instr->Mask(NEON3DifferentMask)) { case NEON_PMULL: - pmull(vf_l, rd, rn, rm); + if ((size == 1) || (size == 2)) { // S/D reserved. + VisitUnallocated(instr); + } else { + if (size == 3) vf_l = kFormat1Q; + pmull(vf_l, rd, rn, rm); + } break; case NEON_PMULL2: - pmull2(vf_l, rd, rn, rm); + if ((size == 1) || (size == 2)) { // S/D reserved. + VisitUnallocated(instr); + } else { + if (size == 3) vf_l = kFormat1Q; + pmull2(vf_l, rd, rn, rm); + } break; case NEON_UADDL: uaddl(vf_l, rd, rn, rm); @@ -7948,22 +8436,14 @@ void Simulator::VisitNEONAcrossLanes(const Instruction* instr) { void Simulator::SimulateNEONMulByElementLong(const Instruction* instr) { NEONFormatDecoder nfd(instr); VectorFormat vf = nfd.GetVectorFormat(nfd.LongIntegerFormatMap()); - SimVRegister& rd = ReadVRegister(instr->GetRd()); SimVRegister& rn = ReadVRegister(instr->GetRn()); - int rm_reg = instr->GetRm(); - int index = (instr->GetNEONH() << 1) | instr->GetNEONL(); - if (instr->GetNEONSize() == 1) { - rm_reg = instr->GetRmLow16(); - index = (index << 1) | instr->GetNEONM(); - } - SimVRegister& rm = ReadVRegister(rm_reg); - + std::pair rm_and_index = instr->GetNEONMulRmAndIndex(); SimVRegister temp; VectorFormat indexform = VectorFormatHalfWidthDoubleLanes(VectorFormatFillQ(vf)); - dup_element(indexform, temp, rm, index); + dup_elements_to_segments(indexform, temp, rm_and_index); bool is_2 = instr->Mask(NEON_Q) ? true : false; @@ -8037,21 +8517,9 @@ void Simulator::SimulateNEONFPMulByElement(const Instruction* instr) { SimVRegister& rd = ReadVRegister(instr->GetRd()); SimVRegister& rn = ReadVRegister(instr->GetRn()); - int rm_reg = instr->GetRm(); - int index = - (instr->GetNEONH() << 2) | (instr->GetNEONL() << 1) | instr->GetNEONM(); - - if ((vform == kFormat4H) || (vform == kFormat8H)) { - rm_reg &= 0xf; - } else if ((vform == kFormat2S) || (vform == kFormat4S)) { - index >>= 1; - } else { - VIXL_ASSERT(vform == kFormat2D); - VIXL_ASSERT(instr->GetNEONL() == 0); - index >>= 2; - } - - SimVRegister& rm = ReadVRegister(rm_reg); + std::pair rm_and_index = instr->GetNEONMulRmAndIndex(); + SimVRegister& rm = ReadVRegister(rm_and_index.first); + int index = rm_and_index.second; switch (form_hash_) { case "fmul_asimdelem_rh_h"_h: @@ -8131,15 +8599,9 @@ void Simulator::VisitNEONByIndexedElement(const Instruction* instr) { SimVRegister& rd = ReadVRegister(instr->GetRd()); SimVRegister& rn = ReadVRegister(instr->GetRn()); - int rm_reg = instr->GetRm(); - int index = (instr->GetNEONH() << 1) | instr->GetNEONL(); - - if ((vform == kFormat4H) || (vform == kFormat8H)) { - rm_reg &= 0xf; - index = (index << 1) | instr->GetNEONM(); - } - - SimVRegister& rm = ReadVRegister(rm_reg); + std::pair rm_and_index = instr->GetNEONMulRmAndIndex(); + SimVRegister& rm = ReadVRegister(rm_and_index.first); + int index = rm_and_index.second; switch (form_hash_) { case "mul_asimdelem_r"_h: @@ -8180,8 +8642,10 @@ void Simulator::VisitNEONCopy(const Instruction* instr) { if (instr->Mask(NEONCopyInsElementMask) == NEON_INS_ELEMENT) { int imm4 = instr->GetImmNEON4(); int rn_index = ExtractSignedBitfield32(31, tz, imm4); + mov(kFormat16B, rd, rd); // Zero bits beyond the MSB of a Q register. ins_element(vf, rd, reg_index, rn, rn_index); } else if (instr->Mask(NEONCopyInsGeneralMask) == NEON_INS_GENERAL) { + mov(kFormat16B, rd, rd); // Zero bits beyond the MSB of a Q register. ins_immediate(vf, rd, reg_index, ReadXRegister(instr->GetRn())); } else if (instr->Mask(NEONCopyUmovMask) == NEON_UMOV) { uint64_t value = LogicVRegister(rn).Uint(vf, reg_index); @@ -8249,97 +8713,117 @@ void Simulator::NEONLoadStoreMultiStructHelper(const Instruction* instr, switch (instr->Mask(NEONLoadStoreMultiStructPostIndexMask)) { case NEON_LD1_4v: case NEON_LD1_4v_post: - ld1(vf, ReadVRegister(reg[3]), addr[3]); + if (!ld1(vf, ReadVRegister(reg[3]), addr[3])) { + return; + } reg_count++; VIXL_FALLTHROUGH(); case NEON_LD1_3v: case NEON_LD1_3v_post: - ld1(vf, ReadVRegister(reg[2]), addr[2]); + if (!ld1(vf, ReadVRegister(reg[2]), addr[2])) { + return; + } reg_count++; VIXL_FALLTHROUGH(); case NEON_LD1_2v: case NEON_LD1_2v_post: - ld1(vf, ReadVRegister(reg[1]), addr[1]); + if (!ld1(vf, ReadVRegister(reg[1]), addr[1])) { + return; + } reg_count++; VIXL_FALLTHROUGH(); case NEON_LD1_1v: case NEON_LD1_1v_post: - ld1(vf, ReadVRegister(reg[0]), addr[0]); + if (!ld1(vf, ReadVRegister(reg[0]), addr[0])) { + return; + } break; case NEON_ST1_4v: case NEON_ST1_4v_post: - st1(vf, ReadVRegister(reg[3]), addr[3]); + if (!st1(vf, ReadVRegister(reg[3]), addr[3])) return; reg_count++; VIXL_FALLTHROUGH(); case NEON_ST1_3v: case NEON_ST1_3v_post: - st1(vf, ReadVRegister(reg[2]), addr[2]); + if (!st1(vf, ReadVRegister(reg[2]), addr[2])) return; reg_count++; VIXL_FALLTHROUGH(); case NEON_ST1_2v: case NEON_ST1_2v_post: - st1(vf, ReadVRegister(reg[1]), addr[1]); + if (!st1(vf, ReadVRegister(reg[1]), addr[1])) return; reg_count++; VIXL_FALLTHROUGH(); case NEON_ST1_1v: case NEON_ST1_1v_post: - st1(vf, ReadVRegister(reg[0]), addr[0]); + if (!st1(vf, ReadVRegister(reg[0]), addr[0])) return; log_read = false; break; case NEON_LD2_post: case NEON_LD2: - ld2(vf, ReadVRegister(reg[0]), ReadVRegister(reg[1]), addr[0]); + if (!ld2(vf, ReadVRegister(reg[0]), ReadVRegister(reg[1]), addr[0])) { + return; + } struct_parts = 2; reg_count = 2; break; case NEON_ST2: case NEON_ST2_post: - st2(vf, ReadVRegister(reg[0]), ReadVRegister(reg[1]), addr[0]); + if (!st2(vf, ReadVRegister(reg[0]), ReadVRegister(reg[1]), addr[0])) { + return; + } struct_parts = 2; reg_count = 2; log_read = false; break; case NEON_LD3_post: case NEON_LD3: - ld3(vf, - ReadVRegister(reg[0]), - ReadVRegister(reg[1]), - ReadVRegister(reg[2]), - addr[0]); + if (!ld3(vf, + ReadVRegister(reg[0]), + ReadVRegister(reg[1]), + ReadVRegister(reg[2]), + addr[0])) { + return; + } struct_parts = 3; reg_count = 3; break; case NEON_ST3: case NEON_ST3_post: - st3(vf, - ReadVRegister(reg[0]), - ReadVRegister(reg[1]), - ReadVRegister(reg[2]), - addr[0]); + if (!st3(vf, + ReadVRegister(reg[0]), + ReadVRegister(reg[1]), + ReadVRegister(reg[2]), + addr[0])) { + return; + } struct_parts = 3; reg_count = 3; log_read = false; break; case NEON_ST4: case NEON_ST4_post: - st4(vf, - ReadVRegister(reg[0]), - ReadVRegister(reg[1]), - ReadVRegister(reg[2]), - ReadVRegister(reg[3]), - addr[0]); + if (!st4(vf, + ReadVRegister(reg[0]), + ReadVRegister(reg[1]), + ReadVRegister(reg[2]), + ReadVRegister(reg[3]), + addr[0])) { + return; + } struct_parts = 4; reg_count = 4; log_read = false; break; case NEON_LD4_post: case NEON_LD4: - ld4(vf, - ReadVRegister(reg[0]), - ReadVRegister(reg[1]), - ReadVRegister(reg[2]), - ReadVRegister(reg[3]), - addr[0]); + if (!ld4(vf, + ReadVRegister(reg[0]), + ReadVRegister(reg[1]), + ReadVRegister(reg[2]), + ReadVRegister(reg[3]), + addr[0])) { + return; + } struct_parts = 4; reg_count = 4; break; @@ -8514,75 +8998,95 @@ void Simulator::NEONLoadStoreSingleStructHelper(const Instruction* instr, reg_count = 1; if (replicating) { VIXL_ASSERT(do_load); - ld1r(vf, ReadVRegister(rt), addr); + if (!ld1r(vf, ReadVRegister(rt), addr)) { + return; + } } else if (do_load) { - ld1(vf, ReadVRegister(rt), lane, addr); + if (!ld1(vf, ReadVRegister(rt), lane, addr)) { + return; + } } else { - st1(vf, ReadVRegister(rt), lane, addr); + if (!st1(vf, ReadVRegister(rt), lane, addr)) return; } break; case NEONLoadStoreSingle2: reg_count = 2; if (replicating) { VIXL_ASSERT(do_load); - ld2r(vf, ReadVRegister(rt), ReadVRegister(rt2), addr); + if (!ld2r(vf, ReadVRegister(rt), ReadVRegister(rt2), addr)) { + return; + } } else if (do_load) { - ld2(vf, ReadVRegister(rt), ReadVRegister(rt2), lane, addr); + if (!ld2(vf, ReadVRegister(rt), ReadVRegister(rt2), lane, addr)) { + return; + } } else { - st2(vf, ReadVRegister(rt), ReadVRegister(rt2), lane, addr); + if (!st2(vf, ReadVRegister(rt), ReadVRegister(rt2), lane, addr)) return; } break; case NEONLoadStoreSingle3: reg_count = 3; if (replicating) { VIXL_ASSERT(do_load); - ld3r(vf, - ReadVRegister(rt), - ReadVRegister(rt2), - ReadVRegister(rt3), - addr); + if (!ld3r(vf, + ReadVRegister(rt), + ReadVRegister(rt2), + ReadVRegister(rt3), + addr)) { + return; + } } else if (do_load) { - ld3(vf, - ReadVRegister(rt), - ReadVRegister(rt2), - ReadVRegister(rt3), - lane, - addr); + if (!ld3(vf, + ReadVRegister(rt), + ReadVRegister(rt2), + ReadVRegister(rt3), + lane, + addr)) { + return; + } } else { - st3(vf, - ReadVRegister(rt), - ReadVRegister(rt2), - ReadVRegister(rt3), - lane, - addr); + if (!st3(vf, + ReadVRegister(rt), + ReadVRegister(rt2), + ReadVRegister(rt3), + lane, + addr)) { + return; + } } break; case NEONLoadStoreSingle4: reg_count = 4; if (replicating) { VIXL_ASSERT(do_load); - ld4r(vf, - ReadVRegister(rt), - ReadVRegister(rt2), - ReadVRegister(rt3), - ReadVRegister(rt4), - addr); + if (!ld4r(vf, + ReadVRegister(rt), + ReadVRegister(rt2), + ReadVRegister(rt3), + ReadVRegister(rt4), + addr)) { + return; + } } else if (do_load) { - ld4(vf, - ReadVRegister(rt), - ReadVRegister(rt2), - ReadVRegister(rt3), - ReadVRegister(rt4), - lane, - addr); + if (!ld4(vf, + ReadVRegister(rt), + ReadVRegister(rt2), + ReadVRegister(rt3), + ReadVRegister(rt4), + lane, + addr)) { + return; + } } else { - st4(vf, - ReadVRegister(rt), - ReadVRegister(rt2), - ReadVRegister(rt3), - ReadVRegister(rt4), - lane, - addr); + if (!st4(vf, + ReadVRegister(rt), + ReadVRegister(rt2), + ReadVRegister(rt3), + ReadVRegister(rt4), + lane, + addr)) { + return; + } } break; default: @@ -8676,7 +9180,7 @@ void Simulator::VisitNEONModifiedImmediate(const Instruction* instr) { vform = q ? kFormat2D : kFormat1D; imm = 0; for (int i = 0; i < 8; ++i) { - if (imm8 & (1 << i)) { + if (imm8 & (uint64_t{1} << i)) { imm |= (UINT64_C(0xff) << (8 * i)); } } @@ -9156,78 +9660,76 @@ void Simulator::VisitNEONScalar3SameExtra(const Instruction* instr) { void Simulator::VisitNEONScalarByIndexedElement(const Instruction* instr) { NEONFormatDecoder nfd(instr, NEONFormatDecoder::LongScalarFormatMap()); VectorFormat vf = nfd.GetVectorFormat(); - VectorFormat vf_r = nfd.GetVectorFormat(nfd.ScalarFormatMap()); - SimVRegister& rd = ReadVRegister(instr->GetRd()); SimVRegister& rn = ReadVRegister(instr->GetRn()); ByElementOp Op = NULL; - int rm_reg = instr->GetRm(); - int index = (instr->GetNEONH() << 1) | instr->GetNEONL(); - if (instr->GetNEONSize() == 1) { - rm_reg &= 0xf; - index = (index << 1) | instr->GetNEONM(); + std::pair rm_and_index = instr->GetNEONMulRmAndIndex(); + std::unordered_map handler = { + {"sqdmull_asisdelem_l"_h, &Simulator::sqdmull}, + {"sqdmlal_asisdelem_l"_h, &Simulator::sqdmlal}, + {"sqdmlsl_asisdelem_l"_h, &Simulator::sqdmlsl}, + {"sqdmulh_asisdelem_r"_h, &Simulator::sqdmulh}, + {"sqrdmulh_asisdelem_r"_h, &Simulator::sqrdmulh}, + {"sqrdmlah_asisdelem_r"_h, &Simulator::sqrdmlah}, + {"sqrdmlsh_asisdelem_r"_h, &Simulator::sqrdmlsh}, + {"fmul_asisdelem_rh_h"_h, &Simulator::fmul}, + {"fmul_asisdelem_r_sd"_h, &Simulator::fmul}, + {"fmla_asisdelem_rh_h"_h, &Simulator::fmla}, + {"fmla_asisdelem_r_sd"_h, &Simulator::fmla}, + {"fmls_asisdelem_rh_h"_h, &Simulator::fmls}, + {"fmls_asisdelem_r_sd"_h, &Simulator::fmls}, + {"fmulx_asisdelem_rh_h"_h, &Simulator::fmulx}, + {"fmulx_asisdelem_r_sd"_h, &Simulator::fmulx}, + }; + + std::unordered_map::const_iterator it = + handler.find(form_hash_); + + if (it == handler.end()) { + VIXL_UNIMPLEMENTED(); + } else { + Op = it->second; } - switch (instr->Mask(NEONScalarByIndexedElementMask)) { - case NEON_SQDMULL_byelement_scalar: - Op = &Simulator::sqdmull; + switch (form_hash_) { + case "sqdmull_asisdelem_l"_h: + case "sqdmlal_asisdelem_l"_h: + case "sqdmlsl_asisdelem_l"_h: + if ((vf == kFormatB) || (vf == kFormatH)) { + VisitUnallocated(instr); + return; + } break; - case NEON_SQDMLAL_byelement_scalar: - Op = &Simulator::sqdmlal; + case "sqdmulh_asisdelem_r"_h: + case "sqrdmulh_asisdelem_r"_h: + case "sqrdmlah_asisdelem_r"_h: + case "sqrdmlsh_asisdelem_r"_h: + vf = nfd.GetVectorFormat(nfd.ScalarFormatMap()); + if ((vf == kFormatB) || (vf == kFormatD)) { + VisitUnallocated(instr); + return; + } break; - case NEON_SQDMLSL_byelement_scalar: - Op = &Simulator::sqdmlsl; - break; - case NEON_SQDMULH_byelement_scalar: - Op = &Simulator::sqdmulh; - vf = vf_r; - break; - case NEON_SQRDMULH_byelement_scalar: - Op = &Simulator::sqrdmulh; - vf = vf_r; - break; - case NEON_SQRDMLAH_byelement_scalar: - Op = &Simulator::sqrdmlah; - vf = vf_r; - break; - case NEON_SQRDMLSH_byelement_scalar: - Op = &Simulator::sqrdmlsh; - vf = vf_r; - break; - default: + case "fmul_asisdelem_r_sd"_h: + case "fmla_asisdelem_r_sd"_h: + case "fmls_asisdelem_r_sd"_h: + case "fmulx_asisdelem_r_sd"_h: vf = nfd.GetVectorFormat(nfd.FPScalarFormatMap()); - index = instr->GetNEONH(); - if (instr->GetFPType() == 0) { - index = (index << 2) | (instr->GetNEONL() << 1) | instr->GetNEONM(); - rm_reg &= 0xf; - vf = kFormatH; - } else if ((instr->GetFPType() & 1) == 0) { - index = (index << 1) | instr->GetNEONL(); - } - switch (instr->Mask(NEONScalarByIndexedElementFPMask)) { - case NEON_FMUL_H_byelement_scalar: - case NEON_FMUL_byelement_scalar: - Op = &Simulator::fmul; - break; - case NEON_FMLA_H_byelement_scalar: - case NEON_FMLA_byelement_scalar: - Op = &Simulator::fmla; - break; - case NEON_FMLS_H_byelement_scalar: - case NEON_FMLS_byelement_scalar: - Op = &Simulator::fmls; - break; - case NEON_FMULX_H_byelement_scalar: - case NEON_FMULX_byelement_scalar: - Op = &Simulator::fmulx; - break; - default: - VIXL_UNIMPLEMENTED(); - } + break; + case "fmul_asisdelem_rh_h"_h: + case "fmla_asisdelem_rh_h"_h: + case "fmls_asisdelem_rh_h"_h: + case "fmulx_asisdelem_rh_h"_h: + vf = kFormatH; + break; } - (this->*Op)(vf, rd, rn, ReadVRegister(rm_reg), index); + (this->*Op)(vf, + rd, + rn, + ReadVRegister(rm_and_index.first), + rm_and_index.second); } @@ -9634,6 +10136,34 @@ void Simulator::VisitNEONPerm(const Instruction* instr) { } } +void Simulator::SimulateNEONSHA3(const Instruction* instr) { + SimVRegister& rd = ReadVRegister(instr->GetRd()); + SimVRegister& rn = ReadVRegister(instr->GetRn()); + SimVRegister& rm = ReadVRegister(instr->GetRm()); + SimVRegister& ra = ReadVRegister(instr->GetRa()); + SimVRegister temp; + + switch (form_hash_) { + case "bcax_vvv16_crypto4"_h: + bic(kFormat16B, temp, rm, ra); + eor(kFormat16B, rd, rn, temp); + break; + case "eor3_vvv16_crypto4"_h: + eor(kFormat16B, temp, rm, ra); + eor(kFormat16B, rd, rn, temp); + break; + case "rax1_vvv2_cryptosha512_3"_h: + ror(kFormat2D, temp, rm, 63); // rol(1) => ror(63) + eor(kFormat2D, rd, rn, temp); + break; + case "xar_vvv2_crypto3_imm6"_h: + int rot = instr->ExtractBits(15, 10); + eor(kFormat2D, temp, rn, rm); + ror(kFormat2D, rd, temp, rot); + break; + } +} + void Simulator::VisitSVEAddressGeneration(const Instruction* instr) { SimVRegister& zd = ReadVRegister(instr->GetRd()); SimVRegister& zn = ReadVRegister(instr->GetRn()); @@ -11820,7 +12350,7 @@ void Simulator::VisitSVEBroadcastIntImm_Unpredicated(const Instruction* instr) { VectorFormat format = instr->GetSVEVectorFormat(); int64_t imm = instr->GetImmSVEIntWideSigned(); int shift = instr->ExtractBit(13) * 8; - imm *= 1 << shift; + imm *= uint64_t{1} << shift; switch (instr->Mask(SVEBroadcastIntImm_UnpredicatedMask)) { case DUP_z_i: @@ -12062,7 +12592,7 @@ void Simulator::VisitSVELoadAndBroadcastElement(const Instruction* instr) { VectorFormat unpack_vform = SVEFormatFromLaneSizeInBytesLog2(msize_in_bytes_log2); SimVRegister temp; - ld1r(vform, unpack_vform, temp, base, is_signed); + if (!ld1r(vform, unpack_vform, temp, base, is_signed)) return; mov_zeroing(vform, ReadVRegister(instr->GetRt()), ReadPRegister(instr->GetPgLow8()), @@ -12079,7 +12609,8 @@ void Simulator::VisitSVELoadPredicateRegister(const Instruction* instr) { uint64_t base = ReadXRegister(instr->GetRn(), Reg31IsStackPointer); uint64_t address = base + multiplier * pl; for (int i = 0; i < pl; i++) { - pt.Insert(i, MemRead(address + i)); + VIXL_DEFINE_OR_RETURN(value, MemRead(address + i)); + pt.Insert(i, value); } LogPRead(instr->GetPt(), address); break; @@ -12100,7 +12631,8 @@ void Simulator::VisitSVELoadVectorRegister(const Instruction* instr) { uint64_t base = ReadXRegister(instr->GetRn(), Reg31IsStackPointer); uint64_t address = base + multiplier * vl; for (int i = 0; i < vl; i++) { - zt.Insert(i, MemRead(address + i)); + VIXL_DEFINE_OR_RETURN(value, MemRead(address + i)); + zt.Insert(i, value); } LogZRead(instr->GetRt(), address); break; @@ -12486,7 +13018,7 @@ void Simulator::VisitSVELoadAndBroadcastQOWord_ScalarPlusImm( VectorFormat vform = SVEFormatFromLaneSizeInBytesLog2(msz); for (unsigned i = 0; i < dwords; i++) { - ld1(kFormatVnD, zt, i, addr + offset + (i * kDRegSizeInBytes)); + if (!ld1(kFormatVnD, zt, i, addr + offset + (i * kDRegSizeInBytes))) return; } mov_zeroing(vform, zt, pg, zt); dup_element(vform_dst, zt, zt, 0); @@ -12513,7 +13045,7 @@ void Simulator::VisitSVELoadAndBroadcastQOWord_ScalarPlusScalar( VectorFormat vform = SVEFormatFromLaneSizeInBytesLog2(msz); offset <<= msz; for (unsigned i = 0; i < bytes; i++) { - ld1(kFormatVnB, zt, i, addr + offset + i); + if (!ld1(kFormatVnB, zt, i, addr + offset + i)) return; } mov_zeroing(vform, zt, pg, zt); dup_element(vform_dst, zt, zt, 0); @@ -12570,7 +13102,7 @@ void Simulator::VisitSVELoadMultipleStructures_ScalarPlusScalar( case LD4H_z_p_br_contiguous: case LD4W_z_p_br_contiguous: { int msz = instr->ExtractBits(24, 23); - uint64_t offset = ReadXRegister(instr->GetRm()) * (1 << msz); + uint64_t offset = ReadXRegister(instr->GetRm()) * (uint64_t{1} << msz); VectorFormat vform = SVEFormatFromLaneSizeInBytesLog2(msz); LogicSVEAddressVector addr( ReadXRegister(instr->GetRn(), Reg31IsStackPointer) + offset); @@ -13006,7 +13538,7 @@ void Simulator::VisitSVEStoreMultipleStructures_ScalarPlusScalar( case ST4H_z_p_br_contiguous: case ST4W_z_p_br_contiguous: { int msz = instr->ExtractBits(24, 23); - uint64_t offset = ReadXRegister(instr->GetRm()) * (1 << msz); + uint64_t offset = ReadXRegister(instr->GetRm()) * (uint64_t{1} << msz); VectorFormat vform = SVEFormatFromLaneSizeInBytesLog2(msz); LogicSVEAddressVector addr( ReadXRegister(instr->GetRn(), Reg31IsStackPointer) + offset); @@ -13034,7 +13566,7 @@ void Simulator::VisitSVEStorePredicateRegister(const Instruction* instr) { uint64_t base = ReadXRegister(instr->GetRn(), Reg31IsStackPointer); uint64_t address = base + multiplier * pl; for (int i = 0; i < pl; i++) { - MemWrite(address + i, pt.GetLane(i)); + if (!MemWrite(address + i, pt.GetLane(i))) return; } LogPWrite(instr->GetPt(), address); break; @@ -13055,7 +13587,7 @@ void Simulator::VisitSVEStoreVectorRegister(const Instruction* instr) { uint64_t base = ReadXRegister(instr->GetRn(), Reg31IsStackPointer); uint64_t address = base + multiplier * vl; for (int i = 0; i < vl; i++) { - MemWrite(address + i, zt.GetLane(i)); + if (!MemWrite(address + i, zt.GetLane(i))) return; } LogZWrite(instr->GetRt(), address); break; @@ -14140,7 +14672,7 @@ void Simulator::SimulateMTETagMaskInsert(const Instruction* instr) { uint64_t mask = ReadXRegister(instr->GetRm()); uint64_t tag = GetAllocationTagFromAddress( ReadXRegister(instr->GetRn(), Reg31IsStackPointer)); - uint64_t mask_bit = 1 << tag; + uint64_t mask_bit = uint64_t{1} << tag; WriteXRegister(instr->GetRd(), mask | mask_bit); } @@ -14187,8 +14719,8 @@ void Simulator::SimulateMTEStoreTagPair(const Instruction* instr) { int tag = GetAllocationTagFromAddress(rn); meta_data_.SetMTETag(address, tag); - MemWrite(address, rt); - MemWrite(address + kXRegSizeInBytes, rt2); + if (!MemWrite(address, rt)) return; + if (!MemWrite(address + kXRegSizeInBytes, rt2)) return; } void Simulator::SimulateMTEStoreTag(const Instruction* instr) { @@ -14250,8 +14782,7 @@ void Simulator::SimulateMTEStoreTag(const Instruction* instr) { uintptr_t address = AddressModeHelper(instr->GetRn(), offset, addr_mode); if (is_zeroing) { - if (!IsAligned(reinterpret_cast(address), - kMTETagGranuleInBytes)) { + if (!IsAligned(address, kMTETagGranuleInBytes)) { VIXL_ALIGNMENT_EXCEPTION(); } VIXL_STATIC_ASSERT(kMTETagGranuleInBytes >= sizeof(uint64_t)); @@ -14264,7 +14795,7 @@ void Simulator::SimulateMTEStoreTag(const Instruction* instr) { size_t fill_offset = 0; while (fill_offset < fill_size) { - MemWrite(address + fill_offset, 0); + if (!MemWrite(address + fill_offset, 0)) return; fill_offset += sizeof(uint64_t); } } @@ -14348,8 +14879,8 @@ void Simulator::SimulateCpyM(const Instruction* instr) { } while (xn--) { - uint8_t temp = MemRead(xs); - MemWrite(xd, temp); + VIXL_DEFINE_OR_RETURN(temp, MemRead(xs)); + if (!MemWrite(xd, temp)) return; LogMemTransfer(xd, xs, temp); xs += step; xd += step; @@ -14388,7 +14919,7 @@ void Simulator::SimulateSetM(const Instruction* instr) { while (xn--) { LogWrite(instr->GetRs(), GetPrintRegPartial(kPrintRegLaneSizeB), xd); - MemWrite(xd++, xs); + if (!MemWrite(xd++, static_cast(xs))) return; } WriteXRegister(instr->GetRd(), xd); WriteXRegister(instr->GetRn(), 0); @@ -14598,22 +15129,46 @@ void Simulator::DoRuntimeCall(const Instruction* instr) { VIXL_STATIC_ASSERT(kRuntimeCallAddressSize == sizeof(uintptr_t)); // The appropriate `Simulator::SimulateRuntimeCall()` wrapper and the function // to call are passed inlined in the assembly. - uintptr_t call_wrapper_address = - MemRead(instr + kRuntimeCallWrapperOffset); - uintptr_t function_address = - MemRead(instr + kRuntimeCallFunctionOffset); - RuntimeCallType call_type = static_cast( - MemRead(instr + kRuntimeCallTypeOffset)); + VIXL_DEFINE_OR_RETURN(call_wrapper_address, + MemRead(instr + kRuntimeCallWrapperOffset)); + VIXL_DEFINE_OR_RETURN(function_address, + MemRead(instr + kRuntimeCallFunctionOffset)); + VIXL_DEFINE_OR_RETURN(call_type, + MemRead(instr + kRuntimeCallTypeOffset)); auto runtime_call_wrapper = reinterpret_cast(call_wrapper_address); - if (call_type == kCallRuntime) { - WriteRegister(kLinkRegCode, - instr->GetInstructionAtOffset(kRuntimeCallLength)); + if (static_cast(call_type) == kCallRuntime) { + const Instruction* addr = instr->GetInstructionAtOffset(kRuntimeCallLength); + WriteLr(addr); + GCSPush(reinterpret_cast(addr)); } runtime_call_wrapper(this, function_address); // Read the return address from `lr` and write it into `pc`. - WritePc(ReadRegister(kLinkRegCode)); + uint64_t addr = ReadRegister(kLinkRegCode); + if (IsGCSCheckEnabled()) { + uint64_t expected_lr = GCSPeek(); + char msg[128]; + if (expected_lr != 0) { + if ((expected_lr & 0x3) != 0) { + snprintf(msg, + sizeof(msg), + "GCS contains misaligned return address: 0x%016" PRIx64 "\n", + expected_lr); + ReportGCSFailure(msg); + } else if ((addr != 0) && (addr != expected_lr)) { + snprintf(msg, + sizeof(msg), + "GCS mismatch: lr = 0x%016" PRIx64 ", gcs = 0x%016" PRIx64 + "\n", + addr, + expected_lr); + ReportGCSFailure(msg); + } + GCSPop(); + } + } + WritePc(reinterpret_cast(addr)); } #else void Simulator::DoRuntimeCall(const Instruction* instr) { @@ -14638,7 +15193,7 @@ void Simulator::DoConfigureCPUFeatures(const Instruction* instr) { // Read the kNone-terminated list of features. CPUFeatures parameters; while (true) { - ElementType feature = MemRead(instr + offset); + VIXL_DEFINE_OR_RETURN(feature, MemRead(instr + offset)); offset += element_size; if (feature == static_cast(CPUFeatures::kNone)) break; parameters.Combine(static_cast(feature)); @@ -14681,6 +15236,7 @@ void Simulator::DoRestoreCPUFeatures(const Instruction* instr) { saved_cpu_features_.pop_back(); } +#ifdef VIXL_HAS_SIMULATED_MMAP void* Simulator::Mmap( void* address, size_t length, int prot, int flags, int fd, off_t offset) { // The underlying system `mmap` in the simulated environment doesn't recognize @@ -14713,7 +15269,7 @@ int Simulator::Munmap(void* address, size_t length, int prot) { return munmap(address, length); } - +#endif // VIXL_HAS_SIMULATED_MMAP } // namespace aarch64 } // namespace vixl diff --git a/src/aarch64/simulator-aarch64.h b/src/aarch64/simulator-aarch64.h index cdc17834..8cf085b7 100644 --- a/src/aarch64/simulator-aarch64.h +++ b/src/aarch64/simulator-aarch64.h @@ -28,12 +28,14 @@ #define VIXL_AARCH64_SIMULATOR_AARCH64_H_ #include +#include +#include #include #include +#include "../cpu-features.h" #include "../globals-vixl.h" #include "../utils-vixl.h" -#include "cpu-features.h" #include "abi-aarch64.h" #include "cpu-features-auditor-aarch64.h" @@ -68,6 +70,28 @@ namespace aarch64 { class Simulator; struct RuntimeCallStructHelper; +enum class MemoryAccessResult { Success = 0, Failure = 1 }; + +// Try to access a piece of memory at the given address. Accessing that memory +// might raise a signal which, if handled by a custom signal handler, should +// setup the native and simulated context in order to continue. Return whether +// the memory access failed (i.e: raised a signal) or succeeded. +MemoryAccessResult TryMemoryAccess(uintptr_t address, uintptr_t access_size); + +#ifdef VIXL_ENABLE_IMPLICIT_CHECKS +// Access a byte of memory from the address at the given offset. If the memory +// could be accessed then return MemoryAccessResult::Success. If the memory +// could not be accessed, and therefore raised a signal, setup the simulated +// context and return MemoryAccessResult::Failure. +// +// If a signal is raised then it is expected that the signal handler will place +// MemoryAccessResult::Failure in the native return register and the address of +// _vixl_internal_AccessMemory_continue into the native instruction pointer. +extern "C" MemoryAccessResult _vixl_internal_ReadMemory(uintptr_t address, + uintptr_t offset); +extern "C" uintptr_t _vixl_internal_AccessMemory_continue(); +#endif // VIXL_ENABLE_IMPLICIT_CHECKS + class SimStack { public: SimStack() {} @@ -136,7 +160,7 @@ class SimStack { // Allocate the stack, locking the parameters. Allocated Allocate() { - size_t align_to = 1 << align_log2_; + size_t align_to = uint64_t{1} << align_log2_; size_t l = AlignUp(limit_guard_size_, align_to); size_t u = AlignUp(usable_size_, align_to); size_t b = AlignUp(base_guard_size_, align_to); @@ -366,7 +390,7 @@ class Memory { } template - T Read(A address, Instruction const* pc = nullptr) const { + std::optional Read(A address, Instruction const* pc = nullptr) const { T value; VIXL_STATIC_ASSERT((sizeof(value) == 1) || (sizeof(value) == 2) || (sizeof(value) == 4) || (sizeof(value) == 8) || @@ -378,12 +402,16 @@ class Memory { if (!IsMTETagsMatched(address, pc)) { VIXL_ABORT_WITH_MSG("Tag mismatch."); } + if (TryMemoryAccess(reinterpret_cast(base), sizeof(value)) == + MemoryAccessResult::Failure) { + return std::nullopt; + } memcpy(&value, base, sizeof(value)); return value; } template - void Write(A address, T value, Instruction const* pc = nullptr) const { + bool Write(A address, T value, Instruction const* pc = nullptr) const { VIXL_STATIC_ASSERT((sizeof(value) == 1) || (sizeof(value) == 2) || (sizeof(value) == 4) || (sizeof(value) == 8) || (sizeof(value) == 16)); @@ -394,11 +422,16 @@ class Memory { if (!IsMTETagsMatched(address, pc)) { VIXL_ABORT_WITH_MSG("Tag mismatch."); } + if (TryMemoryAccess(reinterpret_cast(base), sizeof(value)) == + MemoryAccessResult::Failure) { + return false; + } memcpy(base, &value, sizeof(value)); + return true; } template - uint64_t ReadUint(int size_in_bytes, A address) const { + std::optional ReadUint(int size_in_bytes, A address) const { switch (size_in_bytes) { case 1: return Read(address); @@ -414,7 +447,7 @@ class Memory { } template - int64_t ReadInt(int size_in_bytes, A address) const { + std::optional ReadInt(int size_in_bytes, A address) const { switch (size_in_bytes) { case 1: return Read(address); @@ -430,7 +463,7 @@ class Memory { } template - void Write(int size_in_bytes, A address, uint64_t value) const { + bool Write(int size_in_bytes, A address, uint64_t value) const { switch (size_in_bytes) { case 1: return Write(address, static_cast(value)); @@ -442,6 +475,7 @@ class Memory { return Write(address, value); } VIXL_UNREACHABLE(); + return false; } void AppendMetaData(MetaDataDepot* metadata_depot) { @@ -650,7 +684,7 @@ class LogicPRegister { void SetAllBits() { int chunk_size = sizeof(ChunkType) * kBitsPerByte; - ChunkType bits = GetUintMask(chunk_size); + ChunkType bits = static_cast(GetUintMask(chunk_size)); for (int lane = 0; lane < (static_cast(register_.GetSizeInBits() / chunk_size)); lane++) { @@ -703,6 +737,8 @@ class LogicPRegister { SimPRegister& register_; }; +using vixl_uint128_t = std::pair; + // Representation of a vector register, with typed getters and setters for lanes // and additional information to represent lane state. class LogicVRegister { @@ -831,6 +867,16 @@ class LogicVRegister { } } + void SetUint(VectorFormat vform, int index, vixl_uint128_t value) const { + if (LaneSizeInBitsFromFormat(vform) <= 64) { + SetUint(vform, index, value.second); + return; + } + VIXL_ASSERT((vform == kFormat1Q) || (vform == kFormatVnQ)); + SetUint(kFormatVnD, 2 * index, value.second); + SetUint(kFormatVnD, 2 * index + 1, value.first); + } + void SetUintArray(VectorFormat vform, const uint64_t* src) const { ClearForWrite(vform); for (int i = 0; i < LaneCountFromFormat(vform); i++) { @@ -1234,9 +1280,10 @@ class SimExclusiveGlobalMonitor { uint32_t seed_; }; - class Debugger; +template +uint64_t CryptoOp(uint64_t x, uint64_t y, uint64_t z); class Simulator : public DecoderVisitor { public: @@ -1269,7 +1316,7 @@ class Simulator : public DecoderVisitor { #if defined(VIXL_HAS_ABI_SUPPORT) && __cplusplus >= 201103L && \ - (defined(__clang__) || GCC_VERSION_OR_NEWER(4, 9, 1)) + (defined(_MSC_VER) || defined(__clang__) || GCC_VERSION_OR_NEWER(4, 9, 1)) // Templated `RunFrom` version taking care of passing arguments and returning // the result value. // This allows code like: @@ -1472,6 +1519,7 @@ class Simulator : public DecoderVisitor { void SimulateSVESaturatingMulAddHigh(const Instruction* instr); void SimulateSVESaturatingMulHighIndex(const Instruction* instr); void SimulateSVEFPConvertLong(const Instruction* instr); + void SimulateSVEPmull128(const Instruction* instr); void SimulateMatrixMul(const Instruction* instr); void SimulateSVEFPMatrixMul(const Instruction* instr); void SimulateNEONMulByElementLong(const Instruction* instr); @@ -1479,6 +1527,7 @@ class Simulator : public DecoderVisitor { void SimulateNEONFPMulByElementLong(const Instruction* instr); void SimulateNEONComplexMulByElement(const Instruction* instr); void SimulateNEONDotProdByElement(const Instruction* instr); + void SimulateNEONSHA3(const Instruction* instr); void SimulateMTEAddSubTag(const Instruction* instr); void SimulateMTETagMaskInsert(const Instruction* instr); void SimulateMTESubPointer(const Instruction* instr); @@ -1498,7 +1547,10 @@ class Simulator : public DecoderVisitor { void SimulateSetGM(const Instruction* instr); void SimulateSignedMinMax(const Instruction* instr); void SimulateUnsignedMinMax(const Instruction* instr); + void SimulateSHA512(const Instruction* instr); + void VisitCryptoSM3(const Instruction* instr); + void VisitCryptoSM4(const Instruction* instr); // Integer register accessors. @@ -2029,62 +2081,66 @@ class Simulator : public DecoderVisitor { } template - T MemRead(A address) const { + std::optional MemRead(A address) const { Instruction const* pc = ReadPc(); return memory_.Read(address, pc); } template - void MemWrite(A address, T value) const { + bool MemWrite(A address, T value) const { Instruction const* pc = ReadPc(); return memory_.Write(address, value, pc); } template - uint64_t MemReadUint(int size_in_bytes, A address) const { + std::optional MemReadUint(int size_in_bytes, A address) const { return memory_.ReadUint(size_in_bytes, address); } template - int64_t MemReadInt(int size_in_bytes, A address) const { + std::optional MemReadInt(int size_in_bytes, A address) const { return memory_.ReadInt(size_in_bytes, address); } template - void MemWrite(int size_in_bytes, A address, uint64_t value) const { + bool MemWrite(int size_in_bytes, A address, uint64_t value) const { return memory_.Write(size_in_bytes, address, value); } - void LoadLane(LogicVRegister dst, + bool LoadLane(LogicVRegister dst, VectorFormat vform, int index, uint64_t addr) const { unsigned msize_in_bytes = LaneSizeInBytesFromFormat(vform); - LoadUintToLane(dst, vform, msize_in_bytes, index, addr); + return LoadUintToLane(dst, vform, msize_in_bytes, index, addr); } - void LoadUintToLane(LogicVRegister dst, + bool LoadUintToLane(LogicVRegister dst, VectorFormat vform, unsigned msize_in_bytes, int index, uint64_t addr) const { - dst.SetUint(vform, index, MemReadUint(msize_in_bytes, addr)); + VIXL_DEFINE_OR_RETURN_FALSE(value, MemReadUint(msize_in_bytes, addr)); + dst.SetUint(vform, index, value); + return true; } - void LoadIntToLane(LogicVRegister dst, + bool LoadIntToLane(LogicVRegister dst, VectorFormat vform, unsigned msize_in_bytes, int index, uint64_t addr) const { - dst.SetInt(vform, index, MemReadInt(msize_in_bytes, addr)); + VIXL_DEFINE_OR_RETURN_FALSE(value, MemReadInt(msize_in_bytes, addr)); + dst.SetInt(vform, index, value); + return true; } - void StoreLane(const LogicVRegister& src, + bool StoreLane(const LogicVRegister& src, VectorFormat vform, int index, uint64_t addr) const { unsigned msize_in_bytes = LaneSizeInBytesFromFormat(vform); - MemWrite(msize_in_bytes, addr, src.Uint(vform, index)); + return MemWrite(msize_in_bytes, addr, src.Uint(vform, index)); } uint64_t ComputeMemOperandAddress(const MemOperand& mem_op) const; @@ -2095,12 +2151,14 @@ class Simulator : public DecoderVisitor { return ReadCPURegister(operand.GetCPURegister()); } else { VIXL_ASSERT(operand.IsMemOperand()); - return MemRead(ComputeMemOperandAddress(operand.GetMemOperand())); + auto res = MemRead(ComputeMemOperandAddress(operand.GetMemOperand())); + VIXL_ASSERT(res); + return *res; } } template - void WriteGenericOperand(GenericOperand operand, + bool WriteGenericOperand(GenericOperand operand, T value, RegLogMode log_mode = LogRegWrites) { if (operand.IsCPURegister()) { @@ -2116,8 +2174,9 @@ class Simulator : public DecoderVisitor { WriteCPURegister(operand.GetCPURegister(), raw, log_mode); } else { VIXL_ASSERT(operand.IsMemOperand()); - MemWrite(ComputeMemOperandAddress(operand.GetMemOperand()), value); + return MemWrite(ComputeMemOperandAddress(operand.GetMemOperand()), value); } + return true; } bool ReadN() const { return nzcv_.GetN() != 0; } @@ -2493,12 +2552,16 @@ class Simulator : public DecoderVisitor { // Other state updates, including system registers. void PrintSystemRegister(SystemRegister id); void PrintTakenBranch(const Instruction* target); + void PrintGCS(bool is_push, uint64_t addr, size_t entry); void LogSystemRegister(SystemRegister id) { if (ShouldTraceSysRegs()) PrintSystemRegister(id); } void LogTakenBranch(const Instruction* target) { if (ShouldTraceBranches()) PrintTakenBranch(target); } + void LogGCS(bool is_push, uint64_t addr, size_t entry) { + if (ShouldTraceSysRegs()) PrintGCS(is_push, addr, entry); + } // Trace memory accesses. @@ -2528,6 +2591,14 @@ class Simulator : public DecoderVisitor { void PrintPWrite(int rt_code, uintptr_t address) { PrintPAccess(rt_code, "->", address); } + void PrintWriteU64(uint64_t x, uintptr_t address) { + fprintf(stream_, + "# 0x%016lx -> %s0x%016" PRIxPTR "%s\n", + x, + clr_memory_address, + address, + clr_normal); + } // Like Print* (above), but respect GetTraceParameters(). void LogRead(int rt_code, PrintRegisterFormat format, uintptr_t address) { @@ -2562,6 +2633,9 @@ class Simulator : public DecoderVisitor { void LogPWrite(int rt_code, uintptr_t address) { if (ShouldTraceWrites()) PrintPWrite(rt_code, address); } + void LogWriteU64(uint64_t x, uintptr_t address) { + if (ShouldTraceWrites()) PrintWriteU64(x, address); + } void LogMemTransfer(uintptr_t dst, uintptr_t src, uint8_t value) { if (ShouldTraceWrites()) PrintMemTransfer(dst, src, value); } @@ -2860,7 +2934,7 @@ class Simulator : public DecoderVisitor { } if (offset == 0) { - while ((exclude & (1 << tag)) != 0) { + while ((exclude & (uint64_t{1} << tag)) != 0) { tag = (tag + 1) % 16; } } @@ -2868,7 +2942,7 @@ class Simulator : public DecoderVisitor { while (offset > 0) { offset--; tag = (tag + 1) % 16; - while ((exclude & (1 << tag)) != 0) { + while ((exclude & (uint64_t{1} << tag)) != 0) { tag = (tag + 1) % 16; } } @@ -2880,12 +2954,15 @@ class Simulator : public DecoderVisitor { return (addr & ~(UINT64_C(0xf) << 56)) | (tag << 56); } +#if __linux__ +#define VIXL_HAS_SIMULATED_MMAP // Create or remove a mapping with memory protection. Memory attributes such // as MTE and BTI are represented by metadata in Simulator. void* Mmap( void* address, size_t length, int prot, int flags, int fd, off_t offset); int Munmap(void* address, size_t length, int prot); +#endif // The common CPUFeatures interface with the set of available features. @@ -2908,7 +2985,7 @@ class Simulator : public DecoderVisitor { // Also, the initialisation of the tuples in RuntimeCall(Non)Void is incorrect // in GCC before 4.9.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51253 #if defined(VIXL_HAS_ABI_SUPPORT) && __cplusplus >= 201103L && \ - (defined(__clang__) || GCC_VERSION_OR_NEWER(4, 9, 1)) + (defined(_MSC_VER) || defined(__clang__) || GCC_VERSION_OR_NEWER(4, 9, 1)) #define VIXL_HAS_SIMULATED_RUNTIME_CALL_SUPPORT @@ -2966,7 +3043,10 @@ class Simulator : public DecoderVisitor { R return_value = DoRuntimeCall(function, argument_operands, __local_index_sequence_for{}); - WriteGenericOperand(abi.GetReturnGenericOperand(), return_value); + bool succeeded = + WriteGenericOperand(abi.GetReturnGenericOperand(), return_value); + USE(succeeded); + VIXL_ASSERT(succeeded); } template @@ -3154,6 +3234,43 @@ class Simulator : public DecoderVisitor { #endif } +#ifdef VIXL_ENABLE_IMPLICIT_CHECKS + // Returns true if the faulting instruction address (usually the program + // counter or instruction pointer) comes from an internal VIXL memory access. + // This can be used by signal handlers to check if a signal was raised from + // the simulator (via TryMemoryAccess) before the actual + // access occurs. + bool IsSimulatedMemoryAccess(uintptr_t fault_pc) const { + return (fault_pc == + reinterpret_cast(&_vixl_internal_ReadMemory)); + } + + // Get the instruction address of the internal VIXL memory access continuation + // label. Signal handlers can resume execution at this address to return to + // TryMemoryAccess which will continue simulation. + uintptr_t GetSignalReturnAddress() const { + return reinterpret_cast(&_vixl_internal_AccessMemory_continue); + } + + // Replace the fault address reported by the kernel with the actual faulting + // address. + // + // This is required because TryMemoryAccess reads a section of + // memory 1 byte at a time meaning the fault address reported may not be the + // base address of memory being accessed. + void ReplaceFaultAddress(siginfo_t* siginfo, void* context) { +#ifdef __x86_64__ + // The base address being accessed is passed in as the first argument to + // _vixl_internal_ReadMemory. + ucontext_t* uc = reinterpret_cast(context); + siginfo->si_addr = reinterpret_cast(uc->uc_mcontext.gregs[REG_RDI]); +#else + USE(siginfo); + USE(context); +#endif // __x86_64__ + } +#endif // VIXL_ENABLE_IMPLICIT_CHECKS + protected: const char* clr_normal; const char* clr_flag_name; @@ -3234,8 +3351,9 @@ class Simulator : public DecoderVisitor { uint64_t left, uint64_t right, int carry_in); - using vixl_uint128_t = std::pair; vixl_uint128_t Add128(vixl_uint128_t x, vixl_uint128_t y); + vixl_uint128_t Lsl128(vixl_uint128_t x, unsigned shift) const; + vixl_uint128_t Eor128(vixl_uint128_t x, vixl_uint128_t y) const; vixl_uint128_t Mul64(uint64_t x, uint64_t y); vixl_uint128_t Neg128(vixl_uint128_t x); void LogicalHelper(const Instruction* instr, int64_t op2); @@ -3317,92 +3435,95 @@ class Simulator : public DecoderVisitor { uint64_t PolynomialMult(uint64_t op1, uint64_t op2, int lane_size_in_bits) const; + vixl_uint128_t PolynomialMult128(uint64_t op1, + uint64_t op2, + int lane_size_in_bits) const; - void ld1(VectorFormat vform, LogicVRegister dst, uint64_t addr); - void ld1(VectorFormat vform, LogicVRegister dst, int index, uint64_t addr); - void ld1r(VectorFormat vform, LogicVRegister dst, uint64_t addr); - void ld1r(VectorFormat vform, + bool ld1(VectorFormat vform, LogicVRegister dst, uint64_t addr); + bool ld1(VectorFormat vform, LogicVRegister dst, int index, uint64_t addr); + bool ld1r(VectorFormat vform, LogicVRegister dst, uint64_t addr); + bool ld1r(VectorFormat vform, VectorFormat unpack_vform, LogicVRegister dst, uint64_t addr, bool is_signed = false); - void ld2(VectorFormat vform, + bool ld2(VectorFormat vform, LogicVRegister dst1, LogicVRegister dst2, uint64_t addr); - void ld2(VectorFormat vform, + bool ld2(VectorFormat vform, LogicVRegister dst1, LogicVRegister dst2, int index, uint64_t addr); - void ld2r(VectorFormat vform, + bool ld2r(VectorFormat vform, LogicVRegister dst1, LogicVRegister dst2, uint64_t addr); - void ld3(VectorFormat vform, + bool ld3(VectorFormat vform, LogicVRegister dst1, LogicVRegister dst2, LogicVRegister dst3, uint64_t addr); - void ld3(VectorFormat vform, + bool ld3(VectorFormat vform, LogicVRegister dst1, LogicVRegister dst2, LogicVRegister dst3, int index, uint64_t addr); - void ld3r(VectorFormat vform, + bool ld3r(VectorFormat vform, LogicVRegister dst1, LogicVRegister dst2, LogicVRegister dst3, uint64_t addr); - void ld4(VectorFormat vform, + bool ld4(VectorFormat vform, LogicVRegister dst1, LogicVRegister dst2, LogicVRegister dst3, LogicVRegister dst4, uint64_t addr); - void ld4(VectorFormat vform, + bool ld4(VectorFormat vform, LogicVRegister dst1, LogicVRegister dst2, LogicVRegister dst3, LogicVRegister dst4, int index, uint64_t addr); - void ld4r(VectorFormat vform, + bool ld4r(VectorFormat vform, LogicVRegister dst1, LogicVRegister dst2, LogicVRegister dst3, LogicVRegister dst4, uint64_t addr); - void st1(VectorFormat vform, LogicVRegister src, uint64_t addr); - void st1(VectorFormat vform, LogicVRegister src, int index, uint64_t addr); - void st2(VectorFormat vform, + bool st1(VectorFormat vform, LogicVRegister src, uint64_t addr); + bool st1(VectorFormat vform, LogicVRegister src, int index, uint64_t addr); + bool st2(VectorFormat vform, LogicVRegister src, LogicVRegister src2, uint64_t addr); - void st2(VectorFormat vform, + bool st2(VectorFormat vform, LogicVRegister src, LogicVRegister src2, int index, uint64_t addr); - void st3(VectorFormat vform, + bool st3(VectorFormat vform, LogicVRegister src, LogicVRegister src2, LogicVRegister src3, uint64_t addr); - void st3(VectorFormat vform, + bool st3(VectorFormat vform, LogicVRegister src, LogicVRegister src2, LogicVRegister src3, int index, uint64_t addr); - void st4(VectorFormat vform, + bool st4(VectorFormat vform, LogicVRegister src, LogicVRegister src2, LogicVRegister src3, LogicVRegister src4, uint64_t addr); - void st4(VectorFormat vform, + bool st4(VectorFormat vform, LogicVRegister src, LogicVRegister src2, LogicVRegister src3, @@ -3688,6 +3809,10 @@ class Simulator : public DecoderVisitor { LogicVRegister dst, const LogicVRegister& src, int rotation); + LogicVRegister rol(VectorFormat vform, + LogicVRegister dst, + const LogicVRegister& src, + int rotation); LogicVRegister ext(VectorFormat vform, LogicVRegister dst, const LogicVRegister& src1, @@ -4412,6 +4537,95 @@ class Simulator : public DecoderVisitor { LogicVRegister srcdst, const LogicVRegister& src1, const LogicVRegister& src2); + + template + static void SHARotateEltsLeftOne(uint64_t (&x)[N]) { + VIXL_STATIC_ASSERT(N == 4); + uint64_t temp = x[3]; + x[3] = x[2]; + x[2] = x[1]; + x[1] = x[0]; + x[0] = temp; + } + + template + LogicVRegister sha1(LogicVRegister srcdst, + const LogicVRegister& src1, + const LogicVRegister& src2) { + uint64_t y = src1.Uint(kFormat4S, 0); + uint64_t sd[4] = {}; + srcdst.UintArray(kFormat4S, sd); + + for (unsigned i = 0; i < ArrayLength(sd); i++) { + uint64_t t = CryptoOp(sd[1], sd[2], sd[3]); + + y += RotateLeft(sd[0], 5, kSRegSize) + t; + y += src2.Uint(kFormat4S, i); + + sd[1] = RotateLeft(sd[1], 30, kSRegSize); + + // y:sd = ROL(y:sd, 32) + SHARotateEltsLeftOne(sd); + std::swap(sd[0], y); + } + + srcdst.SetUintArray(kFormat4S, sd); + return srcdst; + } + + LogicVRegister sha2h(LogicVRegister srcdst, + const LogicVRegister& src1, + const LogicVRegister& src2, + bool part1); + LogicVRegister sha2su0(LogicVRegister srcdst, const LogicVRegister& src1); + LogicVRegister sha2su1(LogicVRegister srcdst, + const LogicVRegister& src1, + const LogicVRegister& src2); + LogicVRegister sha512h(LogicVRegister srcdst, + const LogicVRegister& src1, + const LogicVRegister& src2); + LogicVRegister sha512h2(LogicVRegister srcdst, + const LogicVRegister& src1, + const LogicVRegister& src2); + LogicVRegister sha512su0(LogicVRegister srcdst, const LogicVRegister& src1); + LogicVRegister sha512su1(LogicVRegister srcdst, + const LogicVRegister& src1, + const LogicVRegister& src2); + + + LogicVRegister aes(LogicVRegister srcdst, + const LogicVRegister& src1, + bool decrypt); + LogicVRegister aesmix(LogicVRegister srcdst, + const LogicVRegister& src1, + bool inverse); + + LogicVRegister sm3partw1(LogicVRegister dst, + const LogicVRegister& src1, + const LogicVRegister& src2); + LogicVRegister sm3partw2(LogicVRegister dst, + const LogicVRegister& src1, + const LogicVRegister& src2); + LogicVRegister sm3ss1(LogicVRegister dst, + const LogicVRegister& src1, + const LogicVRegister& src2, + const LogicVRegister& src3); + LogicVRegister sm3tt1(LogicVRegister srcdst, + const LogicVRegister& src1, + const LogicVRegister& src2, + int index, + bool is_a); + LogicVRegister sm3tt2(LogicVRegister srcdst, + const LogicVRegister& src1, + const LogicVRegister& src2, + int index, + bool is_a); + + LogicVRegister sm4(LogicVRegister dst, + const LogicVRegister& src1, + const LogicVRegister& src2, + bool is_key); + #define NEON_3VREG_LOGIC_LIST(V) \ V(addhn) \ V(addhn2) \ @@ -4825,7 +5039,7 @@ class Simulator : public DecoderVisitor { uint32_t Crc32Checksum(uint32_t acc, T val, uint32_t poly); uint32_t Crc32Checksum(uint32_t acc, uint64_t val, uint32_t poly); - void SysOp_W(int op, int64_t val); + bool SysOp_W(int op, int64_t val); template T FPRecipSqrtEstimate(T op); @@ -4979,7 +5193,8 @@ class Simulator : public DecoderVisitor { unsigned zt_code, const LogicSVEAddressVector& addr); // Load each active zt[lane] from `addr.GetElementAddress(lane, ...)`. - void SVEStructuredLoadHelper(VectorFormat vform, + // Returns false if a load failed. + bool SVEStructuredLoadHelper(VectorFormat vform, const LogicPRegister& pg, unsigned zt_code, const LogicSVEAddressVector& addr, @@ -5178,10 +5393,12 @@ class Simulator : public DecoderVisitor { bool CanReadMemory(uintptr_t address, size_t size); +#ifndef _WIN32 // CanReadMemory needs placeholder file descriptors, so we use a pipe. We can // save some system call overhead by opening them on construction, rather than // on every call to CanReadMemory. int placeholder_pipe_fd_[2]; +#endif template static T FPDefaultNaN(); @@ -5265,15 +5482,22 @@ class Simulator : public DecoderVisitor { Vectorsaved_cpu_features_; #endif - // State for *rand48 functions, used to simulate randomness with repeatable + // linear_congruential_engine, used to simulate randomness with repeatable // behaviour (so that tests are deterministic). This is used to simulate RNDR // and RNDRRS, as well as to simulate a source of entropy for architecturally // undefined behaviour. - uint16_t rand_state_[3]; + std::linear_congruential_engine(1) << 48> + rand_gen_; // A configurable size of SVE vector registers. unsigned vector_length_; + // DC ZVA enable (= 0) status and block size. + unsigned dczid_ = (0 << 4) | 4; // 2^4 words => 64-byte block size. + // Representation of memory attributes such as MTE tagging and BTI page // protection in addition to branch interceptions. MetaDataDepot meta_data_; @@ -5287,6 +5511,161 @@ class Simulator : public DecoderVisitor { #else Debugger* debugger_{nullptr}; #endif + + // The Guarded Control Stack is represented using a vector, where the more + // recently stored addresses are at higher-numbered indices. + using GuardedControlStack = std::vector; + + // The GCSManager handles the synchronisation of GCS across multiple + // Simulator instances. Each Simulator has its own stack, but all share + // a GCSManager instance. This allows exchanging stacks between Simulators + // in a threaded application. + class GCSManager { + public: + // Allocate a new Guarded Control Stack and add it to the vector of stacks. + uint64_t AllocateStack() { + const std::lock_guard lock(stacks_mtx_); + + GuardedControlStack* new_stack = new GuardedControlStack; + uint64_t result; + + // Put the new stack into the first available slot. + for (result = 0; result < stacks_.size(); result++) { + if (stacks_[result] == nullptr) { + stacks_[result] = new_stack; + break; + } + } + + // If there were no slots, create a new one. + if (result == stacks_.size()) { + stacks_.push_back(new_stack); + } + + // Shift the index to look like a stack pointer aligned to a page. + result <<= kPageSizeLog2; + + // Push the tagged index onto the new stack as a seal. + new_stack->push_back(result + 1); + return result; + } + + // Free a Guarded Control Stack and set the stacks_ slot to null. + void FreeStack(uint64_t gcs) { + const std::lock_guard lock(stacks_mtx_); + uint64_t gcs_index = GetGCSIndex(gcs); + GuardedControlStack* gcsptr = stacks_[gcs_index]; + if (gcsptr == nullptr) { + VIXL_ABORT_WITH_MSG("Tried to free unallocated GCS "); + } else { + delete gcsptr; + stacks_[gcs_index] = nullptr; + } + } + + // Get a pointer to the GCS vector using a GCS id. + GuardedControlStack* GetGCSPtr(uint64_t gcs) const { + return stacks_[GetGCSIndex(gcs)]; + } + + private: + uint64_t GetGCSIndex(uint64_t gcs) const { return gcs >> 12; } + + std::vector stacks_; + std::mutex stacks_mtx_; + }; + + // A GCS id indicating no GCS has been allocated. + static const uint64_t kGCSNoStack = kPageSize - 1; + uint64_t gcs_; + bool gcs_enabled_; + + public: + GCSManager& GetGCSManager() { + static GCSManager manager; + return manager; + } + + void EnableGCSCheck() { gcs_enabled_ = true; } + void DisableGCSCheck() { gcs_enabled_ = false; } + bool IsGCSCheckEnabled() const { return gcs_enabled_; } + + private: + bool IsAllocatedGCS(uint64_t gcs) const { return gcs != kGCSNoStack; } + void ResetGCSState() { + GCSManager& m = GetGCSManager(); + if (IsAllocatedGCS(gcs_)) { + m.FreeStack(gcs_); + } + ActivateGCS(m.AllocateStack()); + GCSPop(); // Remove seal. + } + + GuardedControlStack* GetGCSPtr(uint64_t gcs) { + GCSManager& m = GetGCSManager(); + GuardedControlStack* result = m.GetGCSPtr(gcs); + return result; + } + GuardedControlStack* GetActiveGCSPtr() { return GetGCSPtr(gcs_); } + + uint64_t ActivateGCS(uint64_t gcs) { + uint64_t outgoing_gcs = gcs_; + gcs_ = gcs; + return outgoing_gcs; + } + + void GCSPush(uint64_t addr) { + GetActiveGCSPtr()->push_back(addr); + size_t entry = GetActiveGCSPtr()->size() - 1; + LogGCS(/* is_push = */ true, addr, entry); + } + + uint64_t GCSPop() { + GuardedControlStack* gcs = GetActiveGCSPtr(); + if (gcs->empty()) { + return 0; + } + uint64_t return_addr = gcs->back(); + size_t entry = gcs->size() - 1; + gcs->pop_back(); + LogGCS(/* is_push = */ false, return_addr, entry); + return return_addr; + } + + uint64_t GCSPeek() { + GuardedControlStack* gcs = GetActiveGCSPtr(); + if (gcs->empty()) { + return 0; + } + uint64_t return_addr = gcs->back(); + return return_addr; + } + + void ReportGCSFailure(const char* msg) { + if (IsGCSCheckEnabled()) { + GuardedControlStack* gcs = GetActiveGCSPtr(); + printf("%s", msg); + if (gcs == nullptr) { + printf("GCS pointer is null\n"); + } else { + printf("GCS records, most recent first:\n"); + int most_recent_index = static_cast(gcs->size()) - 1; + for (int i = 0; i < 8; i++) { + if (!gcs->empty()) { + uint64_t entry = gcs->back(); + gcs->pop_back(); + int index = most_recent_index - i; + printf(" gcs%" PRIu64 "[%d]: 0x%016" PRIx64 "\n", + gcs_, + index, + entry); + } + } + printf("End of GCS records.\n"); + } + VIXL_ABORT_WITH_MSG("GCS failed "); + } + } }; #if defined(VIXL_HAS_SIMULATED_RUNTIME_CALL_SUPPORT) && __cplusplus < 201402L diff --git a/src/cpu-features.h b/src/cpu-features.h index 97eb661a..1a041f66 100644 --- a/src/cpu-features.h +++ b/src/cpu-features.h @@ -201,7 +201,8 @@ namespace vixl { /* Extended BFloat16 instructions */ \ V(kEBF16, "EBF16", "ebf16") \ V(kSVE_EBF16, "EBF16 (SVE)", "sveebf16") \ - V(kCSSC, "CSSC", "cssc") + V(kCSSC, "CSSC", "cssc") \ + V(kGCS, "GCS", "gcs") // clang-format on diff --git a/src/globals-vixl.h b/src/globals-vixl.h index 2efed250..b096c7f3 100644 --- a/src/globals-vixl.h +++ b/src/globals-vixl.h @@ -215,6 +215,18 @@ inline void USE(const T1&, const T2&, const T3&, const T4&) {} } while (0) #endif +// Evaluate 'init' to an std::optional and return if it's empty. If 'init' is +// not empty then define a variable 'name' with the value inside the +// std::optional. +#define VIXL_DEFINE_OR_RETURN(name, init) \ + auto opt##name = init; \ + if (!opt##name) return; \ + auto name = *opt##name; +#define VIXL_DEFINE_OR_RETURN_FALSE(name, init) \ + auto opt##name = init; \ + if (!opt##name) return false; \ + auto name = *opt##name; + #if __cplusplus >= 201103L #define VIXL_NO_RETURN [[noreturn]] #else diff --git a/src/invalset-vixl.h b/src/invalset-vixl.h index bdd66025..12de273d 100644 --- a/src/invalset-vixl.h +++ b/src/invalset-vixl.h @@ -1,4 +1,3 @@ -// Copyright 2015, VIXL authors // All rights reserved. // // Redistribution and use in source and binary forms, with or without @@ -95,7 +94,7 @@ class InvalSet { #else InvalSet() = delete; InvalSet(AllocatorWrapper alocator); - InvalSet(InvalSet&&) = default; + InvalSet(InvalSet&&) = default; // movable #endif ~InvalSet() VIXL_NEGATIVE_TESTING_ALLOW_EXCEPTION; diff --git a/src/pool-manager-impl.h b/src/pool-manager-impl.h index 91bc4369..2dfd09c0 100644 --- a/src/pool-manager-impl.h +++ b/src/pool-manager-impl.h @@ -491,7 +491,7 @@ void PoolManager::Release(T pc) { } template -PoolManager::~PoolManager() VIXL_NEGATIVE_TESTING_ALLOW_EXCEPTION { +PoolManager::~PoolManager() VIXL_NEGATIVE_TESTING_ALLOW_EXCEPTION { #ifdef VIXL_DEBUG // Check for unbound objects. for (objects_iter iter = objects_.begin(); iter != objects_.end(); ++iter) { diff --git a/src/utils-vixl.h b/src/utils-vixl.h index 9b0dbc29..9e08ba7d 100644 --- a/src/utils-vixl.h +++ b/src/utils-vixl.h @@ -385,6 +385,11 @@ inline uint64_t RotateRight(uint64_t value, return value & width_mask; } +inline uint64_t RotateLeft(uint64_t value, + unsigned int rotate, + unsigned int width) { + return RotateRight(value, width - rotate, width); +} // Wrapper class for passing FP16 values through the assembler. // This is purely to aid with type checking/casting. @@ -437,6 +442,12 @@ T UnsignedNegate(T value) { return ~value + 1; } +template +bool CanBeNegated(T value) { + VIXL_STATIC_ASSERT(std::is_signed::value); + return (value == std::numeric_limits::min()) ? false : true; +} + // An absolute operation for signed integers that is defined for results outside // the representable range. Specifically, Abs(MIN_INT) is MIN_INT. template @@ -694,13 +705,14 @@ inline T SignExtend(T val, int size_in_bits) { template T ReverseBytes(T value, int block_bytes_log2) { VIXL_ASSERT((sizeof(value) == 4) || (sizeof(value) == 8)); - VIXL_ASSERT((1U << block_bytes_log2) <= sizeof(value)); + VIXL_ASSERT((uint64_t{1} << block_bytes_log2) <= sizeof(value)); // Split the 64-bit value into an 8-bit array, where b[0] is the least // significant byte, and b[7] is the most significant. uint8_t bytes[8]; uint64_t mask = UINT64_C(0xff00000000000000); for (int i = 7; i >= 0; i--) { - bytes[i] = (static_cast(value) & mask) >> (i * 8); + bytes[i] = + static_cast((static_cast(value) & mask) >> (i * 8)); mask >>= 8; } @@ -757,6 +769,39 @@ bool IsWordAligned(T pointer) { return IsAligned<4>(pointer); } +template +bool IsRepeatingPattern(T value) { + VIXL_STATIC_ASSERT(std::is_unsigned::value); + VIXL_ASSERT(IsMultiple(sizeof(value) * kBitsPerByte, BITS)); + VIXL_ASSERT(IsMultiple(BITS, 2)); + VIXL_STATIC_ASSERT(BITS >= 2); +#if (defined(__x86_64__) || defined(__i386)) && __clang_major__ >= 17 && \ + __clang_major__ <= 19 + // Workaround for https://github.com/llvm/llvm-project/issues/108722 + unsigned hbits = BITS / 2; + T midmask = (~static_cast(0) >> BITS) << hbits; + // E.g. for bytes in a word (0xb3b2b1b0): .b3b2b1. == .b2b1b0. + return (((value >> hbits) & midmask) == ((value << hbits) & midmask)); +#else + return value == RotateRight(value, BITS, sizeof(value) * kBitsPerByte); +#endif +} + +template +bool AllBytesMatch(T value) { + return IsRepeatingPattern(value); +} + +template +bool AllHalfwordsMatch(T value) { + return IsRepeatingPattern(value); +} + +template +bool AllWordsMatch(T value) { + return IsRepeatingPattern(value); +} + // Increment a pointer until it has the specified alignment. The alignment must // be a power of two. template diff --git a/test/aarch32/test-assembler-aarch32.cc b/test/aarch32/test-assembler-aarch32.cc index 3432a806..d97e18be 100644 --- a/test/aarch32/test-assembler-aarch32.cc +++ b/test/aarch32/test-assembler-aarch32.cc @@ -177,17 +177,23 @@ namespace aarch32 { #ifdef VIXL_INCLUDE_SIMULATOR_AARCH32 // No simulator yet. We can't test the results. -#define ASSERT_EQUAL_32(expected, result) +#define ASSERT_EQUAL_32(expected, result) \ + USE(expected, result) -#define ASSERT_EQUAL_64(expected, result) +#define ASSERT_EQUAL_64(expected, result) \ + USE(expected, result) -#define ASSERT_EQUAL_128(expected_h, expected_l, result) +#define ASSERT_EQUAL_128(expected_h, expected_l, result) \ + USE(expected_h, expected_l, result) -#define ASSERT_EQUAL_FP32(expected, result) +#define ASSERT_EQUAL_FP32(expected, result) \ + USE(expected, result) -#define ASSERT_EQUAL_FP64(expected, result) +#define ASSERT_EQUAL_FP64(expected, result) \ + USE(expected, result) -#define ASSERT_EQUAL_NZCV(expected) +#define ASSERT_EQUAL_NZCV(expected) \ + USE(expected) #else @@ -3634,8 +3640,6 @@ static void NearBranchAndLiteralFuzzHelper(InstructionSet isa, const int label_count = 15; const int literal_count = 31; Label* labels; - uint64_t* literal_values; - Literal* literals[literal_count]; // Use multiple iterations, as each produces a different predictably random // sequence. @@ -3679,12 +3683,13 @@ static void NearBranchAndLiteralFuzzHelper(InstructionSet isa, labels = new Label[label_count]; // Create new literal values. - literal_values = new uint64_t[literal_count]; + std::vector literal_values; + std::vector> literals; for (int lit = 0; lit < literal_count; lit++) { // TODO: Generate pseudo-random data for literals. At the moment, the // disassembler breaks if we do this. - literal_values[lit] = lit; - literals[lit] = new Literal(literal_values[lit]); + literal_values.push_back(lit); + literals.emplace_back(Literal(literal_values[lit])); } for (;;) { @@ -3736,13 +3741,13 @@ static void NearBranchAndLiteralFuzzHelper(InstructionSet isa, __ Nop(); break; case 4: - __ Ldr(r2, literals[literal_index]); + __ Ldr(r2, &literals[literal_index]); __ Cmp(r2, static_cast(literal_values[literal_index])); __ B(ne, &fail); __ Mov(r2, 0); break; case 5: - __ Ldrb(r2, literals[literal_index]); + __ Ldrb(r2, &literals[literal_index]); __ Cmp(r2, static_cast(literal_values[literal_index]) & 0xff); @@ -3750,7 +3755,7 @@ static void NearBranchAndLiteralFuzzHelper(InstructionSet isa, __ Mov(r2, 0); break; case 6: - __ Ldrd(r2, r3, literals[literal_index]); + __ Ldrd(r2, r3, &literals[literal_index]); __ Cmp(r2, static_cast(literal_values[literal_index])); __ B(ne, &fail); __ Mov(r2, 0); @@ -3761,7 +3766,7 @@ static void NearBranchAndLiteralFuzzHelper(InstructionSet isa, __ Mov(r3, 0); break; case 7: - __ Vldr(s0, literals[literal_index]); + __ Vldr(s0, &literals[literal_index]); __ Vmov(s1, static_cast(literal_values[literal_index])); __ Vcmp(s0, s1); __ B(ne, &fail); @@ -3875,9 +3880,6 @@ static void NearBranchAndLiteralFuzzHelper(InstructionSet isa, // independent. masm.FinalizeCode(MacroAssembler::kFallThrough); delete[] labels; - for (int lit = 0; lit < literal_count; lit++) { - delete literals[lit]; - } } } } diff --git a/test/aarch32/test-disasm-a32.cc b/test/aarch32/test-disasm-a32.cc index c229c2fb..95bb26ee 100644 --- a/test/aarch32/test-disasm-a32.cc +++ b/test/aarch32/test-disasm-a32.cc @@ -1700,6 +1700,41 @@ TEST(macro_assembler_Cbz) { } +TEST(macro_assembler_b_cond_t32) { + SETUP(); + +#ifdef VIXL_INCLUDE_TARGET_T32 + // Ensure backward conditional branches are veneered correctly. + __ UseT32(); + int pc_off = __ GetArchitectureStatePCOffset(); + + // Largest encodable backwards offset. + int curs = __ GetCursorOffset() + pc_off; + Label label_neg1m(curs - 1048576); + COMPARE_T32(B(ne, &label_neg1m), "bne 0xfff00004\n"); + + // Next largest cannot be encoded. + curs = __ GetCursorOffset() + pc_off; + Label label_neg1m_plus_inst(curs - (1048576 + 2)); + COMPARE_T32(B(ne, &label_neg1m_plus_inst), "beq 0x00000006\n" + "b 0xfff00002\n"); + + // Offset that requires largest unconditional branch in veneer. + curs = __ GetCursorOffset() + pc_off; + Label label_neg16m(curs - (16777216 - 2)); + COMPARE_T32(B(ne, &label_neg16m), "beq 0x00000006\n" + "b 0xff000006\n"); + + // Next largest cannot be veneered. + curs = __ GetCursorOffset() + pc_off; + Label label_neg16m_plus_inst(curs - 16777216); + MUST_FAIL_TEST_T32(B(ne, &label_neg16m_plus_inst), + "Conditional branch too far for veneer.\n"); +#endif + + CLEANUP(); +} + #ifdef VIXL_NEGATIVE_TESTING TEST(assembler_crc_negative) { SETUP(); diff --git a/test/aarch64/test-api-aarch64.cc b/test/aarch64/test-api-aarch64.cc index c724f178..3ac9efb7 100644 --- a/test/aarch64/test-api-aarch64.cc +++ b/test/aarch64/test-api-aarch64.cc @@ -27,6 +27,7 @@ #include #include #include +#include #include "test-runner.h" #include "test-utils.h" @@ -1763,6 +1764,24 @@ TEST(sim_stack) { VIXL_CHECK(s.IsAccessInGuardRegion(s.GetLimit() - 1280, 2048)); VIXL_CHECK(s.IsAccessInGuardRegion(s.GetLimit() - 1280, 10000)); } + +void AllocateAndFreeGCS() { + Decoder d; + Simulator s(&d); + + for (int i = 0; i < 100000; i++) { + uint64_t gcs = s.GetGCSManager().AllocateStack(); + s.GetGCSManager().FreeStack(gcs); + } +} + +TEST(sim_gcs_manager) { + std::thread t1(AllocateAndFreeGCS); + std::thread t2(AllocateAndFreeGCS); + + t1.join(); + t2.join(); +} #endif } // namespace aarch64 diff --git a/test/aarch64/test-assembler-aarch64.cc b/test/aarch64/test-assembler-aarch64.cc index 00155471..a86b32e2 100644 --- a/test/aarch64/test-assembler-aarch64.cc +++ b/test/aarch64/test-assembler-aarch64.cc @@ -1634,11 +1634,19 @@ TEST(pacia_pacib_autia_autib) { START(); Register pointer = x24; - Register modifier = x25; + Register retry_limit = x25; + Register modifier = x26; + Label retry; + // There is a small but not negligible chance (1 in 127 runs) that the PAC + // codes for keys A and B will collide, so retry a few times with different + // pointers. __ Mov(pointer, 0x0000000012345678); + __ Mov(retry_limit, 0x0000000012345678 + 32); __ Mov(modifier, 0x477d469dec0b8760); + __ Bind(&retry); + // Generate PACs using keys A and B. __ Mov(x0, pointer); __ Pacia(x0, modifier); @@ -1660,21 +1668,24 @@ TEST(pacia_pacib_autia_autib) { __ Mov(x5, x0); __ Autib(x5, modifier); - // Mask out just the PAC code bits. - // TODO: use Simulator::CalculatePACMask in a nice way. - __ And(x0, x0, 0x007f000000000000); - __ And(x1, x1, 0x007f000000000000); + // Retry on collisions. + __ Cmp(x0, x1); + __ Ccmp(pointer, x0, ZFlag, ne); + __ Ccmp(pointer, x1, ZFlag, ne); + __ Ccmp(pointer, x4, ZFlag, ne); + __ Ccmp(pointer, x5, ZFlag, ne); + __ Ccmp(pointer, retry_limit, ZFlag, eq); + __ Cinc(pointer, pointer, ne); + __ B(ne, &retry); END(); if (CAN_RUN()) { RUN(); - // Check PAC codes have been generated and aren't equal. - // NOTE: with a different ComputePAC implementation, there may be a - // collision. - ASSERT_NOT_EQUAL_64(0, x0); - ASSERT_NOT_EQUAL_64(0, x1); + // Check PAC codes have been generated. + ASSERT_NOT_EQUAL_64(pointer, x0); + ASSERT_NOT_EQUAL_64(pointer, x1); ASSERT_NOT_EQUAL_64(x0, x1); // Pointers correctly authenticated. @@ -1682,8 +1693,13 @@ TEST(pacia_pacib_autia_autib) { ASSERT_EQUAL_64(pointer, x3); // Pointers corrupted after failing to authenticate. +#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 ASSERT_EQUAL_64(0x0020000012345678, x4); ASSERT_EQUAL_64(0x0040000012345678, x5); +#else + ASSERT_NOT_EQUAL_64(pointer, x4); + ASSERT_NOT_EQUAL_64(pointer, x5); +#endif } } @@ -1694,8 +1710,16 @@ TEST(paciza_pacizb_autiza_autizb) { START(); Register pointer = x24; + Register retry_limit = x25; + Label retry; + // There is a small but not negligible chance (1 in 127 runs) that the PAC + // codes for keys A and B will collide, so retry a few times with different + // pointers. __ Mov(pointer, 0x0000000012345678); + __ Mov(retry_limit, 0x0000000012345678 + 32); + + __ Bind(&retry); // Generate PACs using keys A and B. __ Mov(x0, pointer); @@ -1718,21 +1742,24 @@ TEST(paciza_pacizb_autiza_autizb) { __ Mov(x5, x0); __ Autizb(x5); - // Mask out just the PAC code bits. - // TODO: use Simulator::CalculatePACMask in a nice way. - __ And(x0, x0, 0x007f000000000000); - __ And(x1, x1, 0x007f000000000000); + // Retry on collisions. + __ Cmp(x0, x1); + __ Ccmp(pointer, x0, ZFlag, ne); + __ Ccmp(pointer, x1, ZFlag, ne); + __ Ccmp(pointer, x4, ZFlag, ne); + __ Ccmp(pointer, x5, ZFlag, ne); + __ Ccmp(pointer, retry_limit, ZFlag, eq); + __ Cinc(pointer, pointer, ne); + __ B(ne, &retry); END(); if (CAN_RUN()) { RUN(); - // Check PAC codes have been generated and aren't equal. - // NOTE: with a different ComputePAC implementation, there may be a - // collision. - ASSERT_NOT_EQUAL_64(0, x0); - ASSERT_NOT_EQUAL_64(0, x1); + // Check PAC codes have been generated. + ASSERT_NOT_EQUAL_64(pointer, x0); + ASSERT_NOT_EQUAL_64(pointer, x1); ASSERT_NOT_EQUAL_64(x0, x1); // Pointers correctly authenticated. @@ -1740,8 +1767,13 @@ TEST(paciza_pacizb_autiza_autizb) { ASSERT_EQUAL_64(pointer, x3); // Pointers corrupted after failing to authenticate. +#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 ASSERT_EQUAL_64(0x0020000012345678, x4); ASSERT_EQUAL_64(0x0040000012345678, x5); +#else + ASSERT_NOT_EQUAL_64(pointer, x4); + ASSERT_NOT_EQUAL_64(pointer, x5); +#endif } } @@ -1752,11 +1784,19 @@ TEST(pacda_pacdb_autda_autdb) { START(); Register pointer = x24; - Register modifier = x25; + Register retry_limit = x25; + Register modifier = x26; + Label retry; + // There is a small but not negligible chance (1 in 127 runs) that the PAC + // codes for keys A and B will collide, so retry a few times with different + // pointers. __ Mov(pointer, 0x0000000012345678); + __ Mov(retry_limit, 0x0000000012345678 + 32); __ Mov(modifier, 0x477d469dec0b8760); + __ Bind(&retry); + // Generate PACs using keys A and B. __ Mov(x0, pointer); __ Pacda(x0, modifier); @@ -1778,21 +1818,24 @@ TEST(pacda_pacdb_autda_autdb) { __ Mov(x5, x0); __ Autdb(x5, modifier); - // Mask out just the PAC code bits. - // TODO: use Simulator::CalculatePACMask in a nice way. - __ And(x0, x0, 0x007f000000000000); - __ And(x1, x1, 0x007f000000000000); + // Retry on collisions. + __ Cmp(x0, x1); + __ Ccmp(pointer, x0, ZFlag, ne); + __ Ccmp(pointer, x1, ZFlag, ne); + __ Ccmp(pointer, x4, ZFlag, ne); + __ Ccmp(pointer, x5, ZFlag, ne); + __ Ccmp(pointer, retry_limit, ZFlag, eq); + __ Cinc(pointer, pointer, ne); + __ B(ne, &retry); END(); if (CAN_RUN()) { RUN(); - // Check PAC codes have been generated and aren't equal. - // NOTE: with a different ComputePAC implementation, there may be a - // collision. - ASSERT_NOT_EQUAL_64(0, x0); - ASSERT_NOT_EQUAL_64(0, x1); + // Check PAC codes have been generated. + ASSERT_NOT_EQUAL_64(pointer, x0); + ASSERT_NOT_EQUAL_64(pointer, x1); ASSERT_NOT_EQUAL_64(x0, x1); // Pointers correctly authenticated. @@ -1800,8 +1843,13 @@ TEST(pacda_pacdb_autda_autdb) { ASSERT_EQUAL_64(pointer, x3); // Pointers corrupted after failing to authenticate. +#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 ASSERT_EQUAL_64(0x0020000012345678, x4); ASSERT_EQUAL_64(0x0040000012345678, x5); +#else + ASSERT_NOT_EQUAL_64(pointer, x4); + ASSERT_NOT_EQUAL_64(pointer, x5); +#endif } } @@ -1812,8 +1860,16 @@ TEST(pacdza_pacdzb_autdza_autdzb) { START(); Register pointer = x24; + Register retry_limit = x25; + Label retry; + // There is a small but not negligible chance (1 in 127 runs) that the PAC + // codes for keys A and B will collide, so retry a few times with different + // pointers. __ Mov(pointer, 0x0000000012345678); + __ Mov(retry_limit, 0x0000000012345678 + 32); + + __ Bind(&retry); // Generate PACs using keys A and B. __ Mov(x0, pointer); @@ -1836,21 +1892,24 @@ TEST(pacdza_pacdzb_autdza_autdzb) { __ Mov(x5, x0); __ Autdzb(x5); - // Mask out just the PAC code bits. - // TODO: use Simulator::CalculatePACMask in a nice way. - __ And(x0, x0, 0x007f000000000000); - __ And(x1, x1, 0x007f000000000000); + // Retry on collisions. + __ Cmp(x0, x1); + __ Ccmp(pointer, x0, ZFlag, ne); + __ Ccmp(pointer, x1, ZFlag, ne); + __ Ccmp(pointer, x4, ZFlag, ne); + __ Ccmp(pointer, x5, ZFlag, ne); + __ Ccmp(pointer, retry_limit, ZFlag, eq); + __ Cinc(pointer, pointer, ne); + __ B(ne, &retry); END(); if (CAN_RUN()) { RUN(); - // Check PAC codes have been generated and aren't equal. - // NOTE: with a different ComputePAC implementation, there may be a - // collision. - ASSERT_NOT_EQUAL_64(0, x0); - ASSERT_NOT_EQUAL_64(0, x1); + // Check PAC codes have been generated. + ASSERT_NOT_EQUAL_64(pointer, x0); + ASSERT_NOT_EQUAL_64(pointer, x1); ASSERT_NOT_EQUAL_64(x0, x1); // Pointers correctly authenticated. @@ -1858,8 +1917,13 @@ TEST(pacdza_pacdzb_autdza_autdzb) { ASSERT_EQUAL_64(pointer, x3); // Pointers corrupted after failing to authenticate. +#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 ASSERT_EQUAL_64(0x0020000012345678, x4); ASSERT_EQUAL_64(0x0040000012345678, x5); +#else + ASSERT_NOT_EQUAL_64(pointer, x4); + ASSERT_NOT_EQUAL_64(pointer, x5); +#endif } } @@ -1870,11 +1934,19 @@ TEST(pacga_xpaci_xpacd) { START(); Register pointer = x24; - Register modifier = x25; + Register retry_limit = x25; + Register modifier = x26; + Label retry; + // There is a small but not negligible chance (1 in 127 runs) that the PAC + // codes for keys A and B will collide, so retry a few times with different + // pointers. __ Mov(pointer, 0x0000000012345678); + __ Mov(retry_limit, 0x0000000012345678 + 32); __ Mov(modifier, 0x477d469dec0b8760); + __ Bind(&retry); + // Generate generic PAC. __ Pacga(x0, pointer, modifier); @@ -1890,25 +1962,24 @@ TEST(pacga_xpaci_xpacd) { __ Xpaci(x3); __ Xpacd(x4); - // Mask out just the PAC code bits. - // TODO: use Simulator::CalculatePACMask in a nice way. - __ And(x0, x0, 0xffffffff00000000); - __ And(x1, x1, 0x007f000000000000); - __ And(x2, x2, 0x007f000000000000); + // Retry on collisions. + __ Cmp(x1, x2); + __ Ccmp(pointer, x0, ZFlag, ne); + __ Ccmp(pointer, x1, ZFlag, ne); + __ Ccmp(pointer, x2, ZFlag, ne); + __ Ccmp(pointer, retry_limit, ZFlag, eq); + __ Cinc(pointer, pointer, ne); + __ B(ne, &retry); END(); if (CAN_RUN()) { RUN(); - - // Check PAC codes have been generated and aren't equal. - // NOTE: with a different ComputePAC implementation, there may be a - // collision. - ASSERT_NOT_EQUAL_64(0, x0); - - ASSERT_NOT_EQUAL_64(0, x1); - ASSERT_NOT_EQUAL_64(0, x2); + // Check PAC codes have been generated. + ASSERT_NOT_EQUAL_64(pointer, x0); + ASSERT_NOT_EQUAL_64(pointer, x1); + ASSERT_NOT_EQUAL_64(pointer, x2); ASSERT_NOT_EQUAL_64(x1, x2); ASSERT_EQUAL_64(pointer, x3); @@ -2576,13 +2647,18 @@ TEST(return_to_reg_auth_guarded) { if (CAN_RUN()) { #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 simulator.SetGuardedPages(true); -#else - VIXL_UNIMPLEMENTED(); #endif + // On hardware, we'll run the test anyway, but mark it as SKIPPED until + // we've implemented a mechanism for marking Guarded pages. + RUN(); ASSERT_EQUAL_64(42, x0); ASSERT_EQUAL_64(84, x1); + +#ifndef VIXL_INCLUDE_SIMULATOR_AARCH64 + printf("SKIPPED: marking guarded pages is unimplemented on hardware"); +#endif } } @@ -2615,7 +2691,11 @@ TEST(branch_to_reg_auth_fail) { END(); if (CAN_RUN()) { +#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 MUST_FAIL_WITH_MESSAGE(RUN(), "Failed to authenticate pointer."); +#else + printf("SKIPPED: negative PAuth tests are unimplemented on hardware."); +#endif } } #endif // VIXL_NEGATIVE_TESTING @@ -2651,7 +2731,11 @@ TEST(return_to_reg_auth_fail) { END(); if (CAN_RUN()) { +#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 MUST_FAIL_WITH_MESSAGE(RUN(), "Failed to authenticate pointer."); +#else + printf("SKIPPED: negative PAuth tests are unimplemented on hardware."); +#endif } } #endif // VIXL_NEGATIVE_TESTING @@ -3654,7 +3738,11 @@ TEST(load_pauth_negative_test) { END(); if (CAN_RUN()) { +#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 MUST_FAIL_WITH_MESSAGE(RUN(), "Failed to authenticate pointer."); +#else + printf("SKIPPED: negative PAuth tests are unimplemented on hardware."); +#endif } } #endif // VIXL_NEGATIVE_TESTING @@ -5837,6 +5925,10 @@ TEST(rmif) { START(); __ Mov(x0, 0x0123456789abcdef); + // Clear bits of `rmif` masks leave NZCV unmodified, so we need to initialise + // it to a known state to make the test reproducible. + __ Msr(NZCV, x0); + // Set NZCV to 0b1011 (0xb) __ Rmif(x0, 0, NCVFlag); __ Mrs(x1, NZCV); @@ -5883,6 +5975,9 @@ TEST(setf8_setf16) { __ Mov(x7, 0x10001); __ Mov(x8, 0xfffffffff); + // These instruction don't modify 'C', so give it a consistent value. + __ Ands(xzr, xzr, 0); + __ Setf8(w0); __ Mrs(x9, NZCV); __ Setf8(w1); @@ -7231,23 +7326,32 @@ TEST(system_pauth_a) { temps.Exclude(x16, x17); temps.Include(x10, x11); - // Backup stack pointer. + Register pointer = x21; + Register retry_limit = x22; + Label retry; + + __ Mov(pointer, 0x0000000012345678); + __ Mov(retry_limit, 0x0000000012345678 + 32); + + // Back up stack pointer. __ Mov(x20, sp); // Modifiers __ Mov(x16, 0x477d469dec0b8760); __ Mov(sp, 0x477d469dec0b8760); + __ Bind(&retry); + // Generate PACs using the 3 system instructions. - __ Mov(x17, 0x0000000012345678); + __ Mov(x17, pointer); __ Pacia1716(); __ Mov(x0, x17); - __ Mov(lr, 0x0000000012345678); + __ Mov(lr, pointer); __ Paciaz(); __ Mov(x1, lr); - __ Mov(lr, 0x0000000012345678); + __ Mov(lr, pointer); __ Paciasp(); __ Mov(x2, lr); @@ -7282,41 +7386,51 @@ TEST(system_pauth_a) { __ Xpaclri(); __ Mov(x9, lr); + // Retry on collisions. + __ Cmp(x0, x1); + __ Ccmp(pointer, x0, ZFlag, ne); + __ Ccmp(pointer, x1, ZFlag, ne); + __ Ccmp(pointer, x2, ZFlag, ne); + __ Ccmp(pointer, x6, ZFlag, ne); + __ Ccmp(pointer, x7, ZFlag, ne); + __ Ccmp(pointer, x8, ZFlag, ne); + __ Ccmp(pointer, retry_limit, ZFlag, eq); + __ Cinc(pointer, pointer, ne); + __ B(ne, &retry); + // Restore stack pointer. __ Mov(sp, x20); - // Mask out just the PAC code bits. - // TODO: use Simulator::CalculatePACMask in a nice way. - __ And(x0, x0, 0x007f000000000000); - __ And(x1, x1, 0x007f000000000000); - __ And(x2, x2, 0x007f000000000000); - END(); if (CAN_RUN()) { RUN(); - // Check PAC codes have been generated and aren't equal. - // NOTE: with a different ComputePAC implementation, there may be a - // collision. - ASSERT_NOT_EQUAL_64(0, x0); - ASSERT_NOT_EQUAL_64(0, x1); - ASSERT_NOT_EQUAL_64(0, x2); + // Check PAC codes have been generated. + ASSERT_NOT_EQUAL_64(pointer, x0); + ASSERT_NOT_EQUAL_64(pointer, x1); + ASSERT_NOT_EQUAL_64(pointer, x2); ASSERT_NOT_EQUAL_64(x0, x1); ASSERT_EQUAL_64(x0, x2); // Pointers correctly authenticated. - ASSERT_EQUAL_64(0x0000000012345678, x3); - ASSERT_EQUAL_64(0x0000000012345678, x4); - ASSERT_EQUAL_64(0x0000000012345678, x5); + ASSERT_EQUAL_64(pointer, x3); + ASSERT_EQUAL_64(pointer, x4); + ASSERT_EQUAL_64(pointer, x5); // Pointers corrupted after failing to authenticate. +#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 ASSERT_EQUAL_64(0x0020000012345678, x6); ASSERT_EQUAL_64(0x0020000012345678, x7); ASSERT_EQUAL_64(0x0020000012345678, x8); +#else + ASSERT_NOT_EQUAL_64(pointer, x6); + ASSERT_NOT_EQUAL_64(pointer, x7); + ASSERT_NOT_EQUAL_64(pointer, x8); +#endif // Pointer with code stripped. - ASSERT_EQUAL_64(0x0000000012345678, x9); + ASSERT_EQUAL_64(pointer, x9); } } @@ -7331,13 +7445,22 @@ TEST(system_pauth_b) { temps.Exclude(x16, x17); temps.Include(x10, x11); - // Backup stack pointer. + Register pointer = x21; + Register retry_limit = x22; + Label retry; + + __ Mov(pointer, 0x0000000012345678); + __ Mov(retry_limit, 0x0000000012345678 + 32); + + // Back up stack pointer. __ Mov(x20, sp); // Modifiers __ Mov(x16, 0x477d469dec0b8760); __ Mov(sp, 0x477d469dec0b8760); + __ Bind(&retry); + // Generate PACs using the 3 system instructions. __ Mov(x17, 0x0000000012345678); __ Pacib1716(); @@ -7382,15 +7505,21 @@ TEST(system_pauth_b) { __ Xpaclri(); __ Mov(x9, lr); + // Retry on collisions. + __ Cmp(x0, x1); + __ Ccmp(pointer, x0, ZFlag, ne); + __ Ccmp(pointer, x1, ZFlag, ne); + __ Ccmp(pointer, x2, ZFlag, ne); + __ Ccmp(pointer, x6, ZFlag, ne); + __ Ccmp(pointer, x7, ZFlag, ne); + __ Ccmp(pointer, x8, ZFlag, ne); + __ Ccmp(pointer, retry_limit, ZFlag, eq); + __ Cinc(pointer, pointer, ne); + __ B(ne, &retry); + // Restore stack pointer. __ Mov(sp, x20); - // Mask out just the PAC code bits. - // TODO: use Simulator::CalculatePACMask in a nice way. - __ And(x0, x0, 0x007f000000000000); - __ And(x1, x1, 0x007f000000000000); - __ And(x2, x2, 0x007f000000000000); - END(); if (CAN_RUN()) { @@ -7399,24 +7528,30 @@ TEST(system_pauth_b) { // Check PAC codes have been generated and aren't equal. // NOTE: with a different ComputePAC implementation, there may be a // collision. - ASSERT_NOT_EQUAL_64(0, x0); - ASSERT_NOT_EQUAL_64(0, x1); - ASSERT_NOT_EQUAL_64(0, x2); + ASSERT_NOT_EQUAL_64(pointer, x0); + ASSERT_NOT_EQUAL_64(pointer, x1); + ASSERT_NOT_EQUAL_64(pointer, x2); ASSERT_NOT_EQUAL_64(x0, x1); ASSERT_EQUAL_64(x0, x2); // Pointers correctly authenticated. - ASSERT_EQUAL_64(0x0000000012345678, x3); - ASSERT_EQUAL_64(0x0000000012345678, x4); - ASSERT_EQUAL_64(0x0000000012345678, x5); + ASSERT_EQUAL_64(pointer, x3); + ASSERT_EQUAL_64(pointer, x4); + ASSERT_EQUAL_64(pointer, x5); // Pointers corrupted after failing to authenticate. +#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 ASSERT_EQUAL_64(0x0040000012345678, x6); ASSERT_EQUAL_64(0x0040000012345678, x7); ASSERT_EQUAL_64(0x0040000012345678, x8); +#else + ASSERT_NOT_EQUAL_64(pointer, x6); + ASSERT_NOT_EQUAL_64(pointer, x7); + ASSERT_NOT_EQUAL_64(pointer, x8); +#endif // Pointer with code stripped. - ASSERT_EQUAL_64(0x0000000012345678, x9); + ASSERT_EQUAL_64(pointer, x9); } } @@ -7501,11 +7636,12 @@ static void BtiHelper(Register ipreg) { __ Blr(x0); __ Adr(ipreg, &jump_call_target); __ Blr(ipreg); - __ Adr(lr, &done); // Make Ret return to done label. + __ Mov(lr, 0); // Zero lr so we branch to done. __ Br(ipreg); __ Bind(&call_target, EmitBTI_c); __ Ret(); __ Bind(&jump_call_target, EmitBTI_jc); + __ Cbz(lr, &done); __ Ret(); __ Bind(&done); END(); @@ -7513,10 +7649,15 @@ static void BtiHelper(Register ipreg) { if (CAN_RUN()) { #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 simulator.SetGuardedPages(true); -#else - VIXL_UNIMPLEMENTED(); #endif + // On hardware, we'll run the test anyway, but mark it as SKIPPED until + // we've implemented a mechanism for marking Guarded pages. + RUN(); + +#ifndef VIXL_INCLUDE_SIMULATOR_AARCH64 + printf("SKIPPED: marking guarded pages is unimplemented on hardware"); +#endif } } @@ -7529,36 +7670,42 @@ TEST(unguarded_bti_is_nop) { SETUP_WITH_FEATURES(CPUFeatures::kBTI); Label start, none, c, j, jc; + Label jump_to_c, call_to_j; START(); __ B(&start); __ Bind(&none, EmitBTI); __ Bind(&c, EmitBTI_c); __ Bind(&j, EmitBTI_j); __ Bind(&jc, EmitBTI_jc); - VIXL_CHECK(__ GetSizeOfCodeGeneratedSince(&none) == 4 * kInstructionSize); + __ Hint(BTI); + __ Hint(BTI_c); + __ Hint(BTI_j); + __ Hint(BTI_jc); + VIXL_CHECK(__ GetSizeOfCodeGeneratedSince(&none) == 8 * kInstructionSize); + __ Cmp(x1, 1); + __ B(lt, &jump_to_c); + __ B(eq, &call_to_j); __ Ret(); - Label jump_to_c, call_to_j; __ Bind(&start); __ Adr(x0, &none); - __ Adr(lr, &jump_to_c); + __ Mov(x1, 0); __ Br(x0); __ Bind(&jump_to_c); __ Adr(x0, &c); - __ Adr(lr, &call_to_j); + __ Mov(x1, 1); __ Br(x0); __ Bind(&call_to_j); __ Adr(x0, &j); + __ Mov(x1, 2); __ Blr(x0); END(); if (CAN_RUN()) { #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 simulator.SetGuardedPages(false); -#else - VIXL_UNIMPLEMENTED(); #endif RUN(); } @@ -7582,12 +7729,12 @@ TEST(bti_jump_to_ip_unidentified) { if (CAN_RUN()) { #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 simulator.SetGuardedPages(true); -#else - VIXL_UNIMPLEMENTED(); -#endif MUST_FAIL_WITH_MESSAGE(RUN(), "Executing non-BTI instruction with wrong " "BType."); +#else + printf("SKIPPED: marking guarded pages is unimplemented on hardware"); +#endif } } @@ -7606,12 +7753,12 @@ TEST(bti_jump_to_unidentified) { if (CAN_RUN()) { #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 simulator.SetGuardedPages(true); -#else - VIXL_UNIMPLEMENTED(); -#endif MUST_FAIL_WITH_MESSAGE(RUN(), "Executing non-BTI instruction with wrong " "BType."); +#else + printf("SKIPPED: marking guarded pages is unimplemented on hardware"); +#endif } } @@ -7630,12 +7777,12 @@ TEST(bti_call_to_unidentified) { if (CAN_RUN()) { #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 simulator.SetGuardedPages(true); -#else - VIXL_UNIMPLEMENTED(); -#endif MUST_FAIL_WITH_MESSAGE(RUN(), "Executing non-BTI instruction with wrong " "BType."); +#else + printf("SKIPPED: marking guarded pages is unimplemented on hardware"); +#endif } } @@ -7655,10 +7802,10 @@ TEST(bti_jump_to_c) { if (CAN_RUN()) { #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 simulator.SetGuardedPages(true); -#else - VIXL_UNIMPLEMENTED(); -#endif MUST_FAIL_WITH_MESSAGE(RUN(), "Executing BTI c with wrong BType."); +#else + printf("SKIPPED: marking guarded pages is unimplemented on hardware"); +#endif } } @@ -7678,10 +7825,10 @@ TEST(bti_call_to_j) { if (CAN_RUN()) { #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 simulator.SetGuardedPages(true); -#else - VIXL_UNIMPLEMENTED(); -#endif MUST_FAIL_WITH_MESSAGE(RUN(), "Executing BTI j with wrong BType."); +#else + printf("SKIPPED: marking guarded pages is unimplemented on hardware"); +#endif } } #endif // VIXL_NEGATIVE_TESTING @@ -7706,12 +7853,17 @@ TEST(fall_through_bti) { if (CAN_RUN()) { #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 simulator.SetGuardedPages(true); -#else - VIXL_UNIMPLEMENTED(); #endif + // On hardware, we'll run the test anyway, but mark it as SKIPPED until + // we've implemented a mechanism for marking Guarded pages. + RUN(); ASSERT_EQUAL_64(4, x0); + +#ifndef VIXL_INCLUDE_SIMULATOR_AARCH64 + printf("SKIPPED: marking guarded pages is unimplemented on hardware"); +#endif } } @@ -13364,6 +13516,76 @@ TEST(collision_literal_veneer_pools) { END(); } +static void VeneerBackwardBranchHelper(ImmBranchType type, int limit) { + SETUP(); + START(); + + // This is a code generation test. The code generated is not executed. + + __ Mov(x0, 1); + + // Non-veneer case: generate 'limit' instructions, plus the branch itself. + Label start0; + __ Bind(&start0); + for (int i = 0; i < limit; i++) { + __ Nop(); + } + switch (type) { + case CompareBranchType: + __ Cbz(x0, &start0); + break; + case TestBranchType: + __ Tbz(x0, 0, &start0); + break; + default: + VIXL_ASSERT(type == CondBranchType); + __ B(eq, &start0); + } + VIXL_CHECK(masm.GetSizeOfCodeGeneratedSince(&start0) == + ((limit + 1) * kInstructionSize)); + + // Veneer case: As above, plus one extra nop and a branch for the veneer; we + // expect a total of limit + 3 instructions. + // + // start1: + // nop x (limit + 1) + // tbnz skip_veneer + // b start1 + // skip_veneer: + // + Label start1; + __ Bind(&start1); + for (int i = 0; i < limit; i++) { + __ Nop(); + } + __ Nop(); // One extra instruction to exceed branch range. + switch (type) { + case CompareBranchType: + __ Cbz(x0, &start0); + break; + case TestBranchType: + __ Tbz(x0, 0, &start0); + break; + default: + VIXL_ASSERT(type == CondBranchType); + __ B(eq, &start0); + } + VIXL_CHECK(masm.GetSizeOfCodeGeneratedSince(&start1) == + ((limit + 3) * kInstructionSize)); + + END(); + DISASSEMBLE(); +} + +TEST(veneer_backward_tbz) { VeneerBackwardBranchHelper(TestBranchType, 8192); } + +TEST(veneer_backward_cbz) { + VeneerBackwardBranchHelper(CompareBranchType, 262144); +} + +TEST(veneer_backward_bcond) { + VeneerBackwardBranchHelper(CondBranchType, 262144); +} TEST(ldr_literal_explicit) { SETUP(); @@ -14085,20 +14307,24 @@ TEST(mte_irg) { __ Bind(&done); - // Insert random tags, excluding oddly-numbered tags, then orr them together. - // After 128 rounds, it's statistically likely that all but the least - // significant bit will be set. + // Insert random tags, excluding oddly-numbered tags, and set a bit in a + // result register for each tag used. + // After 128 rounds, it's statistically likely that all even bits in the + // least-significant half word will be set. __ Mov(x3, 0); + __ Mov(x4, 1); __ Mov(x10, 128); __ Mov(x11, 0xaaaa); Label loop2; __ Bind(&loop2); __ Irg(x2, x1, x11); + __ Lsr(x2, x2, 56); + __ Lsl(x2, x4, x2); __ Orr(x3, x3, x2); __ Subs(x10, x10, 1); __ B(ne, &loop2); - __ Lsr(x2, x3, 56); + __ Mov(x2, x3); // Check that excluding all tags results in zero tag insertion. __ Mov(x3, 0xffffffffffffffff); @@ -14109,7 +14335,7 @@ TEST(mte_irg) { RUN(); ASSERT_EQUAL_64(0, x1); - ASSERT_EQUAL_64(0xe, x2); + ASSERT_EQUAL_64(0x5555, x2); ASSERT_EQUAL_64(0xf0ffffffffffffff, x3); } } @@ -14131,23 +14357,36 @@ TEST(mops_set) { __ Setp(x1, x2, x3); __ Setm(x1, x2, x3); __ Sete(x1, x2, x3); + __ Mrs(x20, NZCV); // x2 is now zero, so this should do nothing. __ Setp(x1, x2, x3); __ Setm(x1, x2, x3); __ Sete(x1, x2, x3); + __ Mrs(x21, NZCV); // Set dst[15] to zero using the masm helper. __ Add(x1, x0, 15); __ Mov(x2, 1); __ Set(x1, x2, xzr); + __ Mrs(x22, NZCV); // Load dst for comparison. __ Ldp(x10, x11, MemOperand(x0)); END(); if (CAN_RUN()) { + // Permitted results: + // NZCV Xd Xn + // Option A: .... end of buffer 0 + // Option B: ..C. end of buffer 0 + + std::vector allowed_flags = {NoFlag, CFlag}; + RUN(); + ASSERT_EQUAL_64(allowed_flags, x20); + ASSERT_EQUAL_64(allowed_flags, x21); + ASSERT_EQUAL_64(allowed_flags, x22); ASSERT_EQUAL_64(dst_addr + 16, x1); ASSERT_EQUAL_64(0, x2); ASSERT_EQUAL_64(0x1234aa, x3); @@ -14171,11 +14410,20 @@ TEST(mops_setn) { __ Mov(x2, 16); __ Mov(x3, 0x42); __ Setn(x1, x2, x3); + __ Mrs(x20, NZCV); __ Ldp(x10, x11, MemOperand(x0)); END(); if (CAN_RUN()) { + // Permitted results: + // NZCV Xd Xn + // Option A: .... end of buffer 0 + // Option B: ..C. end of buffer 0 + + std::vector allowed_flags = {NoFlag, CFlag}; + RUN(); + ASSERT_EQUAL_64(allowed_flags, x20); ASSERT_EQUAL_64(dst_addr + 16, x1); ASSERT_EQUAL_64(0, x2); ASSERT_EQUAL_64(0x42, x3); @@ -14187,10 +14435,10 @@ TEST(mops_setn) { TEST(mops_setg) { SETUP_WITH_FEATURES(CPUFeatures::kMOPS, CPUFeatures::kMTE); - uint8_t* dst_addr = nullptr; + uint8_t* dst = nullptr; #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 const int dst_size = 32; - dst_addr = reinterpret_cast( + dst = reinterpret_cast( simulator.Mmap(NULL, dst_size * sizeof(uint8_t), PROT_READ | PROT_WRITE | PROT_MTE, @@ -14198,32 +14446,47 @@ TEST(mops_setg) { -1, 0)); - VIXL_ASSERT(dst_addr != nullptr); - uint8_t* untagged_ptr = AddressUntag(dst_addr); + VIXL_ASSERT(dst != nullptr); + uint8_t* untagged_ptr = AddressUntag(dst); memset(untagged_ptr, 0xc9, dst_size); #else // TODO: Port the memory allocation to work on MTE supported platform natively. // Note that `CAN_RUN` prevents running in MTE-unsupported environments. #endif + uintptr_t dst_addr = reinterpret_cast(dst); + uint64_t tag_mask = 0xf0ff'ffff'ffff'ffff; + START(); - __ Mov(x0, reinterpret_cast(dst_addr)); + __ Mov(x0, dst_addr); __ Gmi(x2, x0, xzr); __ Irg(x1, x0, x2); // Choose new tag for setg destination. __ Mov(x2, 16); __ Mov(x3, 0x42); __ Setg(x1, x2, x3); + __ Mrs(x20, NZCV); __ Ubfx(x4, x1, 56, 4); // Extract new tag. __ Bfi(x0, x4, 56, 4); // Tag dst_addr so set region can be loaded. __ Ldp(x10, x11, MemOperand(x0)); - __ Mov(x0, reinterpret_cast(dst_addr)); + __ Mov(x0, dst_addr); __ Ldp(x12, x13, MemOperand(x0, 16)); // Unset region has original tag. + + __ And(x1, x1, tag_mask); // Strip tag for repeatable checks. END(); if (CAN_RUN()) { + // Permitted results: + // NZCV Xd Xn + // Option A: .... end of buffer 0 + // Option B: ..C. end of buffer 0 + + std::vector allowed_flags = {NoFlag, CFlag}; + RUN(); + ASSERT_EQUAL_64(allowed_flags, x20); + ASSERT_EQUAL_64((dst_addr & tag_mask) + 16, x1); ASSERT_EQUAL_64(0, x2); ASSERT_EQUAL_64(0x42, x3); ASSERT_EQUAL_64(0x4242'4242'4242'4242, x10); @@ -14233,7 +14496,7 @@ TEST(mops_setg) { } #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 - simulator.Munmap(dst_addr, dst_size, PROT_MTE); + simulator.Munmap(dst, dst_size, PROT_MTE); #endif } @@ -14251,38 +14514,73 @@ TEST(mops_cpy) { __ Mov(x0, buf_addr); // Copy first eight bytes into second eight. - __ Mov(x2, x0); // src = &buf[0] - __ Add(x3, x0, 8); // dst = &buf[8] - __ Mov(x4, 8); // count = 8 - __ Cpyp(x3, x2, x4); - __ Cpym(x3, x2, x4); - __ Cpye(x3, x2, x4); + __ Mov(x1, x0); // src = &buf[0] + __ Add(x2, x0, 8); // dst = &buf[8] + __ Mov(x3, 8); // count = 8 + __ Cpyp(x2, x1, x3); + __ Cpym(x2, x1, x3); + __ Cpye(x2, x1, x3); __ Ldp(x10, x11, MemOperand(x0)); __ Mrs(x20, NZCV); - // Copy first eight bytes to overlapping offset, causing reverse copy. - __ Mov(x5, x0); // src = &buf[0] - __ Add(x6, x0, 4); // dst = &buf[4] - __ Mov(x7, 8); // count = 8 - __ Cpy(x6, x5, x7); + // Copy first eight bytes to overlapping offset, forcing backwards copy. + __ Mov(x4, x0); // src = &buf[0] + __ Add(x5, x0, 4); // dst = &buf[4] + __ Mov(x6, 8); // count = 8 + __ Cpy(x5, x4, x6); __ Ldp(x12, x13, MemOperand(x0)); + __ Mrs(x21, NZCV); + + // Copy last eight bytes to overlapping offset, forcing forwards copy. + __ Add(x7, x0, 8); // src = &buf[8] + __ Add(x8, x0, 6); // dst = &buf[6] + __ Mov(x9, 8); // count = 8 + __ Cpy(x8, x7, x9); + __ Ldp(x14, x15, MemOperand(x0)); + __ Mrs(x22, NZCV); END(); if (CAN_RUN()) { + // Permitted results: + // NZCV Xs/Xd Xn + // Option A (forwards) : .... ends of buffers 0 + // Option A (backwards): .... starts of buffers 0 + // Option B (forwards) : ..C. ends of buffers 0 + // Option B (backwards): N.C. starts of buffers 0 + + std::vector allowed_backwards_flags = {NoFlag, NCFlag}; + std::vector allowed_forwards_flags = {NoFlag, CFlag}; + RUN(); - ASSERT_EQUAL_64(buf_addr + 8, x2); - ASSERT_EQUAL_64(buf_addr + 16, x3); - ASSERT_EQUAL_64(0, x4); + // IMPLEMENTATION DEFINED direction + if (static_cast(core.xreg(2)) > buf_addr) { + // Forwards + ASSERT_EQUAL_64(buf_addr + 8, x1); + ASSERT_EQUAL_64(buf_addr + 16, x2); + ASSERT_EQUAL_64(allowed_forwards_flags, x20); + } else { + // Backwards + ASSERT_EQUAL_64(buf_addr, x1); + ASSERT_EQUAL_64(buf_addr + 8, x2); + ASSERT_EQUAL_64(allowed_backwards_flags, x20); + } + ASSERT_EQUAL_64(0, x3); // Xn ASSERT_EQUAL_64(0x0706'0504'0302'0100, x10); ASSERT_EQUAL_64(0x0706'0504'0302'0100, x11); - ASSERT_EQUAL_64(CFlag, x20); - ASSERT_EQUAL_64(buf_addr, x5); - ASSERT_EQUAL_64(buf_addr + 4, x6); - ASSERT_EQUAL_64(0, x7); + ASSERT_EQUAL_64(buf_addr, x4); // Xs + ASSERT_EQUAL_64(buf_addr + 4, x5); // Xd + ASSERT_EQUAL_64(0, x6); // Xn ASSERT_EQUAL_64(0x0302'0100'0302'0100, x12); ASSERT_EQUAL_64(0x0706'0504'0706'0504, x13); - ASSERT_EQUAL_NZCV(NCFlag); + ASSERT_EQUAL_64(allowed_backwards_flags, x21); + + ASSERT_EQUAL_64(buf_addr + 16, x7); // Xs + ASSERT_EQUAL_64(buf_addr + 14, x8); // Xd + ASSERT_EQUAL_64(0, x9); // Xn + ASSERT_EQUAL_64(0x0504'0100'0302'0100, x14); + ASSERT_EQUAL_64(0x0706'0706'0504'0706, x15); + ASSERT_EQUAL_64(allowed_forwards_flags, x22); } } @@ -14302,44 +14600,61 @@ TEST(mops_cpyn) { START(); __ Mov(x0, buf_addr); - __ Add(x2, x0, 1); // src = &buf[1] - __ Mov(x3, x0); // dst = &buf[0] - __ Mov(x4, 15); // count = 15 - __ Cpyn(x3, x2, x4); + __ Add(x1, x0, 1); // src = &buf[1] + __ Mov(x2, x0); // dst = &buf[0] + __ Mov(x3, 15); // count = 15 + __ Cpyn(x2, x1, x3); __ Ldp(x10, x11, MemOperand(x0)); + __ Mrs(x20, NZCV); - __ Add(x5, x0, 1); // src = &buf[1] - __ Mov(x6, x0); // dst = &buf[0] - __ Mov(x4, 15); // count = 15 - __ Cpyrn(x6, x5, x4); + __ Add(x4, x0, 1); // src = &buf[1] + __ Mov(x5, x0); // dst = &buf[0] + __ Mov(x6, 15); // count = 15 + __ Cpyrn(x5, x4, x6); __ Ldp(x12, x13, MemOperand(x0)); + __ Mrs(x21, NZCV); __ Add(x7, x0, 1); // src = &buf[1] __ Mov(x8, x0); // dst = &buf[0] - __ Mov(x4, 15); // count = 15 - __ Cpywn(x8, x7, x4); + __ Mov(x9, 15); // count = 15 + __ Cpywn(x8, x7, x9); __ Ldp(x14, x15, MemOperand(x0)); + __ Mrs(x22, NZCV); END(); if (CAN_RUN()) { + // Permitted results: + // NZCV Xs/Xd Xn + // Option A (forwards) : .... ends of buffers 0 + // Option A (backwards): .... starts of buffers 0 + // Option B (forwards) : ..C. ends of buffers 0 + // Option B (backwards): N.C. starts of buffers 0 + // + // All cases overlap to force a forwards copy. + + std::vector allowed_forwards_flags = {NoFlag, CFlag}; + RUN(); - ASSERT_EQUAL_64(buf_addr + 16, x2); - ASSERT_EQUAL_64(buf_addr + 15, x3); + ASSERT_EQUAL_64(buf_addr + 16, x1); // Xs + ASSERT_EQUAL_64(buf_addr + 15, x2); // Xd + ASSERT_EQUAL_64(0, x3); // Xn + ASSERT_EQUAL_64(allowed_forwards_flags, x20); ASSERT_EQUAL_64(0x0807'0605'0403'0201, x10); ASSERT_EQUAL_64(0x0f0f'0e0d'0c0b'0a09, x11); - ASSERT_EQUAL_64(buf_addr + 16, x5); - ASSERT_EQUAL_64(buf_addr + 15, x6); + ASSERT_EQUAL_64(buf_addr + 16, x4); // Xs + ASSERT_EQUAL_64(buf_addr + 15, x5); // Xd + ASSERT_EQUAL_64(0, x6); // Xn + ASSERT_EQUAL_64(allowed_forwards_flags, x21); ASSERT_EQUAL_64(0x0908'0706'0504'0302, x12); ASSERT_EQUAL_64(0x0f0f'0f0e'0d0c'0b0a, x13); - ASSERT_EQUAL_64(buf_addr + 16, x7); - ASSERT_EQUAL_64(buf_addr + 15, x8); + ASSERT_EQUAL_64(buf_addr + 16, x7); // Xs + ASSERT_EQUAL_64(buf_addr + 15, x8); // Xd + ASSERT_EQUAL_64(0, x9); // Xn + ASSERT_EQUAL_64(allowed_forwards_flags, x22); ASSERT_EQUAL_64(0x0a09'0807'0605'0403, x14); ASSERT_EQUAL_64(0x0f0f'0f0f'0e0d'0c0b, x15); - - ASSERT_EQUAL_64(0, x4); - ASSERT_EQUAL_NZCV(CFlag); } } @@ -14353,46 +14668,79 @@ TEST(mops_cpyf) { buf[i] = i; } - // This test matches the cpy variant above, but using cpyf will result in a - // different answer for the overlapping copy. + // As `mops_cpy`, but `cpyf` always copies forwards, so is only useful for + // non-overlapping buffers, or those where the source address is greater than + // the destination address. + START(); __ Mov(x0, buf_addr); - // Copy first eight bytes into second eight. - __ Mov(x2, x0); // src = &buf[0] - __ Add(x3, x0, 8); // dst = &buf[8] - __ Mov(x4, 8); // count = 8 - __ Cpyf(x3, x2, x4); + // Copy first eight bytes into second eight, without overlap. + __ Mov(x1, x0); // src = &buf[0] + __ Add(x2, x0, 8); // dst = &buf[8] + __ Mov(x3, 8); // count = 8 + __ Cpyfp(x2, x1, x3); + __ Cpyfm(x2, x1, x3); + __ Cpyfe(x2, x1, x3); __ Ldp(x10, x11, MemOperand(x0)); __ Mrs(x20, NZCV); - // Copy first eight bytes to overlapping offset. - __ Mov(x5, x0); // src = &buf[0] - __ Add(x6, x0, 4); // dst = &buf[4] - __ Mov(x7, 8); // count = 8 - __ Cpyf(x6, x5, x7); + // Copy last eight bytes to overlapping offset where src < dst. + __ Add(x4, x0, 8); // src = &buf[8] + __ Add(x5, x0, 6); // dst = &buf[6] + __ Mov(x6, 8); // count = 8 + __ Cpyf(x5, x4, x6); __ Ldp(x12, x13, MemOperand(x0)); + __ Mrs(x21, NZCV); + + // Copy first eight bytes to overlapping offset where src > dst. + __ Mov(x7, x0); // src = &buf[0] + __ Add(x8, x0, 4); // dst = &buf[4] + __ Mov(x9, 8); // count = 8 + __ Cpyf(x8, x7, x9); + // The only testable result is the first and last four bytes, which are not + // written at all. + __ Ldr(w14, MemOperand(x0)); + __ Ldr(w15, MemOperand(x0, 12)); + __ Mrs(x22, NZCV); + END(); if (CAN_RUN()) { + // Permitted results: + // NZCV Xs/Xd Xn + // Option A: .... ends of buffers 0 + // Option B: ..C. ends of buffers 0 + + std::vector allowed_forwards_flags = {NoFlag, CFlag}; + RUN(); - ASSERT_EQUAL_64(buf_addr + 8, x2); - ASSERT_EQUAL_64(buf_addr + 16, x3); - ASSERT_EQUAL_64(0, x4); + + // No overlap. + ASSERT_EQUAL_64(buf_addr + 8, x1); // Xs + ASSERT_EQUAL_64(buf_addr + 16, x2); // Xd + ASSERT_EQUAL_64(0, x3); // Xn + ASSERT_EQUAL_64(allowed_forwards_flags, x20); ASSERT_EQUAL_64(0x0706'0504'0302'0100, x10); ASSERT_EQUAL_64(0x0706'0504'0302'0100, x11); - ASSERT_EQUAL_64(CFlag, x20); - ASSERT_EQUAL_64(buf_addr + 8, x5); - ASSERT_EQUAL_64(buf_addr + 12, x6); - ASSERT_EQUAL_64(0, x7); - ASSERT_EQUAL_NZCV(CFlag); + // Overlap, src > dst. + ASSERT_EQUAL_64(buf_addr + 16, x4); // Xs + ASSERT_EQUAL_64(buf_addr + 14, x5); // Xd + ASSERT_EQUAL_64(0, x6); // Xn + ASSERT_EQUAL_64(0x0100'0504'0302'0100, x12); + ASSERT_EQUAL_64(0x0706'0706'0504'0302, x13); + ASSERT_EQUAL_64(allowed_forwards_flags, x21); - // These results are not architecturally defined. They may change if the - // simulator is implemented in a different, but still architecturally - // correct, way. - ASSERT_EQUAL_64(0x0302'0100'0302'0100, x12); - ASSERT_EQUAL_64(0x0706'0504'0302'0100, x13); + // Overlap, src < dst. + ASSERT_EQUAL_64(buf_addr + 8, x7); // Xs + ASSERT_EQUAL_64(buf_addr + 12, x8); // Xd + ASSERT_EQUAL_64(0, x9); // Xn + // We can only reliably test that the operation didn't write outside the + // specified region. + ASSERT_EQUAL_32(0x0302'0100, w14); + ASSERT_EQUAL_32(0x0706'0706, w15); + ASSERT_EQUAL_64(allowed_forwards_flags, x22); } } @@ -14412,44 +14760,57 @@ TEST(mops_cpyfn) { START(); __ Mov(x0, buf_addr); - __ Add(x2, x0, 1); // src = &buf[1] - __ Mov(x3, x0); // dst = &buf[0] - __ Mov(x4, 15); // count = 15 - __ Cpyfn(x3, x2, x4); + __ Add(x1, x0, 1); // src = &buf[1] + __ Mov(x2, x0); // dst = &buf[0] + __ Mov(x3, 15); // count = 15 + __ Cpyfn(x2, x1, x3); __ Ldp(x10, x11, MemOperand(x0)); + __ Mrs(x20, NZCV); - __ Add(x5, x0, 1); // src = &buf[1] - __ Mov(x6, x0); // dst = &buf[0] - __ Mov(x4, 15); // count = 15 - __ Cpyfrn(x6, x5, x4); + __ Add(x4, x0, 1); // src = &buf[1] + __ Mov(x5, x0); // dst = &buf[0] + __ Mov(x6, 15); // count = 15 + __ Cpyfrn(x5, x4, x6); __ Ldp(x12, x13, MemOperand(x0)); + __ Mrs(x21, NZCV); __ Add(x7, x0, 1); // src = &buf[1] __ Mov(x8, x0); // dst = &buf[0] - __ Mov(x4, 15); // count = 15 - __ Cpyfwn(x8, x7, x4); + __ Mov(x9, 15); // count = 15 + __ Cpyfwn(x8, x7, x9); __ Ldp(x14, x15, MemOperand(x0)); + __ Mrs(x22, NZCV); END(); if (CAN_RUN()) { + // Permitted results: + // NZCV Xs/Xd Xn + // Option A: .... ends of buffers 0 + // Option B: ..C. ends of buffers 0 + + std::vector allowed_flags = {NoFlag, CFlag}; + RUN(); - ASSERT_EQUAL_64(buf_addr + 16, x2); - ASSERT_EQUAL_64(buf_addr + 15, x3); + ASSERT_EQUAL_64(buf_addr + 16, x1); // Xs + ASSERT_EQUAL_64(buf_addr + 15, x2); // Xd + ASSERT_EQUAL_64(0, x3); // Xn + ASSERT_EQUAL_64(allowed_flags, x20); ASSERT_EQUAL_64(0x0807'0605'0403'0201, x10); ASSERT_EQUAL_64(0x0f0f'0e0d'0c0b'0a09, x11); - ASSERT_EQUAL_64(buf_addr + 16, x5); - ASSERT_EQUAL_64(buf_addr + 15, x6); + ASSERT_EQUAL_64(buf_addr + 16, x4); // Xs + ASSERT_EQUAL_64(buf_addr + 15, x5); // Xd + ASSERT_EQUAL_64(0, x6); // Xn + ASSERT_EQUAL_64(allowed_flags, x21); ASSERT_EQUAL_64(0x0908'0706'0504'0302, x12); ASSERT_EQUAL_64(0x0f0f'0f0e'0d0c'0b0a, x13); - ASSERT_EQUAL_64(buf_addr + 16, x7); - ASSERT_EQUAL_64(buf_addr + 15, x8); + ASSERT_EQUAL_64(buf_addr + 16, x7); // Xs + ASSERT_EQUAL_64(buf_addr + 15, x8); // Xd + ASSERT_EQUAL_64(0, x9); // Xn + ASSERT_EQUAL_64(allowed_flags, x22); ASSERT_EQUAL_64(0x0a09'0807'0605'0403, x14); ASSERT_EQUAL_64(0x0f0f'0f0f'0e0d'0c0b, x15); - - ASSERT_EQUAL_64(0, x4); - ASSERT_EQUAL_NZCV(CFlag); } } @@ -14723,6 +15084,298 @@ TEST(cssc_smax) { MinMaxHelper(op, true, s64min, s64max, 0, s64max); } +static void ChkfeatHelper(uint64_t initial, + uint64_t chkfeat, + CPUFeatures require) { + SETUP_WITH_FEATURES(require); + + START(); + __ Mov(x16, initial); + __ Chkfeat(x16); + __ Mov(x0, x16); + + __ Mov(x1, initial); + __ Chkfeat(x1); + END(); + + if (CAN_RUN()) { + RUN_WITHOUT_SEEN_FEATURE_CHECK(); + ASSERT_EQUAL_64(chkfeat, x0); + ASSERT_EQUAL_64(x0, x1); + } +} + +TEST(chkfeat) { ChkfeatHelper(0x0, 0x0, CPUFeatures::None()); } + +TEST(chkfeat_gcs) { ChkfeatHelper(0x1, 0x0, CPUFeatures::kGCS); } + +TEST(chkfeat_unused) { + // Bits 1-63 are reserved. This test ensures that they are unmodified by + // `chkfeat`, but it will need to be updated if these bits are assigned in the + // future. + ChkfeatHelper(0xffff'ffff'ffff'fffe, + 0xffff'ffff'ffff'fffe, + CPUFeatures::None()); +} + +TEST(gcs_feature_off) { + SETUP(); + + START(); +#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 + simulator.DisableGCSCheck(); +#else +// TODO: Disable GCS via operating system for this test, here and in the +// gcs_off_pac_on test below. +#endif + __ Mov(x16, 0x0123'4567'89ab'cdef); + __ Chkfeat(x16); + + // This sequence would fail with GCS enabled. + Label lab, end; + __ Bl(&lab); + __ B(&end); + + __ Bind(&lab); + __ Adr(lr, &end); + __ Ret(); + + __ Bind(&end); + END(); + + if (CAN_RUN()) { + // TODO: This will currently fail on GCS-supporting hardware. + RUN(); + ASSERT_EQUAL_64(0x0123'4567'89ab'cdef, x16); + } +} + +TEST(gcs_gcspushm) { + SETUP_WITH_FEATURES(CPUFeatures::kGCS); + + Label ret; + START(); + __ Adr(x0, &ret); + __ Gcspushm(x0); + __ Ret(x0); + __ Nop(); + __ Bind(&ret); + END(); + + if (CAN_RUN()) { + RUN(); + } +} + +TEST(gcs_gcspopm) { + SETUP_WITH_FEATURES(CPUFeatures::kGCS); + + Label lab, ret; + START(); + __ Adr(x0, &ret); + __ Bl(&lab); + __ Bind(&ret); + __ Nop(); + __ Bind(&lab); + __ Gcspopm(x1); + END(); + + if (CAN_RUN()) { + RUN(); + ASSERT_EQUAL_64(x0, x1); + } +} + +TEST(gcs_gcsss1) { + SETUP_WITH_FEATURES(CPUFeatures::kGCS); + + START(); +#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 + uint64_t new_gcs = simulator.GetGCSManager().AllocateStack(); + __ Mov(x0, new_gcs); +#else +// TODO: Request new GCS from the operating system. +#endif + + // Partial stack swap to check GCS has changed, and a token is at the top + // of the new stack. + __ Gcsss1(x0); + __ Gcspopm(x1); + + __ Bic(x0, x0, 7); // Clear LSB of new GCS. + __ Bic(x2, x1, 7); // Clear LSB of old GCS. + __ Cmp(x0, x2); + __ Cset(x0, eq); + __ And(x1, x1, 7); // In progress token. + END(); + + if (CAN_RUN()) { + RUN(); + ASSERT_EQUAL_64(0, x0); // GCS must not be equal. + ASSERT_EQUAL_64(5, x1); // In progress token must be present. + } +} + +// TODO: Add extra tests for combinations of PAC and GCS enabled. +TEST(gcs_stack_swap) { + SETUP_WITH_FEATURES(CPUFeatures::kGCS); + + START(); + Label stack_swap, sub_fn, end; +#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 + uint64_t new_gcs = simulator.GetGCSManager().AllocateStack(); + __ Mov(x0, new_gcs); +#else +// TODO: Request new GCS from the operating system. +#endif + __ Bl(&stack_swap); + __ B(&end); + + __ Bind(&stack_swap); + __ Gcsss1(x0); // x0 = new GCS. + __ Gcsss2(x1); // x1 = old GCS. + __ Mov(x29, lr); + __ Bl(&sub_fn); + __ Mov(lr, x29); + __ Gcsss1(x1); // Restore old GCS. + __ Gcsss2(x0); + __ Ret(); + + __ Bind(&sub_fn); + __ Mov(x2, 42); + __ Ret(); + + __ Bind(&end); + END(); + + if (CAN_RUN()) { + RUN(); + ASSERT_EQUAL_64(42, x2); + } +} + +TEST(gcs_off_pac_on) { + SETUP_WITH_FEATURES(CPUFeatures::kPAuth); + + START(); +#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 + simulator.DisableGCSCheck(); +#else +// TODO: Disable GCS via operating system for this test, and enable for native. +#endif + __ Mov(x16, 1); + __ Chkfeat(x16); + __ Mov(x1, x16); + + Label fn1, after_fn1; + + __ Mov(x28, sp); + __ Mov(x29, lr); + __ Mov(sp, 0x477d469dec0b8760); + + __ Mov(x0, 0); + __ B(&after_fn1); + + __ Bind(&fn1); + __ Mov(x0, 42); + __ Paciasp(); + __ Retaa(); + + __ Bind(&after_fn1); + __ Bl(&fn1); + + __ Mov(sp, x28); + __ Mov(lr, x29); + END(); + + if (CAN_RUN()) { + RUN(); + + ASSERT_EQUAL_64(42, x0); + ASSERT_EQUAL_64(1, x1); + } +} + +#ifdef VIXL_NEGATIVE_TESTING +TEST(gcs_negative_test) { + SETUP_WITH_FEATURES(CPUFeatures::kGCS); + + Label fn, bad_return_addr, done; + START(); + __ Bl(&fn); + __ Nop(); // GCS enforces that fn() returns here... + + __ Bind(&bad_return_addr); + __ B(&done); // ... but this test attempts to return here. + + __ Bind(&fn); + __ Adr(lr, &bad_return_addr); + __ Ret(); + + __ Bind(&done); + END(); + + if (CAN_RUN()) { + MUST_FAIL_WITH_MESSAGE(RUN(), "GCS failed"); + } +} +#endif // VIXL_NEGATIVE_TESTING + +TEST(dc_zva) { + SETUP_WITH_FEATURES(CPUFeatures::kNEON); + + const int zva_blocksize = 64; // Assumed blocksize. + uint8_t buf[2 * zva_blocksize]; + uintptr_t buf_addr = reinterpret_cast(buf); + uintptr_t aligned_addr = AlignUp(buf_addr, zva_blocksize); + + START(); + // Skip this test if the ZVA blocksize is not 64 bytes. + // Set up initial register values to allow the test to pass when skipped. + Label skip; + __ Movi(q0.V16B(), 0); + __ Movi(q1.V16B(), 0); + __ Movi(q2.V16B(), 0); + __ Movi(q3.V16B(), 0); + + __ Mrs(x1, DCZID_EL0); + __ Cmp(x1, 4); // 4 => DC ZVA enabled with 64-byte blocks. + __ B(ne, &skip); + + // Fill aligned region with a pattern. + __ Mov(x0, aligned_addr); + __ Movi(q0.V16B(), 0x55); + __ Movi(q1.V16B(), 0xaa); + __ Movi(q2.V16B(), 0x55); + __ Movi(q3.V16B(), 0xaa); + __ St4(q0.V16B(), q1.V16B(), q2.V16B(), q3.V16B(), MemOperand(x0)); + + // Misalign the address to check DC ZVA re-aligns. + __ Add(x0, x0, 42); + + // Clear the aligned region. + __ Dc(ZVA, x0); + + // Reload the aligned region to check contents. + __ Mov(x0, aligned_addr); + __ Ld1(q0.V16B(), q1.V16B(), q2.V16B(), q3.V16B(), MemOperand(x0)); + + __ Bind(&skip); + END(); + + if (CAN_RUN()) { + RUN(); + if (core.xreg(1) == 4) { + ASSERT_EQUAL_128(0, 0, q0); + ASSERT_EQUAL_128(0, 0, q1); + ASSERT_EQUAL_128(0, 0, q2); + ASSERT_EQUAL_128(0, 0, q3); + } else { + printf("SKIPPED: DC ZVA chunksize not 64-bytes"); + } + } +} + #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64 // Test the pseudo-instructions that control CPUFeatures dynamically in the // Simulator. These are used by the test infrastructure itself, but in a fairly diff --git a/test/aarch64/test-assembler-fp-aarch64.cc b/test/aarch64/test-assembler-fp-aarch64.cc index 56073592..22010298 100644 --- a/test/aarch64/test-assembler-fp-aarch64.cc +++ b/test/aarch64/test-assembler-fp-aarch64.cc @@ -3670,720 +3670,280 @@ TEST(fcvt_half) { } } +typedef void (MacroAssembler::*FcvtFn2)(const Register& rd, + const VRegister& vn); +typedef void (MacroAssembler::*FcvtFn3)(const Register& rd, + const VRegister& vn, + int fbits); + +static void GenFcvt(MacroAssembler* m, + FcvtFn2 fn, + const Register& rd, + const VRegister& vn) { + (m->*fn)(rd, vn); +} +static void GenFcvt(MacroAssembler* m, + FcvtFn3 fn, + const Register& rd, + const VRegister& vn) { + (m->*fn)(rd, vn, 0); +} + +template +static void FcvtHelper(F fn, + const T (&inputs)[N], + const uint64_t (&expected)[N], + int dstsize) { + VIXL_STATIC_ASSERT(N < 16); // Use no more than 16 registers. + + SETUP_WITH_FEATURES(CPUFeatures::kFP); + START(); + + for (unsigned i = 0; i < N; i++) { + Register wi = WRegister(i); + Register xi = XRegister(i); + VRegister si = SRegister(i); + VRegister di = DRegister(i); + + if (std::is_same::value) { + __ Fmov(si, inputs[i]); + if (dstsize == kWRegSize) { + GenFcvt(&masm, fn, wi, si); + } else { + VIXL_ASSERT(dstsize == kXRegSize); + GenFcvt(&masm, fn, xi, si); + } + } else { + __ Fmov(di, inputs[i]); + if (dstsize == kWRegSize) { + GenFcvt(&masm, fn, wi, di); + } else { + VIXL_ASSERT(dstsize == kXRegSize); + GenFcvt(&masm, fn, xi, di); + } + } + } + + END(); + if (CAN_RUN()) { + RUN(); + + for (unsigned i = 0; i < N; i++) { + ASSERT_EQUAL_64(expected[i], XRegister(i)); + } + } +} + +// Largest float/double < INT32_MAX. +static const float kLargestF32ltI32Max = RawbitsToFloat(0x4effffff); +static const double kLargestF64ltI32Max = kWMaxInt - 1; + +// Smallest float/double > INT32_MIN. +static const float kSmallestF32gtI32Min = RawbitsToFloat(0xceffffff); +static const double kSmallestF64gtI32Min = kWMinInt + 1; + +// Largest float/double < INT64_MAX. +static const float kLargestF32ltI64Max = RawbitsToFloat(0x5effffff); +static const double kLargestF64ltI64Max = RawbitsToDouble(0x43dfffffffffffff); + +// Smallest float/double > INT64_MIN. +static const float kSmallestF32gtI64Min = RawbitsToFloat(0xdeffffff); +static const double kSmallestF64gtI64Min = RawbitsToDouble(0xc3dfffffffffffff); + +// Largest float/double < UINT32_MAX. +static const float kLargestF32ltU32Max = 0xffffff00; +static const double kLargestF64ltU32Max = 0xfffffffe; + +// Largest float/double < UINT64_MAX. +static const float kLargestF32ltU64Max = 0xffffff0000000000; +static const double kLargestF64ltU64Max = 0xfffffffffffff800; + +TEST(fcvt_infinity) { + float inputs_s[] = {kFP32PositiveInfinity, kFP32NegativeInfinity}; + double inputs_d[] = {kFP64PositiveInfinity, kFP64NegativeInfinity}; + uint64_t expected_w[] = {0x7fffffff, 0x80000000}; + uint64_t expected_x[] = {0x7fffffffffffffff, 0x8000000000000000}; + + // Test all combinations of fcvt, input size and output size. + FcvtHelper(&MacroAssembler::Fcvtas, inputs_s, expected_w, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtms, inputs_s, expected_w, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtns, inputs_s, expected_w, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtzs, inputs_s, expected_w, kWRegSize); + + FcvtHelper(&MacroAssembler::Fcvtas, inputs_d, expected_w, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtms, inputs_d, expected_w, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtns, inputs_d, expected_w, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtzs, inputs_d, expected_w, kWRegSize); + + FcvtHelper(&MacroAssembler::Fcvtas, inputs_s, expected_x, kXRegSize); + FcvtHelper(&MacroAssembler::Fcvtms, inputs_s, expected_x, kXRegSize); + FcvtHelper(&MacroAssembler::Fcvtns, inputs_s, expected_x, kXRegSize); + FcvtHelper(&MacroAssembler::Fcvtzs, inputs_s, expected_x, kXRegSize); + + FcvtHelper(&MacroAssembler::Fcvtas, inputs_d, expected_x, kXRegSize); + FcvtHelper(&MacroAssembler::Fcvtms, inputs_d, expected_x, kXRegSize); + FcvtHelper(&MacroAssembler::Fcvtns, inputs_d, expected_x, kXRegSize); + FcvtHelper(&MacroAssembler::Fcvtzs, inputs_d, expected_x, kXRegSize); +} + +TEST(fcvt_ws_minmax) { + float inputs[] = {kLargestF32ltI32Max, kSmallestF32gtI32Min}; + uint64_t expected[] = {0x7fffff80, 0x80000080}; + FcvtHelper(&MacroAssembler::Fcvtas, inputs, expected, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtms, inputs, expected, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtns, inputs, expected, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtzs, inputs, expected, kWRegSize); + + float inputs_u[] = {kLargestF32ltU32Max}; + uint64_t expected_u[] = {0xffffff00}; + FcvtHelper(&MacroAssembler::Fcvtau, inputs_u, expected_u, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtmu, inputs_u, expected_u, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtnu, inputs_u, expected_u, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtzu, inputs_u, expected_u, kWRegSize); +} + +TEST(fcvt_wd_minmax) { + double inputs[] = {kLargestF64ltI32Max, kSmallestF64gtI32Min}; + uint64_t expected[] = {0x7ffffffe, 0x80000001}; + FcvtHelper(&MacroAssembler::Fcvtas, inputs, expected, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtms, inputs, expected, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtns, inputs, expected, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtzs, inputs, expected, kWRegSize); + + double inputs_u[] = {kLargestF64ltU32Max}; + uint64_t expected_u[] = {0xfffffffe}; + FcvtHelper(&MacroAssembler::Fcvtau, inputs_u, expected_u, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtmu, inputs_u, expected_u, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtnu, inputs_u, expected_u, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtzu, inputs_u, expected_u, kWRegSize); +} + +TEST(fcvt_xs_minmax) { + float inputs[] = {kLargestF32ltI64Max, kSmallestF32gtI64Min}; + uint64_t expected[] = {0x7fffff8000000000, 0x8000008000000000}; + FcvtHelper(&MacroAssembler::Fcvtas, inputs, expected, kXRegSize); + FcvtHelper(&MacroAssembler::Fcvtms, inputs, expected, kXRegSize); + FcvtHelper(&MacroAssembler::Fcvtns, inputs, expected, kXRegSize); + FcvtHelper(&MacroAssembler::Fcvtzs, inputs, expected, kXRegSize); + + float inputs_u[] = {kLargestF32ltU64Max}; + uint64_t expected_u[] = {0xffffff0000000000}; + FcvtHelper(&MacroAssembler::Fcvtau, inputs_u, expected_u, kXRegSize); + FcvtHelper(&MacroAssembler::Fcvtmu, inputs_u, expected_u, kXRegSize); + FcvtHelper(&MacroAssembler::Fcvtnu, inputs_u, expected_u, kXRegSize); + FcvtHelper(&MacroAssembler::Fcvtzu, inputs_u, expected_u, kXRegSize); +} + +TEST(fcvt_xd_minmax) { + double inputs[] = {kLargestF64ltI64Max, kSmallestF64gtI64Min}; + uint64_t expected[] = {0x7ffffffffffffc00, 0x8000000000000400}; + FcvtHelper(&MacroAssembler::Fcvtas, inputs, expected, kXRegSize); + FcvtHelper(&MacroAssembler::Fcvtms, inputs, expected, kXRegSize); + FcvtHelper(&MacroAssembler::Fcvtns, inputs, expected, kXRegSize); + FcvtHelper(&MacroAssembler::Fcvtzs, inputs, expected, kXRegSize); + + double inputs_u[] = {kLargestF64ltU64Max}; + uint64_t expected_u[] = {0xfffffffffffff800}; + FcvtHelper(&MacroAssembler::Fcvtau, inputs_u, expected_u, kXRegSize); + FcvtHelper(&MacroAssembler::Fcvtmu, inputs_u, expected_u, kXRegSize); + FcvtHelper(&MacroAssembler::Fcvtnu, inputs_u, expected_u, kXRegSize); + FcvtHelper(&MacroAssembler::Fcvtzu, inputs_u, expected_u, kXRegSize); +} TEST(fcvtas) { - SETUP_WITH_FEATURES(CPUFeatures::kFP); + float inputs_s[] = {1.0, 1.1, 2.5, -2.5}; + double inputs_d[] = {1.0, 1.1, 2.5, -2.5}; + uint64_t expected_w[] = {1, 1, 3, 0xfffffffd}; + uint64_t expected_x[] = {1, 1, 3, 0xfffffffffffffffd}; - START(); - __ Fmov(s0, 1.0); - __ Fmov(s1, 1.1); - __ Fmov(s2, 2.5); - __ Fmov(s3, -2.5); - __ Fmov(s4, kFP32PositiveInfinity); - __ Fmov(s5, kFP32NegativeInfinity); - __ Fmov(s6, 0x7fffff80); // Largest float < INT32_MAX. - __ Fneg(s7, s6); // Smallest float > INT32_MIN. - __ Fmov(d8, 1.0); - __ Fmov(d9, 1.1); - __ Fmov(d10, 2.5); - __ Fmov(d11, -2.5); - __ Fmov(d12, kFP64PositiveInfinity); - __ Fmov(d13, kFP64NegativeInfinity); - __ Fmov(d14, kWMaxInt - 1); - __ Fmov(d15, kWMinInt + 1); - __ Fmov(s17, 1.1); - __ Fmov(s18, 2.5); - __ Fmov(s19, -2.5); - __ Fmov(s20, kFP32PositiveInfinity); - __ Fmov(s21, kFP32NegativeInfinity); - __ Fmov(s22, 0x7fffff8000000000); // Largest float < INT64_MAX. - __ Fneg(s23, s22); // Smallest float > INT64_MIN. - __ Fmov(d24, 1.1); - __ Fmov(d25, 2.5); - __ Fmov(d26, -2.5); - __ Fmov(d27, kFP64PositiveInfinity); - __ Fmov(d28, kFP64NegativeInfinity); - __ Fmov(d29, 0x7ffffffffffffc00); // Largest double < INT64_MAX. - __ Fneg(d30, d29); // Smallest double > INT64_MIN. - - __ Fcvtas(w0, s0); - __ Fcvtas(w1, s1); - __ Fcvtas(w2, s2); - __ Fcvtas(w3, s3); - __ Fcvtas(w4, s4); - __ Fcvtas(w5, s5); - __ Fcvtas(w6, s6); - __ Fcvtas(w7, s7); - __ Fcvtas(w8, d8); - __ Fcvtas(w9, d9); - __ Fcvtas(w10, d10); - __ Fcvtas(w11, d11); - __ Fcvtas(w12, d12); - __ Fcvtas(w13, d13); - __ Fcvtas(w14, d14); - __ Fcvtas(w15, d15); - __ Fcvtas(x17, s17); - __ Fcvtas(x18, s18); - __ Fcvtas(x19, s19); - __ Fcvtas(x20, s20); - __ Fcvtas(x21, s21); - __ Fcvtas(x22, s22); - __ Fcvtas(x23, s23); - __ Fcvtas(x24, d24); - __ Fcvtas(x25, d25); - __ Fcvtas(x26, d26); - __ Fcvtas(x27, d27); - __ Fcvtas(x28, d28); - __ Fcvtas(x29, d29); - __ Fcvtas(x30, d30); - END(); - - if (CAN_RUN()) { - RUN(); - - ASSERT_EQUAL_64(1, x0); - ASSERT_EQUAL_64(1, x1); - ASSERT_EQUAL_64(3, x2); - ASSERT_EQUAL_64(0xfffffffd, x3); - ASSERT_EQUAL_64(0x7fffffff, x4); - ASSERT_EQUAL_64(0x80000000, x5); - ASSERT_EQUAL_64(0x7fffff80, x6); - ASSERT_EQUAL_64(0x80000080, x7); - ASSERT_EQUAL_64(1, x8); - ASSERT_EQUAL_64(1, x9); - ASSERT_EQUAL_64(3, x10); - ASSERT_EQUAL_64(0xfffffffd, x11); - ASSERT_EQUAL_64(0x7fffffff, x12); - ASSERT_EQUAL_64(0x80000000, x13); - ASSERT_EQUAL_64(0x7ffffffe, x14); - ASSERT_EQUAL_64(0x80000001, x15); - ASSERT_EQUAL_64(1, x17); - ASSERT_EQUAL_64(3, x18); - ASSERT_EQUAL_64(0xfffffffffffffffd, x19); - ASSERT_EQUAL_64(0x7fffffffffffffff, x20); - ASSERT_EQUAL_64(0x8000000000000000, x21); - ASSERT_EQUAL_64(0x7fffff8000000000, x22); - ASSERT_EQUAL_64(0x8000008000000000, x23); - ASSERT_EQUAL_64(1, x24); - ASSERT_EQUAL_64(3, x25); - ASSERT_EQUAL_64(0xfffffffffffffffd, x26); - ASSERT_EQUAL_64(0x7fffffffffffffff, x27); - ASSERT_EQUAL_64(0x8000000000000000, x28); - ASSERT_EQUAL_64(0x7ffffffffffffc00, x29); - ASSERT_EQUAL_64(0x8000000000000400, x30); - } + FcvtHelper(&MacroAssembler::Fcvtas, inputs_s, expected_w, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtas, inputs_d, expected_w, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtas, inputs_s, expected_x, kXRegSize); + FcvtHelper(&MacroAssembler::Fcvtas, inputs_d, expected_x, kXRegSize); } - TEST(fcvtau) { - SETUP_WITH_FEATURES(CPUFeatures::kFP); + float inputs_s[] = {1.0, 1.1, 2.5, -2.5, 0x100000000}; + double inputs_d[] = {1.0, 1.1, 2.5, -2.5, 0x100000000}; + uint64_t expected_w[] = {1, 1, 3, 0, 0xffffffff}; + uint64_t expected_x[] = {1, 1, 3, 0, 0x100000000}; - START(); - __ Fmov(s0, 1.0); - __ Fmov(s1, 1.1); - __ Fmov(s2, 2.5); - __ Fmov(s3, -2.5); - __ Fmov(s4, kFP32PositiveInfinity); - __ Fmov(s5, kFP32NegativeInfinity); - __ Fmov(s6, 0xffffff00); // Largest float < UINT32_MAX. - __ Fmov(d8, 1.0); - __ Fmov(d9, 1.1); - __ Fmov(d10, 2.5); - __ Fmov(d11, -2.5); - __ Fmov(d12, kFP64PositiveInfinity); - __ Fmov(d13, kFP64NegativeInfinity); - __ Fmov(d14, 0xfffffffe); - __ Fmov(s16, 1.0); - __ Fmov(s17, 1.1); - __ Fmov(s18, 2.5); - __ Fmov(s19, -2.5); - __ Fmov(s20, kFP32PositiveInfinity); - __ Fmov(s21, kFP32NegativeInfinity); - __ Fmov(s22, 0xffffff0000000000); // Largest float < UINT64_MAX. - __ Fmov(d24, 1.1); - __ Fmov(d25, 2.5); - __ Fmov(d26, -2.5); - __ Fmov(d27, kFP64PositiveInfinity); - __ Fmov(d28, kFP64NegativeInfinity); - __ Fmov(d29, 0xfffffffffffff800); // Largest double < UINT64_MAX. - __ Fmov(s30, 0x100000000); - - __ Fcvtau(w0, s0); - __ Fcvtau(w1, s1); - __ Fcvtau(w2, s2); - __ Fcvtau(w3, s3); - __ Fcvtau(w4, s4); - __ Fcvtau(w5, s5); - __ Fcvtau(w6, s6); - __ Fcvtau(w8, d8); - __ Fcvtau(w9, d9); - __ Fcvtau(w10, d10); - __ Fcvtau(w11, d11); - __ Fcvtau(w12, d12); - __ Fcvtau(w13, d13); - __ Fcvtau(w14, d14); - __ Fcvtau(w15, d15); - __ Fcvtau(x16, s16); - __ Fcvtau(x17, s17); - __ Fcvtau(x18, s18); - __ Fcvtau(x19, s19); - __ Fcvtau(x20, s20); - __ Fcvtau(x21, s21); - __ Fcvtau(x22, s22); - __ Fcvtau(x24, d24); - __ Fcvtau(x25, d25); - __ Fcvtau(x26, d26); - __ Fcvtau(x27, d27); - __ Fcvtau(x28, d28); - __ Fcvtau(x29, d29); - __ Fcvtau(w30, s30); - END(); - - if (CAN_RUN()) { - RUN(); - - ASSERT_EQUAL_64(1, x0); - ASSERT_EQUAL_64(1, x1); - ASSERT_EQUAL_64(3, x2); - ASSERT_EQUAL_64(0, x3); - ASSERT_EQUAL_64(0xffffffff, x4); - ASSERT_EQUAL_64(0, x5); - ASSERT_EQUAL_64(0xffffff00, x6); - ASSERT_EQUAL_64(1, x8); - ASSERT_EQUAL_64(1, x9); - ASSERT_EQUAL_64(3, x10); - ASSERT_EQUAL_64(0, x11); - ASSERT_EQUAL_64(0xffffffff, x12); - ASSERT_EQUAL_64(0, x13); - ASSERT_EQUAL_64(0xfffffffe, x14); - ASSERT_EQUAL_64(1, x16); - ASSERT_EQUAL_64(1, x17); - ASSERT_EQUAL_64(3, x18); - ASSERT_EQUAL_64(0, x19); - ASSERT_EQUAL_64(0xffffffffffffffff, x20); - ASSERT_EQUAL_64(0, x21); - ASSERT_EQUAL_64(0xffffff0000000000, x22); - ASSERT_EQUAL_64(1, x24); - ASSERT_EQUAL_64(3, x25); - ASSERT_EQUAL_64(0, x26); - ASSERT_EQUAL_64(0xffffffffffffffff, x27); - ASSERT_EQUAL_64(0, x28); - ASSERT_EQUAL_64(0xfffffffffffff800, x29); - ASSERT_EQUAL_64(0xffffffff, x30); - } + FcvtHelper(&MacroAssembler::Fcvtau, inputs_s, expected_w, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtau, inputs_d, expected_w, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtau, inputs_s, expected_x, kXRegSize); + FcvtHelper(&MacroAssembler::Fcvtau, inputs_d, expected_x, kXRegSize); } - TEST(fcvtms) { - SETUP_WITH_FEATURES(CPUFeatures::kFP); + float inputs_s[] = {1.0, 1.1, 1.5, -1.5}; + double inputs_d[] = {1.0, 1.1, 1.5, -1.5}; + uint64_t expected_w[] = {1, 1, 1, 0xfffffffe}; + uint64_t expected_x[] = {1, 1, 1, 0xfffffffffffffffe}; - START(); - __ Fmov(s0, 1.0); - __ Fmov(s1, 1.1); - __ Fmov(s2, 1.5); - __ Fmov(s3, -1.5); - __ Fmov(s4, kFP32PositiveInfinity); - __ Fmov(s5, kFP32NegativeInfinity); - __ Fmov(s6, 0x7fffff80); // Largest float < INT32_MAX. - __ Fneg(s7, s6); // Smallest float > INT32_MIN. - __ Fmov(d8, 1.0); - __ Fmov(d9, 1.1); - __ Fmov(d10, 1.5); - __ Fmov(d11, -1.5); - __ Fmov(d12, kFP64PositiveInfinity); - __ Fmov(d13, kFP64NegativeInfinity); - __ Fmov(d14, kWMaxInt - 1); - __ Fmov(d15, kWMinInt + 1); - __ Fmov(s17, 1.1); - __ Fmov(s18, 1.5); - __ Fmov(s19, -1.5); - __ Fmov(s20, kFP32PositiveInfinity); - __ Fmov(s21, kFP32NegativeInfinity); - __ Fmov(s22, 0x7fffff8000000000); // Largest float < INT64_MAX. - __ Fneg(s23, s22); // Smallest float > INT64_MIN. - __ Fmov(d24, 1.1); - __ Fmov(d25, 1.5); - __ Fmov(d26, -1.5); - __ Fmov(d27, kFP64PositiveInfinity); - __ Fmov(d28, kFP64NegativeInfinity); - __ Fmov(d29, 0x7ffffffffffffc00); // Largest double < INT64_MAX. - __ Fneg(d30, d29); // Smallest double > INT64_MIN. - - __ Fcvtms(w0, s0); - __ Fcvtms(w1, s1); - __ Fcvtms(w2, s2); - __ Fcvtms(w3, s3); - __ Fcvtms(w4, s4); - __ Fcvtms(w5, s5); - __ Fcvtms(w6, s6); - __ Fcvtms(w7, s7); - __ Fcvtms(w8, d8); - __ Fcvtms(w9, d9); - __ Fcvtms(w10, d10); - __ Fcvtms(w11, d11); - __ Fcvtms(w12, d12); - __ Fcvtms(w13, d13); - __ Fcvtms(w14, d14); - __ Fcvtms(w15, d15); - __ Fcvtms(x17, s17); - __ Fcvtms(x18, s18); - __ Fcvtms(x19, s19); - __ Fcvtms(x20, s20); - __ Fcvtms(x21, s21); - __ Fcvtms(x22, s22); - __ Fcvtms(x23, s23); - __ Fcvtms(x24, d24); - __ Fcvtms(x25, d25); - __ Fcvtms(x26, d26); - __ Fcvtms(x27, d27); - __ Fcvtms(x28, d28); - __ Fcvtms(x29, d29); - __ Fcvtms(x30, d30); - END(); - - if (CAN_RUN()) { - RUN(); - - ASSERT_EQUAL_64(1, x0); - ASSERT_EQUAL_64(1, x1); - ASSERT_EQUAL_64(1, x2); - ASSERT_EQUAL_64(0xfffffffe, x3); - ASSERT_EQUAL_64(0x7fffffff, x4); - ASSERT_EQUAL_64(0x80000000, x5); - ASSERT_EQUAL_64(0x7fffff80, x6); - ASSERT_EQUAL_64(0x80000080, x7); - ASSERT_EQUAL_64(1, x8); - ASSERT_EQUAL_64(1, x9); - ASSERT_EQUAL_64(1, x10); - ASSERT_EQUAL_64(0xfffffffe, x11); - ASSERT_EQUAL_64(0x7fffffff, x12); - ASSERT_EQUAL_64(0x80000000, x13); - ASSERT_EQUAL_64(0x7ffffffe, x14); - ASSERT_EQUAL_64(0x80000001, x15); - ASSERT_EQUAL_64(1, x17); - ASSERT_EQUAL_64(1, x18); - ASSERT_EQUAL_64(0xfffffffffffffffe, x19); - ASSERT_EQUAL_64(0x7fffffffffffffff, x20); - ASSERT_EQUAL_64(0x8000000000000000, x21); - ASSERT_EQUAL_64(0x7fffff8000000000, x22); - ASSERT_EQUAL_64(0x8000008000000000, x23); - ASSERT_EQUAL_64(1, x24); - ASSERT_EQUAL_64(1, x25); - ASSERT_EQUAL_64(0xfffffffffffffffe, x26); - ASSERT_EQUAL_64(0x7fffffffffffffff, x27); - ASSERT_EQUAL_64(0x8000000000000000, x28); - ASSERT_EQUAL_64(0x7ffffffffffffc00, x29); - ASSERT_EQUAL_64(0x8000000000000400, x30); - } + FcvtHelper(&MacroAssembler::Fcvtms, inputs_s, expected_w, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtms, inputs_d, expected_w, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtms, inputs_s, expected_x, kXRegSize); + FcvtHelper(&MacroAssembler::Fcvtms, inputs_d, expected_x, kXRegSize); } - TEST(fcvtmu) { - SETUP_WITH_FEATURES(CPUFeatures::kFP); + float inputs_s[] = {1.0, 1.1, 1.5, -1.5}; + double inputs_d[] = {1.0, 1.1, 1.5, -1.5}; + uint64_t expected_w[] = {1, 1, 1, 0}; + uint64_t expected_x[] = {1, 1, 1, 0}; - START(); - __ Fmov(s0, 1.0); - __ Fmov(s1, 1.1); - __ Fmov(s2, 1.5); - __ Fmov(s3, -1.5); - __ Fmov(s4, kFP32PositiveInfinity); - __ Fmov(s5, kFP32NegativeInfinity); - __ Fmov(s6, 0x7fffff80); // Largest float < INT32_MAX. - __ Fneg(s7, s6); // Smallest float > INT32_MIN. - __ Fmov(d8, 1.0); - __ Fmov(d9, 1.1); - __ Fmov(d10, 1.5); - __ Fmov(d11, -1.5); - __ Fmov(d12, kFP64PositiveInfinity); - __ Fmov(d13, kFP64NegativeInfinity); - __ Fmov(d14, kWMaxInt - 1); - __ Fmov(d15, kWMinInt + 1); - __ Fmov(s17, 1.1); - __ Fmov(s18, 1.5); - __ Fmov(s19, -1.5); - __ Fmov(s20, kFP32PositiveInfinity); - __ Fmov(s21, kFP32NegativeInfinity); - __ Fmov(s22, 0x7fffff8000000000); // Largest float < INT64_MAX. - __ Fneg(s23, s22); // Smallest float > INT64_MIN. - __ Fmov(d24, 1.1); - __ Fmov(d25, 1.5); - __ Fmov(d26, -1.5); - __ Fmov(d27, kFP64PositiveInfinity); - __ Fmov(d28, kFP64NegativeInfinity); - __ Fmov(d29, 0x7ffffffffffffc00); // Largest double < INT64_MAX. - __ Fneg(d30, d29); // Smallest double > INT64_MIN. - - __ Fcvtmu(w0, s0); - __ Fcvtmu(w1, s1); - __ Fcvtmu(w2, s2); - __ Fcvtmu(w3, s3); - __ Fcvtmu(w4, s4); - __ Fcvtmu(w5, s5); - __ Fcvtmu(w6, s6); - __ Fcvtmu(w7, s7); - __ Fcvtmu(w8, d8); - __ Fcvtmu(w9, d9); - __ Fcvtmu(w10, d10); - __ Fcvtmu(w11, d11); - __ Fcvtmu(w12, d12); - __ Fcvtmu(w13, d13); - __ Fcvtmu(w14, d14); - __ Fcvtmu(x17, s17); - __ Fcvtmu(x18, s18); - __ Fcvtmu(x19, s19); - __ Fcvtmu(x20, s20); - __ Fcvtmu(x21, s21); - __ Fcvtmu(x22, s22); - __ Fcvtmu(x23, s23); - __ Fcvtmu(x24, d24); - __ Fcvtmu(x25, d25); - __ Fcvtmu(x26, d26); - __ Fcvtmu(x27, d27); - __ Fcvtmu(x28, d28); - __ Fcvtmu(x29, d29); - __ Fcvtmu(x30, d30); - END(); - - if (CAN_RUN()) { - RUN(); - - ASSERT_EQUAL_64(1, x0); - ASSERT_EQUAL_64(1, x1); - ASSERT_EQUAL_64(1, x2); - ASSERT_EQUAL_64(0, x3); - ASSERT_EQUAL_64(0xffffffff, x4); - ASSERT_EQUAL_64(0, x5); - ASSERT_EQUAL_64(0x7fffff80, x6); - ASSERT_EQUAL_64(0, x7); - ASSERT_EQUAL_64(1, x8); - ASSERT_EQUAL_64(1, x9); - ASSERT_EQUAL_64(1, x10); - ASSERT_EQUAL_64(0, x11); - ASSERT_EQUAL_64(0xffffffff, x12); - ASSERT_EQUAL_64(0, x13); - ASSERT_EQUAL_64(0x7ffffffe, x14); - ASSERT_EQUAL_64(1, x17); - ASSERT_EQUAL_64(1, x18); - ASSERT_EQUAL_64(0, x19); - ASSERT_EQUAL_64(0xffffffffffffffff, x20); - ASSERT_EQUAL_64(0, x21); - ASSERT_EQUAL_64(0x7fffff8000000000, x22); - ASSERT_EQUAL_64(0, x23); - ASSERT_EQUAL_64(1, x24); - ASSERT_EQUAL_64(1, x25); - ASSERT_EQUAL_64(0, x26); - ASSERT_EQUAL_64(0xffffffffffffffff, x27); - ASSERT_EQUAL_64(0, x28); - ASSERT_EQUAL_64(0x7ffffffffffffc00, x29); - ASSERT_EQUAL_64(0, x30); - } + FcvtHelper(&MacroAssembler::Fcvtmu, inputs_s, expected_w, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtmu, inputs_d, expected_w, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtmu, inputs_s, expected_x, kXRegSize); + FcvtHelper(&MacroAssembler::Fcvtmu, inputs_d, expected_x, kXRegSize); } - TEST(fcvtns) { - SETUP_WITH_FEATURES(CPUFeatures::kFP); + float inputs_s[] = {1.0, 1.1, 1.5, -1.5}; + double inputs_d[] = {1.0, 1.1, 1.5, -1.5}; + uint64_t expected_w[] = {1, 1, 2, 0xfffffffe}; + uint64_t expected_x[] = {1, 1, 2, 0xfffffffffffffffe}; - START(); - __ Fmov(s0, 1.0); - __ Fmov(s1, 1.1); - __ Fmov(s2, 1.5); - __ Fmov(s3, -1.5); - __ Fmov(s4, kFP32PositiveInfinity); - __ Fmov(s5, kFP32NegativeInfinity); - __ Fmov(s6, 0x7fffff80); // Largest float < INT32_MAX. - __ Fneg(s7, s6); // Smallest float > INT32_MIN. - __ Fmov(d8, 1.0); - __ Fmov(d9, 1.1); - __ Fmov(d10, 1.5); - __ Fmov(d11, -1.5); - __ Fmov(d12, kFP64PositiveInfinity); - __ Fmov(d13, kFP64NegativeInfinity); - __ Fmov(d14, kWMaxInt - 1); - __ Fmov(d15, kWMinInt + 1); - __ Fmov(s17, 1.1); - __ Fmov(s18, 1.5); - __ Fmov(s19, -1.5); - __ Fmov(s20, kFP32PositiveInfinity); - __ Fmov(s21, kFP32NegativeInfinity); - __ Fmov(s22, 0x7fffff8000000000); // Largest float < INT64_MAX. - __ Fneg(s23, s22); // Smallest float > INT64_MIN. - __ Fmov(d24, 1.1); - __ Fmov(d25, 1.5); - __ Fmov(d26, -1.5); - __ Fmov(d27, kFP64PositiveInfinity); - __ Fmov(d28, kFP64NegativeInfinity); - __ Fmov(d29, 0x7ffffffffffffc00); // Largest double < INT64_MAX. - __ Fneg(d30, d29); // Smallest double > INT64_MIN. - - __ Fcvtns(w0, s0); - __ Fcvtns(w1, s1); - __ Fcvtns(w2, s2); - __ Fcvtns(w3, s3); - __ Fcvtns(w4, s4); - __ Fcvtns(w5, s5); - __ Fcvtns(w6, s6); - __ Fcvtns(w7, s7); - __ Fcvtns(w8, d8); - __ Fcvtns(w9, d9); - __ Fcvtns(w10, d10); - __ Fcvtns(w11, d11); - __ Fcvtns(w12, d12); - __ Fcvtns(w13, d13); - __ Fcvtns(w14, d14); - __ Fcvtns(w15, d15); - __ Fcvtns(x17, s17); - __ Fcvtns(x18, s18); - __ Fcvtns(x19, s19); - __ Fcvtns(x20, s20); - __ Fcvtns(x21, s21); - __ Fcvtns(x22, s22); - __ Fcvtns(x23, s23); - __ Fcvtns(x24, d24); - __ Fcvtns(x25, d25); - __ Fcvtns(x26, d26); - __ Fcvtns(x27, d27); - __ Fcvtns(x28, d28); - __ Fcvtns(x29, d29); - __ Fcvtns(x30, d30); - END(); - - if (CAN_RUN()) { - RUN(); - - ASSERT_EQUAL_64(1, x0); - ASSERT_EQUAL_64(1, x1); - ASSERT_EQUAL_64(2, x2); - ASSERT_EQUAL_64(0xfffffffe, x3); - ASSERT_EQUAL_64(0x7fffffff, x4); - ASSERT_EQUAL_64(0x80000000, x5); - ASSERT_EQUAL_64(0x7fffff80, x6); - ASSERT_EQUAL_64(0x80000080, x7); - ASSERT_EQUAL_64(1, x8); - ASSERT_EQUAL_64(1, x9); - ASSERT_EQUAL_64(2, x10); - ASSERT_EQUAL_64(0xfffffffe, x11); - ASSERT_EQUAL_64(0x7fffffff, x12); - ASSERT_EQUAL_64(0x80000000, x13); - ASSERT_EQUAL_64(0x7ffffffe, x14); - ASSERT_EQUAL_64(0x80000001, x15); - ASSERT_EQUAL_64(1, x17); - ASSERT_EQUAL_64(2, x18); - ASSERT_EQUAL_64(0xfffffffffffffffe, x19); - ASSERT_EQUAL_64(0x7fffffffffffffff, x20); - ASSERT_EQUAL_64(0x8000000000000000, x21); - ASSERT_EQUAL_64(0x7fffff8000000000, x22); - ASSERT_EQUAL_64(0x8000008000000000, x23); - ASSERT_EQUAL_64(1, x24); - ASSERT_EQUAL_64(2, x25); - ASSERT_EQUAL_64(0xfffffffffffffffe, x26); - ASSERT_EQUAL_64(0x7fffffffffffffff, x27); - ASSERT_EQUAL_64(0x8000000000000000, x28); - ASSERT_EQUAL_64(0x7ffffffffffffc00, x29); - ASSERT_EQUAL_64(0x8000000000000400, x30); - } + FcvtHelper(&MacroAssembler::Fcvtns, inputs_s, expected_w, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtns, inputs_d, expected_w, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtns, inputs_s, expected_x, kXRegSize); + FcvtHelper(&MacroAssembler::Fcvtns, inputs_d, expected_x, kXRegSize); } - TEST(fcvtnu) { - SETUP_WITH_FEATURES(CPUFeatures::kFP); + float inputs_s[] = {1.0, 1.1, 1.5, -1.5, 0x100000000}; + double inputs_d[] = {1.0, 1.1, 1.5, -1.5, 0x100000000}; + uint64_t expected_w[] = {1, 1, 2, 0, 0xffffffff}; + uint64_t expected_x[] = {1, 1, 2, 0, 0x100000000}; - START(); - __ Fmov(s0, 1.0); - __ Fmov(s1, 1.1); - __ Fmov(s2, 1.5); - __ Fmov(s3, -1.5); - __ Fmov(s4, kFP32PositiveInfinity); - __ Fmov(s5, kFP32NegativeInfinity); - __ Fmov(s6, 0xffffff00); // Largest float < UINT32_MAX. - __ Fmov(d8, 1.0); - __ Fmov(d9, 1.1); - __ Fmov(d10, 1.5); - __ Fmov(d11, -1.5); - __ Fmov(d12, kFP64PositiveInfinity); - __ Fmov(d13, kFP64NegativeInfinity); - __ Fmov(d14, 0xfffffffe); - __ Fmov(s16, 1.0); - __ Fmov(s17, 1.1); - __ Fmov(s18, 1.5); - __ Fmov(s19, -1.5); - __ Fmov(s20, kFP32PositiveInfinity); - __ Fmov(s21, kFP32NegativeInfinity); - __ Fmov(s22, 0xffffff0000000000); // Largest float < UINT64_MAX. - __ Fmov(d24, 1.1); - __ Fmov(d25, 1.5); - __ Fmov(d26, -1.5); - __ Fmov(d27, kFP64PositiveInfinity); - __ Fmov(d28, kFP64NegativeInfinity); - __ Fmov(d29, 0xfffffffffffff800); // Largest double < UINT64_MAX. - __ Fmov(s30, 0x100000000); - - __ Fcvtnu(w0, s0); - __ Fcvtnu(w1, s1); - __ Fcvtnu(w2, s2); - __ Fcvtnu(w3, s3); - __ Fcvtnu(w4, s4); - __ Fcvtnu(w5, s5); - __ Fcvtnu(w6, s6); - __ Fcvtnu(w8, d8); - __ Fcvtnu(w9, d9); - __ Fcvtnu(w10, d10); - __ Fcvtnu(w11, d11); - __ Fcvtnu(w12, d12); - __ Fcvtnu(w13, d13); - __ Fcvtnu(w14, d14); - __ Fcvtnu(w15, d15); - __ Fcvtnu(x16, s16); - __ Fcvtnu(x17, s17); - __ Fcvtnu(x18, s18); - __ Fcvtnu(x19, s19); - __ Fcvtnu(x20, s20); - __ Fcvtnu(x21, s21); - __ Fcvtnu(x22, s22); - __ Fcvtnu(x24, d24); - __ Fcvtnu(x25, d25); - __ Fcvtnu(x26, d26); - __ Fcvtnu(x27, d27); - __ Fcvtnu(x28, d28); - __ Fcvtnu(x29, d29); - __ Fcvtnu(w30, s30); - END(); - - if (CAN_RUN()) { - RUN(); - - ASSERT_EQUAL_64(1, x0); - ASSERT_EQUAL_64(1, x1); - ASSERT_EQUAL_64(2, x2); - ASSERT_EQUAL_64(0, x3); - ASSERT_EQUAL_64(0xffffffff, x4); - ASSERT_EQUAL_64(0, x5); - ASSERT_EQUAL_64(0xffffff00, x6); - ASSERT_EQUAL_64(1, x8); - ASSERT_EQUAL_64(1, x9); - ASSERT_EQUAL_64(2, x10); - ASSERT_EQUAL_64(0, x11); - ASSERT_EQUAL_64(0xffffffff, x12); - ASSERT_EQUAL_64(0, x13); - ASSERT_EQUAL_64(0xfffffffe, x14); - ASSERT_EQUAL_64(1, x16); - ASSERT_EQUAL_64(1, x17); - ASSERT_EQUAL_64(2, x18); - ASSERT_EQUAL_64(0, x19); - ASSERT_EQUAL_64(0xffffffffffffffff, x20); - ASSERT_EQUAL_64(0, x21); - ASSERT_EQUAL_64(0xffffff0000000000, x22); - ASSERT_EQUAL_64(1, x24); - ASSERT_EQUAL_64(2, x25); - ASSERT_EQUAL_64(0, x26); - ASSERT_EQUAL_64(0xffffffffffffffff, x27); - ASSERT_EQUAL_64(0, x28); - ASSERT_EQUAL_64(0xfffffffffffff800, x29); - ASSERT_EQUAL_64(0xffffffff, x30); - } + FcvtHelper(&MacroAssembler::Fcvtnu, inputs_s, expected_w, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtnu, inputs_d, expected_w, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtnu, inputs_s, expected_x, kXRegSize); + FcvtHelper(&MacroAssembler::Fcvtnu, inputs_d, expected_x, kXRegSize); } - TEST(fcvtzs) { - SETUP_WITH_FEATURES(CPUFeatures::kFP); + float inputs_s[] = {1.0, 1.1, 1.5, -1.5}; + double inputs_d[] = {1.0, 1.1, 1.5, -1.5}; + uint64_t expected_w[] = {1, 1, 1, 0xffffffff}; + uint64_t expected_x[] = {1, 1, 1, 0xffffffffffffffff}; - START(); - __ Fmov(s0, 1.0); - __ Fmov(s1, 1.1); - __ Fmov(s2, 1.5); - __ Fmov(s3, -1.5); - __ Fmov(s4, kFP32PositiveInfinity); - __ Fmov(s5, kFP32NegativeInfinity); - __ Fmov(s6, 0x7fffff80); // Largest float < INT32_MAX. - __ Fneg(s7, s6); // Smallest float > INT32_MIN. - __ Fmov(d8, 1.0); - __ Fmov(d9, 1.1); - __ Fmov(d10, 1.5); - __ Fmov(d11, -1.5); - __ Fmov(d12, kFP64PositiveInfinity); - __ Fmov(d13, kFP64NegativeInfinity); - __ Fmov(d14, kWMaxInt - 1); - __ Fmov(d15, kWMinInt + 1); - __ Fmov(s17, 1.1); - __ Fmov(s18, 1.5); - __ Fmov(s19, -1.5); - __ Fmov(s20, kFP32PositiveInfinity); - __ Fmov(s21, kFP32NegativeInfinity); - __ Fmov(s22, 0x7fffff8000000000); // Largest float < INT64_MAX. - __ Fneg(s23, s22); // Smallest float > INT64_MIN. - __ Fmov(d24, 1.1); - __ Fmov(d25, 1.5); - __ Fmov(d26, -1.5); - __ Fmov(d27, kFP64PositiveInfinity); - __ Fmov(d28, kFP64NegativeInfinity); - __ Fmov(d29, 0x7ffffffffffffc00); // Largest double < INT64_MAX. - __ Fneg(d30, d29); // Smallest double > INT64_MIN. + FcvtHelper(&MacroAssembler::Fcvtzs, inputs_s, expected_w, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtzs, inputs_d, expected_w, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtzs, inputs_s, expected_x, kXRegSize); + FcvtHelper(&MacroAssembler::Fcvtzs, inputs_d, expected_x, kXRegSize); +} - __ Fcvtzs(w0, s0); - __ Fcvtzs(w1, s1); - __ Fcvtzs(w2, s2); - __ Fcvtzs(w3, s3); - __ Fcvtzs(w4, s4); - __ Fcvtzs(w5, s5); - __ Fcvtzs(w6, s6); - __ Fcvtzs(w7, s7); - __ Fcvtzs(w8, d8); - __ Fcvtzs(w9, d9); - __ Fcvtzs(w10, d10); - __ Fcvtzs(w11, d11); - __ Fcvtzs(w12, d12); - __ Fcvtzs(w13, d13); - __ Fcvtzs(w14, d14); - __ Fcvtzs(w15, d15); - __ Fcvtzs(x17, s17); - __ Fcvtzs(x18, s18); - __ Fcvtzs(x19, s19); - __ Fcvtzs(x20, s20); - __ Fcvtzs(x21, s21); - __ Fcvtzs(x22, s22); - __ Fcvtzs(x23, s23); - __ Fcvtzs(x24, d24); - __ Fcvtzs(x25, d25); - __ Fcvtzs(x26, d26); - __ Fcvtzs(x27, d27); - __ Fcvtzs(x28, d28); - __ Fcvtzs(x29, d29); - __ Fcvtzs(x30, d30); - END(); +TEST(fcvtzu) { + float inputs_s[] = {1.0, 1.1, 1.5, -1.5}; + double inputs_d[] = {1.0, 1.1, 1.5, -1.5}; + uint64_t expected_w[] = {1, 1, 1, 0}; + uint64_t expected_x[] = {1, 1, 1, 0}; - if (CAN_RUN()) { - RUN(); - - ASSERT_EQUAL_64(1, x0); - ASSERT_EQUAL_64(1, x1); - ASSERT_EQUAL_64(1, x2); - ASSERT_EQUAL_64(0xffffffff, x3); - ASSERT_EQUAL_64(0x7fffffff, x4); - ASSERT_EQUAL_64(0x80000000, x5); - ASSERT_EQUAL_64(0x7fffff80, x6); - ASSERT_EQUAL_64(0x80000080, x7); - ASSERT_EQUAL_64(1, x8); - ASSERT_EQUAL_64(1, x9); - ASSERT_EQUAL_64(1, x10); - ASSERT_EQUAL_64(0xffffffff, x11); - ASSERT_EQUAL_64(0x7fffffff, x12); - ASSERT_EQUAL_64(0x80000000, x13); - ASSERT_EQUAL_64(0x7ffffffe, x14); - ASSERT_EQUAL_64(0x80000001, x15); - ASSERT_EQUAL_64(1, x17); - ASSERT_EQUAL_64(1, x18); - ASSERT_EQUAL_64(0xffffffffffffffff, x19); - ASSERT_EQUAL_64(0x7fffffffffffffff, x20); - ASSERT_EQUAL_64(0x8000000000000000, x21); - ASSERT_EQUAL_64(0x7fffff8000000000, x22); - ASSERT_EQUAL_64(0x8000008000000000, x23); - ASSERT_EQUAL_64(1, x24); - ASSERT_EQUAL_64(1, x25); - ASSERT_EQUAL_64(0xffffffffffffffff, x26); - ASSERT_EQUAL_64(0x7fffffffffffffff, x27); - ASSERT_EQUAL_64(0x8000000000000000, x28); - ASSERT_EQUAL_64(0x7ffffffffffffc00, x29); - ASSERT_EQUAL_64(0x8000000000000400, x30); - } + FcvtHelper(&MacroAssembler::Fcvtzu, inputs_s, expected_w, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtzu, inputs_d, expected_w, kWRegSize); + FcvtHelper(&MacroAssembler::Fcvtzu, inputs_s, expected_x, kXRegSize); + FcvtHelper(&MacroAssembler::Fcvtzu, inputs_d, expected_x, kXRegSize); } void FjcvtzsHelper(uint64_t value, uint64_t expected, uint32_t expected_z) { @@ -4489,107 +4049,6 @@ TEST(fjcvtzs) { } } -TEST(fcvtzu) { - SETUP_WITH_FEATURES(CPUFeatures::kFP); - - START(); - __ Fmov(s0, 1.0); - __ Fmov(s1, 1.1); - __ Fmov(s2, 1.5); - __ Fmov(s3, -1.5); - __ Fmov(s4, kFP32PositiveInfinity); - __ Fmov(s5, kFP32NegativeInfinity); - __ Fmov(s6, 0x7fffff80); // Largest float < INT32_MAX. - __ Fneg(s7, s6); // Smallest float > INT32_MIN. - __ Fmov(d8, 1.0); - __ Fmov(d9, 1.1); - __ Fmov(d10, 1.5); - __ Fmov(d11, -1.5); - __ Fmov(d12, kFP64PositiveInfinity); - __ Fmov(d13, kFP64NegativeInfinity); - __ Fmov(d14, kWMaxInt - 1); - __ Fmov(d15, kWMinInt + 1); - __ Fmov(s17, 1.1); - __ Fmov(s18, 1.5); - __ Fmov(s19, -1.5); - __ Fmov(s20, kFP32PositiveInfinity); - __ Fmov(s21, kFP32NegativeInfinity); - __ Fmov(s22, 0x7fffff8000000000); // Largest float < INT64_MAX. - __ Fneg(s23, s22); // Smallest float > INT64_MIN. - __ Fmov(d24, 1.1); - __ Fmov(d25, 1.5); - __ Fmov(d26, -1.5); - __ Fmov(d27, kFP64PositiveInfinity); - __ Fmov(d28, kFP64NegativeInfinity); - __ Fmov(d29, 0x7ffffffffffffc00); // Largest double < INT64_MAX. - __ Fneg(d30, d29); // Smallest double > INT64_MIN. - - __ Fcvtzu(w0, s0); - __ Fcvtzu(w1, s1); - __ Fcvtzu(w2, s2); - __ Fcvtzu(w3, s3); - __ Fcvtzu(w4, s4); - __ Fcvtzu(w5, s5); - __ Fcvtzu(w6, s6); - __ Fcvtzu(w7, s7); - __ Fcvtzu(w8, d8); - __ Fcvtzu(w9, d9); - __ Fcvtzu(w10, d10); - __ Fcvtzu(w11, d11); - __ Fcvtzu(w12, d12); - __ Fcvtzu(w13, d13); - __ Fcvtzu(w14, d14); - __ Fcvtzu(x17, s17); - __ Fcvtzu(x18, s18); - __ Fcvtzu(x19, s19); - __ Fcvtzu(x20, s20); - __ Fcvtzu(x21, s21); - __ Fcvtzu(x22, s22); - __ Fcvtzu(x23, s23); - __ Fcvtzu(x24, d24); - __ Fcvtzu(x25, d25); - __ Fcvtzu(x26, d26); - __ Fcvtzu(x27, d27); - __ Fcvtzu(x28, d28); - __ Fcvtzu(x29, d29); - __ Fcvtzu(x30, d30); - END(); - - if (CAN_RUN()) { - RUN(); - - ASSERT_EQUAL_64(1, x0); - ASSERT_EQUAL_64(1, x1); - ASSERT_EQUAL_64(1, x2); - ASSERT_EQUAL_64(0, x3); - ASSERT_EQUAL_64(0xffffffff, x4); - ASSERT_EQUAL_64(0, x5); - ASSERT_EQUAL_64(0x7fffff80, x6); - ASSERT_EQUAL_64(0, x7); - ASSERT_EQUAL_64(1, x8); - ASSERT_EQUAL_64(1, x9); - ASSERT_EQUAL_64(1, x10); - ASSERT_EQUAL_64(0, x11); - ASSERT_EQUAL_64(0xffffffff, x12); - ASSERT_EQUAL_64(0, x13); - ASSERT_EQUAL_64(0x7ffffffe, x14); - ASSERT_EQUAL_64(1, x17); - ASSERT_EQUAL_64(1, x18); - ASSERT_EQUAL_64(0, x19); - ASSERT_EQUAL_64(0xffffffffffffffff, x20); - ASSERT_EQUAL_64(0, x21); - ASSERT_EQUAL_64(0x7fffff8000000000, x22); - ASSERT_EQUAL_64(0, x23); - ASSERT_EQUAL_64(1, x24); - ASSERT_EQUAL_64(1, x25); - ASSERT_EQUAL_64(0, x26); - ASSERT_EQUAL_64(0xffffffffffffffff, x27); - ASSERT_EQUAL_64(0, x28); - ASSERT_EQUAL_64(0x7ffffffffffffc00, x29); - ASSERT_EQUAL_64(0, x30); - } -} - // Test that scvtf and ucvtf can convert the 64-bit input into the expected // value. All possible values of 'fbits' are tested. The expected value is // modified accordingly in each case. diff --git a/test/aarch64/test-assembler-neon-aarch64.cc b/test/aarch64/test-assembler-neon-aarch64.cc index 1682d13e..2155db48 100644 --- a/test/aarch64/test-assembler-neon-aarch64.cc +++ b/test/aarch64/test-assembler-neon-aarch64.cc @@ -10975,8 +10975,26 @@ TEST(neon_usdot_element) { } } +TEST(neon_pmull_regression_test) { + SETUP_WITH_FEATURES(CPUFeatures::kNEON); + + START(); + __ Movi(v0.V2D(), 0xdecafc0ffee); + __ Pmull(v0.V8H(), v0.V8B(), v0.V8B()); + + __ Movi(v1.V2D(), 0xaaaaaaaa55555555); + __ Pmull2(v1.V8H(), v1.V16B(), v1.V16B()); + END(); + + if (CAN_RUN()) { + RUN(); + ASSERT_EQUAL_128(0x0000000000515450, 0x4455500055555454, q0); + ASSERT_EQUAL_128(0x4444444444444444, 0x1111111111111111, q1); + } +} + TEST(zero_high_b) { - SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON, CPUFeatures::kRDM); + SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON); START(); __ Mov(x0, 0x55aa42ffaa42ff55); @@ -10996,7 +11014,7 @@ TEST(zero_high_b) { __ Ror(x0, x0, 8); { - ExactAssemblyScope scope(&masm, 81 * kInstructionSize); + ExactAssemblyScope scope(&masm, 75 * kInstructionSize); __ movi(q9.V16B(), 0x55); __ dci(0x5e010409); // mov b9, v0.b[0] __ orr(q30.V16B(), q30.V16B(), q9.V16B()); @@ -11013,14 +11031,6 @@ TEST(zero_high_b) { __ dci(0x7e207809); // sqneg b9, b0 __ orr(q30.V16B(), q30.V16B(), q9.V16B()); - __ movi(q9.V16B(), 0x55); - __ dci(0x7e008429); // sqrdmlah b9, b1, b0 - __ orr(q30.V16B(), q30.V16B(), q9.V16B()); - - __ movi(q9.V16B(), 0x55); - __ dci(0x7e008c29); // sqrdmlsh b9, b1, b0 - __ orr(q30.V16B(), q30.V16B(), q9.V16B()); - __ movi(q9.V16B(), 0x55); __ dci(0x5e205c29); // sqrshl b9, b1, b0 __ orr(q30.V16B(), q30.V16B(), q9.V16B()); @@ -11821,10 +11831,7 @@ TEST(zero_high_s) { } TEST(zero_high_d) { - SETUP_WITH_FEATURES(CPUFeatures::kSVE, - CPUFeatures::kNEON, - CPUFeatures::kFP, - CPUFeatures::kRDM); + SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON, CPUFeatures::kFP); START(); __ Mov(x0, 0x55aa42ffaa42ff55); @@ -11844,7 +11851,7 @@ TEST(zero_high_d) { __ Ror(x0, x0, 8); { - ExactAssemblyScope scope(&masm, 291 * kInstructionSize); + ExactAssemblyScope scope(&masm, 285 * kInstructionSize); __ movi(q9.V16B(), 0x55); __ dci(0x5ee0b809); // abs d9, d0 __ orr(q30.V16B(), q30.V16B(), q9.V16B()); @@ -12113,14 +12120,6 @@ TEST(zero_high_d) { __ dci(0x7ee07809); // sqneg d9, d0 __ orr(q30.V16B(), q30.V16B(), q9.V16B()); - __ movi(q9.V16B(), 0x55); - __ dci(0x7ec08429); // sqrdmlah d9, d1, d0 - __ orr(q30.V16B(), q30.V16B(), q9.V16B()); - - __ movi(q9.V16B(), 0x55); - __ dci(0x7ec08c29); // sqrdmlsh d9, d1, d0 - __ orr(q30.V16B(), q30.V16B(), q9.V16B()); - __ movi(q9.V16B(), 0x55); __ dci(0x5ee05c29); // sqrshl d9, d1, d0 __ orr(q30.V16B(), q30.V16B(), q9.V16B()); diff --git a/test/aarch64/test-assembler-sve-aarch64.cc b/test/aarch64/test-assembler-sve-aarch64.cc index f16ab336..cc49d5b1 100644 --- a/test/aarch64/test-assembler-sve-aarch64.cc +++ b/test/aarch64/test-assembler-sve-aarch64.cc @@ -19729,6 +19729,709 @@ TEST_SVE(sudot_usdot) { } } +TEST_SVE(neon_ins_zero_high_regression_test) { + SVE_SETUP_WITH_FEATURES(CPUFeatures::kNEON, CPUFeatures::kSVE); + + START(); + __ Movi(v0.V2D(), 0x0f0e0d0c0b0a0908, 0x0706050403020100); + + // Check that both forms of ins zero bits + __ Index(z1.VnB(), 0, 1); + __ Ins(v1.V16B(), 0, wzr); + __ Index(z2.VnB(), 0, 1); + __ Ins(v2.V16B(), 3, v2.V16B(), 3); + END(); + + if (CAN_RUN()) { + RUN(); + ASSERT_EQUAL_SVE(z0, z1); + ASSERT_EQUAL_SVE(z0, z2); + } +} + +TEST_SVE(neon_fcvt_zero_high_regression_test) { + SVE_SETUP_WITH_FEATURES(CPUFeatures::kFP, + CPUFeatures::kNEON, + CPUFeatures::kSVE); + + START(); + __ Mov(z1.VnD(), 0); + __ Mov(z2.VnD(), 0); + __ Mov(z3.VnD(), 0); + __ Mov(z4.VnD(), 0); + __ Mov(z5.VnD(), 0); + __ Mov(z6.VnD(), 0); + __ Mov(z10.VnD(), 0); + + Label done; + // Skip calculations for VL128. + __ Rdvl(x0, 1); + __ Cmp(x0, 16); + __ B(eq, &done); + + __ Movi(v0.V2D(), 0x3ff000003f800000); + __ Index(z1.VnB(), 0, 1); + __ Index(z2.VnB(), 0, 1); + __ Index(z3.VnB(), 0, 1); + __ Index(z4.VnB(), 0, 1); + __ Index(z5.VnB(), 0, 1); + __ Index(z6.VnB(), 0, 1); + + // Test zeroing bits for fcvtl, fcvtn and fcvtxn. + __ Fcvtl(v1.V2D(), v0.V2S()); + __ Fcvtl2(v2.V2D(), v0.V4S()); + + __ Fcvtn(v3.V2S(), v0.V2D()); + __ Fcvtn2(v4.V4S(), v0.V2D()); + + __ Fcvtxn(v5.V2S(), v0.V2D()); + __ Fcvtxn2(v6.V4S(), v0.V2D()); + + // Set the expected non-zero bits to zero. + __ Ext(z1.VnB(), z1.VnB(), z10.VnB(), kDRegSizeInBytes * 2); + __ Ext(z2.VnB(), z2.VnB(), z10.VnB(), kDRegSizeInBytes * 2); + __ Ext(z3.VnB(), z3.VnB(), z10.VnB(), kSRegSizeInBytes * 2); + __ Ext(z4.VnB(), z4.VnB(), z10.VnB(), kSRegSizeInBytes * 4); + __ Ext(z5.VnB(), z5.VnB(), z10.VnB(), kSRegSizeInBytes * 2); + __ Ext(z6.VnB(), z6.VnB(), z10.VnB(), kSRegSizeInBytes * 4); + + __ Bind(&done); + END(); + + if (CAN_RUN()) { + RUN(); + ASSERT_EQUAL_SVE(z10, z1); + ASSERT_EQUAL_SVE(z10, z2); + ASSERT_EQUAL_SVE(z10, z3); + ASSERT_EQUAL_SVE(z10, z4); + ASSERT_EQUAL_SVE(z10, z5); + ASSERT_EQUAL_SVE(z10, z6); + } +} + +#define TEST_ZEROING(INST) \ + __ Index(z0.VnB(), 0, 1); \ + __ INST; \ + __ Orr(z10.VnB(), z10.VnB(), z0.VnB()); + +TEST_SVE(neon_zero_high) { + SVE_SETUP_WITH_FEATURES(CPUFeatures::kFP, + CPUFeatures::kNEON, + CPUFeatures::kNEONHalf, + CPUFeatures::kSVE, + CPUFeatures::kFcma, + CPUFeatures::kFHM, + CPUFeatures::kFrintToFixedSizedInt, + CPUFeatures::kDotProduct, + CPUFeatures::kRDM, + CPUFeatures::kI8MM); + + START(); + __ Mov(z10.VnD(), 0); // Initialise cumulative result register. + + TEST_ZEROING(Abs(v0.V16B(), v0.V16B())); + TEST_ZEROING(Abs(v0.V2S(), v0.V2S())); + TEST_ZEROING(Add(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Add(v0.V2S(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Addhn2(v0.V16B(), v0.V8H(), v0.V8H())); + TEST_ZEROING(Addhn(v0.V4H(), v0.V4S(), v0.V4S())); + TEST_ZEROING(Addp(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Addp(v0.V2S(), v0.V2S(), v0.V2S())); + TEST_ZEROING(And(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Bic(v0.V8H(), 0, 0)); + TEST_ZEROING(Bic(v0.V2S(), 255, 0)); + TEST_ZEROING(Bic(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Bif(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Bit(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Bsl(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Cls(v0.V16B(), v0.V16B())); + TEST_ZEROING(Cls(v0.V2S(), v0.V2S())); + TEST_ZEROING(Clz(v0.V16B(), v0.V16B())); + TEST_ZEROING(Clz(v0.V2S(), v0.V2S())); + TEST_ZEROING(Cmeq(v0.V16B(), v0.V16B(), 0)); + TEST_ZEROING(Cmeq(v0.V2S(), v0.V2S(), 0)); + TEST_ZEROING(Cmeq(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Cmeq(v0.V2S(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Cmge(v0.V16B(), v0.V16B(), 0)); + TEST_ZEROING(Cmge(v0.V2S(), v0.V2S(), 0)); + TEST_ZEROING(Cmge(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Cmge(v0.V2S(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Cmgt(v0.V16B(), v0.V16B(), 0)); + TEST_ZEROING(Cmgt(v0.V2S(), v0.V2S(), 0)); + TEST_ZEROING(Cmgt(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Cmgt(v0.V2S(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Cmhi(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Cmhi(v0.V2S(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Cmhs(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Cmhs(v0.V2S(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Cmle(v0.V16B(), v0.V16B(), 0)); + TEST_ZEROING(Cmle(v0.V2S(), v0.V2S(), 0)); + TEST_ZEROING(Cmlt(v0.V16B(), v0.V16B(), 0)); + TEST_ZEROING(Cmlt(v0.V2S(), v0.V2S(), 0)); + TEST_ZEROING(Cmtst(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Cmtst(v0.V2S(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Cnt(v0.V16B(), v0.V16B())); + TEST_ZEROING(Dup(v0.V2S(), w0)); + TEST_ZEROING(Dup(v0.V8B(), w0)); + TEST_ZEROING(Dup(v0.V2S(), v0.S(), 0)); + TEST_ZEROING(Dup(v0.V8B(), v0.B(), 0)); + TEST_ZEROING(Eor(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Ext(v0.V16B(), v0.V16B(), v0.V16B(), 0)); + TEST_ZEROING(Ext(v0.V8B(), v0.V8B(), v0.V8B(), 4)); + TEST_ZEROING(Fabd(v0.V4S(), v0.V4S(), v0.V4S())); + TEST_ZEROING(Fabd(v0.V8H(), v0.V8H(), v0.V8H())); + TEST_ZEROING(Fabs(v0.V4S(), v0.V4S())); + TEST_ZEROING(Fabs(v0.V8H(), v0.V8H())); + TEST_ZEROING(Facge(v0.V2S(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Facge(v0.V8H(), v0.V8H(), v0.V8H())); + TEST_ZEROING(Facgt(v0.V2S(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Facgt(v0.V8H(), v0.V8H(), v0.V8H())); + TEST_ZEROING(Fadd(v0.V2S(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Fadd(v0.V8H(), v0.V8H(), v0.V8H())); + TEST_ZEROING(Faddp(v0.V2S(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Faddp(v0.V8H(), v0.V8H(), v0.V8H())); + TEST_ZEROING(Fcadd(v0.V2S(), v0.V2S(), v0.V2S(), 90)); + TEST_ZEROING(Fcadd(v0.V8H(), v0.V8H(), v0.V8H(), 90)); + TEST_ZEROING(Fcmeq(v0.V2S(), v0.V2S(), 0)); + TEST_ZEROING(Fcmeq(v0.V8H(), v0.V8H(), 0)); + TEST_ZEROING(Fcmeq(v0.V2S(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Fcmeq(v0.V8H(), v0.V8H(), v0.V8H())); + TEST_ZEROING(Fcmge(v0.V2S(), v0.V2S(), 0)); + TEST_ZEROING(Fcmge(v0.V8H(), v0.V8H(), 0)); + TEST_ZEROING(Fcmge(v0.V2S(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Fcmge(v0.V8H(), v0.V8H(), v0.V8H())); + TEST_ZEROING(Fcmgt(v0.V2S(), v0.V2S(), 0)); + TEST_ZEROING(Fcmgt(v0.V8H(), v0.V8H(), 0)); + TEST_ZEROING(Fcmgt(v0.V2S(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Fcmgt(v0.V8H(), v0.V8H(), v0.V8H())); + TEST_ZEROING(Fcmla(v0.V4H(), v0.V4H(), v0.H(), 0, 0)); + TEST_ZEROING(Fcmla(v0.V4S(), v0.V4S(), v0.S(), 0, 0)); + TEST_ZEROING(Fcmla(v0.V4S(), v0.V4S(), v0.V4S(), 0)); + TEST_ZEROING(Fcmla(v0.V4H(), v0.V4H(), v0.V4H(), 0)); + TEST_ZEROING(Fcmle(v0.V2S(), v0.V2S(), 0)); + TEST_ZEROING(Fcmle(v0.V8H(), v0.V8H(), 0)); + TEST_ZEROING(Fcmlt(v0.V2S(), v0.V2S(), 0)); + TEST_ZEROING(Fcmlt(v0.V8H(), v0.V8H(), 0)); + TEST_ZEROING(Fcvtas(v0.V2S(), v0.V2S())); + TEST_ZEROING(Fcvtas(v0.V8H(), v0.V8H())); + TEST_ZEROING(Fcvtau(v0.V2S(), v0.V2S())); + TEST_ZEROING(Fcvtau(v0.V8H(), v0.V8H())); + TEST_ZEROING(Fcvtl2(v0.V4S(), v0.V8H())); + TEST_ZEROING(Fcvtl(v0.V2D(), v0.V2S())); + TEST_ZEROING(Fcvtms(v0.V2S(), v0.V2S())); + TEST_ZEROING(Fcvtms(v0.V8H(), v0.V8H())); + TEST_ZEROING(Fcvtmu(v0.V2S(), v0.V2S())); + TEST_ZEROING(Fcvtmu(v0.V8H(), v0.V8H())); + TEST_ZEROING(Fcvtn2(v0.V8H(), v0.V4S())); + TEST_ZEROING(Fcvtn(v0.V2S(), v0.V2D())); + TEST_ZEROING(Fcvtns(v0.V2S(), v0.V2S())); + TEST_ZEROING(Fcvtns(v0.V8H(), v0.V8H())); + TEST_ZEROING(Fcvtnu(v0.V2S(), v0.V2S())); + TEST_ZEROING(Fcvtnu(v0.V8H(), v0.V8H())); + TEST_ZEROING(Fcvtps(v0.V2S(), v0.V2S())); + TEST_ZEROING(Fcvtps(v0.V8H(), v0.V8H())); + TEST_ZEROING(Fcvtpu(v0.V2S(), v0.V2S())); + TEST_ZEROING(Fcvtpu(v0.V8H(), v0.V8H())); + TEST_ZEROING(Fcvtxn(v0.V2S(), v0.V2D())); + TEST_ZEROING(Fcvtxn2(v0.V4S(), v0.V2D())); + TEST_ZEROING(Fcvtzs(v0.V2S(), v0.V2S())); + TEST_ZEROING(Fcvtzs(v0.V8H(), v0.V8H())); + TEST_ZEROING(Fcvtzs(v0.V2D(), v0.V2D(), 8)); + TEST_ZEROING(Fcvtzu(v0.V2S(), v0.V2S())); + TEST_ZEROING(Fcvtzu(v0.V4H(), v0.V4H())); + TEST_ZEROING(Fcvtzu(v0.V2D(), v0.V2D(), 8)); + TEST_ZEROING(Fdiv(v0.V2S(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Fdiv(v0.V8H(), v0.V8H(), v0.V8H())); + TEST_ZEROING(Fmax(v0.V2S(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Fmax(v0.V8H(), v0.V8H(), v0.V8H())); + TEST_ZEROING(Fmaxnm(v0.V2S(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Fmaxnm(v0.V8H(), v0.V8H(), v0.V8H())); + TEST_ZEROING(Fmaxnmp(v0.V2S(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Fmaxnmp(v0.V8H(), v0.V8H(), v0.V8H())); + TEST_ZEROING(Fmaxp(v0.V2S(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Fmaxp(v0.V8H(), v0.V8H(), v0.V8H())); + TEST_ZEROING(Fmin(v0.V2S(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Fmin(v0.V8H(), v0.V8H(), v0.V8H())); + TEST_ZEROING(Fminnm(v0.V2S(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Fminnm(v0.V8H(), v0.V8H(), v0.V8H())); + TEST_ZEROING(Fminnmp(v0.V2S(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Fminnmp(v0.V8H(), v0.V8H(), v0.V8H())); + TEST_ZEROING(Fminp(v0.V2S(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Fminp(v0.V8H(), v0.V8H(), v0.V8H())); + TEST_ZEROING(Fmla(v0.V4S(), v0.V4S(), v0.S(), 0)); + TEST_ZEROING(Fmla(v0.V4H(), v0.V4H(), v0.H(), 2)); + TEST_ZEROING(Fmla(v0.V4S(), v0.V4S(), v0.V4S())); + TEST_ZEROING(Fmla(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Fmlal2(v0.V4S(), v0.V4H(), v0.H(), 0)); + TEST_ZEROING(Fmlal2(v0.V2S(), v0.V2H(), v0.H(), 2)); + TEST_ZEROING(Fmlal2(v0.V4S(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Fmlal(v0.V4S(), v0.V4H(), v0.H(), 0)); + TEST_ZEROING(Fmlal(v0.V2S(), v0.V2H(), v0.H(), 2)); + TEST_ZEROING(Fmlal(v0.V4S(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Fmls(v0.V4S(), v0.V4S(), v0.S(), 0)); + TEST_ZEROING(Fmls(v0.V4H(), v0.V4H(), v0.H(), 2)); + TEST_ZEROING(Fmls(v0.V4S(), v0.V4S(), v0.V4S())); + TEST_ZEROING(Fmls(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Fmlsl2(v0.V4S(), v0.V4H(), v0.H(), 0)); + TEST_ZEROING(Fmlsl2(v0.V2S(), v0.V2H(), v0.H(), 2)); + TEST_ZEROING(Fmlsl2(v0.V4S(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Fmlsl(v0.V4S(), v0.V4H(), v0.H(), 0)); + TEST_ZEROING(Fmlsl(v0.V2S(), v0.V2H(), v0.H(), 2)); + TEST_ZEROING(Fmlsl(v0.V4S(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Fmov(v0.V2D(), 2.0000)); + TEST_ZEROING(Fmov(v0.V4H(), 2.0000)); + TEST_ZEROING(Fmov(v0.D(), 1, x1)); + TEST_ZEROING(Fmul(v0.V4S(), v0.V4S(), v0.S(), 0)); + TEST_ZEROING(Fmul(v0.V4H(), v0.V4H(), v0.H(), 2)); + TEST_ZEROING(Fmul(v0.V4S(), v0.V4S(), v0.V4S())); + TEST_ZEROING(Fmul(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Fmulx(v0.V4S(), v0.V4S(), v0.S(), 0)); + TEST_ZEROING(Fmulx(v0.V4H(), v0.V4H(), v0.H(), 2)); + TEST_ZEROING(Fmulx(v0.V4S(), v0.V4S(), v0.V4S())); + TEST_ZEROING(Fmulx(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Fneg(v0.V4S(), v0.V4S())); + TEST_ZEROING(Fneg(v0.V4H(), v0.V4H())); + TEST_ZEROING(Frecpe(v0.V4S(), v0.V4S())); + TEST_ZEROING(Frecpe(v0.V4H(), v0.V4H())); + TEST_ZEROING(Frecps(v0.V4S(), v0.V4S(), v0.V4S())); + TEST_ZEROING(Frecps(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Frint32x(v0.V4S(), v0.V4S())); + TEST_ZEROING(Frint32z(v0.V4S(), v0.V4S())); + TEST_ZEROING(Frint64x(v0.V4S(), v0.V4S())); + TEST_ZEROING(Frint64z(v0.V4S(), v0.V4S())); + TEST_ZEROING(Frinta(v0.V4S(), v0.V4S())); + TEST_ZEROING(Frinta(v0.V4H(), v0.V4H())); + TEST_ZEROING(Frinti(v0.V4S(), v0.V4S())); + TEST_ZEROING(Frinti(v0.V4H(), v0.V4H())); + TEST_ZEROING(Frintm(v0.V4S(), v0.V4S())); + TEST_ZEROING(Frintm(v0.V4H(), v0.V4H())); + TEST_ZEROING(Frintn(v0.V4S(), v0.V4S())); + TEST_ZEROING(Frintn(v0.V4H(), v0.V4H())); + TEST_ZEROING(Frintp(v0.V4S(), v0.V4S())); + TEST_ZEROING(Frintp(v0.V4H(), v0.V4H())); + TEST_ZEROING(Frintx(v0.V4S(), v0.V4S())); + TEST_ZEROING(Frintx(v0.V4H(), v0.V4H())); + TEST_ZEROING(Frintz(v0.V4S(), v0.V4S())); + TEST_ZEROING(Frintz(v0.V4H(), v0.V4H())); + TEST_ZEROING(Frsqrte(v0.V4S(), v0.V4S())); + TEST_ZEROING(Frsqrte(v0.V4H(), v0.V4H())); + TEST_ZEROING(Frsqrts(v0.V4S(), v0.V4S(), v0.V4S())); + TEST_ZEROING(Frsqrts(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Fsqrt(v0.V4S(), v0.V4S())); + TEST_ZEROING(Fsqrt(v0.V4H(), v0.V4H())); + TEST_ZEROING(Fsub(v0.V4S(), v0.V4S(), v0.V4S())); + TEST_ZEROING(Fsub(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Mov(v0.D(), 0, x0)); + TEST_ZEROING(Mov(v0.S(), 0, w0)); + TEST_ZEROING(Mov(v0.H(), 0, w0)); + TEST_ZEROING(Mov(v0.B(), 0, w0)); + TEST_ZEROING(Mov(v0.D(), 0, v0.D(), 0)); + TEST_ZEROING(Mov(v0.S(), 0, v0.S(), 0)); + TEST_ZEROING(Mov(v0.H(), 0, v0.H(), 0)); + TEST_ZEROING(Mov(v0.B(), 0, v0.B(), 0)); + TEST_ZEROING(Mla(v0.V4S(), v0.V4S(), v0.S(), 0)); + TEST_ZEROING(Mla(v0.V4H(), v0.V4H(), v0.H(), 0)); + TEST_ZEROING(Mla(v0.V4S(), v0.V4S(), v0.V4S())); + TEST_ZEROING(Mla(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Mls(v0.V4S(), v0.V4S(), v0.S(), 0)); + TEST_ZEROING(Mls(v0.V4H(), v0.V4H(), v0.H(), 0)); + TEST_ZEROING(Mls(v0.V4S(), v0.V4S(), v0.V4S())); + TEST_ZEROING(Mls(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Movi(v0.V2D(), 0xff)); + TEST_ZEROING(Movi(v0.V2S(), 0xff)); + TEST_ZEROING(Movi(v0.V4S(), 0x10, LSL, 8)); + TEST_ZEROING(Movi(v0.V2S(), 0x10, LSL, 8)); + TEST_ZEROING(Mul(v0.V4S(), v0.V4S(), v0.S(), 0)); + TEST_ZEROING(Mul(v0.V4H(), v0.V4H(), v0.H(), 0)); + TEST_ZEROING(Mul(v0.V4S(), v0.V4S(), v0.V4S())); + TEST_ZEROING(Mul(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Mvni(v0.V4H(), 0x10, LSL, 8)); + TEST_ZEROING(Mvni(v0.V4H(), 0x10, LSL, 8)); + TEST_ZEROING(Neg(v0.V4S(), v0.V4S())); + TEST_ZEROING(Neg(v0.V4H(), v0.V4H())); + TEST_ZEROING(Mvn(v0.V16B(), v0.V16B())); + TEST_ZEROING(Mvn(v0.V8B(), v0.V8B())); + TEST_ZEROING(Orn(v0.V8B(), v0.V8B(), v0.V8B())); + TEST_ZEROING(Orn(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Orr(v0.V8H(), 0x10, 8)); + TEST_ZEROING(Orr(v0.V4H(), 0x10, 8)); + TEST_ZEROING(Mov(v0.V8B(), v0.V8B())); + TEST_ZEROING(Mov(v0.V16B(), v0.V16B())); + TEST_ZEROING(Pmul(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Pmull(v0.V8H(), v0.V8B(), v0.V8B())); + TEST_ZEROING(Pmull2(v0.V8H(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Raddhn2(v0.V16B(), v0.V8H(), v0.V8H())); + TEST_ZEROING(Raddhn(v0.V4H(), v0.V4S(), v0.V4S())); + TEST_ZEROING(Rbit(v0.V8B(), v0.V8B())); + TEST_ZEROING(Rbit(v0.V16B(), v0.V16B())); + TEST_ZEROING(Rsubhn2(v0.V16B(), v0.V8H(), v0.V8H())); + TEST_ZEROING(Rsubhn(v0.V4H(), v0.V4S(), v0.V4S())); + TEST_ZEROING(Saba(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Saba(v0.V2S(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Saba(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Sabal2(v0.V8H(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Sabal(v0.V4S(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Sabd(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Sabd(v0.V2S(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Sabd(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Sabdl2(v0.V8H(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Sabdl(v0.V4S(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Sadalp(v0.V8H(), v0.V16B())); + TEST_ZEROING(Saddl2(v0.V8H(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Saddl(v0.V2D(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Saddl(v0.V4S(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Saddw2(v0.V8H(), v0.V8H(), v0.V16B())); + TEST_ZEROING(Saddw(v0.V4S(), v0.V4S(), v0.V4H())); + TEST_ZEROING(Scvtf(v0.V4S(), v0.V4S())); + TEST_ZEROING(Scvtf(v0.V8H(), v0.V8H())); + TEST_ZEROING(Scvtf(v0.V2D(), v0.V2D(), 8)); + TEST_ZEROING(Sdot(v0.V4S(), v0.V16B(), v0.S4B(), 0)); + TEST_ZEROING(Sdot(v0.V2S(), v0.V8B(), v0.S4B(), 0)); + TEST_ZEROING(Sdot(v0.V4S(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Sdot(v0.V2S(), v0.V8B(), v0.V8B())); + TEST_ZEROING(Shadd(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Shadd(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Shl(v0.V2D(), v0.V2D(), 56)); + TEST_ZEROING(Shll2(v0.V8H(), v0.V16B(), 8)); + TEST_ZEROING(Shll(v0.V2D(), v0.V2S(), 32)); + TEST_ZEROING(Shsub(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Shsub(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Sli(v0.V2D(), v0.V2D(), 56)); + TEST_ZEROING(Sli(v0.V2S(), v0.V2S(), 16)); + TEST_ZEROING(Smax(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Smax(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Smaxp(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Smaxp(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Smin(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Smin(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Sminp(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Sminp(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Smlal2(v0.V8H(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Smlal(v0.V2D(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Smlal(v0.V2D(), v0.V2S(), v0.S(), 0)); + TEST_ZEROING(Smlsl2(v0.V8H(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Smlsl(v0.V2D(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Smlsl(v0.V2D(), v0.V2S(), v0.S(), 0)); + TEST_ZEROING(Smull2(v0.V8H(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Smull(v0.V2D(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Smull(v0.V2D(), v0.V2S(), v0.S(), 0)); + TEST_ZEROING(Sqabs(v0.V16B(), v0.V16B())); + TEST_ZEROING(Sqabs(v0.V4H(), v0.V4H())); + TEST_ZEROING(Sqadd(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Sqadd(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Sqdmlal2(v0.V4S(), v0.V8H(), v0.V8H())); + TEST_ZEROING(Sqdmlal(v0.V2D(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Sqdmlal(v0.V2D(), v0.V2S(), v0.S(), 0)); + TEST_ZEROING(Sqdmlsl2(v0.V4S(), v0.V8H(), v0.V8H())); + TEST_ZEROING(Sqdmlsl(v0.V2D(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Sqdmlsl(v0.V2D(), v0.V2S(), v0.S(), 0)); + TEST_ZEROING(Sqdmulh(v0.V4S(), v0.V4S(), v0.S(), 0)); + TEST_ZEROING(Sqdmulh(v0.V4H(), v0.V4H(), v0.H(), 0)); + TEST_ZEROING(Sqdmulh(v0.V4S(), v0.V4S(), v0.V4S())); + TEST_ZEROING(Sqdmulh(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Sqdmull2(v0.V2D(), v0.V4S(), v0.V4S())); + TEST_ZEROING(Sqdmull(v0.V4S(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Sqdmull2(v0.V2D(), v0.V4S(), v0.S(), 0)); + TEST_ZEROING(Sqdmull(v0.V4S(), v0.V4H(), v0.H(), 0)); + TEST_ZEROING(Sqneg(v0.V16B(), v0.V16B())); + TEST_ZEROING(Sqneg(v0.V2S(), v0.V2S())); + TEST_ZEROING(Sqrdmlah(v0.V4S(), v0.V4S(), v0.S(), 0)); + TEST_ZEROING(Sqrdmlah(v0.V4H(), v0.V4H(), v0.H(), 0)); + TEST_ZEROING(Sqrdmlah(v0.V4S(), v0.V4S(), v0.V4S())); + TEST_ZEROING(Sqrdmlah(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Sqrdmlsh(v0.V4S(), v0.V4S(), v0.S(), 0)); + TEST_ZEROING(Sqrdmlsh(v0.V4H(), v0.V4H(), v0.H(), 0)); + TEST_ZEROING(Sqrdmlsh(v0.V4S(), v0.V4S(), v0.V4S())); + TEST_ZEROING(Sqrdmlsh(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Sqrdmulh(v0.V4S(), v0.V4S(), v0.S(), 0)); + TEST_ZEROING(Sqrdmulh(v0.V4H(), v0.V4H(), v0.H(), 0)); + TEST_ZEROING(Sqrdmulh(v0.V4S(), v0.V4S(), v0.V4S())); + TEST_ZEROING(Sqrdmulh(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Sqrshl(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Sqrshl(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Sqshl(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Sqshl(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Sqshl(v0.V2D(), v0.V2D(), 56)); + TEST_ZEROING(Sqshl(v0.V2S(), v0.V2S(), 16)); + TEST_ZEROING(Sqshlu(v0.V2D(), v0.V2D(), 56)); + TEST_ZEROING(Sqshlu(v0.V2S(), v0.V2S(), 16)); + TEST_ZEROING(Sqsub(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Sqsub(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Sqxtn2(v0.V16B(), v0.V8H())); + TEST_ZEROING(Sqxtn(v0.V2S(), v0.V2D())); + TEST_ZEROING(Sqxtun2(v0.V16B(), v0.V8H())); + TEST_ZEROING(Sqxtun(v0.V2S(), v0.V2D())); + TEST_ZEROING(Srhadd(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Srhadd(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Sri(v0.V2D(), v0.V2D(), 8)); + TEST_ZEROING(Sri(v0.V2S(), v0.V2S(), 8)); + TEST_ZEROING(Srshl(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Srshl(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Srshr(v0.V2D(), v0.V2D(), 8)); + TEST_ZEROING(Srshr(v0.V2S(), v0.V2S(), 8)); + TEST_ZEROING(Srsra(v0.V2D(), v0.V2D(), 8)); + TEST_ZEROING(Srsra(v0.V2S(), v0.V2S(), 8)); + TEST_ZEROING(Sshl(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Sshl(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Sshr(v0.V2D(), v0.V2D(), 8)); + TEST_ZEROING(Sshr(v0.V2S(), v0.V2S(), 8)); + TEST_ZEROING(Ssra(v0.V2D(), v0.V2D(), 8)); + TEST_ZEROING(Ssra(v0.V2S(), v0.V2S(), 8)); + TEST_ZEROING(Ssubl2(v0.V8H(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Ssubl(v0.V4S(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Ssubw2(v0.V8H(), v0.V8H(), v0.V16B())); + TEST_ZEROING(Ssubw(v0.V4S(), v0.V4S(), v0.V4H())); + TEST_ZEROING(Sub(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Sub(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Subhn2(v0.V16B(), v0.V8H(), v0.V8H())); + TEST_ZEROING(Subhn(v0.V4H(), v0.V4S(), v0.V4S())); + TEST_ZEROING(Sudot(v0.V4S(), v0.V16B(), v0.S4B(), 0)); + TEST_ZEROING(Sudot(v0.V2S(), v0.V8B(), v0.S4B(), 2)); + TEST_ZEROING(Suqadd(v0.V16B(), v0.V16B())); + TEST_ZEROING(Suqadd(v0.V4H(), v0.V4H())); + TEST_ZEROING(Tbl(v0.V8B(), {v0.V16B()}, v0.V8B())); + TEST_ZEROING(Tbl(v0.V16B(), {v0.V16B()}, v0.V16B())); + TEST_ZEROING(Tbx(v0.V8B(), {v0.V16B()}, v0.V8B())); + TEST_ZEROING(Tbx(v0.V16B(), {v0.V16B()}, v0.V16B())); + TEST_ZEROING(Trn1(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Trn1(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Trn2(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Trn2(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Uaba(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Uaba(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Uabal2(v0.V8H(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Uabal(v0.V4S(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Uabd(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Uabd(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Uabdl2(v0.V8H(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Uabdl(v0.V4S(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Uadalp(v0.V8H(), v0.V16B())); + TEST_ZEROING(Uadalp(v0.V2S(), v0.V4H())); + TEST_ZEROING(Uaddl2(v0.V8H(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Uaddl(v0.V4S(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Uaddlp(v0.V8H(), v0.V16B())); + TEST_ZEROING(Uaddlp(v0.V2S(), v0.V4H())); + TEST_ZEROING(Uaddw2(v0.V8H(), v0.V8H(), v0.V16B())); + TEST_ZEROING(Uaddw(v0.V4S(), v0.V4S(), v0.V4H())); + TEST_ZEROING(Ucvtf(v0.V4S(), v0.V4S())); + TEST_ZEROING(Ucvtf(v0.V4H(), v0.V4H())); + TEST_ZEROING(Ucvtf(v0.V2D(), v0.V2D(), 8)); + TEST_ZEROING(Ucvtf(v0.V2S(), v0.V2S(), 8)); + TEST_ZEROING(Udot(v0.V4S(), v0.V16B(), v0.S4B(), 0)); + TEST_ZEROING(Udot(v0.V2S(), v0.V8B(), v0.S4B(), 0)); + TEST_ZEROING(Udot(v0.V2S(), v0.V8B(), v0.V8B())); + TEST_ZEROING(Udot(v0.V4S(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Uhadd(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Uhadd(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Uhsub(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Uhsub(v0.V2S(), v0.V2S(), v0.V2S())); + TEST_ZEROING(Umax(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Umax(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Umaxp(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Umaxp(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Umin(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Umin(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Uminp(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Uminp(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Umlal2(v0.V8H(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Umlal(v0.V4S(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Umlal(v0.V2D(), v0.V2S(), v0.S(), 0)); + TEST_ZEROING(Umlal(v0.V4S(), v0.V4H(), v0.H(), 0)); + TEST_ZEROING(Umlsl2(v0.V8H(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Umlsl(v0.V4S(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Umlsl(v0.V2D(), v0.V2S(), v0.S(), 0)); + TEST_ZEROING(Umlsl(v0.V4S(), v0.V4H(), v0.H(), 0)); + TEST_ZEROING(Umull2(v0.V8H(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Umull(v0.V4S(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Umull(v0.V2D(), v0.V2S(), v0.S(), 0)); + TEST_ZEROING(Umull(v0.V4S(), v0.V4H(), v0.H(), 0)); + TEST_ZEROING(Uqadd(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Uqadd(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Uqrshl(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Uqrshl(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Uqshl(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Uqshl(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Uqsub(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Uqsub(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Uqxtn2(v0.V16B(), v0.V8H())); + TEST_ZEROING(Uqxtn(v0.V2S(), v0.V2D())); + TEST_ZEROING(Urecpe(v0.V2S(), v0.V2S())); + TEST_ZEROING(Urecpe(v0.V4S(), v0.V4S())); + TEST_ZEROING(Urhadd(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Urhadd(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Urshl(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Urshl(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Urshr(v0.V2D(), v0.V2D(), 8)); + TEST_ZEROING(Urshr(v0.V2S(), v0.V2S(), 8)); + TEST_ZEROING(Ursqrte(v0.V4S(), v0.V4S())); + TEST_ZEROING(Ursqrte(v0.V2S(), v0.V2S())); + TEST_ZEROING(Ursra(v0.V2D(), v0.V2D(), 8)); + TEST_ZEROING(Ursra(v0.V2S(), v0.V2S(), 8)); + TEST_ZEROING(Usdot(v0.V4S(), v0.V16B(), v0.S4B(), 0)); + TEST_ZEROING(Usdot(v0.V2S(), v0.V8B(), v0.S4B(), 1)); + TEST_ZEROING(Usdot(v0.V4S(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Usdot(v0.V2S(), v0.V8B(), v0.V8B())); + TEST_ZEROING(Ushl(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Ushl(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Ushr(v0.V2D(), v0.V2D(), 8)); + TEST_ZEROING(Ushr(v0.V2S(), v0.V2S(), 8)); + TEST_ZEROING(Usqadd(v0.V16B(), v0.V16B())); + TEST_ZEROING(Usqadd(v0.V4H(), v0.V4H())); + TEST_ZEROING(Usra(v0.V2D(), v0.V2D(), 8)); + TEST_ZEROING(Usra(v0.V2S(), v0.V2S(), 8)); + TEST_ZEROING(Usubl2(v0.V8H(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Usubl(v0.V4S(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Usubw2(v0.V8H(), v0.V8H(), v0.V16B())); + TEST_ZEROING(Usubw(v0.V4S(), v0.V4S(), v0.V4H())); + TEST_ZEROING(Uzp1(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Uzp1(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Uzp2(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Uzp2(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Xtn2(v0.V16B(), v0.V8H())); + TEST_ZEROING(Xtn(v0.V4H(), v0.V4S())); + TEST_ZEROING(Zip1(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Zip1(v0.V4H(), v0.V4H(), v0.V4H())); + TEST_ZEROING(Zip2(v0.V16B(), v0.V16B(), v0.V16B())); + TEST_ZEROING(Zip2(v0.V4H(), v0.V4H(), v0.V4H())); + + __ Mov(z11.VnD(), 0); + + Label done, zero_127_to_0; + __ Rdvl(x0, 1); + __ Cmp(x0, 16); + __ B(gt, &zero_127_to_0); + + // For 128-bit VL, there's nothing to be tested, so zero the whole register. + __ Mov(z10.VnD(), 0); + __ B(&done); + + // Set the expected non-zero bits to zero. + __ Bind(&zero_127_to_0); + __ Ext(z10.VnB(), z10.VnB(), z11.VnB(), kDRegSizeInBytes * 2); + + __ Bind(&done); + + END(); + + if (CAN_RUN()) { + RUN(); + ASSERT_EQUAL_SVE(z11, z10); + } +} + +#undef TEST_ZEROING + +#define TEST_ZEROING_1(INST) \ + __ Index(z0.VnB(), 0, 1); \ + __ INST; \ + __ Orr(z10.VnB(), z10.VnB(), z0.VnB()); +#define TEST_ZEROING_2(INST) \ + __ Index(z0.VnB(), 0, 1); \ + __ Index(z1.VnB(), 0, 1); \ + __ INST; \ + __ Orr(z10.VnB(), z10.VnB(), z0.VnB()); \ + __ Orr(z10.VnB(), z10.VnB(), z1.VnB()); +#define TEST_ZEROING_3(INST) \ + __ Index(z0.VnB(), 0, 1); \ + __ Index(z1.VnB(), 0, 1); \ + __ Index(z2.VnB(), 0, 1); \ + __ INST; \ + __ Orr(z10.VnB(), z10.VnB(), z0.VnB()); \ + __ Orr(z10.VnB(), z10.VnB(), z1.VnB()); \ + __ Orr(z10.VnB(), z10.VnB(), z2.VnB()); +#define TEST_ZEROING_4(INST) \ + __ Index(z0.VnB(), 0, 1); \ + __ Index(z1.VnB(), 0, 1); \ + __ Index(z2.VnB(), 0, 1); \ + __ Index(z3.VnB(), 0, 1); \ + __ INST; \ + __ Orr(z10.VnB(), z10.VnB(), z0.VnB()); \ + __ Orr(z10.VnB(), z10.VnB(), z1.VnB()); \ + __ Orr(z10.VnB(), z10.VnB(), z2.VnB()); \ + __ Orr(z10.VnB(), z10.VnB(), z3.VnB()); + +TEST_SVE(neon_load_zero_high) { + SVE_SETUP_WITH_FEATURES(CPUFeatures::kNEON, CPUFeatures::kSVE); + + START(); + __ Mov(z10.VnD(), 0); // Initialise cumulative result register. + + // Initialise x0 to point to a buffer from which data is loaded. The contents + // does not need to be defined. + int data_size = 4 * kQRegSizeInBytes; + uint8_t* data = new uint8_t[data_size]; + __ Mov(x0, reinterpret_cast(&data[data_size])); + + MemOperand mop = MemOperand(x0); + TEST_ZEROING_1(Ld1(v0.V16B(), mop)); + TEST_ZEROING_1(Ld1(v0.V4H(), mop)); + TEST_ZEROING_1(Ld1(v0.V16B(), v1.V16B(), mop)); + TEST_ZEROING_1(Ld1(v0.V4H(), v1.V4H(), mop)); + TEST_ZEROING_1(Ld1(v0.V16B(), v1.V16B(), v2.V16B(), mop)); + TEST_ZEROING_1(Ld1(v0.V4H(), v1.V4H(), v2.V4H(), mop)); + TEST_ZEROING_1(Ld1(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B(), mop)); + TEST_ZEROING_1(Ld1(v0.V4H(), v1.V4H(), v2.V4H(), v3.V4H(), mop)); + TEST_ZEROING_1(Ld1(v0.B(), 1, mop)); + TEST_ZEROING_1(Ld1(v0.D(), 1, mop)); + TEST_ZEROING_1(Ld1(v0.H(), 1, mop)); + TEST_ZEROING_1(Ld1(v0.S(), 1, mop)); + TEST_ZEROING_1(Ld1r(v0.V16B(), mop)); + TEST_ZEROING_1(Ld1r(v0.V4H(), mop)); + TEST_ZEROING_2(Ld2(v0.V16B(), v1.V16B(), mop)); + TEST_ZEROING_2(Ld2(v0.V4H(), v1.V4H(), mop)); + TEST_ZEROING_2(Ld2(v0.B(), v1.B(), 1, mop)); + TEST_ZEROING_2(Ld2(v0.D(), v1.D(), 1, mop)); + TEST_ZEROING_2(Ld2(v0.H(), v1.H(), 1, mop)); + TEST_ZEROING_2(Ld2(v0.S(), v1.S(), 1, mop)); + TEST_ZEROING_2(Ld2r(v0.V16B(), v1.V16B(), mop)); + TEST_ZEROING_2(Ld2r(v0.V4H(), v1.V4H(), mop)); + TEST_ZEROING_3(Ld3(v0.V16B(), v1.V16B(), v2.V16B(), mop)); + TEST_ZEROING_3(Ld3(v0.V4H(), v1.V4H(), v2.V4H(), mop)); + TEST_ZEROING_3(Ld3(v0.B(), v1.B(), v2.B(), 1, mop)); + TEST_ZEROING_3(Ld3(v0.D(), v1.D(), v2.D(), 1, mop)); + TEST_ZEROING_3(Ld3(v0.H(), v1.H(), v2.H(), 1, mop)); + TEST_ZEROING_3(Ld3(v0.S(), v1.S(), v2.S(), 1, mop)); + TEST_ZEROING_3(Ld3r(v0.V16B(), v1.V16B(), v2.V16B(), mop)); + TEST_ZEROING_3(Ld3r(v0.V4H(), v1.V4H(), v2.V4H(), mop)); + TEST_ZEROING_4(Ld4(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B(), mop)); + TEST_ZEROING_4(Ld4(v0.V4H(), v1.V4H(), v2.V4H(), v3.V4H(), mop)); + TEST_ZEROING_4(Ld4(v0.B(), v1.B(), v2.B(), v3.B(), 1, mop)); + TEST_ZEROING_4(Ld4(v0.D(), v1.D(), v2.D(), v3.D(), 1, mop)); + TEST_ZEROING_4(Ld4(v0.H(), v1.H(), v2.H(), v3.H(), 1, mop)); + TEST_ZEROING_4(Ld4(v0.S(), v1.S(), v2.S(), v3.S(), 1, mop)); + TEST_ZEROING_4(Ld4r(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B(), mop)); + TEST_ZEROING_4(Ld4r(v0.V4H(), v1.V4H(), v2.V4H(), v3.V4H(), mop)); + + __ Mov(z11.VnD(), 0); + + Label done, zero_127_to_0; + __ Rdvl(x0, 1); + __ Cmp(x0, 16); + __ B(gt, &zero_127_to_0); + + // For 128-bit VL, there's nothing to be tested, so zero the whole register. + __ Mov(z10.VnD(), 0); + __ B(&done); + + // Set the expected non-zero bits to zero. + __ Bind(&zero_127_to_0); + __ Ext(z10.VnB(), z10.VnB(), z11.VnB(), kDRegSizeInBytes * 2); + + __ Bind(&done); + + END(); + + if (CAN_RUN()) { + RUN(); + ASSERT_EQUAL_SVE(z11, z10); + } +} + +#undef TEST_ZEROING_1 +#undef TEST_ZEROING_2 +#undef TEST_ZEROING_3 +#undef TEST_ZEROING_4 + TEST_SVE(sve_load_store_sp_base_regression_test) { SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE); START(); @@ -19945,6 +20648,8 @@ TEST_SVE(sve_load_store_sp_base_regression_test) { __ dci(0xe58043e0); // str z0, [sp] } + __ Drop(128 * 2 * kXRegSizeInBytes); + END(); if (CAN_RUN()) { diff --git a/test/aarch64/test-cpu-features-aarch64.cc b/test/aarch64/test-cpu-features-aarch64.cc index 4a82127f..c018f49b 100644 --- a/test/aarch64/test-cpu-features-aarch64.cc +++ b/test/aarch64/test-cpu-features-aarch64.cc @@ -3778,5 +3778,91 @@ TEST_FP_FCMA_NEON_NEONHALF(fcmla_1, fcmla(v0.V8H(), v1.V8H(), v2.H(), 2, 180)) TEST_FP_FCMA_NEON_NEONHALF(fcmla_2, fcmla(v0.V4H(), v1.V4H(), v2.V4H(), 180)) TEST_FP_FCMA_NEON_NEONHALF(fcmla_3, fcmla(v0.V8H(), v1.V8H(), v2.V8H(), 0)) +#define TEST_FEAT(NAME, ASM) \ + TEST_TEMPLATE(CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kPmull1Q), \ + NEON_Pmull1Q_##NAME, \ + ASM) +TEST_FEAT(pmull1q_0, pmull(v5.V1Q(), v6.V1D(), v7.V1D())) +#undef TEST_FEAT + +#define TEST_NEON_SHA3(NAME, ASM) \ + TEST_TEMPLATE(CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA3), \ + NEON_SHA3_##NAME, \ + ASM) +TEST_NEON_SHA3(bcax_0, bcax(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B())) +TEST_NEON_SHA3(eor3_0, eor3(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B())) +TEST_NEON_SHA3(xar_0, xar(v0.V2D(), v1.V2D(), v2.V2D(), 42)) +TEST_NEON_SHA3(rax1_0, rax1(v0.V2D(), v1.V2D(), v2.V2D())) + +#define TEST_NEON_SHA1(NAME, ASM) \ + TEST_TEMPLATE(CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA1), \ + NEON_SHA1_##NAME, \ + ASM) +TEST_NEON_SHA1(sha1c_0, sha1c(q0, s12, v20.V4S())) +TEST_NEON_SHA1(sha1m_0, sha1m(q22, s2, v13.V4S())) +TEST_NEON_SHA1(sha1p_0, sha1p(q31, s5, v15.V4S())) +TEST_NEON_SHA1(sha1su0_0, sha1su0(v19.V4S(), v9.V4S(), v27.V4S())) +TEST_NEON_SHA1(sha1h_0, sha1h(s12, s0)) +TEST_NEON_SHA1(sha1su1_0, sha1su1(v2.V4S(), v4.V4S())) + +#define TEST_FEAT(NAME, ASM) \ + TEST_TEMPLATE(CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA2), \ + NEON_SHA2_##NAME, \ + ASM) +TEST_FEAT(sha256h_0, sha256h(q0, q12, v20.V4S())) +TEST_FEAT(sha256h2_0, sha256h2(q22, q2, v13.V4S())) +TEST_FEAT(sha256su0_0, sha256su0(v2.V4S(), v4.V4S())) +TEST_FEAT(sha256su1_0, sha256su1(v19.V4S(), v9.V4S(), v27.V4S())) +#undef TEST_FEAT + +#define TEST_FEAT(NAME, ASM) \ + TEST_TEMPLATE(CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA512), \ + NEON_SHA512_##NAME, \ + ASM) +TEST_FEAT(sha512h_0, sha512h(q0, q12, v20.V2D())) +TEST_FEAT(sha512h2_0, sha512h2(q22, q2, v13.V2D())) +TEST_FEAT(sha512su0_0, sha512su0(v2.V2D(), v4.V2D())) +TEST_FEAT(sha512su1_0, sha512su1(v19.V2D(), v9.V2D(), v27.V2D())) +#undef TEST_FEAT + +#define TEST_FEAT(NAME, ASM) \ + TEST_TEMPLATE(CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kAES), \ + NEON_AES_##NAME, \ + ASM) +TEST_FEAT(aesd_0, aesd(v0.V16B(), v29.V16B())) +TEST_FEAT(aese_0, aese(v0.V16B(), v29.V16B())) +TEST_FEAT(aesimc_0, aesimc(v0.V16B(), v29.V16B())) +TEST_FEAT(aesmc_0, aesmc(v0.V16B(), v29.V16B())) +#undef TEST_FEAT + +#define TEST_FEAT(NAME, ASM) \ + TEST_TEMPLATE(CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSM3), \ + NEON_SM3_##NAME, \ + ASM) +TEST_FEAT(sm3partw1_0, sm3partw1(v12.V4S(), v13.V4S(), v14.V4S())) +TEST_FEAT(sm3partw2_0, sm3partw2(v12.V4S(), v13.V4S(), v14.V4S())) +TEST_FEAT(sm3ss1_0, sm3ss1(v13.V4S(), v15.V4S(), v17.V4S(), v21.V4S())) +TEST_FEAT(sm3tt1a_0, sm3tt1a(v30.V4S(), v29.V4S(), v9.V4S(), 1)) +TEST_FEAT(sm3tt1b_0, sm3tt1b(v30.V4S(), v29.V4S(), v9.V4S(), 3)) +TEST_FEAT(sm3tt2a_0, sm3tt2a(v30.V4S(), v29.V4S(), v9.V4S(), 2)) +TEST_FEAT(sm3tt2b_0, sm3tt2b(v30.V4S(), v29.V4S(), v9.V4S(), 0)) +#undef TEST_FEAT + +#define TEST_FEAT(NAME, ASM) \ + TEST_TEMPLATE(CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSM4), \ + NEON_SM4_##NAME, \ + ASM) +TEST_FEAT(sm4e, sm4e(v12.V4S(), v13.V4S())) +TEST_FEAT(sm4ekey, sm4ekey(v12.V4S(), v13.V4S(), v14.V4S())) +#undef TEST_FEAT + +#define TEST_FEAT(NAME, ASM) \ + TEST_TEMPLATE(CPUFeatures(CPUFeatures::kSVE2, CPUFeatures::kSVEPmull128), \ + SVE_PMULL128_##NAME, \ + ASM) +TEST_FEAT(pmullb, pmullb(z12.VnQ(), z21.VnD(), z12.VnD())) +TEST_FEAT(pmullt, pmullt(z12.VnQ(), z21.VnD(), z12.VnD())) +#undef TEST_FEAT + } // namespace aarch64 } // namespace vixl diff --git a/test/aarch64/test-disasm-aarch64.cc b/test/aarch64/test-disasm-aarch64.cc index 2babb9e6..40abef1e 100644 --- a/test/aarch64/test-disasm-aarch64.cc +++ b/test/aarch64/test-disasm-aarch64.cc @@ -2611,6 +2611,7 @@ TEST(system_mrs) { COMPARE(mrs(x15, FPCR), "mrs x15, fpcr"); COMPARE(mrs(x20, RNDR), "mrs x20, rndr"); COMPARE(mrs(x5, RNDRRS), "mrs x5, rndrrs"); + COMPARE(mrs(x9, DCZID_EL0), "mrs x9, dczid_el0"); // Test mrs that use system registers we haven't named. COMPARE(dci(MRS | (0x5555 << 5)), "mrs x0, S3_2_c10_c10_5"); @@ -3359,6 +3360,20 @@ TEST(cssc) { CLEANUP(); } +TEST(gcs) { + SETUP(); + + COMPARE_MACRO(Chkfeat(x16), "chkfeat x16"); + COMPARE_MACRO(Gcspopm(x0), "gcspopm x0"); + COMPARE_MACRO(Gcspopm(), "gcspopm"); + COMPARE_MACRO(Gcspopm(xzr), "gcspopm"); + COMPARE_MACRO(Gcsss1(x4), "gcsss1 x4"); + COMPARE_MACRO(Gcsss2(x2), "gcsss2 x2"); + COMPARE_MACRO(Gcspushm(x1), "gcspushm x1"); + + CLEANUP(); +} + TEST(architecture_features) { SETUP(); @@ -3543,19 +3558,19 @@ TEST(architecture_features) { COMPARE_PREFIX(dci(0xf8e08000), "swpal"); // SWPAL_64_memop // ARMv8.1 - RDM - COMPARE_PREFIX(dci(0x2e008400), "sqrdmlah"); // SQRDMLAH_asimdsame2_only - COMPARE_PREFIX(dci(0x2e008c00), "sqrdmlsh"); // SQRDMLSH_asimdsame2_only + COMPARE_PREFIX(dci(0x2e808400), "sqrdmlah"); // SQRDMLAH_asimdsame2_only + COMPARE_PREFIX(dci(0x2e808c00), "sqrdmlsh"); // SQRDMLSH_asimdsame2_only COMPARE_PREFIX(dci(0x2f40d000), "sqrdmlah"); // SQRDMLAH_asimdelem_R COMPARE_PREFIX(dci(0x2f40f000), "sqrdmlsh"); // SQRDMLSH_asimdelem_R - COMPARE_PREFIX(dci(0x7e008400), "sqrdmlah"); // SQRDMLAH_asisdsame2_only - COMPARE_PREFIX(dci(0x7e008c00), "sqrdmlsh"); // SQRDMLSH_asisdsame2_only + COMPARE_PREFIX(dci(0x7e408400), "sqrdmlah"); // SQRDMLAH_asisdsame2_only + COMPARE_PREFIX(dci(0x7e408c00), "sqrdmlsh"); // SQRDMLSH_asisdsame2_only COMPARE_PREFIX(dci(0x7f40d000), "sqrdmlah"); // SQRDMLAH_asisdelem_R COMPARE_PREFIX(dci(0x7f40f000), "sqrdmlsh"); // SQRDMLSH_asisdelem_R // ARMv8.2 - DotProd - COMPARE_PREFIX(dci(0x0e009400), "sdot"); // SDOT_asimdsame2_D + COMPARE_PREFIX(dci(0x0e809400), "sdot"); // SDOT_asimdsame2_D COMPARE_PREFIX(dci(0x0f00e000), "sdot"); // SDOT_asimdelem_D - COMPARE_PREFIX(dci(0x2e009400), "udot"); // UDOT_asimdsame2_D + COMPARE_PREFIX(dci(0x2e809400), "udot"); // UDOT_asimdsame2_D COMPARE_PREFIX(dci(0x2f00e000), "udot"); // UDOT_asimdelem_D // ARMv8.2 - FHM @@ -3775,42 +3790,39 @@ TEST(architecture_features) { COMPARE_PREFIX(dci(0xd503221f), "esb"); // ESB_HI_hints // ARMv8.2 - SHA3 - // COMPARE_PREFIX(dci(0xce000000), "eor3"); // EOR3_VVV16_crypto4 - // COMPARE_PREFIX(dci(0xce200000), "bcax"); // BCAX_VVV16_crypto4 - // COMPARE_PREFIX(dci(0xce608c00), "rax1"); // RAX1_VVV2_cryptosha512_3 - // COMPARE_PREFIX(dci(0xce800000), "xar"); // XAR_VVV2_crypto3_imm6 + COMPARE_PREFIX(dci(0xce000000), "eor3"); // EOR3_VVV16_crypto4 + COMPARE_PREFIX(dci(0xce200000), "bcax"); // BCAX_VVV16_crypto4 + COMPARE_PREFIX(dci(0xce608c00), "rax1"); // RAX1_VVV2_cryptosha512_3 + COMPARE_PREFIX(dci(0xce800000), "xar"); // XAR_VVV2_crypto3_imm6 // ARMv8.2 - SHA512 - // COMPARE_PREFIX(dci(0xce608000), "sha512h"); // SHA512H_QQV_cryptosha512_3 - // COMPARE_PREFIX(dci(0xce608400), "sha512h2"); // - // SHA512H2_QQV_cryptosha512_3 - // COMPARE_PREFIX(dci(0xce608800), "sha512su1"); // - // SHA512SU1_VVV2_cryptosha512_3 - // COMPARE_PREFIX(dci(0xcec08000), "sha512su0"); // - // SHA512SU0_VV2_cryptosha512_2 + COMPARE_PREFIX(dci(0xce608000), "sha512h"); // SHA512H_QQV_cryptosha512_3 + COMPARE_PREFIX(dci(0xce608400), "sha512h2"); // SHA512H2_QQV_cryptosha512_3 + COMPARE_PREFIX(dci(0xce608800), + "sha512su1"); // SHA512SU1_VVV2_cryptosha512_3 + COMPARE_PREFIX(dci(0xcec08000), "sha512su0"); // SHA512SU0_VV2_cryptosha512_2 // ARMv8.2 - SM3 - // COMPARE_PREFIX(dci(0xce400000), "sm3ss1"); // SM3SS1_VVV4_crypto4 - // COMPARE_PREFIX(dci(0xce408000), "sm3tt1a"); // SM3TT1A_VVV4_crypto3_imm2 - // COMPARE_PREFIX(dci(0xce408400), "sm3tt1b"); // SM3TT1B_VVV4_crypto3_imm2 - // COMPARE_PREFIX(dci(0xce408800), "sm3tt2a"); // SM3TT2A_VVV4_crypto3_imm2 - // COMPARE_PREFIX(dci(0xce408c00), "sm3tt2b"); // SM3TT2B_VVV_crypto3_imm2 - // COMPARE_PREFIX(dci(0xce60c000), "sm3partw1"); // - // SM3PARTW1_VVV4_cryptosha512_3 - // COMPARE_PREFIX(dci(0xce60c400), "sm3partw2"); // - // SM3PARTW2_VVV4_cryptosha512_3 + COMPARE_PREFIX(dci(0xce400000), "sm3ss1"); // SM3SS1_VVV4_crypto4 + COMPARE_PREFIX(dci(0xce408000), "sm3tt1a"); // SM3TT1A_VVV4_crypto3_imm2 + COMPARE_PREFIX(dci(0xce408400), "sm3tt1b"); // SM3TT1B_VVV4_crypto3_imm2 + COMPARE_PREFIX(dci(0xce408800), "sm3tt2a"); // SM3TT2A_VVV4_crypto3_imm2 + COMPARE_PREFIX(dci(0xce408c00), "sm3tt2b"); // SM3TT2B_VVV_crypto3_imm2 + COMPARE_PREFIX(dci(0xce60c000), + "sm3partw1"); // SM3PARTW1_VVV4_cryptosha512_3 + COMPARE_PREFIX(dci(0xce60c400), + "sm3partw2"); // SM3PARTW2_VVV4_cryptosha512_3 // ARMv8.2 - SM4 - // COMPARE_PREFIX(dci(0xce60c800), "sm4ekey"); // - // SM4EKEY_VVV4_cryptosha512_3 - // COMPARE_PREFIX(dci(0xcec08400), "sm4e"); // SM4E_VV4_cryptosha512_2 + COMPARE_PREFIX(dci(0xce60c800), "sm4ekey"); // SM4EKEY_VVV4_cryptosha512_3 + COMPARE_PREFIX(dci(0xcec08400), "sm4e"); // SM4E_VV4_cryptosha512_2 // ARMv8.2 - SPE // COMPARE_PREFIX(dci(0xd503223f), "psb"); // PSB_HC_hints // ARMv8.3 - FCMA COMPARE_PREFIX(dci(0x2e40c400), "fcmla"); // FCMLA_asimdsame2_C - COMPARE_PREFIX(dci(0x2e00e400), "fcadd"); // FCADD_asimdsame2_C + COMPARE_PREFIX(dci(0x2e40e400), "fcadd"); // FCADD_asimdsame2_C COMPARE_PREFIX(dci(0x2f401000), "fcmla"); // FCMLA_asimdelem_C_H COMPARE_PREFIX(dci(0x6f801000), "fcmla"); // FCMLA_asimdelem_C_S diff --git a/test/aarch64/test-disasm-neon-aarch64.cc b/test/aarch64/test-disasm-neon-aarch64.cc index 14dd18a6..f50e5a60 100644 --- a/test/aarch64/test-disasm-neon-aarch64.cc +++ b/test/aarch64/test-disasm-neon-aarch64.cc @@ -1792,6 +1792,34 @@ TEST(neon_3same) { COMPARE_MACRO(Pmul(v6.V16B(), v7.V16B(), v8.V16B()), "pmul v6.16b, v7.16b, v8.16b"); + // Check unallocated vector types for SDOT. + COMPARE(dci(0x0e009400), "unallocated (Unallocated)"); // 8B + COMPARE(dci(0x4e009400), "unallocated (Unallocated)"); // 16B + COMPARE(dci(0x0e409400), "unallocated (Unallocated)"); // 4H + COMPARE(dci(0x4e409400), "unallocated (Unallocated)"); // 8H + COMPARE(dci(0x0ec09400), "unallocated (Unallocated)"); // 1D + COMPARE(dci(0x4ec09400), "unallocated (Unallocated)"); // 2D + + // Check unallocated vector types for UDOT. + COMPARE(dci(0x2e009400), "unallocated (Unallocated)"); // 8B + COMPARE(dci(0x6e009400), "unallocated (Unallocated)"); // 16B + COMPARE(dci(0x2e409400), "unallocated (Unallocated)"); // 4H + COMPARE(dci(0x6e409400), "unallocated (Unallocated)"); // 8H + COMPARE(dci(0x2ec09400), "unallocated (Unallocated)"); // 1D + COMPARE(dci(0x6ec09400), "unallocated (Unallocated)"); // 2D + + // Check unallocated vector types for SQRDMLAH. + COMPARE(dci(0x2e008400), "unallocated (Unallocated)"); // 8B + COMPARE(dci(0x6e008400), "unallocated (Unallocated)"); // 16B + COMPARE(dci(0x2ec08400), "unallocated (Unallocated)"); // 1D + COMPARE(dci(0x6ec08400), "unallocated (Unallocated)"); // 2D + + // Check unallocated vector types for SQRDMLSH. + COMPARE(dci(0x2e008c00), "unallocated (Unallocated)"); // 8B + COMPARE(dci(0x6e008c00), "unallocated (Unallocated)"); // 16B + COMPARE(dci(0x2ec08c00), "unallocated (Unallocated)"); // 1D + COMPARE(dci(0x6ec08c00), "unallocated (Unallocated)"); // 2D + CLEANUP(); } @@ -1924,6 +1952,16 @@ TEST(neon_3same_extra_fcadd) { COMPARE(dci(0x2e00ec00), "unallocated (Unallocated)"); // opcode = 0x1101 COMPARE(dci(0x2e00fc00), "unallocated (Unallocated)"); // opcode = 0x1111 + // Check unallocated vector types for FCADD. + COMPARE(dci(0x2e00e400), "unallocated (Unallocated)"); // 8B + COMPARE(dci(0x6e00e400), "unallocated (Unallocated)"); // 16B + COMPARE(dci(0x2ec0e400), "unallocated (Unallocated)"); // 1D + + // Check unallocated vector types for FCMLA. + COMPARE(dci(0x2e00c400), "unallocated (Unallocated)"); // 8B + COMPARE(dci(0x6e00c400), "unallocated (Unallocated)"); // 16B + COMPARE(dci(0x2ec0c400), "unallocated (Unallocated)"); // 1D + CLEANUP(); } @@ -2594,6 +2632,13 @@ TEST(neon_fp_byelement) { COMPARE_MACRO(Fcmla(v0.V8H(), v1.V8H(), v31.H(), 3, 0), "fcmla v0.8h, v1.8h, v31.h[3], #0"); + // Check unallocated vector types for FCMLA. + COMPARE(dci(0x2f001000), "unallocated (Unallocated)"); // 8B + COMPARE(dci(0x6f001000), "unallocated (Unallocated)"); // 16B + COMPARE(dci(0x2f801000), "unallocated (Unallocated)"); // 2S + COMPARE(dci(0x2fc01000), "unallocated (Unallocated)"); // 1D + COMPARE(dci(0x6fc01000), "unallocated (Unallocated)"); // 2D + CLEANUP(); } @@ -2904,6 +2949,10 @@ TEST(neon_3different) { "pmull v0.8h, v1.8b, v2.8b"); COMPARE_MACRO(Pmull2(v2.V8H(), v3.V16B(), v4.V16B()), "pmull2 v2.8h, v3.16b, v4.16b"); + COMPARE_MACRO(Pmull(v5.V1Q(), v6.V1D(), v7.V1D()), + "pmull v5.1q, v6.1d, v7.1d"); + COMPARE_MACRO(Pmull2(v8.V1Q(), v9.V2D(), v10.V2D()), + "pmull2 v8.1q, v9.2d, v10.2d"); CLEANUP(); } @@ -4467,6 +4516,100 @@ TEST(neon_matmul) { CLEANUP(); } +TEST(neon_sha3) { + SETUP(); + + COMPARE_MACRO(Bcax(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B()), + "bcax v0.16b, v1.16b, v2.16b, v3.16b"); + COMPARE_MACRO(Eor3(v10.V16B(), v11.V16B(), v12.V16B(), v13.V16B()), + "eor3 v10.16b, v11.16b, v12.16b, v13.16b"); + COMPARE_MACRO(Xar(v20.V2D(), v21.V2D(), v22.V2D(), 42), + "xar v20.2d, v21.2d, v22.2d, #42"); + COMPARE_MACRO(Rax1(v0.V2D(), v1.V2D(), v2.V2D()), "rax1 v0.2d, v1.2d, v2.2d"); + + CLEANUP(); +} + +TEST(neon_sha1) { + SETUP(); + + COMPARE_MACRO(Sha1c(q0, s12, v20.V4S()), "sha1c q0, s12, v20.4s"); + COMPARE_MACRO(Sha1m(q22, s2, v13.V4S()), "sha1m q22, s2, v13.4s"); + COMPARE_MACRO(Sha1p(q31, s5, v15.V4S()), "sha1p q31, s5, v15.4s"); + COMPARE_MACRO(Sha1su0(v19.V4S(), v9.V4S(), v27.V4S()), + "sha1su0 v19.4s, v9.4s, v27.4s"); + COMPARE_MACRO(Sha1h(s12, s0), "sha1h s12, s0"); + COMPARE_MACRO(Sha1su1(v2.V4S(), v4.V4S()), "sha1su1 v2.4s, v4.4s"); + + CLEANUP(); +} + +TEST(neon_sha2) { + SETUP(); + + COMPARE_MACRO(Sha256h(q0, q12, v20.V4S()), "sha256h q0, q12, v20.4s"); + COMPARE_MACRO(Sha256h2(q22, q2, v13.V4S()), "sha256h2 q22, q2, v13.4s"); + COMPARE_MACRO(Sha256su0(v2.V4S(), v4.V4S()), "sha256su0 v2.4s, v4.4s"); + COMPARE_MACRO(Sha256su1(v19.V4S(), v9.V4S(), v27.V4S()), + "sha256su1 v19.4s, v9.4s, v27.4s"); + + CLEANUP(); +} + +TEST(neon_sha512) { + SETUP(); + + COMPARE_MACRO(Sha512h(q0, q12, v20.V2D()), "sha512h q0, q12, v20.2d"); + COMPARE_MACRO(Sha512h2(q22, q2, v13.V2D()), "sha512h2 q22, q2, v13.2d"); + COMPARE_MACRO(Sha512su0(v2.V2D(), v4.V2D()), "sha512su0 v2.2d, v4.2d"); + COMPARE_MACRO(Sha512su1(v19.V2D(), v9.V2D(), v27.V2D()), + "sha512su1 v19.2d, v9.2d, v27.2d"); + + CLEANUP(); +} + +TEST(neon_aes) { + SETUP(); + + COMPARE_MACRO(Aesd(v0.V16B(), v29.V16B()), "aesd v0.16b, v29.16b"); + COMPARE_MACRO(Aese(v0.V16B(), v29.V16B()), "aese v0.16b, v29.16b"); + COMPARE_MACRO(Aesimc(v0.V16B(), v29.V16B()), "aesimc v0.16b, v29.16b"); + COMPARE_MACRO(Aesmc(v0.V16B(), v29.V16B()), "aesmc v0.16b, v29.16b"); + + CLEANUP(); +} + +TEST(neon_sm3) { + SETUP(); + + COMPARE_MACRO(Sm3partw1(v12.V4S(), v13.V4S(), v14.V4S()), + "sm3partw1 v12.4s, v13.4s, v14.4s"); + COMPARE_MACRO(Sm3partw2(v12.V4S(), v13.V4S(), v14.V4S()), + "sm3partw2 v12.4s, v13.4s, v14.4s"); + COMPARE_MACRO(Sm3ss1(v13.V4S(), v15.V4S(), v17.V4S(), v21.V4S()), + "sm3ss1 v13.4s, v15.4s, v17.4s, v21.4s"); + COMPARE_MACRO(Sm3tt1a(v30.V4S(), v29.V4S(), v9.V4S(), 1), + "sm3tt1a v30.4s, v29.4s, v9.s[1]"); + COMPARE_MACRO(Sm3tt1b(v30.V4S(), v29.V4S(), v9.V4S(), 3), + "sm3tt1b v30.4s, v29.4s, v9.s[3]"); + COMPARE_MACRO(Sm3tt2a(v30.V4S(), v29.V4S(), v9.V4S(), 2), + "sm3tt2a v30.4s, v29.4s, v9.s[2]"); + COMPARE_MACRO(Sm3tt2b(v30.V4S(), v29.V4S(), v9.V4S(), 0), + "sm3tt2b v30.4s, v29.4s, v9.s[0]"); + + CLEANUP(); +} + +TEST(neon_sm4) { + SETUP(); + + COMPARE_MACRO(Sm4e(v12.V4S(), v13.V4S()), "sm4e v12.4s, v13.4s"); + COMPARE_MACRO(Sm4ekey(v12.V4S(), v13.V4S(), v14.V4S()), + "sm4ekey v12.4s, v13.4s, v14.4s"); + + CLEANUP(); +} + TEST(neon_unallocated_regression_test) { SETUP(); @@ -4562,8 +4705,6 @@ TEST(neon_unallocated_regression_test) { COMPARE_PREFIX(dci(0x2efb9dbd), "unallocated"); // pmul v.und, v.und, v.und COMPARE_PREFIX(dci(0x4eace101), "unallocated"); // pmull v.d, v.s, v.s COMPARE_PREFIX(dci(0x0e6de3ad), "unallocated"); // pmull v.s, v.h, v.h - COMPARE_PREFIX(dci(0x4ee3e2c0), "unallocated"); // pmull v.und, v.d, v.d - COMPARE_PREFIX(dci(0x0eede060), "unallocated"); // pmull v.und, v.und, v.und COMPARE_PREFIX(dci(0x6ee00afd), "unallocated"); // rev v.d, v.d COMPARE_PREFIX(dci(0x4e601975), "unallocated"); // rev v.h, v.h COMPARE_PREFIX(dci(0x4ea019f3), "unallocated"); // rev v.s, v.s @@ -4633,10 +4774,14 @@ TEST(neon_unallocated_regression_test) { COMPARE_PREFIX(dci(0x6fd6d80f), "unallocated"); // sqrdmlah v.d, v.d, v.d[] COMPARE_PREFIX(dci(0x2fecdae5), "unallocated"); // sqrdmlah v.und, v.und, v.d[] + COMPARE_PREFIX(dci(0x7e008429), "unallocated"); // sqrdmlah b9, b1, b0 + COMPARE_PREFIX(dci(0x7ec08429), "unallocated"); // sqrdmlah d9, d1, d0 COMPARE_PREFIX(dci(0x7fe0f992), "unallocated"); // sqrdmlsh d, d, v.d[] COMPARE_PREFIX(dci(0x6ff1f9df), "unallocated"); // sqrdmlsh v.d, v.d, v.d[] COMPARE_PREFIX(dci(0x2fcdfad1), "unallocated"); // sqrdmlsh v.und, v.und, v.d[] + COMPARE_PREFIX(dci(0x7e008c29), "unallocated"); // sqrdmlsh b9, b1, b0 + COMPARE_PREFIX(dci(0x7ec08c29), "unallocated"); // sqrdmlsh d9, d1, d0 COMPARE_PREFIX(dci(0x7e23b7fa), "unallocated"); // sqrdmulh b, b, b COMPARE_PREFIX(dci(0x5f1ad272), "unallocated"); // sqrdmulh b, b, v.b[] COMPARE_PREFIX(dci(0x7ef8b6e0), "unallocated"); // sqrdmulh d, d, d diff --git a/test/aarch64/test-disasm-sve-aarch64.cc b/test/aarch64/test-disasm-sve-aarch64.cc index 5e001e7e..fbdff335 100644 --- a/test/aarch64/test-disasm-sve-aarch64.cc +++ b/test/aarch64/test-disasm-sve-aarch64.cc @@ -7673,13 +7673,14 @@ TEST(sve2_integer_multiply_long_vector) { COMPARE(sqdmullt(z7.VnD(), z4.VnS(), z0.VnS(), 0), "sqdmullt z7.d, z4.s, z0.s[0]"); - // Feature `SVEPmull128` is not supported. - // COMPARE(pmullb(z12.VnQ(), z21.VnD(), z12.VnD()), - // "pmullb z12.q, z21.d, z12.d"); COMPARE(pmullb(z12.VnH(), z21.VnB(), z12.VnB()), "pmullb z12.h, z21.b, z12.b"); COMPARE(pmullt(z31.VnD(), z30.VnS(), z26.VnS()), "pmullt z31.d, z30.s, z26.s"); + COMPARE(pmullb(z12.VnQ(), z21.VnD(), z12.VnD()), + "pmullb z12.q, z21.d, z12.d"); + COMPARE(pmullt(z12.VnQ(), z21.VnD(), z12.VnD()), + "pmullt z12.q, z21.d, z12.d"); COMPARE(smullb(z10.VnD(), z4.VnS(), z4.VnS()), "smullb z10.d, z4.s, z4.s"); COMPARE(smullb(z11.VnH(), z14.VnB(), z14.VnB()), @@ -7701,6 +7702,10 @@ TEST(sve2_integer_multiply_long_vector) { COMPARE(umullt(z24.VnH(), z7.VnB(), z16.VnB()), "umullt z24.h, z7.b, z16.b"); COMPARE(umullt(z24.VnS(), z8.VnH(), z26.VnH()), "umullt z24.s, z8.h, z26.h"); + // Check related but undefined encodings. + COMPARE(dci(0x45806800), "unallocated (Unallocated)"); // pmullb s, h, h + COMPARE(dci(0x45806c00), "unallocated (Unallocated)"); // pmullt s, h, h + CLEANUP(); } diff --git a/test/aarch64/test-simulator-aarch64.cc b/test/aarch64/test-simulator-aarch64.cc index 0a9dabed..e9d8fdbd 100644 --- a/test/aarch64/test-simulator-aarch64.cc +++ b/test/aarch64/test-simulator-aarch64.cc @@ -102,6 +102,95 @@ namespace aarch64 { /* The simulator can run every test. */ \ *skipped = false +#ifdef VIXL_ENABLE_IMPLICIT_CHECKS +// The signal handler needs access to the simulator. +Simulator* gImplicitCheckSim; + +#ifdef __x86_64__ +#include +#include +void HandleSegFault(int sig, siginfo_t* info, void* context) { + USE(sig); + USE(info); + Simulator* sim = gImplicitCheckSim; + + // Did the signal come from the simulator? + ucontext_t* uc = reinterpret_cast(context); + uintptr_t fault_pc = uc->uc_mcontext.gregs[REG_RIP]; + VIXL_CHECK(sim->IsSimulatedMemoryAccess(fault_pc)); + + // Increment the counter (x1) each time we handle a signal. + int64_t counter = reinterpret_cast(sim->ReadXRegister(1)); + sim->WriteXRegister(1, ++counter); + + // Return to the VIXL memory access continuation point, which is also the + // next instruction, after this handler. + uc->uc_mcontext.gregs[REG_RIP] = sim->GetSignalReturnAddress(); + // Return that the memory access failed. + uc->uc_mcontext.gregs[REG_RAX] = + static_cast(MemoryAccessResult::Failure); +} +#endif // __x86_64__ + +// Start an implicit check test with a counter and start label so the number of +// faults can be counted. Note: each instruction after the start will be +// expected to fault. +#define START_IMPLICIT_CHECK() \ + gImplicitCheckSim = &simulator; \ + /* Set up a signal handler to count the number of faulting instructions. */ \ + struct sigaction sa; \ + sa.sa_sigaction = HandleSegFault; \ + sigaction(SIGSEGV, &sa, NULL); \ + START(); \ + /* Reset the counter. */ \ + __ Mov(x1, 0); \ + /* Use a consistent bad address. */ \ + __ Mov(x15, xzr); \ + __ Mov(ip0, xzr); \ + /* Load an amount of data to load. */ \ + __ Mov(ip1, 4096); \ + [[maybe_unused]] MemOperand bad_memory = MemOperand(ip0); \ + if (masm.GetCPUFeatures()->Has(CPUFeatures::kSVE)) { \ + /* Turn on all lanes to ensure all loads/stores are tested. */ \ + __ Ptrue(p0.VnB()); \ + __ Ptrue(p1.VnB()); \ + __ Ptrue(p2.VnB()); \ + __ Ptrue(p3.VnB()); \ + __ Ptrue(p4.VnB()); \ + __ Ptrue(p5.VnB()); \ + __ Ptrue(p6.VnB()); \ + __ Ptrue(p7.VnB()); \ + __ Ptrue(p8.VnB()); \ + __ Ptrue(p9.VnB()); \ + __ Ptrue(p10.VnB()); \ + __ Ptrue(p11.VnB()); \ + __ Ptrue(p12.VnB()); \ + __ Ptrue(p13.VnB()); \ + __ Ptrue(p14.VnB()); \ + __ Ptrue(p15.VnB()); \ + } \ + Label l_start, l_end; \ + __ Bind(&l_start); + +#define END_IMPLICIT_CHECK() \ + __ Bind(&l_end); \ + /* Return the counter. */ \ + __ Mov(x0, x1); \ + END(); + +#define TRY_RUN_IMPLICIT_CHECK() \ + bool skipped; \ + TRY_RUN(&skipped); \ + /* Implicit checks should only be used with the simulator. */ \ + VIXL_ASSERT(!skipped); \ + /* Check that each load/store instruction generated a segfault that was */ \ + /* raised and dealt with. */ \ + size_t result = simulator.ReadXRegister(0); \ + size_t num_of_faulting_instr = masm.GetSizeOfCodeGeneratedSince(&l_start) - \ + masm.GetSizeOfCodeGeneratedSince(&l_end); \ + VIXL_CHECK((result * kInstructionSize) == num_of_faulting_instr); + +#endif // VIXL_ENABLE_IMPLICIT_CHECKS #else // VIXL_INCLUDE_SIMULATOR_AARCH64 @@ -2850,7 +2939,7 @@ static void TestOpImmOpImmNEON(const char* name, } } } - VIXL_ASSERT(counted_length == expected_length); + VIXL_CHECK(counted_length == expected_length); if (error_count > kErrorReportLimit) { printf("%u other errors follow.\n", error_count - kErrorReportLimit); } @@ -5012,6 +5101,802 @@ DEFINE_TEST_NEON_FHM_BYELEMENT(fmlsl, Basic, Basic, Basic) DEFINE_TEST_NEON_FHM_BYELEMENT(fmlsl2, Basic, Basic, Basic) +#ifdef VIXL_ENABLE_IMPLICIT_CHECKS +TEST(ImplicitCheck) { + SETUP_WITH_FEATURES(CPUFeatures::kNEON); + START_IMPLICIT_CHECK(); + + EmissionCheckScope guard(&masm, masm.GetBuffer()->GetRemainingBytes()); + // Invalid memory reads. + __ ldar(w3, bad_memory); + __ ldar(x4, bad_memory); + __ ldarb(w5, bad_memory); + __ ldarb(x6, bad_memory); + __ ldarh(w7, bad_memory); + __ ldarh(x8, bad_memory); + __ ldaxp(w9, w10, bad_memory); + __ ldaxp(x11, x12, bad_memory); + __ ldaxr(w13, bad_memory); + __ ldaxr(x14, bad_memory); + __ ldaxrb(w15, bad_memory); + __ ldaxrb(x16, bad_memory); + __ ldaxrh(w17, bad_memory); + __ ldaxrh(x18, bad_memory); + __ ldnp(w19, w20, bad_memory); + __ ldnp(x21, x22, bad_memory); + __ ldp(w23, w24, bad_memory); + __ ldp(x25, x26, bad_memory); + __ ldpsw(x27, x28, bad_memory); + __ ldr(w29, bad_memory); + __ ldr(x2, bad_memory); + __ ldrb(w3, bad_memory); + __ ldrb(x4, bad_memory); + __ ldrh(w5, bad_memory); + __ ldrh(x6, bad_memory); + __ ldrsb(w7, bad_memory); + __ ldrsb(x8, bad_memory); + __ ldrsh(w9, bad_memory); + __ ldrsh(x10, bad_memory); + __ ldrsw(x11, bad_memory); + __ ldur(w12, bad_memory); + __ ldur(x13, bad_memory); + __ ldurb(w14, bad_memory); + __ ldurb(x15, bad_memory); + __ ldurh(w16, bad_memory); + __ ldurh(x17, bad_memory); + __ ldursb(w18, bad_memory); + __ ldursb(x19, bad_memory); + __ ldursh(w20, bad_memory); + __ ldursh(x21, bad_memory); + __ ldursw(x22, bad_memory); + __ ldxp(w23, w24, bad_memory); + __ ldxp(x25, x26, bad_memory); + __ ldxr(w27, bad_memory); + __ ldxr(x28, bad_memory); + __ ldxrb(w29, bad_memory); + __ ldxrb(x2, bad_memory); + __ ldxrh(w3, bad_memory); + __ ldxrh(x4, bad_memory); + + // Invalid memory writes. Note: exclusive store instructions are not tested + // because they can fail due to the global monitor before trying to perform a + // memory store. + __ stlr(w18, bad_memory); + __ stlr(x19, bad_memory); + __ stlrb(w20, bad_memory); + __ stlrb(x21, bad_memory); + __ stlrh(w22, bad_memory); + __ stlrh(x23, bad_memory); + __ stnp(w14, w15, bad_memory); + __ stnp(x16, x17, bad_memory); + __ stp(w18, w19, bad_memory); + __ stp(x20, x21, bad_memory); + __ str(w22, bad_memory); + __ str(x23, bad_memory); + __ strb(w24, bad_memory); + __ strb(x25, bad_memory); + __ strh(w26, bad_memory); + __ strh(x27, bad_memory); + __ stur(w28, bad_memory); + __ stur(x29, bad_memory); + __ sturb(w2, bad_memory); + __ sturb(x3, bad_memory); + __ sturh(w4, bad_memory); + __ sturh(x5, bad_memory); + + END_IMPLICIT_CHECK(); + TRY_RUN_IMPLICIT_CHECK(); +} + +TEST(ImplicitCheckNeon) { + SETUP_WITH_FEATURES(CPUFeatures::kNEON); + START_IMPLICIT_CHECK(); + + EmissionCheckScope guard(&masm, masm.GetBuffer()->GetRemainingBytes()); + __ ld1(v18.V16B(), v19.V16B(), v20.V16B(), v21.V16B(), bad_memory); + __ ld1(v23.V16B(), v24.V16B(), v25.V16B(), v26.V16B(), bad_memory); + __ ld1(v5.V16B(), v6.V16B(), v7.V16B(), v8.V16B(), bad_memory); + __ ld1(v18.V16B(), v19.V16B(), v20.V16B(), bad_memory); + __ ld1(v13.V16B(), v14.V16B(), v15.V16B(), bad_memory); + __ ld1(v19.V16B(), v20.V16B(), v21.V16B(), bad_memory); + __ ld1(v17.V16B(), v18.V16B(), bad_memory); + __ ld1(v20.V16B(), v21.V16B(), bad_memory); + __ ld1(v28.V16B(), v29.V16B(), bad_memory); + __ ld1(v29.V16B(), bad_memory); + __ ld1(v21.V16B(), bad_memory); + __ ld1(v4.V16B(), bad_memory); + __ ld1(v4.V1D(), v5.V1D(), v6.V1D(), v7.V1D(), bad_memory); + __ ld1(v17.V1D(), v18.V1D(), v19.V1D(), v20.V1D(), bad_memory); + __ ld1(v28.V1D(), v29.V1D(), v30.V1D(), v31.V1D(), bad_memory); + __ ld1(v20.V1D(), v21.V1D(), v22.V1D(), bad_memory); + __ ld1(v19.V1D(), v20.V1D(), v21.V1D(), bad_memory); + __ ld1(v12.V1D(), v13.V1D(), v14.V1D(), bad_memory); + __ ld1(v29.V1D(), v30.V1D(), bad_memory); + __ ld1(v31.V1D(), v0.V1D(), bad_memory); + __ ld1(v3.V1D(), v4.V1D(), bad_memory); + __ ld1(v28.V1D(), bad_memory); + __ ld1(v11.V1D(), bad_memory); + __ ld1(v29.V1D(), bad_memory); + __ ld1(v28.V2D(), v29.V2D(), v30.V2D(), v31.V2D(), bad_memory); + __ ld1(v8.V2D(), v9.V2D(), v10.V2D(), v11.V2D(), bad_memory); + __ ld1(v14.V2D(), v15.V2D(), v16.V2D(), v17.V2D(), bad_memory); + __ ld1(v26.V2D(), v27.V2D(), v28.V2D(), bad_memory); + __ ld1(v5.V2D(), v6.V2D(), v7.V2D(), bad_memory); + __ ld1(v26.V2D(), v27.V2D(), v28.V2D(), bad_memory); + __ ld1(v18.V2D(), v19.V2D(), bad_memory); + __ ld1(v21.V2D(), v22.V2D(), bad_memory); + __ ld1(v17.V2D(), v18.V2D(), bad_memory); + __ ld1(v5.V2D(), bad_memory); + __ ld1(v6.V2D(), bad_memory); + __ ld1(v15.V2D(), bad_memory); + __ ld1(v30.V2S(), v31.V2S(), v0.V2S(), v1.V2S(), bad_memory); + __ ld1(v24.V2S(), v25.V2S(), v26.V2S(), v27.V2S(), bad_memory); + __ ld1(v27.V2S(), v28.V2S(), v29.V2S(), v30.V2S(), bad_memory); + __ ld1(v11.V2S(), v12.V2S(), v13.V2S(), bad_memory); + __ ld1(v8.V2S(), v9.V2S(), v10.V2S(), bad_memory); + __ ld1(v31.V2S(), v0.V2S(), v1.V2S(), bad_memory); + __ ld1(v0.V2S(), v1.V2S(), bad_memory); + __ ld1(v13.V2S(), v14.V2S(), bad_memory); + __ ld1(v3.V2S(), v4.V2S(), bad_memory); + __ ld1(v26.V2S(), bad_memory); + __ ld1(v0.V2S(), bad_memory); + __ ld1(v11.V2S(), bad_memory); + __ ld1(v16.V4H(), v17.V4H(), v18.V4H(), v19.V4H(), bad_memory); + __ ld1(v24.V4H(), v25.V4H(), v26.V4H(), v27.V4H(), bad_memory); + __ ld1(v1.V4H(), v2.V4H(), v3.V4H(), v4.V4H(), bad_memory); + __ ld1(v30.V4H(), v31.V4H(), v0.V4H(), bad_memory); + __ ld1(v25.V4H(), v26.V4H(), v27.V4H(), bad_memory); + __ ld1(v3.V4H(), v4.V4H(), v5.V4H(), bad_memory); + __ ld1(v3.V4H(), v4.V4H(), bad_memory); + __ ld1(v3.V4H(), v4.V4H(), bad_memory); + __ ld1(v23.V4H(), v24.V4H(), bad_memory); + __ ld1(v26.V4H(), bad_memory); + __ ld1(v1.V4H(), bad_memory); + __ ld1(v14.V4H(), bad_memory); + __ ld1(v26.V4S(), v27.V4S(), v28.V4S(), v29.V4S(), bad_memory); + __ ld1(v28.V4S(), v29.V4S(), v30.V4S(), v31.V4S(), bad_memory); + __ ld1(v4.V4S(), v5.V4S(), v6.V4S(), v7.V4S(), bad_memory); + __ ld1(v2.V4S(), v3.V4S(), v4.V4S(), bad_memory); + __ ld1(v22.V4S(), v23.V4S(), v24.V4S(), bad_memory); + __ ld1(v15.V4S(), v16.V4S(), v17.V4S(), bad_memory); + __ ld1(v20.V4S(), v21.V4S(), bad_memory); + __ ld1(v30.V4S(), v31.V4S(), bad_memory); + __ ld1(v11.V4S(), v12.V4S(), bad_memory); + __ ld1(v15.V4S(), bad_memory); + __ ld1(v12.V4S(), bad_memory); + __ ld1(v0.V4S(), bad_memory); + __ ld1(v17.V8B(), v18.V8B(), v19.V8B(), v20.V8B(), bad_memory); + __ ld1(v5.V8B(), v6.V8B(), v7.V8B(), v8.V8B(), bad_memory); + __ ld1(v9.V8B(), v10.V8B(), v11.V8B(), v12.V8B(), bad_memory); + __ ld1(v4.V8B(), v5.V8B(), v6.V8B(), bad_memory); + __ ld1(v2.V8B(), v3.V8B(), v4.V8B(), bad_memory); + __ ld1(v12.V8B(), v13.V8B(), v14.V8B(), bad_memory); + __ ld1(v10.V8B(), v11.V8B(), bad_memory); + __ ld1(v11.V8B(), v12.V8B(), bad_memory); + __ ld1(v27.V8B(), v28.V8B(), bad_memory); + __ ld1(v31.V8B(), bad_memory); + __ ld1(v10.V8B(), bad_memory); + __ ld1(v28.V8B(), bad_memory); + __ ld1(v5.V8H(), v6.V8H(), v7.V8H(), v8.V8H(), bad_memory); + __ ld1(v2.V8H(), v3.V8H(), v4.V8H(), v5.V8H(), bad_memory); + __ ld1(v10.V8H(), v11.V8H(), v12.V8H(), v13.V8H(), bad_memory); + __ ld1(v26.V8H(), v27.V8H(), v28.V8H(), bad_memory); + __ ld1(v3.V8H(), v4.V8H(), v5.V8H(), bad_memory); + __ ld1(v17.V8H(), v18.V8H(), v19.V8H(), bad_memory); + __ ld1(v4.V8H(), v5.V8H(), bad_memory); + __ ld1(v21.V8H(), v22.V8H(), bad_memory); + __ ld1(v4.V8H(), v5.V8H(), bad_memory); + __ ld1(v9.V8H(), bad_memory); + __ ld1(v27.V8H(), bad_memory); + __ ld1(v26.V8H(), bad_memory); + __ ld1(v19.B(), 1, bad_memory); + __ ld1(v12.B(), 3, bad_memory); + __ ld1(v27.B(), 12, bad_memory); + __ ld1(v10.D(), 1, bad_memory); + __ ld1(v26.D(), 1, bad_memory); + __ ld1(v7.D(), 1, bad_memory); + __ ld1(v19.H(), 5, bad_memory); + __ ld1(v10.H(), 1, bad_memory); + __ ld1(v5.H(), 4, bad_memory); + __ ld1(v21.S(), 2, bad_memory); + __ ld1(v13.S(), 2, bad_memory); + __ ld1(v1.S(), 2, bad_memory); + __ ld1r(v2.V16B(), bad_memory); + __ ld1r(v2.V16B(), bad_memory); + __ ld1r(v22.V16B(), bad_memory); + __ ld1r(v25.V1D(), bad_memory); + __ ld1r(v9.V1D(), bad_memory); + __ ld1r(v23.V1D(), bad_memory); + __ ld1r(v19.V2D(), bad_memory); + __ ld1r(v21.V2D(), bad_memory); + __ ld1r(v30.V2D(), bad_memory); + __ ld1r(v24.V2S(), bad_memory); + __ ld1r(v26.V2S(), bad_memory); + __ ld1r(v28.V2S(), bad_memory); + __ ld1r(v19.V4H(), bad_memory); + __ ld1r(v1.V4H(), bad_memory); + __ ld1r(v21.V4H(), bad_memory); + __ ld1r(v15.V4S(), bad_memory); + __ ld1r(v21.V4S(), bad_memory); + __ ld1r(v23.V4S(), bad_memory); + __ ld1r(v26.V8B(), bad_memory); + __ ld1r(v14.V8B(), bad_memory); + __ ld1r(v19.V8B(), bad_memory); + __ ld1r(v13.V8H(), bad_memory); + __ ld1r(v30.V8H(), bad_memory); + __ ld1r(v27.V8H(), bad_memory); + __ ld2(v21.V16B(), v22.V16B(), bad_memory); + __ ld2(v21.V16B(), v22.V16B(), bad_memory); + __ ld2(v12.V16B(), v13.V16B(), bad_memory); + __ ld2(v14.V2D(), v15.V2D(), bad_memory); + __ ld2(v0.V2D(), v1.V2D(), bad_memory); + __ ld2(v12.V2D(), v13.V2D(), bad_memory); + __ ld2(v27.V2S(), v28.V2S(), bad_memory); + __ ld2(v2.V2S(), v3.V2S(), bad_memory); + __ ld2(v12.V2S(), v13.V2S(), bad_memory); + __ ld2(v9.V4H(), v10.V4H(), bad_memory); + __ ld2(v23.V4H(), v24.V4H(), bad_memory); + __ ld2(v1.V4H(), v2.V4H(), bad_memory); + __ ld2(v20.V4S(), v21.V4S(), bad_memory); + __ ld2(v10.V4S(), v11.V4S(), bad_memory); + __ ld2(v24.V4S(), v25.V4S(), bad_memory); + __ ld2(v17.V8B(), v18.V8B(), bad_memory); + __ ld2(v13.V8B(), v14.V8B(), bad_memory); + __ ld2(v7.V8B(), v8.V8B(), bad_memory); + __ ld2(v30.V8H(), v31.V8H(), bad_memory); + __ ld2(v4.V8H(), v5.V8H(), bad_memory); + __ ld2(v13.V8H(), v14.V8H(), bad_memory); + __ ld2(v5.B(), v6.B(), 12, bad_memory); + __ ld2(v16.B(), v17.B(), 7, bad_memory); + __ ld2(v29.B(), v30.B(), 2, bad_memory); + __ ld2(v11.D(), v12.D(), 1, bad_memory); + __ ld2(v26.D(), v27.D(), 0, bad_memory); + __ ld2(v25.D(), v26.D(), 0, bad_memory); + __ ld2(v18.H(), v19.H(), 7, bad_memory); + __ ld2(v17.H(), v18.H(), 5, bad_memory); + __ ld2(v30.H(), v31.H(), 2, bad_memory); + __ ld2(v29.S(), v30.S(), 3, bad_memory); + __ ld2(v28.S(), v29.S(), 0, bad_memory); + __ ld2(v6.S(), v7.S(), 1, bad_memory); + __ ld2r(v26.V16B(), v27.V16B(), bad_memory); + __ ld2r(v21.V16B(), v22.V16B(), bad_memory); + __ ld2r(v5.V16B(), v6.V16B(), bad_memory); + __ ld2r(v26.V1D(), v27.V1D(), bad_memory); + __ ld2r(v14.V1D(), v15.V1D(), bad_memory); + __ ld2r(v23.V1D(), v24.V1D(), bad_memory); + __ ld2r(v11.V2D(), v12.V2D(), bad_memory); + __ ld2r(v29.V2D(), v30.V2D(), bad_memory); + __ ld2r(v15.V2D(), v16.V2D(), bad_memory); + __ ld2r(v26.V2S(), v27.V2S(), bad_memory); + __ ld2r(v22.V2S(), v23.V2S(), bad_memory); + __ ld2r(v2.V2S(), v3.V2S(), bad_memory); + __ ld2r(v2.V4H(), v3.V4H(), bad_memory); + __ ld2r(v9.V4H(), v10.V4H(), bad_memory); + __ ld2r(v6.V4H(), v7.V4H(), bad_memory); + __ ld2r(v7.V4S(), v8.V4S(), bad_memory); + __ ld2r(v19.V4S(), v20.V4S(), bad_memory); + __ ld2r(v21.V4S(), v22.V4S(), bad_memory); + __ ld2r(v26.V8B(), v27.V8B(), bad_memory); + __ ld2r(v20.V8B(), v21.V8B(), bad_memory); + __ ld2r(v11.V8B(), v12.V8B(), bad_memory); + __ ld2r(v12.V8H(), v13.V8H(), bad_memory); + __ ld2r(v6.V8H(), v7.V8H(), bad_memory); + __ ld2r(v25.V8H(), v26.V8H(), bad_memory); + __ ld3(v20.V16B(), v21.V16B(), v22.V16B(), bad_memory); + __ ld3(v28.V16B(), v29.V16B(), v30.V16B(), bad_memory); + __ ld3(v20.V16B(), v21.V16B(), v22.V16B(), bad_memory); + __ ld3(v21.V2D(), v22.V2D(), v23.V2D(), bad_memory); + __ ld3(v18.V2D(), v19.V2D(), v20.V2D(), bad_memory); + __ ld3(v27.V2D(), v28.V2D(), v29.V2D(), bad_memory); + __ ld3(v7.V2S(), v8.V2S(), v9.V2S(), bad_memory); + __ ld3(v20.V2S(), v21.V2S(), v22.V2S(), bad_memory); + __ ld3(v26.V2S(), v27.V2S(), v28.V2S(), bad_memory); + __ ld3(v27.V4H(), v28.V4H(), v29.V4H(), bad_memory); + __ ld3(v28.V4H(), v29.V4H(), v30.V4H(), bad_memory); + __ ld3(v7.V4H(), v8.V4H(), v9.V4H(), bad_memory); + __ ld3(v2.V4S(), v3.V4S(), v4.V4S(), bad_memory); + __ ld3(v24.V4S(), v25.V4S(), v26.V4S(), bad_memory); + __ ld3(v11.V4S(), v12.V4S(), v13.V4S(), bad_memory); + __ ld3(v29.V8B(), v30.V8B(), v31.V8B(), bad_memory); + __ ld3(v1.V8B(), v2.V8B(), v3.V8B(), bad_memory); + __ ld3(v12.V8B(), v13.V8B(), v14.V8B(), bad_memory); + __ ld3(v22.V8H(), v23.V8H(), v24.V8H(), bad_memory); + __ ld3(v13.V8H(), v14.V8H(), v15.V8H(), bad_memory); + __ ld3(v28.V8H(), v29.V8H(), v30.V8H(), bad_memory); + __ ld3(v21.B(), v22.B(), v23.B(), 11, bad_memory); + __ ld3(v5.B(), v6.B(), v7.B(), 9, bad_memory); + __ ld3(v23.B(), v24.B(), v25.B(), 0, bad_memory); + __ ld3(v16.D(), v17.D(), v18.D(), 0, bad_memory); + __ ld3(v30.D(), v31.D(), v0.D(), 0, bad_memory); + __ ld3(v28.D(), v29.D(), v30.D(), 1, bad_memory); + __ ld3(v13.H(), v14.H(), v15.H(), 2, bad_memory); + __ ld3(v22.H(), v23.H(), v24.H(), 7, bad_memory); + __ ld3(v14.H(), v15.H(), v16.H(), 3, bad_memory); + __ ld3(v22.S(), v23.S(), v24.S(), 3, bad_memory); + __ ld3(v30.S(), v31.S(), v0.S(), 2, bad_memory); + __ ld3(v12.S(), v13.S(), v14.S(), 1, bad_memory); + __ ld3r(v24.V16B(), v25.V16B(), v26.V16B(), bad_memory); + __ ld3r(v24.V16B(), v25.V16B(), v26.V16B(), bad_memory); + __ ld3r(v3.V16B(), v4.V16B(), v5.V16B(), bad_memory); + __ ld3r(v4.V1D(), v5.V1D(), v6.V1D(), bad_memory); + __ ld3r(v7.V1D(), v8.V1D(), v9.V1D(), bad_memory); + __ ld3r(v17.V1D(), v18.V1D(), v19.V1D(), bad_memory); + __ ld3r(v16.V2D(), v17.V2D(), v18.V2D(), bad_memory); + __ ld3r(v20.V2D(), v21.V2D(), v22.V2D(), bad_memory); + __ ld3r(v14.V2D(), v15.V2D(), v16.V2D(), bad_memory); + __ ld3r(v10.V2S(), v11.V2S(), v12.V2S(), bad_memory); + __ ld3r(v0.V2S(), v1.V2S(), v2.V2S(), bad_memory); + __ ld3r(v23.V2S(), v24.V2S(), v25.V2S(), bad_memory); + __ ld3r(v22.V4H(), v23.V4H(), v24.V4H(), bad_memory); + __ ld3r(v6.V4H(), v7.V4H(), v8.V4H(), bad_memory); + __ ld3r(v7.V4H(), v8.V4H(), v9.V4H(), bad_memory); + __ ld3r(v26.V4S(), v27.V4S(), v28.V4S(), bad_memory); + __ ld3r(v0.V4S(), v1.V4S(), v2.V4S(), bad_memory); + __ ld3r(v30.V4S(), v31.V4S(), v0.V4S(), bad_memory); + __ ld3r(v2.V8B(), v3.V8B(), v4.V8B(), bad_memory); + __ ld3r(v10.V8B(), v11.V8B(), v12.V8B(), bad_memory); + __ ld3r(v28.V8B(), v29.V8B(), v30.V8B(), bad_memory); + __ ld3r(v6.V8H(), v7.V8H(), v8.V8H(), bad_memory); + __ ld3r(v29.V8H(), v30.V8H(), v31.V8H(), bad_memory); + __ ld3r(v7.V8H(), v8.V8H(), v9.V8H(), bad_memory); + __ ld4(v3.V16B(), v4.V16B(), v5.V16B(), v6.V16B(), bad_memory); + __ ld4(v2.V16B(), v3.V16B(), v4.V16B(), v5.V16B(), bad_memory); + __ ld4(v5.V16B(), v6.V16B(), v7.V16B(), v8.V16B(), bad_memory); + __ ld4(v18.V2D(), v19.V2D(), v20.V2D(), v21.V2D(), bad_memory); + __ ld4(v4.V2D(), v5.V2D(), v6.V2D(), v7.V2D(), bad_memory); + __ ld4(v29.V2D(), v30.V2D(), v31.V2D(), v0.V2D(), bad_memory); + __ ld4(v27.V2S(), v28.V2S(), v29.V2S(), v30.V2S(), bad_memory); + __ ld4(v24.V2S(), v25.V2S(), v26.V2S(), v27.V2S(), bad_memory); + __ ld4(v4.V2S(), v5.V2S(), v6.V2S(), v7.V2S(), bad_memory); + __ ld4(v16.V4H(), v17.V4H(), v18.V4H(), v19.V4H(), bad_memory); + __ ld4(v23.V4H(), v24.V4H(), v25.V4H(), v26.V4H(), bad_memory); + __ ld4(v2.V4H(), v3.V4H(), v4.V4H(), v5.V4H(), bad_memory); + __ ld4(v7.V4S(), v8.V4S(), v9.V4S(), v10.V4S(), bad_memory); + __ ld4(v28.V4S(), v29.V4S(), v30.V4S(), v31.V4S(), bad_memory); + __ ld4(v29.V4S(), v30.V4S(), v31.V4S(), v0.V4S(), bad_memory); + __ ld4(v15.V8B(), v16.V8B(), v17.V8B(), v18.V8B(), bad_memory); + __ ld4(v27.V8B(), v28.V8B(), v29.V8B(), v30.V8B(), bad_memory); + __ ld4(v5.V8B(), v6.V8B(), v7.V8B(), v8.V8B(), bad_memory); + __ ld4(v25.V8H(), v26.V8H(), v27.V8H(), v28.V8H(), bad_memory); + __ ld4(v2.V8H(), v3.V8H(), v4.V8H(), v5.V8H(), bad_memory); + __ ld4(v20.V8H(), v21.V8H(), v22.V8H(), v23.V8H(), bad_memory); + __ ld4(v20.B(), v21.B(), v22.B(), v23.B(), 3, bad_memory); + __ ld4(v12.B(), v13.B(), v14.B(), v15.B(), 3, bad_memory); + __ ld4(v27.B(), v28.B(), v29.B(), v30.B(), 6, bad_memory); + __ ld4(v28.D(), v29.D(), v30.D(), v31.D(), 1, bad_memory); + __ ld4(v15.D(), v16.D(), v17.D(), v18.D(), 1, bad_memory); + __ ld4(v16.D(), v17.D(), v18.D(), v19.D(), 1, bad_memory); + __ ld4(v2.H(), v3.H(), v4.H(), v5.H(), 6, bad_memory); + __ ld4(v5.H(), v6.H(), v7.H(), v8.H(), 3, bad_memory); + __ ld4(v7.H(), v8.H(), v9.H(), v10.H(), 6, bad_memory); + __ ld4(v6.S(), v7.S(), v8.S(), v9.S(), 1, bad_memory); + __ ld4(v25.S(), v26.S(), v27.S(), v28.S(), 2, bad_memory); + __ ld4(v8.S(), v9.S(), v10.S(), v11.S(), 3, bad_memory); + __ ld4r(v14.V16B(), v15.V16B(), v16.V16B(), v17.V16B(), bad_memory); + __ ld4r(v13.V16B(), v14.V16B(), v15.V16B(), v16.V16B(), bad_memory); + __ ld4r(v9.V16B(), v10.V16B(), v11.V16B(), v12.V16B(), bad_memory); + __ ld4r(v8.V1D(), v9.V1D(), v10.V1D(), v11.V1D(), bad_memory); + __ ld4r(v4.V1D(), v5.V1D(), v6.V1D(), v7.V1D(), bad_memory); + __ ld4r(v26.V1D(), v27.V1D(), v28.V1D(), v29.V1D(), bad_memory); + __ ld4r(v19.V2D(), v20.V2D(), v21.V2D(), v22.V2D(), bad_memory); + __ ld4r(v28.V2D(), v29.V2D(), v30.V2D(), v31.V2D(), bad_memory); + __ ld4r(v15.V2D(), v16.V2D(), v17.V2D(), v18.V2D(), bad_memory); + __ ld4r(v31.V2S(), v0.V2S(), v1.V2S(), v2.V2S(), bad_memory); + __ ld4r(v28.V2S(), v29.V2S(), v30.V2S(), v31.V2S(), bad_memory); + __ ld4r(v11.V2S(), v12.V2S(), v13.V2S(), v14.V2S(), bad_memory); + __ ld4r(v19.V4H(), v20.V4H(), v21.V4H(), v22.V4H(), bad_memory); + __ ld4r(v22.V4H(), v23.V4H(), v24.V4H(), v25.V4H(), bad_memory); + __ ld4r(v20.V4H(), v21.V4H(), v22.V4H(), v23.V4H(), bad_memory); + __ ld4r(v16.V4S(), v17.V4S(), v18.V4S(), v19.V4S(), bad_memory); + __ ld4r(v25.V4S(), v26.V4S(), v27.V4S(), v28.V4S(), bad_memory); + __ ld4r(v23.V4S(), v24.V4S(), v25.V4S(), v26.V4S(), bad_memory); + __ ld4r(v22.V8B(), v23.V8B(), v24.V8B(), v25.V8B(), bad_memory); + __ ld4r(v27.V8B(), v28.V8B(), v29.V8B(), v30.V8B(), bad_memory); + __ ld4r(v29.V8B(), v30.V8B(), v31.V8B(), v0.V8B(), bad_memory); + __ ld4r(v28.V8H(), v29.V8H(), v30.V8H(), v31.V8H(), bad_memory); + __ ld4r(v25.V8H(), v26.V8H(), v27.V8H(), v28.V8H(), bad_memory); + __ ld4r(v22.V8H(), v23.V8H(), v24.V8H(), v25.V8H(), bad_memory); + + __ st1(v18.V16B(), v19.V16B(), v20.V16B(), v21.V16B(), bad_memory); + __ st1(v10.V16B(), v11.V16B(), v12.V16B(), v13.V16B(), bad_memory); + __ st1(v27.V16B(), v28.V16B(), v29.V16B(), v30.V16B(), bad_memory); + __ st1(v16.V16B(), v17.V16B(), v18.V16B(), bad_memory); + __ st1(v21.V16B(), v22.V16B(), v23.V16B(), bad_memory); + __ st1(v9.V16B(), v10.V16B(), v11.V16B(), bad_memory); + __ st1(v7.V16B(), v8.V16B(), bad_memory); + __ st1(v26.V16B(), v27.V16B(), bad_memory); + __ st1(v22.V16B(), v23.V16B(), bad_memory); + __ st1(v23.V16B(), bad_memory); + __ st1(v28.V16B(), bad_memory); + __ st1(v2.V16B(), bad_memory); + __ st1(v29.V1D(), v30.V1D(), v31.V1D(), v0.V1D(), bad_memory); + __ st1(v12.V1D(), v13.V1D(), v14.V1D(), v15.V1D(), bad_memory); + __ st1(v30.V1D(), v31.V1D(), v0.V1D(), v1.V1D(), bad_memory); + __ st1(v16.V1D(), v17.V1D(), v18.V1D(), bad_memory); + __ st1(v3.V1D(), v4.V1D(), v5.V1D(), bad_memory); + __ st1(v14.V1D(), v15.V1D(), v16.V1D(), bad_memory); + __ st1(v18.V1D(), v19.V1D(), bad_memory); + __ st1(v5.V1D(), v6.V1D(), bad_memory); + __ st1(v2.V1D(), v3.V1D(), bad_memory); + __ st1(v4.V1D(), bad_memory); + __ st1(v27.V1D(), bad_memory); + __ st1(v23.V1D(), bad_memory); + __ st1(v2.V2D(), v3.V2D(), v4.V2D(), v5.V2D(), bad_memory); + __ st1(v22.V2D(), v23.V2D(), v24.V2D(), v25.V2D(), bad_memory); + __ st1(v28.V2D(), v29.V2D(), v30.V2D(), v31.V2D(), bad_memory); + __ st1(v17.V2D(), v18.V2D(), v19.V2D(), bad_memory); + __ st1(v16.V2D(), v17.V2D(), v18.V2D(), bad_memory); + __ st1(v22.V2D(), v23.V2D(), v24.V2D(), bad_memory); + __ st1(v21.V2D(), v22.V2D(), bad_memory); + __ st1(v6.V2D(), v7.V2D(), bad_memory); + __ st1(v27.V2D(), v28.V2D(), bad_memory); + __ st1(v21.V2D(), bad_memory); + __ st1(v29.V2D(), bad_memory); + __ st1(v20.V2D(), bad_memory); + __ st1(v22.V2S(), v23.V2S(), v24.V2S(), v25.V2S(), bad_memory); + __ st1(v8.V2S(), v9.V2S(), v10.V2S(), v11.V2S(), bad_memory); + __ st1(v15.V2S(), v16.V2S(), v17.V2S(), v18.V2S(), bad_memory); + __ st1(v2.V2S(), v3.V2S(), v4.V2S(), bad_memory); + __ st1(v23.V2S(), v24.V2S(), v25.V2S(), bad_memory); + __ st1(v7.V2S(), v8.V2S(), v9.V2S(), bad_memory); + __ st1(v28.V2S(), v29.V2S(), bad_memory); + __ st1(v29.V2S(), v30.V2S(), bad_memory); + __ st1(v23.V2S(), v24.V2S(), bad_memory); + __ st1(v6.V2S(), bad_memory); + __ st1(v11.V2S(), bad_memory); + __ st1(v17.V2S(), bad_memory); + __ st1(v6.V4H(), v7.V4H(), v8.V4H(), v9.V4H(), bad_memory); + __ st1(v9.V4H(), v10.V4H(), v11.V4H(), v12.V4H(), bad_memory); + __ st1(v25.V4H(), v26.V4H(), v27.V4H(), v28.V4H(), bad_memory); + __ st1(v11.V4H(), v12.V4H(), v13.V4H(), bad_memory); + __ st1(v10.V4H(), v11.V4H(), v12.V4H(), bad_memory); + __ st1(v12.V4H(), v13.V4H(), v14.V4H(), bad_memory); + __ st1(v13.V4H(), v14.V4H(), bad_memory); + __ st1(v15.V4H(), v16.V4H(), bad_memory); + __ st1(v21.V4H(), v22.V4H(), bad_memory); + __ st1(v16.V4H(), bad_memory); + __ st1(v8.V4H(), bad_memory); + __ st1(v30.V4H(), bad_memory); + __ st1(v3.V4S(), v4.V4S(), v5.V4S(), v6.V4S(), bad_memory); + __ st1(v25.V4S(), v26.V4S(), v27.V4S(), v28.V4S(), bad_memory); + __ st1(v5.V4S(), v6.V4S(), v7.V4S(), v8.V4S(), bad_memory); + __ st1(v31.V4S(), v0.V4S(), v1.V4S(), bad_memory); + __ st1(v30.V4S(), v31.V4S(), v0.V4S(), bad_memory); + __ st1(v6.V4S(), v7.V4S(), v8.V4S(), bad_memory); + __ st1(v17.V4S(), v18.V4S(), bad_memory); + __ st1(v31.V4S(), v0.V4S(), bad_memory); + __ st1(v1.V4S(), v2.V4S(), bad_memory); + __ st1(v26.V4S(), bad_memory); + __ st1(v15.V4S(), bad_memory); + __ st1(v13.V4S(), bad_memory); + __ st1(v26.V8B(), v27.V8B(), v28.V8B(), v29.V8B(), bad_memory); + __ st1(v10.V8B(), v11.V8B(), v12.V8B(), v13.V8B(), bad_memory); + __ st1(v15.V8B(), v16.V8B(), v17.V8B(), v18.V8B(), bad_memory); + __ st1(v19.V8B(), v20.V8B(), v21.V8B(), bad_memory); + __ st1(v31.V8B(), v0.V8B(), v1.V8B(), bad_memory); + __ st1(v9.V8B(), v10.V8B(), v11.V8B(), bad_memory); + __ st1(v12.V8B(), v13.V8B(), bad_memory); + __ st1(v2.V8B(), v3.V8B(), bad_memory); + __ st1(v0.V8B(), v1.V8B(), bad_memory); + __ st1(v16.V8B(), bad_memory); + __ st1(v25.V8B(), bad_memory); + __ st1(v31.V8B(), bad_memory); + __ st1(v4.V8H(), v5.V8H(), v6.V8H(), v7.V8H(), bad_memory); + __ st1(v3.V8H(), v4.V8H(), v5.V8H(), v6.V8H(), bad_memory); + __ st1(v26.V8H(), v27.V8H(), v28.V8H(), v29.V8H(), bad_memory); + __ st1(v10.V8H(), v11.V8H(), v12.V8H(), bad_memory); + __ st1(v21.V8H(), v22.V8H(), v23.V8H(), bad_memory); + __ st1(v18.V8H(), v19.V8H(), v20.V8H(), bad_memory); + __ st1(v26.V8H(), v27.V8H(), bad_memory); + __ st1(v24.V8H(), v25.V8H(), bad_memory); + __ st1(v17.V8H(), v18.V8H(), bad_memory); + __ st1(v29.V8H(), bad_memory); + __ st1(v19.V8H(), bad_memory); + __ st1(v23.V8H(), bad_memory); + __ st1(v19.B(), 15, bad_memory); + __ st1(v25.B(), 9, bad_memory); + __ st1(v4.B(), 8, bad_memory); + __ st1(v13.D(), 0, bad_memory); + __ st1(v30.D(), 0, bad_memory); + __ st1(v3.D(), 0, bad_memory); + __ st1(v22.H(), 0, bad_memory); + __ st1(v31.H(), 7, bad_memory); + __ st1(v23.H(), 3, bad_memory); + __ st1(v0.S(), 0, bad_memory); + __ st1(v11.S(), 3, bad_memory); + __ st1(v24.S(), 3, bad_memory); + __ st2(v7.V16B(), v8.V16B(), bad_memory); + __ st2(v5.V16B(), v6.V16B(), bad_memory); + __ st2(v18.V16B(), v19.V16B(), bad_memory); + __ st2(v14.V2D(), v15.V2D(), bad_memory); + __ st2(v7.V2D(), v8.V2D(), bad_memory); + __ st2(v24.V2D(), v25.V2D(), bad_memory); + __ st2(v22.V2S(), v23.V2S(), bad_memory); + __ st2(v4.V2S(), v5.V2S(), bad_memory); + __ st2(v2.V2S(), v3.V2S(), bad_memory); + __ st2(v23.V4H(), v24.V4H(), bad_memory); + __ st2(v8.V4H(), v9.V4H(), bad_memory); + __ st2(v7.V4H(), v8.V4H(), bad_memory); + __ st2(v17.V4S(), v18.V4S(), bad_memory); + __ st2(v6.V4S(), v7.V4S(), bad_memory); + __ st2(v26.V4S(), v27.V4S(), bad_memory); + __ st2(v31.V8B(), v0.V8B(), bad_memory); + __ st2(v0.V8B(), v1.V8B(), bad_memory); + __ st2(v21.V8B(), v22.V8B(), bad_memory); + __ st2(v7.V8H(), v8.V8H(), bad_memory); + __ st2(v22.V8H(), v23.V8H(), bad_memory); + __ st2(v4.V8H(), v5.V8H(), bad_memory); + __ st2(v8.B(), v9.B(), 15, bad_memory); + __ st2(v8.B(), v9.B(), 15, bad_memory); + __ st2(v7.B(), v8.B(), 4, bad_memory); + __ st2(v25.D(), v26.D(), 0, bad_memory); + __ st2(v17.D(), v18.D(), 1, bad_memory); + __ st2(v3.D(), v4.D(), 1, bad_memory); + __ st2(v4.H(), v5.H(), 3, bad_memory); + __ st2(v0.H(), v1.H(), 5, bad_memory); + __ st2(v22.H(), v23.H(), 2, bad_memory); + __ st2(v14.S(), v15.S(), 3, bad_memory); + __ st2(v23.S(), v24.S(), 3, bad_memory); + __ st2(v0.S(), v1.S(), 2, bad_memory); + __ st3(v26.V16B(), v27.V16B(), v28.V16B(), bad_memory); + __ st3(v21.V16B(), v22.V16B(), v23.V16B(), bad_memory); + __ st3(v24.V16B(), v25.V16B(), v26.V16B(), bad_memory); + __ st3(v17.V2D(), v18.V2D(), v19.V2D(), bad_memory); + __ st3(v23.V2D(), v24.V2D(), v25.V2D(), bad_memory); + __ st3(v10.V2D(), v11.V2D(), v12.V2D(), bad_memory); + __ st3(v9.V2S(), v10.V2S(), v11.V2S(), bad_memory); + __ st3(v13.V2S(), v14.V2S(), v15.V2S(), bad_memory); + __ st3(v22.V2S(), v23.V2S(), v24.V2S(), bad_memory); + __ st3(v31.V4H(), v0.V4H(), v1.V4H(), bad_memory); + __ st3(v8.V4H(), v9.V4H(), v10.V4H(), bad_memory); + __ st3(v19.V4H(), v20.V4H(), v21.V4H(), bad_memory); + __ st3(v18.V4S(), v19.V4S(), v20.V4S(), bad_memory); + __ st3(v25.V4S(), v26.V4S(), v27.V4S(), bad_memory); + __ st3(v16.V4S(), v17.V4S(), v18.V4S(), bad_memory); + __ st3(v27.V8B(), v28.V8B(), v29.V8B(), bad_memory); + __ st3(v29.V8B(), v30.V8B(), v31.V8B(), bad_memory); + __ st3(v30.V8B(), v31.V8B(), v0.V8B(), bad_memory); + __ st3(v8.V8H(), v9.V8H(), v10.V8H(), bad_memory); + __ st3(v18.V8H(), v19.V8H(), v20.V8H(), bad_memory); + __ st3(v18.V8H(), v19.V8H(), v20.V8H(), bad_memory); + __ st3(v31.B(), v0.B(), v1.B(), 10, bad_memory); + __ st3(v4.B(), v5.B(), v6.B(), 5, bad_memory); + __ st3(v5.B(), v6.B(), v7.B(), 1, bad_memory); + __ st3(v5.D(), v6.D(), v7.D(), 0, bad_memory); + __ st3(v6.D(), v7.D(), v8.D(), 0, bad_memory); + __ st3(v0.D(), v1.D(), v2.D(), 0, bad_memory); + __ st3(v31.H(), v0.H(), v1.H(), 2, bad_memory); + __ st3(v14.H(), v15.H(), v16.H(), 5, bad_memory); + __ st3(v21.H(), v22.H(), v23.H(), 6, bad_memory); + __ st3(v21.S(), v22.S(), v23.S(), 0, bad_memory); + __ st3(v11.S(), v12.S(), v13.S(), 1, bad_memory); + __ st3(v15.S(), v16.S(), v17.S(), 0, bad_memory); + __ st4(v22.V16B(), v23.V16B(), v24.V16B(), v25.V16B(), bad_memory); + __ st4(v24.V16B(), v25.V16B(), v26.V16B(), v27.V16B(), bad_memory); + __ st4(v15.V16B(), v16.V16B(), v17.V16B(), v18.V16B(), bad_memory); + __ st4(v16.V2D(), v17.V2D(), v18.V2D(), v19.V2D(), bad_memory); + __ st4(v17.V2D(), v18.V2D(), v19.V2D(), v20.V2D(), bad_memory); + __ st4(v9.V2D(), v10.V2D(), v11.V2D(), v12.V2D(), bad_memory); + __ st4(v23.V2S(), v24.V2S(), v25.V2S(), v26.V2S(), bad_memory); + __ st4(v15.V2S(), v16.V2S(), v17.V2S(), v18.V2S(), bad_memory); + __ st4(v24.V2S(), v25.V2S(), v26.V2S(), v27.V2S(), bad_memory); + __ st4(v14.V4H(), v15.V4H(), v16.V4H(), v17.V4H(), bad_memory); + __ st4(v18.V4H(), v19.V4H(), v20.V4H(), v21.V4H(), bad_memory); + __ st4(v1.V4H(), v2.V4H(), v3.V4H(), v4.V4H(), bad_memory); + __ st4(v13.V4S(), v14.V4S(), v15.V4S(), v16.V4S(), bad_memory); + __ st4(v6.V4S(), v7.V4S(), v8.V4S(), v9.V4S(), bad_memory); + __ st4(v15.V4S(), v16.V4S(), v17.V4S(), v18.V4S(), bad_memory); + __ st4(v26.V8B(), v27.V8B(), v28.V8B(), v29.V8B(), bad_memory); + __ st4(v25.V8B(), v26.V8B(), v27.V8B(), v28.V8B(), bad_memory); + __ st4(v19.V8B(), v20.V8B(), v21.V8B(), v22.V8B(), bad_memory); + __ st4(v19.V8H(), v20.V8H(), v21.V8H(), v22.V8H(), bad_memory); + __ st4(v15.V8H(), v16.V8H(), v17.V8H(), v18.V8H(), bad_memory); + __ st4(v31.V8H(), v0.V8H(), v1.V8H(), v2.V8H(), bad_memory); + __ st4(v0.B(), v1.B(), v2.B(), v3.B(), 13, bad_memory); + __ st4(v4.B(), v5.B(), v6.B(), v7.B(), 10, bad_memory); + __ st4(v9.B(), v10.B(), v11.B(), v12.B(), 9, bad_memory); + __ st4(v2.D(), v3.D(), v4.D(), v5.D(), 1, bad_memory); + __ st4(v7.D(), v8.D(), v9.D(), v10.D(), 0, bad_memory); + __ st4(v31.D(), v0.D(), v1.D(), v2.D(), 1, bad_memory); + __ st4(v2.H(), v3.H(), v4.H(), v5.H(), 1, bad_memory); + __ st4(v27.H(), v28.H(), v29.H(), v30.H(), 3, bad_memory); + __ st4(v24.H(), v25.H(), v26.H(), v27.H(), 4, bad_memory); + __ st4(v18.S(), v19.S(), v20.S(), v21.S(), 2, bad_memory); + __ st4(v6.S(), v7.S(), v8.S(), v9.S(), 2, bad_memory); + __ st4(v25.S(), v26.S(), v27.S(), v28.S(), 1, bad_memory); + + END_IMPLICIT_CHECK(); + TRY_RUN_IMPLICIT_CHECK(); +} + +TEST(ImplicitCheckSve) { + SETUP_WITH_FEATURES(CPUFeatures::kSVE, + CPUFeatures::kSVE2, + CPUFeatures::kNEON); + START_IMPLICIT_CHECK(); + + SVEMemOperand bad_sve_memory = SVEMemOperand(ip0); + + EmissionCheckScope guard(&masm, masm.GetBuffer()->GetRemainingBytes()); + // Simple, unpredicated loads and stores. + __ Str(p12.VnD(), bad_sve_memory); + __ Str(p13.VnS(), bad_sve_memory); + __ Str(p14.VnH(), bad_sve_memory); + __ Str(p15.VnB(), bad_sve_memory); + __ Ldr(p8.VnD(), bad_sve_memory); + __ Ldr(p9.VnS(), bad_sve_memory); + __ Ldr(p10.VnH(), bad_sve_memory); + __ Ldr(p11.VnB(), bad_sve_memory); + + __ Str(z0.VnD(), bad_sve_memory); + __ Str(z1.VnS(), bad_sve_memory); + __ Str(z2.VnH(), bad_sve_memory); + __ Str(z3.VnB(), bad_sve_memory); + __ Ldr(z20.VnD(), bad_sve_memory); + __ Ldr(z21.VnS(), bad_sve_memory); + __ Ldr(z22.VnH(), bad_sve_memory); + __ Ldr(z23.VnB(), bad_sve_memory); + + // Structured accesses. + __ St1b(z0.VnB(), p2, bad_sve_memory); + __ St1h(z1.VnH(), p1, bad_sve_memory); + __ St1w(z2.VnS(), p1, bad_sve_memory); + __ St1d(z3.VnD(), p2, bad_sve_memory); + __ Ld1b(z20.VnB(), p1.Zeroing(), bad_sve_memory); + __ Ld1h(z21.VnH(), p2.Zeroing(), bad_sve_memory); + __ Ld1w(z22.VnS(), p1.Zeroing(), bad_sve_memory); + __ Ld1d(z23.VnD(), p1.Zeroing(), bad_sve_memory); + + // Structured, packed accesses. + __ St1b(z2.VnH(), p1, bad_sve_memory); + __ St1b(z3.VnS(), p2, bad_sve_memory); + __ St1b(z4.VnD(), p2, bad_sve_memory); + __ St1h(z0.VnS(), p1, bad_sve_memory); + __ St1h(z1.VnD(), p1, bad_sve_memory); + __ St1w(z2.VnD(), p1, bad_sve_memory); + __ Ld1b(z20.VnH(), p1.Zeroing(), bad_sve_memory); + __ Ld1b(z21.VnS(), p1.Zeroing(), bad_sve_memory); + __ Ld1b(z22.VnD(), p1.Zeroing(), bad_sve_memory); + __ Ld1h(z23.VnS(), p2.Zeroing(), bad_sve_memory); + __ Ld1h(z24.VnD(), p2.Zeroing(), bad_sve_memory); + __ Ld1w(z20.VnD(), p1.Zeroing(), bad_sve_memory); + __ Ld1sb(z21.VnH(), p1.Zeroing(), bad_sve_memory); + __ Ld1sb(z22.VnS(), p1.Zeroing(), bad_sve_memory); + __ Ld1sb(z23.VnD(), p2.Zeroing(), bad_sve_memory); + __ Ld1sh(z24.VnS(), p2.Zeroing(), bad_sve_memory); + __ Ld1sh(z20.VnD(), p1.Zeroing(), bad_sve_memory); + __ Ld1sw(z21.VnD(), p1.Zeroing(), bad_sve_memory); + + // Structured, interleaved accesses. + __ St2b(z0.VnB(), z1.VnB(), p4, bad_sve_memory); + __ St2h(z1.VnH(), z2.VnH(), p4, bad_sve_memory); + __ St2w(z2.VnS(), z3.VnS(), p3, bad_sve_memory); + __ St2d(z3.VnD(), z4.VnD(), p4, bad_sve_memory); + __ Ld2b(z20.VnB(), z21.VnB(), p5.Zeroing(), bad_sve_memory); + __ Ld2h(z21.VnH(), z22.VnH(), p6.Zeroing(), bad_sve_memory); + __ Ld2w(z22.VnS(), z23.VnS(), p6.Zeroing(), bad_sve_memory); + __ Ld2d(z23.VnD(), z24.VnD(), p5.Zeroing(), bad_sve_memory); + + __ St3b(z4.VnB(), z5.VnB(), z6.VnB(), p4, bad_sve_memory); + __ St3h(z5.VnH(), z6.VnH(), z7.VnH(), p4, bad_sve_memory); + __ St3w(z6.VnS(), z7.VnS(), z8.VnS(), p3, bad_sve_memory); + __ St3d(z7.VnD(), z8.VnD(), z9.VnD(), p4, bad_sve_memory); + __ Ld3b(z24.VnB(), z25.VnB(), z26.VnB(), p5.Zeroing(), bad_sve_memory); + __ Ld3h(z25.VnH(), z26.VnH(), z27.VnH(), p6.Zeroing(), bad_sve_memory); + __ Ld3w(z26.VnS(), z27.VnS(), z28.VnS(), p6.Zeroing(), bad_sve_memory); + __ Ld3d(z27.VnD(), z28.VnD(), z29.VnD(), p5.Zeroing(), bad_sve_memory); + + __ St4b(z31.VnB(), z0.VnB(), z1.VnB(), z2.VnB(), p4, bad_sve_memory); + __ St4h(z0.VnH(), z1.VnH(), z2.VnH(), z3.VnH(), p4, bad_sve_memory); + __ St4w(z1.VnS(), z2.VnS(), z3.VnS(), z4.VnS(), p3, bad_sve_memory); + __ St4d(z2.VnD(), z3.VnD(), z4.VnD(), z5.VnD(), p4, bad_sve_memory); + __ Ld4b(z25.VnB(), + z26.VnB(), + z27.VnB(), + z28.VnB(), + p5.Zeroing(), + bad_sve_memory); + __ Ld4h(z26.VnH(), + z27.VnH(), + z28.VnH(), + z29.VnH(), + p6.Zeroing(), + bad_sve_memory); + __ Ld4w(z27.VnS(), + z28.VnS(), + z29.VnS(), + z30.VnS(), + p6.Zeroing(), + bad_sve_memory); + __ Ld4d(z28.VnD(), + z29.VnD(), + z30.VnD(), + z31.VnD(), + p5.Zeroing(), + bad_sve_memory); + + END_IMPLICIT_CHECK(); + TRY_RUN_IMPLICIT_CHECK(); +} + +TEST(ImplicitCheckAtomics) { + SETUP_WITH_FEATURES(CPUFeatures::kNEON, CPUFeatures::kAtomics); + START_IMPLICIT_CHECK(); + + EmissionCheckScope guard(&masm, masm.GetBuffer()->GetRemainingBytes()); +#define INST_LIST(OP) \ + __ Ld##OP##b(w0, w0, bad_memory); \ + __ Ld##OP##ab(w0, w1, bad_memory); \ + __ Ld##OP##lb(w0, w2, bad_memory); \ + __ Ld##OP##alb(w0, w3, bad_memory); \ + __ Ld##OP##h(w0, w0, bad_memory); \ + __ Ld##OP##ah(w0, w1, bad_memory); \ + __ Ld##OP##lh(w0, w2, bad_memory); \ + __ Ld##OP##alh(w0, w3, bad_memory); \ + __ Ld##OP(w0, w0, bad_memory); \ + __ Ld##OP##a(w0, w1, bad_memory); \ + __ Ld##OP##l(w0, w2, bad_memory); \ + __ Ld##OP##al(w0, w3, bad_memory); \ + __ Ld##OP(x0, x0, bad_memory); \ + __ Ld##OP##a(x0, x1, bad_memory); \ + __ Ld##OP##l(x0, x2, bad_memory); \ + __ Ld##OP##al(x0, x3, bad_memory); \ + __ St##OP##b(w0, bad_memory); \ + __ St##OP##lb(w0, bad_memory); \ + __ St##OP##h(w0, bad_memory); \ + __ St##OP##lh(w0, bad_memory); \ + __ St##OP(w0, bad_memory); \ + __ St##OP##l(w0, bad_memory); \ + __ St##OP(x0, bad_memory); \ + __ St##OP##l(x0, bad_memory); + + INST_LIST(add); + INST_LIST(set); + INST_LIST(eor); + INST_LIST(smin); + INST_LIST(smax); + INST_LIST(umin); + INST_LIST(umax); + INST_LIST(clr); + +#undef INST_LIST + + END_IMPLICIT_CHECK(); + TRY_RUN_IMPLICIT_CHECK(); +} + +TEST(ImplicitCheckMops) { + SETUP_WITH_FEATURES(CPUFeatures::kNEON, CPUFeatures::kMOPS); + START_IMPLICIT_CHECK(); + + EmissionCheckScope guard(&masm, masm.GetBuffer()->GetRemainingBytes()); + __ Set(x15, ip1, ip0); + __ Setn(x15, ip1, ip0); + __ Setg(x15, ip1, ip0); + __ Setgn(x15, ip1, ip0); + + __ Cpy(x15, ip0, ip1); + __ Cpyn(x15, ip0, ip1); + __ Cpyrn(x15, ip0, ip1); + __ Cpywn(x15, ip0, ip1); + __ Cpyf(x15, ip0, ip1); + __ Cpyfn(x15, ip0, ip1); + __ Cpyfrn(x15, ip0, ip1); + __ Cpyfwn(x15, ip0, ip1); + + // The macro-assembler expands each instruction into prologue, main and + // epilogue instructions where only the main instruction will fail. Increase + // the counter to account for those additional instructions and the following + // instructions. + __ Mov(x0, 3); + __ Mul(x1, x1, x0); + __ Add(x1, x1, x0); + + END_IMPLICIT_CHECK(); + TRY_RUN_IMPLICIT_CHECK(); +} +#endif // VIXL_ENABLE_IMPLICIT_CHECKS + #undef __ #define __ masm-> @@ -5140,6 +6025,7 @@ TEST(RunFrom) { 3.0); VIXL_CHECK(res_double == 6.0); } + #endif diff --git a/test/aarch64/test-simulator-sve-aarch64.cc b/test/aarch64/test-simulator-sve-aarch64.cc index 51f7d82f..6b5b9582 100644 --- a/test/aarch64/test-simulator-sve-aarch64.cc +++ b/test/aarch64/test-simulator-sve-aarch64.cc @@ -267,5 +267,1776 @@ TEST_SVE(sve_fmatmul_s) { } } +// Below here, there are tests for Neon instructions. As these forms of test +// check the entire register state, they also need SVE features. + +TEST_SVE(neon_pmull) { + SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, + CPUFeatures::kNEON, + CPUFeatures::kCRC32, + CPUFeatures::kPmull1Q); + START(); + + SetInitialMachineState(&masm); + // state = 0xe2bd2480 + + { + ExactAssemblyScope scope(&masm, 40 * kInstructionSize); + __ dci(0x4e20e000); // pmull2 v0.8h, v0.16b, v0.16b + // vl128 state = 0x5eba4d4f + __ dci(0x4e20e228); // pmull2 v8.8h, v17.16b, v0.16b + // vl128 state = 0x86bceb87 + __ dci(0x4ee0e22a); // pmull2 v10.1q, v17.2d, v0.2d + // vl128 state = 0x1332fe02 + __ dci(0x0ee8e222); // pmull v2.1q, v17.1d, v8.1d + // vl128 state = 0xd357dc7b + __ dci(0x4eece226); // pmull2 v6.1q, v17.2d, v12.2d + // vl128 state = 0xdff409ad + __ dci(0x0eece276); // pmull v22.1q, v19.1d, v12.1d + // vl128 state = 0xd8af1dc6 + __ dci(0x0eede232); // pmull v18.1q, v17.1d, v13.1d + // vl128 state = 0x41e6ed0e + __ dci(0x0efde216); // pmull v22.1q, v16.1d, v29.1d + // vl128 state = 0x1f10365f + __ dci(0x0effe23e); // pmull v30.1q, v17.1d, v31.1d + // vl128 state = 0x9779ece5 + __ dci(0x0ee7e23f); // pmull v31.1q, v17.1d, v7.1d + // vl128 state = 0x11fc8ce9 + __ dci(0x0ee2e23e); // pmull v30.1q, v17.1d, v2.1d + // vl128 state = 0x101d5a6f + __ dci(0x0ee2e23c); // pmull v28.1q, v17.1d, v2.1d + // vl128 state = 0xcc4fe26e + __ dci(0x0eeae27d); // pmull v29.1q, v19.1d, v10.1d + // vl128 state = 0xc84be9f4 + __ dci(0x4eeae24d); // pmull2 v13.1q, v18.2d, v10.2d + // vl128 state = 0x2fc540b4 + __ dci(0x4eeae25d); // pmull2 v29.1q, v18.2d, v10.2d + // vl128 state = 0x1b2d99cd + __ dci(0x4eeae2ed); // pmull2 v13.1q, v23.2d, v10.2d + // vl128 state = 0x8a278b95 + __ dci(0x4eeae2e9); // pmull2 v9.1q, v23.2d, v10.2d + // vl128 state = 0x3359b4c8 + __ dci(0x4efee2e8); // pmull2 v8.1q, v23.2d, v30.2d + // vl128 state = 0x5c25ed31 + __ dci(0x4effe3e0); // pmull2 v0.1q, v31.2d, v31.2d + // vl128 state = 0x28ff67d1 + __ dci(0x4eefe3d0); // pmull2 v16.1q, v30.2d, v15.2d + // vl128 state = 0x1543436d + __ dci(0x4ee7e2d1); // pmull2 v17.1q, v22.2d, v7.2d + // vl128 state = 0x71b8bc90 + __ dci(0x4eefe3d5); // pmull2 v21.1q, v30.2d, v15.2d + // vl128 state = 0x3d35ca02 + __ dci(0x4eefe314); // pmull2 v20.1q, v24.2d, v15.2d + // vl128 state = 0x40e8fade + __ dci(0x4eefe310); // pmull2 v16.1q, v24.2d, v15.2d + // vl128 state = 0xb8affb87 + __ dci(0x4eefe300); // pmull2 v0.1q, v24.2d, v15.2d + // vl128 state = 0x4824ee5c + __ dci(0x4eede350); // pmull2 v16.1q, v26.2d, v13.2d + // vl128 state = 0x39202868 + __ dci(0x4ee7e354); // pmull2 v20.1q, v26.2d, v7.2d + // vl128 state = 0xc8fde340 + __ dci(0x4e27e356); // pmull2 v22.8h, v26.16b, v7.16b + // vl128 state = 0x0f02316b + __ dci(0x4e37e15e); // pmull2 v30.8h, v10.16b, v23.16b + // vl128 state = 0xced4f8bd + __ dci(0x4e33e05f); // pmull2 v31.8h, v2.16b, v19.16b + // vl128 state = 0x0c76bdb3 + __ dci(0x0e23e05e); // pmull v30.8h, v2.8b, v3.8b + // vl128 state = 0x0e36962b + __ dci(0x4e23e25f); // pmull2 v31.8h, v18.16b, v3.16b + // vl128 state = 0x11a8dcc3 + __ dci(0x4e23e25b); // pmull2 v27.8h, v18.16b, v3.16b + // vl128 state = 0xf01bfe16 + __ dci(0x4e23e259); // pmull2 v25.8h, v18.16b, v3.16b + // vl128 state = 0xea351afe + __ dci(0x4e22e2c9); // pmull2 v9.8h, v22.16b, v2.16b + // vl128 state = 0x16e933ef + __ dci(0x4e3ae2c8); // pmull2 v8.8h, v22.16b, v26.16b + // vl128 state = 0x02528a2a + __ dci(0x4e32e249); // pmull2 v9.8h, v18.16b, v18.16b + // vl128 state = 0xe7e20633 + __ dci(0x4e36e20d); // pmull2 v13.8h, v16.16b, v22.16b + // vl128 state = 0x6f231732 + __ dci(0x4e36e205); // pmull2 v5.8h, v16.16b, v22.16b + // vl128 state = 0x423eb7ea + __ dci(0x4e22e20d); // pmull2 v13.8h, v16.16b, v2.16b + // vl128 state = 0xfc0d1c14 + } + + uint32_t state; + ComputeMachineStateHash(&masm, &state); + __ Mov(x0, reinterpret_cast(&state)); + __ Ldr(w0, MemOperand(x0)); + + END(); + if (CAN_RUN()) { + RUN(); + uint32_t expected_hashes[] = { + 0xfc0d1c14, + 0x4cb040a3, + 0x4b913ebe, + 0xfa35b836, + 0x78745d20, + 0x6666b09a, + 0xee2868f4, + 0x1936a795, + 0x1025244a, + 0xe8551950, + 0xae73af02, + 0x0fdd5fc7, + 0x22e9827b, + 0x384ce1ac, + 0xc833cbeb, + 0x255baab5, + }; + ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0); + } +} + +TEST_SVE(neon_sha1_2reg) { + SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, + CPUFeatures::kNEON, + CPUFeatures::kCRC32, + CPUFeatures::kSHA1); + START(); + + SetInitialMachineState(&masm); + // state = 0xe2bd2480 + + { + ExactAssemblyScope scope(&masm, 30 * kInstructionSize); + __ dci(0x5e280800); // sha1h s0, s0 + // vl128 state = 0xc388d4f8 + __ dci(0x5e280a28); // sha1h s8, s17 + // vl128 state = 0x5c88b904 + __ dci(0x5e280a2a); // sha1h s10, s17 + // vl128 state = 0x6f63c596 + __ dci(0x5e281aae); // sha1su1 v14.4s, v21.4s + // vl128 state = 0x85e1119d + __ dci(0x5e281abe); // sha1su1 v30.4s, v21.4s + // vl128 state = 0x9b814260 + __ dci(0x5e281a0e); // sha1su1 v14.4s, v16.4s + // vl128 state = 0x8ccca0ab + __ dci(0x5e281a0a); // sha1su1 v10.4s, v16.4s + // vl128 state = 0x42262836 + __ dci(0x5e281acb); // sha1su1 v11.4s, v22.4s + // vl128 state = 0xabcde33d + __ dci(0x5e281acf); // sha1su1 v15.4s, v22.4s + // vl128 state = 0xdf44e7be + __ dci(0x5e281adf); // sha1su1 v31.4s, v22.4s + // vl128 state = 0x48c332a3 + __ dci(0x5e280a9d); // sha1h s29, s20 + // vl128 state = 0x56bafe13 + __ dci(0x5e28188d); // sha1su1 v13.4s, v4.4s + // vl128 state = 0x218eb351 + __ dci(0x5e2808cf); // sha1h s15, s6 + // vl128 state = 0xc1720d9f + __ dci(0x5e2808cb); // sha1h s11, s6 + // vl128 state = 0x67119e1c + __ dci(0x5e2808c9); // sha1h s9, s6 + // vl128 state = 0x31f69637 + __ dci(0x5e2808c1); // sha1h s1, s6 + // vl128 state = 0x214a25ff + __ dci(0x5e280871); // sha1h s17, s3 + // vl128 state = 0xa5e88b55 + __ dci(0x5e280815); // sha1h s21, s0 + // vl128 state = 0xc8c91e29 + __ dci(0x5e28185d); // sha1su1 v29.4s, v2.4s + // vl128 state = 0x5582c6a8 + __ dci(0x5e28185f); // sha1su1 v31.4s, v2.4s + // vl128 state = 0xd3288a61 + __ dci(0x5e28087e); // sha1h s30, s3 + // vl128 state = 0x350b39c2 + __ dci(0x5e28093f); // sha1h s31, s9 + // vl128 state = 0xbdc1ac98 + __ dci(0x5e28093b); // sha1h s27, s9 + // vl128 state = 0x62f828bf + __ dci(0x5e28092b); // sha1h s11, s9 + // vl128 state = 0xc8f2f671 + __ dci(0x5e2819bb); // sha1su1 v27.4s, v13.4s + // vl128 state = 0x24ec8c34 + __ dci(0x5e281b93); // sha1su1 v19.4s, v28.4s + // vl128 state = 0x71e188de + __ dci(0x5e281b97); // sha1su1 v23.4s, v28.4s + // vl128 state = 0x22490375 + __ dci(0x5e281b95); // sha1su1 v21.4s, v28.4s + // vl128 state = 0x016b70d1 + __ dci(0x5e281b51); // sha1su1 v17.4s, v26.4s + // vl128 state = 0xa6252086 + __ dci(0x5e2819d3); // sha1su1 v19.4s, v14.4s + // vl128 state = 0x78683885 + } + + uint32_t state; + ComputeMachineStateHash(&masm, &state); + __ Mov(x0, reinterpret_cast(&state)); + __ Ldr(w0, MemOperand(x0)); + + END(); + if (CAN_RUN()) { + RUN(); + uint32_t expected_hashes[] = { + 0x78683885, + 0x59574c2a, + 0x449978bf, + 0x0ddab775, + 0x1a043ef3, + 0xf501e2e7, + 0xa219e725, + 0xf17f57c8, + 0x4ccdbf99, + 0x419d4fc3, + 0x7302571d, + 0xd6bee170, + 0x7d81c301, + 0xbaa7d729, + 0xf33f0bc4, + 0xff8b070a, + }; + ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0); + } +} + +TEST_SVE(neon_sha1_3reg) { + SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, + CPUFeatures::kNEON, + CPUFeatures::kCRC32, + CPUFeatures::kSHA1); + START(); + + SetInitialMachineState(&masm); + // state = 0xe2bd2480 + + { + ExactAssemblyScope scope(&masm, 30 * kInstructionSize); + __ dci(0x5e1f02bd); // sha1c q29, s21, v31.4s + // vl128 state = 0xec2a37ad + __ dci(0x5e0810af); // sha1p q15, s5, v8.4s + // vl128 state = 0x3fe9252a + __ dci(0x5e122227); // sha1m q7, s17, v18.4s + // vl128 state = 0x4465789e + __ dci(0x5e0b039d); // sha1c q29, s28, v11.4s + // vl128 state = 0x2186488a + __ dci(0x5e1a03e9); // sha1c q9, s31, v26.4s + // vl128 state = 0x9eddf8e3 + __ dci(0x5e0c138c); // sha1p q12, s28, v12.4s + // vl128 state = 0x0ca7cd3d + __ dci(0x5e1f1316); // sha1p q22, s24, v31.4s + // vl128 state = 0xb80a61c0 + __ dci(0x5e052204); // sha1m q4, s16, v5.4s + // vl128 state = 0x941821ca + __ dci(0x5e0a00d6); // sha1c q22, s6, v10.4s + // vl128 state = 0x5e71ccae + __ dci(0x5e0e032e); // sha1c q14, s25, v14.4s + // vl128 state = 0x7ed4486a + __ dci(0x5e1d1098); // sha1p q24, s4, v29.4s + // vl128 state = 0x0978a637 + __ dci(0x5e0400d9); // sha1c q25, s6, v4.4s + // vl128 state = 0x34c8609e + __ dci(0x5e1a330e); // sha1su0 v14.4s, v24.4s, v26.4s + // vl128 state = 0xcb078fad + __ dci(0x5e1e30f5); // sha1su0 v21.4s, v7.4s, v30.4s + // vl128 state = 0x885200be + __ dci(0x5e1e32e1); // sha1su0 v1.4s, v23.4s, v30.4s + // vl128 state = 0xabc6a188 + __ dci(0x5e0733d3); // sha1su0 v19.4s, v30.4s, v7.4s + // vl128 state = 0x37a4fe6f + __ dci(0x5e0b22e6); // sha1m q6, s23, v11.4s + // vl128 state = 0x68b788d2 + __ dci(0x5e011210); // sha1p q16, s16, v1.4s + // vl128 state = 0x6b36b092 + __ dci(0x5e1702e1); // sha1c q1, s23, v23.4s + // vl128 state = 0x74ef56f5 + __ dci(0x5e1e30f6); // sha1su0 v22.4s, v7.4s, v30.4s + // vl128 state = 0x5a150dfd + __ dci(0x5e1b3348); // sha1su0 v8.4s, v26.4s, v27.4s + // vl128 state = 0xe0a45d9c + __ dci(0x5e0a3041); // sha1su0 v1.4s, v2.4s, v10.4s + // vl128 state = 0x6ba02d02 + __ dci(0x5e17119a); // sha1p q26, s12, v23.4s + // vl128 state = 0x3bf511fc + __ dci(0x5e0b32c7); // sha1su0 v7.4s, v22.4s, v11.4s + // vl128 state = 0xf5c513b6 + __ dci(0x5e063016); // sha1su0 v22.4s, v0.4s, v6.4s + // vl128 state = 0x3eb44b28 + __ dci(0x5e05323c); // sha1su0 v28.4s, v17.4s, v5.4s + // vl128 state = 0x7c2d3adf + __ dci(0x5e1d132a); // sha1p q10, s25, v29.4s + // vl128 state = 0x2b0963c4 + __ dci(0x5e13003c); // sha1c q28, s1, v19.4s + // vl128 state = 0x4a582d00 + __ dci(0x5e13322c); // sha1su0 v12.4s, v17.4s, v19.4s + // vl128 state = 0x7bb2cc8c + __ dci(0x5e032330); // sha1m q16, s25, v3.4s + // vl128 state = 0x2a8b4c0d + } + + uint32_t state; + ComputeMachineStateHash(&masm, &state); + __ Mov(x0, reinterpret_cast(&state)); + __ Ldr(w0, MemOperand(x0)); + + END(); + if (CAN_RUN()) { + RUN(); + uint32_t expected_hashes[] = { + 0x2a8b4c0d, + 0x114e25bb, + 0x4f035af9, + 0x23db7966, + 0x3d106b42, + 0x62651fcf, + 0x44c20879, + 0xadf71d73, + 0xe6858f82, + 0x93a74ae5, + 0xc270310e, + 0x3d07058c, + 0x69f83d0e, + 0x28c5813b, + 0xbb9de2c1, + 0xe06b94cd, + }; + ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0); + } +} + +TEST_SVE(neon_sha2h) { + SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, + CPUFeatures::kNEON, + CPUFeatures::kCRC32, + CPUFeatures::kSHA2); + START(); + + SetInitialMachineState(&masm); + // state = 0xe2bd2480 + + { + ExactAssemblyScope scope(&masm, 30 * kInstructionSize); + __ dci(0x5e0152a2); // sha256h2 q2, q21, v1.4s + // vl128 state = 0x6bda8984 + __ dci(0x5e1552b2); // sha256h2 q18, q21, v21.4s + // vl128 state = 0xe985c68a + __ dci(0x5e055293); // sha256h2 q19, q20, v5.4s + // vl128 state = 0xab18a98b + __ dci(0x5e055297); // sha256h2 q23, q20, v5.4s + // vl128 state = 0x896bad28 + __ dci(0x5e0752a7); // sha256h2 q7, q21, v7.4s + // vl128 state = 0x4e00ba08 + __ dci(0x5e175223); // sha256h2 q3, q17, v23.4s + // vl128 state = 0x380f3893 + __ dci(0x5e1f5262); // sha256h2 q2, q19, v31.4s + // vl128 state = 0xb431122d + __ dci(0x5e1f5272); // sha256h2 q18, q19, v31.4s + // vl128 state = 0x18140047 + __ dci(0x5e1e4262); // sha256h q2, q19, v30.4s + // vl128 state = 0x721779be + __ dci(0x5e164363); // sha256h q3, q27, v22.4s + // vl128 state = 0x383ad878 + __ dci(0x5e175361); // sha256h2 q1, q27, v23.4s + // vl128 state = 0xd985bd85 + __ dci(0x5e115360); // sha256h2 q0, q27, v17.4s + // vl128 state = 0xfa5e77f3 + __ dci(0x5e135270); // sha256h2 q16, q19, v19.4s + // vl128 state = 0x4fc1f5cc + __ dci(0x5e195260); // sha256h2 q0, q19, v25.4s + // vl128 state = 0x89435952 + __ dci(0x5e1952c4); // sha256h2 q4, q22, v25.4s + // vl128 state = 0x93c60c86 + __ dci(0x5e1a52c6); // sha256h2 q6, q22, v26.4s + // vl128 state = 0xedc42105 + __ dci(0x5e1a52c4); // sha256h2 q4, q22, v26.4s + // vl128 state = 0xd5d638a8 + __ dci(0x5e1a4285); // sha256h q5, q20, v26.4s + // vl128 state = 0x9f9da446 + __ dci(0x5e1a428d); // sha256h q13, q20, v26.4s + // vl128 state = 0x87d49cfb + __ dci(0x5e1b42cf); // sha256h q15, q22, v27.4s + // vl128 state = 0xa6802b10 + __ dci(0x5e1b43ed); // sha256h q13, q31, v27.4s + // vl128 state = 0x2e346937 + __ dci(0x5e0b436f); // sha256h q15, q27, v11.4s + // vl128 state = 0x1005f372 + __ dci(0x5e03433f); // sha256h q31, q25, v3.4s + // vl128 state = 0xd908918c + __ dci(0x5e13532f); // sha256h2 q15, q25, v19.4s + // vl128 state = 0x31c73fe0 + __ dci(0x5e01533f); // sha256h2 q31, q25, v1.4s + // vl128 state = 0x84e35a20 + __ dci(0x5e03523d); // sha256h2 q29, q17, v3.4s + // vl128 state = 0x40da34aa + __ dci(0x5e0b527c); // sha256h2 q28, q19, v11.4s + // vl128 state = 0x506a21d9 + __ dci(0x5e0f5238); // sha256h2 q24, q17, v15.4s + // vl128 state = 0x6a67f033 + __ dci(0x5e0d5210); // sha256h2 q16, q16, v13.4s + // vl128 state = 0x317e084c + __ dci(0x5e0d5214); // sha256h2 q20, q16, v13.4s + // vl128 state = 0xdd0eb379 + } + + uint32_t state; + ComputeMachineStateHash(&masm, &state); + __ Mov(x0, reinterpret_cast(&state)); + __ Ldr(w0, MemOperand(x0)); + + END(); + if (CAN_RUN()) { + RUN(); + uint32_t expected_hashes[] = { + 0xdd0eb379, + 0x15384d69, + 0x32bbc73a, + 0xc5879e77, + 0x9241294d, + 0xfc01bad8, + 0xf5e79af5, + 0xee66e696, + 0x535158e8, + 0x09cfa8b6, + 0x8cd83eae, + 0x93ff18b0, + 0x561444e4, + 0xa6249eea, + 0x830e4c73, + 0xb516eaae, + }; + ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0); + } +} + +TEST_SVE(neon_sha2su0) { + SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, + CPUFeatures::kNEON, + CPUFeatures::kCRC32, + CPUFeatures::kSHA2); + START(); + + SetInitialMachineState(&masm); + // state = 0xe2bd2480 + + { + ExactAssemblyScope scope(&masm, 30 * kInstructionSize); + __ dci(0x5e2828e3); // sha256su0 v3.4s, v7.4s + // vl128 state = 0xbc7a7764 + __ dci(0x5e282be1); // sha256su0 v1.4s, v31.4s + // vl128 state = 0x6138a856 + __ dci(0x5e282be9); // sha256su0 v9.4s, v31.4s + // vl128 state = 0x49c6be17 + __ dci(0x5e282beb); // sha256su0 v11.4s, v31.4s + // vl128 state = 0xca658743 + __ dci(0x5e2829bb); // sha256su0 v27.4s, v13.4s + // vl128 state = 0x1bf1d233 + __ dci(0x5e2829ba); // sha256su0 v26.4s, v13.4s + // vl128 state = 0xafb0c6ae + __ dci(0x5e2829aa); // sha256su0 v10.4s, v13.4s + // vl128 state = 0x2182e90d + __ dci(0x5e282b2e); // sha256su0 v14.4s, v25.4s + // vl128 state = 0x401d297d + __ dci(0x5e282aaf); // sha256su0 v15.4s, v21.4s + // vl128 state = 0x6c01fefa + __ dci(0x5e282aad); // sha256su0 v13.4s, v21.4s + // vl128 state = 0x0f4c191d + __ dci(0x5e282a7d); // sha256su0 v29.4s, v19.4s + // vl128 state = 0xcf26aa1b + __ dci(0x5e282ad9); // sha256su0 v25.4s, v22.4s + // vl128 state = 0xae04081e + __ dci(0x5e282ac9); // sha256su0 v9.4s, v22.4s + // vl128 state = 0x08149009 + __ dci(0x5e282acb); // sha256su0 v11.4s, v22.4s + // vl128 state = 0xa691e487 + __ dci(0x5e282ac3); // sha256su0 v3.4s, v22.4s + // vl128 state = 0xd728e1b5 + __ dci(0x5e282ac7); // sha256su0 v7.4s, v22.4s + // vl128 state = 0x120fac30 + __ dci(0x5e282ac5); // sha256su0 v5.4s, v22.4s + // vl128 state = 0x88086f82 + __ dci(0x5e282ac4); // sha256su0 v4.4s, v22.4s + // vl128 state = 0x625160b7 + __ dci(0x5e282a65); // sha256su0 v5.4s, v19.4s + // vl128 state = 0x308feecd + __ dci(0x5e282a6d); // sha256su0 v13.4s, v19.4s + // vl128 state = 0x65f03097 + __ dci(0x5e282a65); // sha256su0 v5.4s, v19.4s + // vl128 state = 0x44d9fbb6 + __ dci(0x5e282a67); // sha256su0 v7.4s, v19.4s + // vl128 state = 0x694fe04a + __ dci(0x5e282a17); // sha256su0 v23.4s, v16.4s + // vl128 state = 0x3d5c139b + __ dci(0x5e282a13); // sha256su0 v19.4s, v16.4s + // vl128 state = 0x922f40a5 + __ dci(0x5e282b3b); // sha256su0 v27.4s, v25.4s + // vl128 state = 0x4f9c34f2 + __ dci(0x5e282ab9); // sha256su0 v25.4s, v21.4s + // vl128 state = 0x18a4f581 + __ dci(0x5e282ab1); // sha256su0 v17.4s, v21.4s + // vl128 state = 0x69da3844 + __ dci(0x5e282ab9); // sha256su0 v25.4s, v21.4s + // vl128 state = 0x57f8ce0b + __ dci(0x5e282a1d); // sha256su0 v29.4s, v16.4s + // vl128 state = 0xafa03001 + __ dci(0x5e282ad5); // sha256su0 v21.4s, v22.4s + // vl128 state = 0x029b78a8 + } + + uint32_t state; + ComputeMachineStateHash(&masm, &state); + __ Mov(x0, reinterpret_cast(&state)); + __ Ldr(w0, MemOperand(x0)); + + END(); + if (CAN_RUN()) { + RUN(); + uint32_t expected_hashes[] = { + 0x029b78a8, + 0x479a8911, + 0x6bdbdb48, + 0x5ef3718b, + 0x695ce173, + 0x586543d0, + 0xd00a22be, + 0xe63a91b9, + 0x42bb89a2, + 0xea48ee79, + 0x9788ac35, + 0x1e8599a3, + 0xd0d2d6ee, + 0xfe7aaaf7, + 0x77da6831, + 0xb93fb875, + }; + ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0); + } +} + +TEST_SVE(neon_sha2su1) { + SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, + CPUFeatures::kNEON, + CPUFeatures::kCRC32, + CPUFeatures::kSHA2); + START(); + + SetInitialMachineState(&masm); + // state = 0xe2bd2480 + + { + ExactAssemblyScope scope(&masm, 30 * kInstructionSize); + __ dci(0x5e1e6146); // sha256su1 v6.4s, v10.4s, v30.4s + // vl128 state = 0x3bbf7782 + __ dci(0x5e0f6144); // sha256su1 v4.4s, v10.4s, v15.4s + // vl128 state = 0xf8c83149 + __ dci(0x5e0e6174); // sha256su1 v20.4s, v11.4s, v14.4s + // vl128 state = 0x3b8c353b + __ dci(0x5e0e6170); // sha256su1 v16.4s, v11.4s, v14.4s + // vl128 state = 0x1041e30e + __ dci(0x5e0a6131); // sha256su1 v17.4s, v9.4s, v10.4s + // vl128 state = 0xe4d81cd2 + __ dci(0x5e0a6135); // sha256su1 v21.4s, v9.4s, v10.4s + // vl128 state = 0x24869db3 + __ dci(0x5e0a6131); // sha256su1 v17.4s, v9.4s, v10.4s + // vl128 state = 0xfb093436 + __ dci(0x5e0a6199); // sha256su1 v25.4s, v12.4s, v10.4s + // vl128 state = 0x0c7939ba + __ dci(0x5e0e639b); // sha256su1 v27.4s, v28.4s, v14.4s + // vl128 state = 0xa7e5c40a + __ dci(0x5e0663ab); // sha256su1 v11.4s, v29.4s, v6.4s + // vl128 state = 0xc4ae571c + __ dci(0x5e06619b); // sha256su1 v27.4s, v12.4s, v6.4s + // vl128 state = 0xf84ef221 + __ dci(0x5e066199); // sha256su1 v25.4s, v12.4s, v6.4s + // vl128 state = 0x24f98d3c + __ dci(0x5e0e6118); // sha256su1 v24.4s, v8.4s, v14.4s + // vl128 state = 0xcdb43a3b + __ dci(0x5e0f601a); // sha256su1 v26.4s, v0.4s, v15.4s + // vl128 state = 0x85fd37e9 + __ dci(0x5e096012); // sha256su1 v18.4s, v0.4s, v9.4s + // vl128 state = 0xabccd3f6 + __ dci(0x5e0c601a); // sha256su1 v26.4s, v0.4s, v12.4s + // vl128 state = 0x8c0232e5 + __ dci(0x5e1c602a); // sha256su1 v10.4s, v1.4s, v28.4s + // vl128 state = 0xcdcf37ba + __ dci(0x5e1e622e); // sha256su1 v14.4s, v17.4s, v30.4s + // vl128 state = 0x25129c9a + __ dci(0x5e1e623e); // sha256su1 v30.4s, v17.4s, v30.4s + // vl128 state = 0xd0a281b7 + __ dci(0x5e1e630e); // sha256su1 v14.4s, v24.4s, v30.4s + // vl128 state = 0x3ed92f18 + __ dci(0x5e1f639e); // sha256su1 v30.4s, v28.4s, v31.4s + // vl128 state = 0xda1056b9 + __ dci(0x5e0f629f); // sha256su1 v31.4s, v20.4s, v15.4s + // vl128 state = 0x367274fa + __ dci(0x5e0f63bd); // sha256su1 v29.4s, v29.4s, v15.4s + // vl128 state = 0x46a79748 + __ dci(0x5e0f63b5); // sha256su1 v21.4s, v29.4s, v15.4s + // vl128 state = 0xdc427315 + __ dci(0x5e0b63f7); // sha256su1 v23.4s, v31.4s, v11.4s + // vl128 state = 0x91547f41 + __ dci(0x5e0263e7); // sha256su1 v7.4s, v31.4s, v2.4s + // vl128 state = 0x1c233ffa + __ dci(0x5e0062f7); // sha256su1 v23.4s, v23.4s, v0.4s + // vl128 state = 0x8c2948a1 + __ dci(0x5e1062c7); // sha256su1 v7.4s, v22.4s, v16.4s + // vl128 state = 0x8b72f498 + __ dci(0x5e1062c6); // sha256su1 v6.4s, v22.4s, v16.4s + // vl128 state = 0x43d27746 + __ dci(0x5e1063ee); // sha256su1 v14.4s, v31.4s, v16.4s + // vl128 state = 0xa864e589 + } + + uint32_t state; + ComputeMachineStateHash(&masm, &state); + __ Mov(x0, reinterpret_cast(&state)); + __ Ldr(w0, MemOperand(x0)); + + END(); + if (CAN_RUN()) { + RUN(); + uint32_t expected_hashes[] = { + 0xa864e589, + 0xc588dfe0, + 0x171add38, + 0x884ca9db, + 0x5f47fb6a, + 0x0bd024c5, + 0xa6921cce, + 0x01dc8899, + 0x0f5b4b19, + 0x948260c1, + 0x4d4faafe, + 0x76ee7ff7, + 0xd9a56156, + 0x63c8e138, + 0xe687f7c3, + 0x51785434, + }; + ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0); + } +} + +TEST_SVE(neon_sha3) { + SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, + CPUFeatures::kNEON, + CPUFeatures::kCRC32, + CPUFeatures::kSHA3); + START(); + + SetInitialMachineState(&masm); + // state = 0xe2bd2480 + + { + ExactAssemblyScope scope(&masm, 60 * kInstructionSize); + __ dci(0xce608c00); // rax1 v0.2d, v0.2d, v0.2d + // vl128 state = 0x960c2b9f + __ dci(0xce608e28); // rax1 v8.2d, v17.2d, v0.2d + // vl128 state = 0x89ea3f7b + __ dci(0xce618e6c); // rax1 v12.2d, v19.2d, v1.2d + // vl128 state = 0xa7801384 + __ dci(0xce718e48); // rax1 v8.2d, v18.2d, v17.2d + // vl128 state = 0x4477d70d + __ dci(0xce738e60); // rax1 v0.2d, v19.2d, v19.2d + // vl128 state = 0xdee66854 + __ dci(0xce6b8e61); // rax1 v1.2d, v19.2d, v11.2d + // vl128 state = 0x2e383dc2 + __ dci(0xce6e8e60); // rax1 v0.2d, v19.2d, v14.2d + // vl128 state = 0xa022bb6d + __ dci(0xce6e8e62); // rax1 v2.2d, v19.2d, v14.2d + // vl128 state = 0x923f5d32 + __ dci(0xce668e23); // rax1 v3.2d, v17.2d, v6.2d + // vl128 state = 0xc2c6ca00 + __ dci(0xce260e33); // bcax v19.16b, v17.16b, v6.16b, v3.16b + // vl128 state = 0x517e85e9 + __ dci(0xce260e23); // bcax v3.16b, v17.16b, v6.16b, v3.16b + // vl128 state = 0xbcf4c332 + __ dci(0xce260e93); // bcax v19.16b, v20.16b, v6.16b, v3.16b + // vl128 state = 0x5d9d51ef + __ dci(0xce260a11); // bcax v17.16b, v16.16b, v6.16b, v2.16b + // vl128 state = 0x69ce0099 + __ dci(0xce260a15); // bcax v21.16b, v16.16b, v6.16b, v2.16b + // vl128 state = 0x9a2cdc9f + __ dci(0xce244a11); // bcax v17.16b, v16.16b, v4.16b, v18.16b + // vl128 state = 0x27eeff29 + __ dci(0xce304a10); // bcax v16.16b, v16.16b, v16.16b, v18.16b + // vl128 state = 0x6d586875 + __ dci(0xce314b18); // bcax v24.16b, v24.16b, v17.16b, v18.16b + // vl128 state = 0xe38b6054 + __ dci(0xce214b28); // bcax v8.16b, v25.16b, v1.16b, v18.16b + // vl128 state = 0x27a3f5f6 + __ dci(0xce294f38); // bcax v24.16b, v25.16b, v9.16b, v19.16b + // vl128 state = 0x7d7ffa9b + __ dci(0xce214e39); // bcax v25.16b, v17.16b, v1.16b, v19.16b + // vl128 state = 0x936374f0 + __ dci(0xce216a3d); // bcax v29.16b, v17.16b, v1.16b, v26.16b + // vl128 state = 0x1c5136d5 + __ dci(0xce296b39); // bcax v25.16b, v25.16b, v9.16b, v26.16b + // vl128 state = 0x75cd7131 + __ dci(0xce216338); // bcax v24.16b, v25.16b, v1.16b, v24.16b + // vl128 state = 0xcc747626 + __ dci(0xce2163f9); // bcax v25.16b, v31.16b, v1.16b, v24.16b + // vl128 state = 0x9409c8bc + __ dci(0xce2043f1); // bcax v17.16b, v31.16b, v0.16b, v16.16b + // vl128 state = 0x8db3a0c8 + __ dci(0xce2043f5); // bcax v21.16b, v31.16b, v0.16b, v16.16b + // vl128 state = 0xa55f8d7d + __ dci(0xce2043e5); // bcax v5.16b, v31.16b, v0.16b, v16.16b + // vl128 state = 0xe1960c7a + __ dci(0xce224be7); // bcax v7.16b, v31.16b, v2.16b, v18.16b + // vl128 state = 0xc9599bde + __ dci(0xce204bb7); // bcax v23.16b, v29.16b, v0.16b, v18.16b + // vl128 state = 0x7176d08d + __ dci(0xce004b9f); // eor3 v31.16b, v28.16b, v0.16b, v18.16b + // vl128 state = 0x10620821 + __ dci(0xce000baf); // eor3 v15.16b, v29.16b, v0.16b, v2.16b + // vl128 state = 0x0aba0288 + __ dci(0xce0a0bab); // eor3 v11.16b, v29.16b, v10.16b, v2.16b + // vl128 state = 0xe6517156 + __ dci(0xce0e1baf); // eor3 v15.16b, v29.16b, v14.16b, v6.16b + // vl128 state = 0x6b7021fb + __ dci(0xce0e3fa7); // eor3 v7.16b, v29.16b, v14.16b, v15.16b + // vl128 state = 0x05761b1f + __ dci(0xce0e2fe5); // eor3 v5.16b, v31.16b, v14.16b, v11.16b + // vl128 state = 0xe01822c6 + __ dci(0xce2e2fc7); // bcax v7.16b, v30.16b, v14.16b, v11.16b + // vl128 state = 0xdc6444d7 + __ dci(0xce3e2dcf); // bcax v15.16b, v14.16b, v30.16b, v11.16b + // vl128 state = 0xa5ecad2e + __ dci(0xce3e3fdf); // bcax v31.16b, v30.16b, v30.16b, v15.16b + // vl128 state = 0x2124dc42 + __ dci(0xce3a3ede); // bcax v30.16b, v22.16b, v26.16b, v15.16b + // vl128 state = 0x57f77204 + __ dci(0xce3a2e9c); // bcax v28.16b, v20.16b, v26.16b, v11.16b + // vl128 state = 0x6e8d303d + __ dci(0xce3a2294); // bcax v20.16b, v20.16b, v26.16b, v8.16b + // vl128 state = 0xdb53d42c + __ dci(0xce38029c); // bcax v28.16b, v20.16b, v24.16b, v0.16b + // vl128 state = 0x258d49b8 + __ dci(0xce38088c); // bcax v12.16b, v4.16b, v24.16b, v2.16b + // vl128 state = 0xe751a348 + __ dci(0xce28008e); // bcax v14.16b, v4.16b, v8.16b, v0.16b + // vl128 state = 0x8ce0aa1a + __ dci(0xce28008a); // bcax v10.16b, v4.16b, v8.16b, v0.16b + // vl128 state = 0x1fdf89a5 + __ dci(0xce280088); // bcax v8.16b, v4.16b, v8.16b, v0.16b + // vl128 state = 0xcc51f5e1 + __ dci(0xce2a1089); // bcax v9.16b, v4.16b, v10.16b, v4.16b + // vl128 state = 0xdaf766b0 + __ dci(0xce0b1081); // eor3 v1.16b, v4.16b, v11.16b, v4.16b + // vl128 state = 0x2da7deb5 + __ dci(0xce0a1011); // eor3 v17.16b, v0.16b, v10.16b, v4.16b + // vl128 state = 0xcc86f5d4 + __ dci(0xce121010); // eor3 v16.16b, v0.16b, v18.16b, v4.16b + // vl128 state = 0xfb722105 + __ dci(0xce921118); // xar v24.2d, v8.2d, v18.2d, #4 + // vl128 state = 0x9a7752e3 + __ dci(0xce9a1199); // xar v25.2d, v12.2d, v26.2d, #4 + // vl128 state = 0x83a251c2 + __ dci(0xce9e11dd); // xar v29.2d, v14.2d, v30.2d, #4 + // vl128 state = 0x1e31c9d5 + __ dci(0xce9e915c); // xar v28.2d, v10.2d, v30.2d, #36 + // vl128 state = 0x0e421d73 + __ dci(0xce1e115d); // eor3 v29.16b, v10.16b, v30.16b, v4.16b + // vl128 state = 0xb5a8c677 + __ dci(0xce3e515c); // bcax v28.16b, v10.16b, v30.16b, v20.16b + // vl128 state = 0x21587300 + __ dci(0xce3e5154); // bcax v20.16b, v10.16b, v30.16b, v20.16b + // vl128 state = 0x9459c629 + __ dci(0xce3e1056); // bcax v22.16b, v2.16b, v30.16b, v4.16b + // vl128 state = 0xdb02263a + __ dci(0xce2a105e); // bcax v30.16b, v2.16b, v10.16b, v4.16b + // vl128 state = 0xc9d210aa + __ dci(0xce3a5056); // bcax v22.16b, v2.16b, v26.16b, v20.16b + // vl128 state = 0x4cc56293 + } + + uint32_t state; + ComputeMachineStateHash(&masm, &state); + __ Mov(x0, reinterpret_cast(&state)); + __ Ldr(w0, MemOperand(x0)); + + END(); + if (CAN_RUN()) { + RUN(); + uint32_t expected_hashes[] = { + 0x4cc56293, + 0xee8bac03, + 0xc1253ac9, + 0x9fe5aa0f, + 0x43df27f4, + 0x19f03be6, + 0xd26c928b, + 0x7b9da4c4, + 0xe13149a7, + 0x9fa11ed9, + 0xe02cc4dd, + 0x7848dfe7, + 0x5ed1726f, + 0x983e0123, + 0x34166240, + 0xc4ee172f, + }; + ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0); + } +} + +TEST_SVE(neon_sha512) { + SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, + CPUFeatures::kNEON, + CPUFeatures::kCRC32, + CPUFeatures::kSHA512); + START(); + + SetInitialMachineState(&masm); + // state = 0xe2bd2480 + + { + ExactAssemblyScope scope(&masm, 50 * kInstructionSize); + __ dci(0xce6583cc); // sha512h q12, q30, v5.2d + // vl128 state = 0xecc5733a + __ dci(0xce6586c8); // sha512h2 q8, q22, v5.2d + // vl128 state = 0xe05f2087 + __ dci(0xce7586e0); // sha512h2 q0, q23, v21.2d + // vl128 state = 0x1925555b + __ dci(0xce7187e8); // sha512h2 q8, q31, v17.2d + // vl128 state = 0x891dba65 + __ dci(0xce7586ec); // sha512h2 q12, q23, v21.2d + // vl128 state = 0xdfbe3239 + __ dci(0xce7580fc); // sha512h q28, q7, v21.2d + // vl128 state = 0xba49dbc1 + __ dci(0xce7580f4); // sha512h q20, q7, v21.2d + // vl128 state = 0x3ad11a23 + __ dci(0xce6780f6); // sha512h q22, q7, v7.2d + // vl128 state = 0xcf9e1803 + __ dci(0xce6780f7); // sha512h q23, q7, v7.2d + // vl128 state = 0xe2baee15 + __ dci(0xce6785e7); // sha512h2 q7, q15, v7.2d + // vl128 state = 0x900a337c + __ dci(0xce6f8565); // sha512h2 q5, q11, v15.2d + // vl128 state = 0xc6e5d7eb + __ dci(0xce6f8424); // sha512h2 q4, q1, v15.2d + // vl128 state = 0xcbcb6ac1 + __ dci(0xce6b84a6); // sha512h2 q6, q5, v11.2d + // vl128 state = 0xa3c1a679 + __ dci(0xce7b848e); // sha512h2 q14, q4, v27.2d + // vl128 state = 0x47c4e54d + __ dci(0xce7d849e); // sha512h2 q30, q4, v29.2d + // vl128 state = 0x9f519a29 + __ dci(0xce7f859c); // sha512h2 q28, q12, v31.2d + // vl128 state = 0xa4433415 + __ dci(0xce778494); // sha512h2 q20, q4, v23.2d + // vl128 state = 0xf03a69ec + __ dci(0xce778484); // sha512h2 q4, q4, v23.2d + // vl128 state = 0x2c728333 + __ dci(0xce77850c); // sha512h2 q12, q8, v23.2d + // vl128 state = 0xaedc423e + __ dci(0xce77815c); // sha512h q28, q10, v23.2d + // vl128 state = 0xea9346ea + __ dci(0xce7381cc); // sha512h q12, q14, v19.2d + // vl128 state = 0x05ad87d1 + __ dci(0xce7a81dc); // sha512h q28, q14, v26.2d + // vl128 state = 0x9b1cd7b3 + __ dci(0xce7285d4); // sha512h2 q20, q14, v18.2d + // vl128 state = 0x154201ac + __ dci(0xce7280d6); // sha512h q22, q6, v18.2d + // vl128 state = 0xd8640492 + __ dci(0xce7a81d4); // sha512h q20, q14, v26.2d + // vl128 state = 0x908eb258 + __ dci(0xce7281f0); // sha512h q16, q15, v18.2d + // vl128 state = 0x0067f162 + __ dci(0xce728572); // sha512h2 q18, q11, v18.2d + // vl128 state = 0xca9bc751 + __ dci(0xce728422); // sha512h2 q2, q1, v18.2d + // vl128 state = 0x06b7318d + __ dci(0xce738412); // sha512h2 q18, q0, v19.2d + // vl128 state = 0xad019588 + __ dci(0xce718016); // sha512h q22, q0, v17.2d + // vl128 state = 0x55a29e9b + __ dci(0xce718834); // sha512su1 v20.2d, v1.2d, v17.2d + // vl128 state = 0x953a9c7a + __ dci(0xce738876); // sha512su1 v22.2d, v3.2d, v19.2d + // vl128 state = 0x4f194c71 + __ dci(0xce638826); // sha512su1 v6.2d, v1.2d, v3.2d + // vl128 state = 0x08e50d47 + __ dci(0xce6b886e); // sha512su1 v14.2d, v3.2d, v11.2d + // vl128 state = 0x4bdfb870 + __ dci(0xce6b88de); // sha512su1 v30.2d, v6.2d, v11.2d + // vl128 state = 0xbcf4b6c5 + __ dci(0xce7f88df); // sha512su1 v31.2d, v6.2d, v31.2d + // vl128 state = 0x916dede1 + __ dci(0xce6f8acf); // sha512su1 v15.2d, v22.2d, v15.2d + // vl128 state = 0x3b776003 + __ dci(0xce6d8bcb); // sha512su1 v11.2d, v30.2d, v13.2d + // vl128 state = 0x5d5cb7d9 + __ dci(0xce6d83ea); // sha512h q10, q31, v13.2d + // vl128 state = 0x18df9e46 + __ dci(0xce6d8328); // sha512h q8, q25, v13.2d + // vl128 state = 0xde5807d0 + __ dci(0xce6583b8); // sha512h q24, q29, v5.2d + // vl128 state = 0x861020e7 + __ dci(0xce6d83f9); // sha512h q25, q31, v13.2d + // vl128 state = 0x39d960f4 + __ dci(0xce6d8b78); // sha512su1 v24.2d, v27.2d, v13.2d + // vl128 state = 0x3afc2b5c + __ dci(0xce6c8968); // sha512su1 v8.2d, v11.2d, v12.2d + // vl128 state = 0x74d44114 + __ dci(0xce6c8b49); // sha512su1 v9.2d, v26.2d, v12.2d + // vl128 state = 0x72e6b5cd + __ dci(0xce6c8b39); // sha512su1 v25.2d, v25.2d, v12.2d + // vl128 state = 0x6aaa4658 + __ dci(0xce6c8b9d); // sha512su1 v29.2d, v28.2d, v12.2d + // vl128 state = 0x7c076c9b + __ dci(0xce648b0d); // sha512su1 v13.2d, v24.2d, v4.2d + // vl128 state = 0x1082519d + __ dci(0xce648385); // sha512h q5, q28, v4.2d + // vl128 state = 0x9ed9d190 + __ dci(0xce648715); // sha512h2 q21, q24, v4.2d + // vl128 state = 0xaace5a02 + } + + uint32_t state; + ComputeMachineStateHash(&masm, &state); + __ Mov(x0, reinterpret_cast(&state)); + __ Ldr(w0, MemOperand(x0)); + + END(); + if (CAN_RUN()) { + RUN(); + uint32_t expected_hashes[] = { + 0xaace5a02, + 0x912905de, + 0xc62c0756, + 0xac6646d5, + 0xd3c2e6af, + 0x029ae35f, + 0xf5e83b54, + 0x49f8d50c, + 0xc5175320, + 0xb51c8ebd, + 0x2dc184b0, + 0x01e01875, + 0x28df0d5a, + 0x01d2fff2, + 0x5f5f5909, + 0x6aead9d8, + }; + ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0); + } +} + +TEST_SVE(neon_sha512su0) { + SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, + CPUFeatures::kNEON, + CPUFeatures::kCRC32, + CPUFeatures::kSHA512); + START(); + + SetInitialMachineState(&masm); + // state = 0xe2bd2480 + + { + ExactAssemblyScope scope(&masm, 30 * kInstructionSize); + __ dci(0xcec083f6); // sha512su0 v22.2d, v31.2d + // vl128 state = 0xf7a54f2b + __ dci(0xcec083e6); // sha512su0 v6.2d, v31.2d + // vl128 state = 0x919c170d + __ dci(0xcec08347); // sha512su0 v7.2d, v26.2d + // vl128 state = 0x8a1800d6 + __ dci(0xcec082c6); // sha512su0 v6.2d, v22.2d + // vl128 state = 0x353aa8bf + __ dci(0xcec082c4); // sha512su0 v4.2d, v22.2d + // vl128 state = 0x87d75b6c + __ dci(0xcec082c0); // sha512su0 v0.2d, v22.2d + // vl128 state = 0xf2ee6974 + __ dci(0xcec082c1); // sha512su0 v1.2d, v22.2d + // vl128 state = 0xf2ec1e17 + __ dci(0xcec082c0); // sha512su0 v0.2d, v22.2d + // vl128 state = 0x1bcca060 + __ dci(0xcec082c4); // sha512su0 v4.2d, v22.2d + // vl128 state = 0x67773394 + __ dci(0xcec082c5); // sha512su0 v5.2d, v22.2d + // vl128 state = 0xbb344c8d + __ dci(0xcec083e1); // sha512su0 v1.2d, v31.2d + // vl128 state = 0x595e2eb0 + __ dci(0xcec081a5); // sha512su0 v5.2d, v13.2d + // vl128 state = 0x7d7f4e15 + __ dci(0xcec081a7); // sha512su0 v7.2d, v13.2d + // vl128 state = 0xba4b1bc6 + __ dci(0xcec081a3); // sha512su0 v3.2d, v13.2d + // vl128 state = 0x2c56ee6e + __ dci(0xcec083f3); // sha512su0 v19.2d, v31.2d + // vl128 state = 0xefe9b855 + __ dci(0xcec08397); // sha512su0 v23.2d, v28.2d + // vl128 state = 0x6f0d20ba + __ dci(0xcec08396); // sha512su0 v22.2d, v28.2d + // vl128 state = 0x9be77fdb + __ dci(0xcec081b7); // sha512su0 v23.2d, v13.2d + // vl128 state = 0x5d981c55 + __ dci(0xcec080ff); // sha512su0 v31.2d, v7.2d + // vl128 state = 0x9126079f + __ dci(0xcec080fd); // sha512su0 v29.2d, v7.2d + // vl128 state = 0x3199dc9e + __ dci(0xcec081dc); // sha512su0 v28.2d, v14.2d + // vl128 state = 0x20fb48d7 + __ dci(0xcec081cc); // sha512su0 v12.2d, v14.2d + // vl128 state = 0x4ae6221a + __ dci(0xcec08088); // sha512su0 v8.2d, v4.2d + // vl128 state = 0x17e8b62d + __ dci(0xcec0808a); // sha512su0 v10.2d, v4.2d + // vl128 state = 0x90d73468 + __ dci(0xcec0809a); // sha512su0 v26.2d, v4.2d + // vl128 state = 0x1f02f97f + __ dci(0xcec081de); // sha512su0 v30.2d, v14.2d + // vl128 state = 0xe5ef3e67 + __ dci(0xcec081bf); // sha512su0 v31.2d, v13.2d + // vl128 state = 0xd1bcc363 + __ dci(0xcec081bb); // sha512su0 v27.2d, v13.2d + // vl128 state = 0x8bcfab58 + __ dci(0xcec08033); // sha512su0 v19.2d, v1.2d + // vl128 state = 0x93fb8bad + __ dci(0xcec080fb); // sha512su0 v27.2d, v7.2d + // vl128 state = 0x3598e921 + } + + uint32_t state; + ComputeMachineStateHash(&masm, &state); + __ Mov(x0, reinterpret_cast(&state)); + __ Ldr(w0, MemOperand(x0)); + + END(); + if (CAN_RUN()) { + RUN(); + uint32_t expected_hashes[] = { + 0x3598e921, + 0x7e3ee16c, + 0x4856987c, + 0x193bda79, + 0x84154d6f, + 0x861f1795, + 0xb74d39b3, + 0x9653d8b3, + 0x6690a066, + 0x00a29b51, + 0xb2c795ce, + 0xcbd03b05, + 0x9fb2aaec, + 0x0216b732, + 0x96eb6864, + 0x4024f5c7, + }; + ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0); + } +} + +TEST_SVE(neon_aes) { + SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, + CPUFeatures::kNEON, + CPUFeatures::kCRC32, + CPUFeatures::kAES); + START(); + + SetInitialMachineState(&masm); + // state = 0xe2bd2480 + + { + ExactAssemblyScope scope(&masm, 30 * kInstructionSize); + __ dci(0x4e285a86); // aesd v6.16b, v20.16b + // vl128 state = 0x801bfc08 + __ dci(0x4e2858ae); // aesd v14.16b, v5.16b + // vl128 state = 0xbd83a757 + __ dci(0x4e2858ac); // aesd v12.16b, v5.16b + // vl128 state = 0x9fb1dc6b + __ dci(0x4e2858ae); // aesd v14.16b, v5.16b + // vl128 state = 0xfa1fa7e4 + __ dci(0x4e28482a); // aese v10.16b, v1.16b + // vl128 state = 0xecfcfe2d + __ dci(0x4e28483a); // aese v26.16b, v1.16b + // vl128 state = 0x05e22f07 + __ dci(0x4e28488a); // aese v10.16b, v4.16b + // vl128 state = 0xdd53df5f + __ dci(0x4e28488e); // aese v14.16b, v4.16b + // vl128 state = 0x9d2ac50f + __ dci(0x4e28484f); // aese v15.16b, v2.16b + // vl128 state = 0xf45146ab + __ dci(0x4e28484b); // aese v11.16b, v2.16b + // vl128 state = 0xf1260a7c + __ dci(0x4e28485b); // aese v27.16b, v2.16b + // vl128 state = 0x3a0844da + __ dci(0x4e285819); // aesd v25.16b, v0.16b + // vl128 state = 0xaca89993 + __ dci(0x4e284a09); // aese v9.16b, v16.16b + // vl128 state = 0xef4e9a5f + __ dci(0x4e285a4b); // aesd v11.16b, v18.16b + // vl128 state = 0x209a44bc + __ dci(0x4e285a4f); // aesd v15.16b, v18.16b + // vl128 state = 0xc6d2d718 + __ dci(0x4e285a4d); // aesd v13.16b, v18.16b + // vl128 state = 0x1aceef8f + __ dci(0x4e285a45); // aesd v5.16b, v18.16b + // vl128 state = 0x7ed056c6 + __ dci(0x4e285af5); // aesd v21.16b, v23.16b + // vl128 state = 0x429ed71e + __ dci(0x4e285a91); // aesd v17.16b, v20.16b + // vl128 state = 0xd7a1f687 + __ dci(0x4e284ad9); // aese v25.16b, v22.16b + // vl128 state = 0x8fa44574 + __ dci(0x4e284adb); // aese v27.16b, v22.16b + // vl128 state = 0xd2792169 + __ dci(0x4e285afa); // aesd v26.16b, v23.16b + // vl128 state = 0xe502f095 + __ dci(0x4e285bbb); // aesd v27.16b, v29.16b + // vl128 state = 0x0e3d3238 + __ dci(0x4e285bbf); // aesd v31.16b, v29.16b + // vl128 state = 0x0ad06592 + __ dci(0x4e285baf); // aesd v15.16b, v29.16b + // vl128 state = 0xb94f3c19 + __ dci(0x4e284b3f); // aese v31.16b, v25.16b + // vl128 state = 0xf31a0da1 + __ dci(0x4e284917); // aese v23.16b, v8.16b + // vl128 state = 0x7d2d7811 + __ dci(0x4e284913); // aese v19.16b, v8.16b + // vl128 state = 0x41b7b854 + __ dci(0x4e284911); // aese v17.16b, v8.16b + // vl128 state = 0x60600536 + __ dci(0x4e2849d5); // aese v21.16b, v14.16b + // vl128 state = 0x3e0cc74f + } + + uint32_t state; + ComputeMachineStateHash(&masm, &state); + __ Mov(x0, reinterpret_cast(&state)); + __ Ldr(w0, MemOperand(x0)); + + END(); + if (CAN_RUN()) { + RUN(); + uint32_t expected_hashes[] = { + 0x3e0cc74f, + 0x7f17ba2e, + 0xd59f8e91, + 0x9f15a51b, + 0x11d92e66, + 0xcd53d015, + 0xbc652785, + 0x6974fa54, + 0x953d342e, + 0xf1aa56b3, + 0xde8ca1d3, + 0xba408b82, + 0x48094fa4, + 0xb757bcf1, + 0x2cc5be58, + 0x6e7a0f58, + }; + ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0); + } +} + +TEST_SVE(neon_aesmc) { + SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, + CPUFeatures::kNEON, + CPUFeatures::kCRC32, + CPUFeatures::kAES); + START(); + + SetInitialMachineState(&masm); + // state = 0xe2bd2480 + + { + ExactAssemblyScope scope(&masm, 30 * kInstructionSize); + __ dci(0x4e287800); // aesimc v0.16b, v0.16b + // vl128 state = 0x03554749 + __ dci(0x4e287a28); // aesimc v8.16b, v17.16b + // vl128 state = 0x59d5fedd + __ dci(0x4e287a2a); // aesimc v10.16b, v17.16b + // vl128 state = 0xcda29514 + __ dci(0x4e286aae); // aesmc v14.16b, v21.16b + // vl128 state = 0xae8f019a + __ dci(0x4e286abe); // aesmc v30.16b, v21.16b + // vl128 state = 0x7b04c6c0 + __ dci(0x4e286a0e); // aesmc v14.16b, v16.16b + // vl128 state = 0xaf6c5ce6 + __ dci(0x4e286a0a); // aesmc v10.16b, v16.16b + // vl128 state = 0xf1d7fd2b + __ dci(0x4e286acb); // aesmc v11.16b, v22.16b + // vl128 state = 0x5d693c63 + __ dci(0x4e286acf); // aesmc v15.16b, v22.16b + // vl128 state = 0xec8971ad + __ dci(0x4e286adf); // aesmc v31.16b, v22.16b + // vl128 state = 0x6389b200 + __ dci(0x4e287a9d); // aesimc v29.16b, v20.16b + // vl128 state = 0xd69341fb + __ dci(0x4e28688d); // aesmc v13.16b, v4.16b + // vl128 state = 0x6344af95 + __ dci(0x4e2878cf); // aesimc v15.16b, v6.16b + // vl128 state = 0x5c58dfac + __ dci(0x4e2878cb); // aesimc v11.16b, v6.16b + // vl128 state = 0x7dc9cf34 + __ dci(0x4e2878c9); // aesimc v9.16b, v6.16b + // vl128 state = 0xff4b3544 + __ dci(0x4e2878c1); // aesimc v1.16b, v6.16b + // vl128 state = 0xd1937de2 + __ dci(0x4e287871); // aesimc v17.16b, v3.16b + // vl128 state = 0x7cabd208 + __ dci(0x4e287815); // aesimc v21.16b, v0.16b + // vl128 state = 0xbc06df94 + __ dci(0x4e28685d); // aesmc v29.16b, v2.16b + // vl128 state = 0xfc4478bb + __ dci(0x4e28685f); // aesmc v31.16b, v2.16b + // vl128 state = 0x0c72c200 + __ dci(0x4e28787e); // aesimc v30.16b, v3.16b + // vl128 state = 0xdd822b9d + __ dci(0x4e28793f); // aesimc v31.16b, v9.16b + // vl128 state = 0x1397dcc6 + __ dci(0x4e28793b); // aesimc v27.16b, v9.16b + // vl128 state = 0x43f3abd6 + __ dci(0x4e28792b); // aesimc v11.16b, v9.16b + // vl128 state = 0xeb8ca365 + __ dci(0x4e2869bb); // aesmc v27.16b, v13.16b + // vl128 state = 0x0a957f4f + __ dci(0x4e286b93); // aesmc v19.16b, v28.16b + // vl128 state = 0xbc5da8bd + __ dci(0x4e286b97); // aesmc v23.16b, v28.16b + // vl128 state = 0xc49343cc + __ dci(0x4e286b95); // aesmc v21.16b, v28.16b + // vl128 state = 0x8c80c144 + __ dci(0x4e286b51); // aesmc v17.16b, v26.16b + // vl128 state = 0xeda3255d + __ dci(0x4e2869d3); // aesmc v19.16b, v14.16b + // vl128 state = 0x8db8a9d0 + } + + uint32_t state; + ComputeMachineStateHash(&masm, &state); + __ Mov(x0, reinterpret_cast(&state)); + __ Ldr(w0, MemOperand(x0)); + + END(); + if (CAN_RUN()) { + RUN(); + uint32_t expected_hashes[] = { + 0x8db8a9d0, + 0xb13d8e1e, + 0x9f33ca70, + 0x38f7ef7a, + 0x65352b29, + 0xc4257260, + 0xf49587c2, + 0xb3f61256, + 0x8ef4a534, + 0x6e061aa9, + 0x7270527d, + 0x3e1f82f9, + 0x1fe79e60, + 0x985cab68, + 0xe77b4484, + 0xe3817f4e, + }; + ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0); + } +} + +TEST_SVE(neon_sm3) { + SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, + CPUFeatures::kNEON, + CPUFeatures::kCRC32, + CPUFeatures::kSM3); + START(); + + SetInitialMachineState(&masm); + // state = 0xe2bd2480 + + { + ExactAssemblyScope scope(&masm, 10 * kInstructionSize); + __ dci(0xce591017); // sm3ss1 v23.4s, v0.4s, v25.4s, v4.4s + // vl128 state = 0xad4bba0a + __ dci(0xce49121f); // sm3ss1 v31.4s, v16.4s, v9.4s, v4.4s + // vl128 state = 0x84adef21 + __ dci(0xce49121e); // sm3ss1 v30.4s, v16.4s, v9.4s, v4.4s + // vl128 state = 0xccfd7e5a + __ dci(0xce49301a); // sm3ss1 v26.4s, v0.4s, v9.4s, v12.4s + // vl128 state = 0x60833cc7 + __ dci(0xce49720a); // sm3ss1 v10.4s, v16.4s, v9.4s, v28.4s + // vl128 state = 0x03f03263 + __ dci(0xce58721a); // sm3ss1 v26.4s, v16.4s, v24.4s, v28.4s + // vl128 state = 0x31845f40 + __ dci(0xce58702a); // sm3ss1 v10.4s, v1.4s, v24.4s, v28.4s + // vl128 state = 0x54c64f70 + __ dci(0xce58753a); // sm3ss1 v26.4s, v9.4s, v24.4s, v29.4s + // vl128 state = 0x3d5cb04f + __ dci(0xce507518); // sm3ss1 v24.4s, v8.4s, v16.4s, v29.4s + // vl128 state = 0xe02de221 + __ dci(0xce406519); // sm3ss1 v25.4s, v8.4s, v0.4s, v25.4s + // vl128 state = 0x73d36ae8 + } + + uint32_t state; + ComputeMachineStateHash(&masm, &state); + __ Mov(x0, reinterpret_cast(&state)); + __ Ldr(w0, MemOperand(x0)); + + END(); + if (CAN_RUN()) { + RUN(); + uint32_t expected_hashes[] = { + 0x73d36ae8, + 0xcbcda2db, + 0x6ee9ad3d, + 0xa6857a16, + 0xa238ec05, + 0x1bc82d1d, + 0xe4530773, + 0xfb0d092e, + 0xe62aff0a, + 0xf56a593f, + 0x3967d590, + 0xebcd14a0, + 0xa7bedcb8, + 0x867fa43c, + 0x1679eab5, + 0x0a836861, + }; + ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0); + } +} + +TEST_SVE(neon_sm3partw12) { + SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, + CPUFeatures::kNEON, + CPUFeatures::kCRC32, + CPUFeatures::kSM3); + START(); + + SetInitialMachineState(&masm); + // state = 0xe2bd2480 + + { + ExactAssemblyScope scope(&masm, 30 * kInstructionSize); + __ dci(0xce70c201); // sm3partw1 v1.4s, v16.4s, v16.4s + // vl128 state = 0x6f2069a6 + __ dci(0xce72c303); // sm3partw1 v3.4s, v24.4s, v18.4s + // vl128 state = 0x986fa56c + __ dci(0xce76c381); // sm3partw1 v1.4s, v28.4s, v22.4s + // vl128 state = 0x5dbd953c + __ dci(0xce7ec3b1); // sm3partw1 v17.4s, v29.4s, v30.4s + // vl128 state = 0xc72ccca5 + __ dci(0xce7ac1b5); // sm3partw1 v21.4s, v13.4s, v26.4s + // vl128 state = 0x33cdfd6a + __ dci(0xce7ac1b7); // sm3partw1 v23.4s, v13.4s, v26.4s + // vl128 state = 0x4303e945 + __ dci(0xce7ac1bf); // sm3partw1 v31.4s, v13.4s, v26.4s + // vl128 state = 0x56acac84 + __ dci(0xce78c1fd); // sm3partw1 v29.4s, v15.4s, v24.4s + // vl128 state = 0x5e2a2793 + __ dci(0xce78c5df); // sm3partw2 v31.4s, v14.4s, v24.4s + // vl128 state = 0xf7c457f3 + __ dci(0xce70c55d); // sm3partw2 v29.4s, v10.4s, v16.4s + // vl128 state = 0xfa3557ac + __ dci(0xce60c159); // sm3partw1 v25.4s, v10.4s, v0.4s + // vl128 state = 0xb3ae6830 + __ dci(0xce62c55b); // sm3partw2 v27.4s, v10.4s, v2.4s + // vl128 state = 0xa7747c70 + __ dci(0xce66c753); // sm3partw2 v19.4s, v26.4s, v6.4s + // vl128 state = 0xb55f5895 + __ dci(0xce67c551); // sm3partw2 v17.4s, v10.4s, v7.4s + // vl128 state = 0x519b1342 + __ dci(0xce65c750); // sm3partw2 v16.4s, v26.4s, v5.4s + // vl128 state = 0xc4e6e4b9 + __ dci(0xce61c718); // sm3partw2 v24.4s, v24.4s, v1.4s + // vl128 state = 0x127c483c + __ dci(0xce61c71c); // sm3partw2 v28.4s, v24.4s, v1.4s + // vl128 state = 0x92783ecc + __ dci(0xce6dc714); // sm3partw2 v20.4s, v24.4s, v13.4s + // vl128 state = 0xe11e87d3 + __ dci(0xce65c756); // sm3partw2 v22.4s, v26.4s, v5.4s + // vl128 state = 0x8b6878d0 + __ dci(0xce65c5d2); // sm3partw2 v18.4s, v14.4s, v5.4s + // vl128 state = 0xf2fb1e86 + __ dci(0xce64c550); // sm3partw2 v16.4s, v10.4s, v4.4s + // vl128 state = 0x73ad3b0f + __ dci(0xce66c578); // sm3partw2 v24.4s, v11.4s, v6.4s + // vl128 state = 0x7e03900d + __ dci(0xce76c55c); // sm3partw2 v28.4s, v10.4s, v22.4s + // vl128 state = 0x1d0b5df6 + __ dci(0xce76c54c); // sm3partw2 v12.4s, v10.4s, v22.4s + // vl128 state = 0x1a3d7a77 + __ dci(0xce7ec448); // sm3partw2 v8.4s, v2.4s, v30.4s + // vl128 state = 0x3ed2e4bd + __ dci(0xce6ec409); // sm3partw2 v9.4s, v0.4s, v14.4s + // vl128 state = 0x826dd348 + __ dci(0xce6ec52b); // sm3partw2 v11.4s, v9.4s, v14.4s + // vl128 state = 0x3ff5e482 + __ dci(0xce66c72f); // sm3partw2 v15.4s, v25.4s, v6.4s + // vl128 state = 0x6fd24cd4 + __ dci(0xce65c73f); // sm3partw2 v31.4s, v25.4s, v5.4s + // vl128 state = 0xd51ac474 + __ dci(0xce67c77b); // sm3partw2 v27.4s, v27.4s, v7.4s + // vl128 state = 0x720d7419 + } + + uint32_t state; + ComputeMachineStateHash(&masm, &state); + __ Mov(x0, reinterpret_cast(&state)); + __ Ldr(w0, MemOperand(x0)); + + END(); + if (CAN_RUN()) { + RUN(); + uint32_t expected_hashes[] = { + 0x720d7419, + 0x31445e06, + 0xd2aee240, + 0x45a27e4b, + 0xd6c46f08, + 0xcaed7f9e, + 0x734820c7, + 0x377e1f38, + 0x12e03585, + 0x1b9cbe63, + 0x1d58d49a, + 0xc160a9dc, + 0x22c2fe25, + 0x86b7af0f, + 0xfeae7bf5, + 0xf8dfcc40, + }; + ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0); + } +} + +TEST_SVE(neon_sm3tt1) { + SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, + CPUFeatures::kNEON, + CPUFeatures::kCRC32, + CPUFeatures::kSM3); + START(); + + SetInitialMachineState(&masm); + // state = 0xe2bd2480 + + { + ExactAssemblyScope scope(&masm, 20 * kInstructionSize); + __ dci(0xce53a363); // sm3tt1a v3.4s, v27.4s, v19.s[2] + // vl128 state = 0xaaa8c715 + __ dci(0xce58a7a7); // sm3tt1b v7.4s, v29.4s, v24.s[2] + // vl128 state = 0xb99a301d + __ dci(0xce5eb2b7); // sm3tt1a v23.4s, v21.4s, v30.s[3] + // vl128 state = 0xe8dabe99 + __ dci(0xce43b6ce); // sm3tt1b v14.4s, v22.4s, v3.s[3] + // vl128 state = 0xaa498ae5 + __ dci(0xce448027); // sm3tt1a v7.4s, v1.4s, v4.s[0] + // vl128 state = 0x32093547 + __ dci(0xce4286d8); // sm3tt1b v24.4s, v22.4s, v2.s[0] + // vl128 state = 0xe03e3a81 + __ dci(0xce44a0f3); // sm3tt1a v19.4s, v7.4s, v4.s[2] + // vl128 state = 0xcb555b4a + __ dci(0xce418233); // sm3tt1a v19.4s, v17.4s, v1.s[0] + // vl128 state = 0x751e4f7d + __ dci(0xce58a49f); // sm3tt1b v31.4s, v4.4s, v24.s[2] + // vl128 state = 0xcaff7580 + __ dci(0xce548326); // sm3tt1a v6.4s, v25.4s, v20.s[0] + // vl128 state = 0xc4308a78 + __ dci(0xce548124); // sm3tt1a v4.4s, v9.4s, v20.s[0] + // vl128 state = 0x1f1bfdfb + __ dci(0xce5fb282); // sm3tt1a v2.4s, v20.4s, v31.s[3] + // vl128 state = 0xa632c0b2 + __ dci(0xce549573); // sm3tt1b v19.4s, v11.4s, v20.s[1] + // vl128 state = 0x7fb7c2d3 + __ dci(0xce4387ae); // sm3tt1b v14.4s, v29.4s, v3.s[0] + // vl128 state = 0xe8d4c534 + __ dci(0xce5094eb); // sm3tt1b v11.4s, v7.4s, v16.s[1] + // vl128 state = 0xf34a4fbc + __ dci(0xce51b59f); // sm3tt1b v31.4s, v12.4s, v17.s[3] + // vl128 state = 0x98e388e9 + __ dci(0xce50a7bf); // sm3tt1b v31.4s, v29.4s, v16.s[2] + // vl128 state = 0x7cd7a6ac + __ dci(0xce5ca52e); // sm3tt1b v14.4s, v9.4s, v28.s[2] + // vl128 state = 0xce9410c5 + __ dci(0xce5aa741); // sm3tt1b v1.4s, v26.4s, v26.s[2] + // vl128 state = 0xd83fbd58 + __ dci(0xce5e94da); // sm3tt1b v26.4s, v6.4s, v30.s[1] + // vl128 state = 0xc6055fe3 + } + + uint32_t state; + ComputeMachineStateHash(&masm, &state); + __ Mov(x0, reinterpret_cast(&state)); + __ Ldr(w0, MemOperand(x0)); + + END(); + if (CAN_RUN()) { + RUN(); + uint32_t expected_hashes[] = { + 0xc6055fe3, + 0xa2c33f98, + 0x1cc9a227, + 0xf29eb254, + 0xd1739d6e, + 0x1c4fff34, + 0x0c182795, + 0x96e46836, + 0x43d010c9, + 0xd7c4f94c, + 0x78c387f2, + 0x4319fef3, + 0x72407eef, + 0xa77d3869, + 0x3c81c49a, + 0x68cc20ef, + }; + ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0); + } +} + +TEST_SVE(neon_sm3tt2) { + SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, + CPUFeatures::kNEON, + CPUFeatures::kCRC32, + CPUFeatures::kSM3); + START(); + + SetInitialMachineState(&masm); + // state = 0xe2bd2480 + + { + ExactAssemblyScope scope(&masm, 20 * kInstructionSize); + __ dci(0xce439d42); // sm3tt2b v2.4s, v10.4s, v3.s[1] + // vl128 state = 0x388642cc + __ dci(0xce42b89d); // sm3tt2a v29.4s, v4.4s, v2.s[3] + // vl128 state = 0x66f4e60a + __ dci(0xce4da95d); // sm3tt2a v29.4s, v10.4s, v13.s[2] + // vl128 state = 0x95d4651d + __ dci(0xce49b926); // sm3tt2a v6.4s, v9.4s, v9.s[3] + // vl128 state = 0x826919fe + __ dci(0xce5cae33); // sm3tt2b v19.4s, v17.4s, v28.s[2] + // vl128 state = 0xb5cfefb0 + __ dci(0xce478959); // sm3tt2a v25.4s, v10.4s, v7.s[0] + // vl128 state = 0xfe17b730 + __ dci(0xce549cc2); // sm3tt2b v2.4s, v6.4s, v20.s[1] + // vl128 state = 0x769a0d76 + __ dci(0xce4c9f90); // sm3tt2b v16.4s, v28.4s, v12.s[1] + // vl128 state = 0x8f633b95 + __ dci(0xce508d49); // sm3tt2b v9.4s, v10.4s, v16.s[0] + // vl128 state = 0x5eab6daa + __ dci(0xce59ad79); // sm3tt2b v25.4s, v11.4s, v25.s[2] + // vl128 state = 0xfb197616 + __ dci(0xce458fd6); // sm3tt2b v22.4s, v30.4s, v5.s[0] + // vl128 state = 0x875ff29d + __ dci(0xce4ab92c); // sm3tt2a v12.4s, v9.4s, v10.s[3] + // vl128 state = 0xad159c01 + __ dci(0xce598a1c); // sm3tt2a v28.4s, v16.4s, v25.s[0] + // vl128 state = 0x3da313e4 + __ dci(0xce43989f); // sm3tt2a v31.4s, v4.4s, v3.s[1] + // vl128 state = 0xc0a54179 + __ dci(0xce459c8a); // sm3tt2b v10.4s, v4.4s, v5.s[1] + // vl128 state = 0x4739cdbf + __ dci(0xce539959); // sm3tt2a v25.4s, v10.4s, v19.s[1] + // vl128 state = 0xd85f84ab + __ dci(0xce429be1); // sm3tt2a v1.4s, v31.4s, v2.s[1] + // vl128 state = 0x85b5871c + __ dci(0xce5d9fe3); // sm3tt2b v3.4s, v31.4s, v29.s[1] + // vl128 state = 0x2be5bd95 + __ dci(0xce4ebe16); // sm3tt2b v22.4s, v16.4s, v14.s[3] + // vl128 state = 0x2f8146e9 + __ dci(0xce599a63); // sm3tt2a v3.4s, v19.4s, v25.s[1] + // vl128 state = 0xa6e513e2 + } + + uint32_t state; + ComputeMachineStateHash(&masm, &state); + __ Mov(x0, reinterpret_cast(&state)); + __ Ldr(w0, MemOperand(x0)); + + END(); + if (CAN_RUN()) { + RUN(); + uint32_t expected_hashes[] = { + 0xa6e513e2, + 0x6bf4ae47, + 0x74e074db, + 0xae1a57e0, + 0x0db67f09, + 0x85332e49, + 0xc40d6565, + 0x07ed81aa, + 0xfa0e10bb, + 0x9addadfa, + 0xa9cea561, + 0xa481e17b, + 0x7c2be34e, + 0xd4cf493f, + 0x8b30cc5e, + 0xe44416d3, + }; + ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0); + } +} + +TEST_SVE(neon_sm4e) { + SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, + CPUFeatures::kNEON, + CPUFeatures::kCRC32, + CPUFeatures::kSM4); + START(); + + SetInitialMachineState(&masm); + // state = 0xe2bd2480 + + { + ExactAssemblyScope scope(&masm, 20 * kInstructionSize); + __ dci(0xcec08400); // sm4e v0.4s, v0.4s + // vl128 state = 0xa687bacc + __ dci(0xcec08628); // sm4e v8.4s, v17.4s + // vl128 state = 0xf174e346 + __ dci(0xcec0862a); // sm4e v10.4s, v17.4s + // vl128 state = 0xab88f8ca + __ dci(0xcec08628); // sm4e v8.4s, v17.4s + // vl128 state = 0x000d3840 + __ dci(0xcec08638); // sm4e v24.4s, v17.4s + // vl128 state = 0xd980ddc2 + __ dci(0xcec08688); // sm4e v8.4s, v20.4s + // vl128 state = 0xd501f2c2 + __ dci(0xcec0868c); // sm4e v12.4s, v20.4s + // vl128 state = 0x699d6b6f + __ dci(0xcec0864d); // sm4e v13.4s, v18.4s + // vl128 state = 0x67baf406 + __ dci(0xcec08649); // sm4e v9.4s, v18.4s + // vl128 state = 0x178b048e + __ dci(0xcec08659); // sm4e v25.4s, v18.4s + // vl128 state = 0x552a70d9 + __ dci(0xcec0865d); // sm4e v29.4s, v18.4s + // vl128 state = 0x3be534d1 + __ dci(0xcec0865f); // sm4e v31.4s, v18.4s + // vl128 state = 0x396fdf70 + __ dci(0xcec08657); // sm4e v23.4s, v18.4s + // vl128 state = 0x836c474b + __ dci(0xcec086e7); // sm4e v7.4s, v23.4s + // vl128 state = 0x71aebad7 + __ dci(0xcec08683); // sm4e v3.4s, v20.4s + // vl128 state = 0xadfd515c + __ dci(0xcec08681); // sm4e v1.4s, v20.4s + // vl128 state = 0xf1465ab4 + __ dci(0xcec087c0); // sm4e v0.4s, v30.4s + // vl128 state = 0x8555b40f + __ dci(0xcec087c4); // sm4e v4.4s, v30.4s + // vl128 state = 0x2cb3f99f + __ dci(0xcec087d4); // sm4e v20.4s, v30.4s + // vl128 state = 0x733336fd + __ dci(0xcec085fc); // sm4e v28.4s, v15.4s + // vl128 state = 0x11b138f9 + } + + uint32_t state; + ComputeMachineStateHash(&masm, &state); + __ Mov(x0, reinterpret_cast(&state)); + __ Ldr(w0, MemOperand(x0)); + + END(); + if (CAN_RUN()) { + RUN(); + uint32_t expected_hashes[] = { + 0x11b138f9, + 0x5993c196, + 0xb9eef6b5, + 0xf96d88cf, + 0x8e92bd49, + 0x04d27185, + 0x8833f291, + 0x77933d5b, + 0x135500cc, + 0xe5ca977f, + 0x3e4536af, + 0xb169aa9d, + 0xe0b4425b, + 0x35c1f76e, + 0x54e3448a, + 0x4dbf0c92, + }; + ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0); + } +} + +TEST_SVE(neon_sm4ekey) { + SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, + CPUFeatures::kNEON, + CPUFeatures::kCRC32, + CPUFeatures::kSM4); + START(); + + SetInitialMachineState(&masm); + // state = 0xe2bd2480 + + { + ExactAssemblyScope scope(&masm, 20 * kInstructionSize); + __ dci(0xce6fc9d4); // sm4ekey v20.4s, v14.4s, v15.4s + // vl128 state = 0x4bb7b396 + __ dci(0xce6bc8d5); // sm4ekey v21.4s, v6.4s, v11.4s + // vl128 state = 0xf4354b26 + __ dci(0xce6bc8c5); // sm4ekey v5.4s, v6.4s, v11.4s + // vl128 state = 0x0a331378 + __ dci(0xce6bc8cd); // sm4ekey v13.4s, v6.4s, v11.4s + // vl128 state = 0x7ed4c2a7 + __ dci(0xce6fc8e5); // sm4ekey v5.4s, v7.4s, v15.4s + // vl128 state = 0x38a433fd + __ dci(0xce6fc8e4); // sm4ekey v4.4s, v7.4s, v15.4s + // vl128 state = 0xc1ad0d76 + __ dci(0xce6bcaec); // sm4ekey v12.4s, v23.4s, v11.4s + // vl128 state = 0x81660ce3 + __ dci(0xce6bcae8); // sm4ekey v8.4s, v23.4s, v11.4s + // vl128 state = 0x79f3e5c1 + __ dci(0xce7bcaaa); // sm4ekey v10.4s, v21.4s, v27.4s + // vl128 state = 0x231e0a79 + __ dci(0xce72caa8); // sm4ekey v8.4s, v21.4s, v18.4s + // vl128 state = 0xd931c858 + __ dci(0xce7ac8aa); // sm4ekey v10.4s, v5.4s, v26.4s + // vl128 state = 0x2476ef6a + __ dci(0xce7bc888); // sm4ekey v8.4s, v4.4s, v27.4s + // vl128 state = 0xd4a9ac83 + __ dci(0xce7bc889); // sm4ekey v9.4s, v4.4s, v27.4s + // vl128 state = 0x149fd9b3 + __ dci(0xce7bc9cd); // sm4ekey v13.4s, v14.4s, v27.4s + // vl128 state = 0xece67fce + __ dci(0xce79cbc5); // sm4ekey v5.4s, v30.4s, v25.4s + // vl128 state = 0xccb45863 + __ dci(0xce71cac4); // sm4ekey v4.4s, v22.4s, v17.4s + // vl128 state = 0xafb23c9d + __ dci(0xce71c8e0); // sm4ekey v0.4s, v7.4s, v17.4s + // vl128 state = 0x5c808694 + __ dci(0xce71c882); // sm4ekey v2.4s, v4.4s, v17.4s + // vl128 state = 0x6cea5132 + __ dci(0xce73c803); // sm4ekey v3.4s, v0.4s, v19.4s + // vl128 state = 0x67e316db + __ dci(0xce71c847); // sm4ekey v7.4s, v2.4s, v17.4s + // vl128 state = 0x317aafac + } + + uint32_t state; + ComputeMachineStateHash(&masm, &state); + __ Mov(x0, reinterpret_cast(&state)); + __ Ldr(w0, MemOperand(x0)); + + END(); + if (CAN_RUN()) { + RUN(); + uint32_t expected_hashes[] = { + 0x317aafac, + 0xbacd34de, + 0x3e92f0b2, + 0x3043dbe3, + 0x6dda4d17, + 0x6e59ba0d, + 0xa29887cf, + 0x3bee1f56, + 0xacd43191, + 0x97ab7ada, + 0x39ebcf53, + 0xea7b411e, + 0xd8e1efe9, + 0x2b99fc57, + 0xf5f62e02, + 0xd50621d1, + }; + ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0); + } +} + } // namespace aarch64 } // namespace vixl diff --git a/test/aarch64/test-simulator-sve2-aarch64.cc b/test/aarch64/test-simulator-sve2-aarch64.cc index a7c0f401..621754d2 100644 --- a/test/aarch64/test-simulator-sve2-aarch64.cc +++ b/test/aarch64/test-simulator-sve2-aarch64.cc @@ -9117,5 +9117,130 @@ TEST_SVE(sve2_extract) { } } +TEST_SVE(sve2_pmull128) { + SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE, + CPUFeatures::kSVE2, + CPUFeatures::kNEON, + CPUFeatures::kCRC32, + CPUFeatures::kSVEPmull128); + START(); + + SetInitialMachineState(&masm); + // state = 0xe2bd2480 + + { + ExactAssemblyScope scope(&masm, 40 * kInstructionSize); + __ dci(0x45006800); // pmullb z0.q, z0.d, z0.d + // vl128 state = 0x4107ca0c + __ dci(0x45006a28); // pmullb z8.q, z17.d, z0.d + // vl128 state = 0xa87d231a + __ dci(0x45016a6c); // pmullb z12.q, z19.d, z1.d + // vl128 state = 0xc547fcf6 + __ dci(0x45116e68); // pmullt z8.q, z19.d, z17.d + // vl128 state = 0x6a01d521 + __ dci(0x45106a69); // pmullb z9.q, z19.d, z16.d + // vl128 state = 0x64a7ba8a + __ dci(0x45006a4d); // pmullb z13.q, z18.d, z0.d + // vl128 state = 0xe59e3f8e + __ dci(0x45086e5d); // pmullt z29.q, z18.d, z8.d + // vl128 state = 0xbfbb9316 + __ dci(0x450a6e75); // pmullt z21.q, z19.d, z10.d + // vl128 state = 0x29f6a4c7 + __ dci(0x45126e74); // pmullt z20.q, z19.d, z18.d + // vl128 state = 0x4ced9406 + __ dci(0x45176e75); // pmullt z21.q, z19.d, z23.d + // vl128 state = 0xd09e5676 + __ dci(0x45176e77); // pmullt z23.q, z19.d, z23.d + // vl128 state = 0x568c0e25 + __ dci(0x45176e75); // pmullt z21.q, z19.d, z23.d + // vl128 state = 0xb2f13c36 + __ dci(0x45176b71); // pmullb z17.q, z27.d, z23.d + // vl128 state = 0x160bec4f + __ dci(0x451f6b30); // pmullb z16.q, z25.d, z31.d + // vl128 state = 0x2d7e7f49 + __ dci(0x451f6b20); // pmullb z0.q, z25.d, z31.d + // vl128 state = 0x113d828b + __ dci(0x451f6b90); // pmullb z16.q, z28.d, z31.d + // vl128 state = 0xb8b3b3d9 + __ dci(0x451f6f12); // pmullt z18.q, z24.d, z31.d + // vl128 state = 0x277aacb8 + __ dci(0x451f6f16); // pmullt z22.q, z24.d, z31.d + // vl128 state = 0xef79c8da + __ dci(0x450b6f17); // pmullt z23.q, z24.d, z11.d + // vl128 state = 0x1dc19104 + __ dci(0x450a6e1f); // pmullt z31.q, z16.d, z10.d + // vl128 state = 0x3ccb4ea8 + __ dci(0x451a6e2f); // pmullt z15.q, z17.d, z26.d + // vl128 state = 0x14e13481 + __ dci(0x45126a3f); // pmullb z31.q, z17.d, z18.d + // vl128 state = 0x4e6502f9 + __ dci(0x451a6b3e); // pmullb z30.q, z25.d, z26.d + // vl128 state = 0xf6f18478 + __ dci(0x45126a3a); // pmullb z26.q, z17.d, z18.d + // vl128 state = 0xdd4f14fb + __ dci(0x45126afb); // pmullb z27.q, z23.d, z18.d + // vl128 state = 0xcbf3bee2 + __ dci(0x45126aff); // pmullb z31.q, z23.d, z18.d + // vl128 state = 0x627bec09 + __ dci(0x45126aef); // pmullb z15.q, z23.d, z18.d + // vl128 state = 0xf5de1fa9 + __ dci(0x45106abf); // pmullb z31.q, z21.d, z16.d + // vl128 state = 0x44bb6385 + __ dci(0x451a6abb); // pmullb z27.q, z21.d, z26.d + // vl128 state = 0x5c5fa224 + __ dci(0x450a68b3); // pmullb z19.q, z5.d, z10.d + // vl128 state = 0x28b6085c + __ dci(0x450e69b2); // pmullb z18.q, z13.d, z14.d + // vl128 state = 0x450898d6 + __ dci(0x450e69b6); // pmullb z22.q, z13.d, z14.d + // vl128 state = 0x79d7911b + __ dci(0x450e69b4); // pmullb z20.q, z13.d, z14.d + // vl128 state = 0x98bf6939 + __ dci(0x450f6924); // pmullb z4.q, z9.d, z15.d + // vl128 state = 0xb8a1bbc7 + __ dci(0x45176925); // pmullb z5.q, z9.d, z23.d + // vl128 state = 0x631b41c8 + __ dci(0x451f69a4); // pmullb z4.q, z13.d, z31.d + // vl128 state = 0x617fc272 + __ dci(0x451b69e0); // pmullb z0.q, z15.d, z27.d + // vl128 state = 0x77780ac1 + __ dci(0x451b69e8); // pmullb z8.q, z15.d, z27.d + // vl128 state = 0xce5ae18f + __ dci(0x450f69e0); // pmullb z0.q, z15.d, z15.d + // vl128 state = 0xa037371a + __ dci(0x450b6be8); // pmullb z8.q, z31.d, z11.d + // vl128 state = 0xb59be233 + } + + uint32_t state; + ComputeMachineStateHash(&masm, &state); + __ Mov(x0, reinterpret_cast(&state)); + __ Ldr(w0, MemOperand(x0)); + + END(); + if (CAN_RUN()) { + RUN(); + uint32_t expected_hashes[] = { + 0xb59be233, + 0x32430624, + 0x5cc3ec66, + 0xecfdffe7, + 0x6d77a270, + 0xa0d604f2, + 0x2178aa11, + 0xabdcbeaa, + 0xab3b974f, + 0x11a874f5, + 0xf2eb6131, + 0x6d311c6c, + 0xd4e99b72, + 0x5177ce8e, + 0x32aa02f0, + 0x681ef977, + }; + ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0); + } +} + } // namespace aarch64 } // namespace vixl diff --git a/test/aarch64/test-utils-aarch64.cc b/test/aarch64/test-utils-aarch64.cc index 398ed4fd..c23f4e8b 100644 --- a/test/aarch64/test-utils-aarch64.cc +++ b/test/aarch64/test-utils-aarch64.cc @@ -89,6 +89,34 @@ bool Equal64(uint64_t reference, } +bool Equal64(std::vector reference_list, + const RegisterDump*, + uint64_t result, + ExpectedResult option) { + switch (option) { + case kExpectEqual: + for (uint64_t reference : reference_list) { + if (result == reference) return true; + } + printf("Expected a result in (\n"); + break; + case kExpectNotEqual: + for (uint64_t reference : reference_list) { + if (result == reference) { + printf("Expected a result not in (\n"); + break; + } + } + return true; + } + for (uint64_t reference : reference_list) { + printf(" 0x%016" PRIx64 ",\n", reference); + } + printf(")\t Found 0x%016" PRIx64 "\n", result); + return false; +} + + bool Equal128(QRegisterValue expected, const RegisterDump*, QRegisterValue result) { @@ -200,6 +228,16 @@ bool Equal64(uint64_t reference, } +bool Equal64(std::vector reference_list, + const RegisterDump* core, + const Register& reg, + ExpectedResult option) { + VIXL_ASSERT(reg.Is64Bits()); + uint64_t result = core->xreg(reg.GetCode()); + return Equal64(reference_list, core, result, option); +} + + bool NotEqual64(uint64_t reference, const RegisterDump* core, const Register& reg) { diff --git a/test/aarch64/test-utils-aarch64.h b/test/aarch64/test-utils-aarch64.h index 9cf91549..40a5aa5e 100644 --- a/test/aarch64/test-utils-aarch64.h +++ b/test/aarch64/test-utils-aarch64.h @@ -345,6 +345,10 @@ bool Equal64(uint64_t reference, const RegisterDump*, uint64_t result, ExpectedResult option = kExpectEqual); +bool Equal64(std::vector reference_list, + const RegisterDump*, + uint64_t result, + ExpectedResult option = kExpectEqual); bool Equal128(QRegisterValue expected, const RegisterDump*, QRegisterValue result); @@ -358,6 +362,10 @@ bool Equal64(uint64_t reference, const RegisterDump* core, const Register& reg, ExpectedResult option = kExpectEqual); +bool Equal64(std::vector reference_list, + const RegisterDump* core, + const Register& reg, + ExpectedResult option = kExpectEqual); bool Equal64(uint64_t expected, const RegisterDump* core, const VRegister& vreg); diff --git a/test/test-invalset.cc b/test/test-invalset.cc index ac53a04d..548f67ea 100644 --- a/test/test-invalset.cc +++ b/test/test-invalset.cc @@ -397,5 +397,27 @@ TEST(stl_forward_iterator) { #endif } +TEST(move) { + TestSet set1; + + set1.insert(Obj(-123, 456)); + set1.insert(Obj(2718, 2871828)); + + TestSet set2(std::move(set1)); + VIXL_CHECK(set1.empty()); + VIXL_CHECK(set2.size() == 2); + VIXL_CHECK(set2.GetMinElement() == Obj(-123, 456)); + + // Test with more elements. + for (unsigned i = 0; i < 4 * kNPreallocatedElements; i++) { + set2.insert(Obj(i, -1)); + } + + TestSet set3(std::move(set2)); + VIXL_CHECK(set2.empty()); + VIXL_CHECK(set3.size() == 2 + 4 * kNPreallocatedElements); + VIXL_CHECK(set3.GetMinElement() == Obj(-123, 456)); +} + } // namespace vixl diff --git a/test/test-pool-manager.cc b/test/test-pool-manager.cc index eb22ae6f..194154b3 100644 --- a/test/test-pool-manager.cc +++ b/test/test-pool-manager.cc @@ -376,7 +376,7 @@ TEST(FuzzObjectDeletedWhenPlaced) { } int32_t pc = 0; - for (int i = 0; !objects.empty(); ++i) { + while (!objects.empty()) { IF_VERBOSE(printf("PC = 0x%x (%d)\n", pc, pc)); int32_t pc_increment = RandomPCIncrement(); IF_VERBOSE(printf("Attempting to increment PC by %d\n", pc_increment)); @@ -451,7 +451,7 @@ TEST(FuzzObjectUpdatedWhenPlaced) { } int32_t pc = 0; - for (int i = 0; !objects.empty(); ++i) { + while (!objects.empty()) { IF_VERBOSE(printf("PC = 0x%x (%d)\n", pc, pc)); int32_t pc_increment = RandomPCIncrement(); diff --git a/tools/code_coverage.log b/tools/code_coverage.log index c27ab83a..d787f6fe 100644 --- a/tools/code_coverage.log +++ b/tools/code_coverage.log @@ -14,11 +14,22 @@ 1660224011 82.79% 97.51% 95.50% 1663161852 82.79% 97.51% 95.50% 1666104118 82.79% 97.51% 95.50% +1668785529 82.75% 97.44% 95.40% 1669202345 82.79% 97.51% 95.51% 1673432155 82.79% 97.51% 95.51% 1677171445 82.78% 97.56% 94.81% 1681814646 82.90% 97.57% 94.87% 1686666000 82.90% 97.57% 94.87% 1693487542 82.91% 97.57% 94.87% +1694008240 82.72% 97.50% 94.95% +1697036303 82.87% 97.56% 94.76% +1698228274 82.93% 97.68% 94.90% +1698330215 82.92% 97.57% 94.88% 1702052331 82.89% 97.59% 94.77% +1706691191 82.87% 97.59% 94.74% 1707395574 82.89% 97.59% 94.77% +1715261843 82.84% 97.60% 94.69% +1718190785 82.85% 97.60% 94.70% +1722595938 82.94% 97.78% 94.72% +1728570468 82.94% 97.78% 94.71% +1736874659 82.94% 97.63% 94.78% diff --git a/tools/lint.py b/tools/lint.py index 4820439d..f67799b2 100755 --- a/tools/lint.py +++ b/tools/lint.py @@ -91,7 +91,7 @@ def Lint(filename, progress_prefix = ''): printer.Print(outerr) # Find the number of errors in this file. - res = re.search('Total errors found: (\d+)', outerr) + res = re.search(r'Total errors found: (\d+)', outerr) if res: n_errors_str = res.string[res.start(1):res.end(1)] n_errors = int(n_errors_str) @@ -192,7 +192,7 @@ def IsCppLintAvailable(): return retcode == 0 -CPP_EXT_REGEXP = re.compile('\.(cc|h)$') +CPP_EXT_REGEXP = re.compile(r'\.(cc|h)$') def IsLinterInput(filename): # lint all C++ files. return CPP_EXT_REGEXP.search(filename) != None diff --git a/tools/util.py b/tools/util.py index ed41461e..240c6972 100644 --- a/tools/util.py +++ b/tools/util.py @@ -89,7 +89,7 @@ def GetCompilerDirectives(env): match.group(1): match.group(2) for match in [ # Capture macro name. - re.search('^#define (\S+?) (.+)$', macro) + re.search(r'^#define (\S+?) (.+)$', macro) for macro in out.split('\n') ] # Filter out non-matches. @@ -183,7 +183,7 @@ class CompilerInformation(object): # "{compiler}-{major}.{minor}". The comparison is done using the provided # `operator` argument. def CompareVersion(self, operator, description): - match = re.search('^(\S+)-(.*?)$', description) + match = re.search(r'^(\S+)-(.*?)$', description) if not match: raise Exception("A version number is required when comparing compilers") compiler, version = match.group(1), match.group(2)