Added SSE4.1 and SSE4.2 spport (backported from box64)

This commit is contained in:
ptitSeb 2023-11-07 18:06:11 +01:00
parent 236de084ea
commit c10b105f5f
25 changed files with 2730 additions and 579 deletions

View File

@ -312,6 +312,7 @@ set(ELFLOADER_SRC
"${BOX86_ROOT}/src/librarian/dictionnary.c"
"${BOX86_ROOT}/src/librarian/symbols.c"
"${BOX86_ROOT}/src/librarian/globalsymbols.c"
"${BOX86_ROOT}/src/emu/x86compstrings.c"
"${BOX86_ROOT}/src/emu/x86emu.c"
"${BOX86_ROOT}/src/emu/x86run_private.c"
"${BOX86_ROOT}/src/emu/x86syscall.c"
@ -347,6 +348,7 @@ set(INTERPRETER
"${BOX86_ROOT}/src/emu/x86run660f.c"
"${BOX86_ROOT}/src/emu/x86run66d9.c"
"${BOX86_ROOT}/src/emu/x86run66dd.c"
"${BOX86_ROOT}/src/emu/x86run66f20f.c"
"${BOX86_ROOT}/src/emu/x86run67.c"
"${BOX86_ROOT}/src/emu/x86run6766.c"
"${BOX86_ROOT}/src/emu/x86rund8.c"
@ -920,4 +922,19 @@ add_test(NAME sse_optimized COMMAND ${CMAKE_COMMAND} -D TEST_PROGRAM=${CMAKE_BIN
set_tests_properties(sse_optimized PROPERTIES ENVIRONMENT "BOX86_DYNAREC_FASTNAN=0;BOX86_DYNAREC_FASTROUND=0")
add_test(bswap ${CMAKE_COMMAND} -D TEST_PROGRAM=${CMAKE_BINARY_DIR}/${BOX86}
-D TEST_ARGS=${CMAKE_SOURCE_DIR}/tests/test23 -D TEST_OUTPUT=tmpfile23.txt
-D TEST_REFERENCE=${CMAKE_SOURCE_DIR}/tests/ref23.txt
-P ${CMAKE_SOURCE_DIR}/runTest.cmake )
#add_test(feround ${CMAKE_COMMAND} -D TEST_PROGRAM=${CMAKE_BINARY_DIR}/${BOX86}
# -D TEST_ARGS=${CMAKE_SOURCE_DIR}/tests/test24 -D TEST_OUTPUT=tmpfile24.txt
# -D TEST_REFERENCE=${CMAKE_SOURCE_DIR}/tests/ref24.txt
# -P ${CMAKE_SOURCE_DIR}/runTest.cmake )
add_test(sse4_2 ${CMAKE_COMMAND} -D TEST_PROGRAM=${CMAKE_BINARY_DIR}/${BOX86}
-D TEST_ARGS=${CMAKE_SOURCE_DIR}/tests/test25 -D TEST_OUTPUT=tmpfile25.txt
-D TEST_REFERENCE=${CMAKE_SOURCE_DIR}/tests/ref25.txt
-P ${CMAKE_SOURCE_DIR}/runTest.cmake )
endif(BOX86LIB)

View File

@ -189,6 +189,9 @@ Op is 20-27
// and dst, src, #(imm8)
#define AND_IMM8(dst, src, imm8) \
EMIT(0xe2000000 | ((dst) << 12) | ((src) << 16) | brIMM(imm8) )
// and dst, src, #(imm8) with cond
#define AND_IMM8_cond(cond, dst, src, imm8) \
EMIT((cond) | 0x02000000 | ((dst) << 12) | ((src) << 16) | brIMM(imm8) )
// and dst, src1, #imm ror rot*2
#define AND_IMM8_ROR(dst, src, imm8, rot) \
EMIT(0xe2000000 | ((dst) << 12) | ((src) << 16) | ((rot)<<8) | brIMM(imm8) )
@ -633,11 +636,15 @@ Op is 20-27
#define VMRS(Rt) EMIT(c__ | (0b1110<<24) | (0b1111<<20) | (0b0001<<16) | ((Rt)<<12) | (0b1010<<8) | (0b0001<<4) | (0b0000))
// Move to FPSCR from Arm register
#define VMSR(Rt) EMIT(c__ | (0b1110<<24) | (0b1110<<20) | (0b0001<<16) | ((Rt)<<12) | (0b1010<<8) | (0b0001<<4) | (0b0000))
// Move to FPSCR from Arm register with cond
#define VMSR_cond(cond, Rt) EMIT(cond | (0b1110<<24) | (0b1110<<20) | (0b0001<<16) | ((Rt)<<12) | (0b1010<<8) | (0b0001<<4) | (0b0000))
// Move to FPSCR from Arm flags APSR
#define VMRS_APSR() VMRS(15)
// Move between Rt to Sm
#define VMOVtoV(Sm, Rt) EMIT(c__ | (0b1110<<24) | (0b000<<21) | (0<<20) | ((((Sm)&0b11110)>>1)<<16) | ((Rt)<<12) | (0b1010<<8) | (((Sm)&1)<<7) |(0b00<<6) | (1<<4))
// Move between Rt to Sm with condition
#define VMOVtoVcond(cond, Sm, Rt) EMIT(cond | (0b1110<<24) | (0b000<<21) | (0<<20) | ((((Sm)&0b11110)>>1)<<16) | ((Rt)<<12) | (0b1010<<8) | (((Sm)&1)<<7) |(0b00<<6) | (1<<4))
// Move between Sm to Rt
#define VMOVfrV(Rt, Sm) EMIT(c__ | (0b1110<<24) | (0b000<<21) | (1<<20) | ((((Sm)&0b11110)>>1)<<16) | ((Rt)<<12) | (0b1010<<8) | (((Sm)&1)<<7) |(0b00<<6) | (1<<4))

View File

@ -484,7 +484,7 @@ uintptr_t dynarec0F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst,
} else { // mem <= reg
addr = geted(dyn, addr, ninst, nextop, &ed, x2, &fixedaddress, 4095, 0, 0, NULL);
REV(x1, gd);
STR_IMM9(gd, ed, fixedaddress);
STR_IMM9(x1, ed, fixedaddress);
SMWRITE();
}
break;

View File

@ -877,11 +877,21 @@ uintptr_t dynarec660F(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int nins
q1 = v1;
// need rounding!
//VCVTQ_S32_F32(v0, v1);
u8 = sse_setround(dyn, ninst, x1, x2, x14);
VCVTR_S32_F32(q0*2+0, q1*2+0);
VCVTR_S32_F32(q0*2+1, q1*2+1);
VCVTR_S32_F32(q0*2+2, q1*2+2);
VCVTR_S32_F32(q0*2+3, q1*2+3);
if(!box86_dynarec_fastround)
u8 = sse_setround_reset(dyn, ninst, x1, x2, x14);
else
u8 = sse_setround(dyn, ninst, x1, x2, x14);
for(int i=0; i<4; ++i) {
VCVTR_S32_F32(q0*2+i, q1*2+i);
if(!box86_dynarec_fastround) {
VMRS(x3); // get the FPCSR reg and test FPU exception (IO only)
TSTS_IMM8_ROR(x3, 0b00000001, 0);
MOV_IMM_COND(cNE, x1, 0b10, 1); // 0x80000000
VMOVtoVcond(cNE, q0*2+i, x1);
BIC_IMM8_COND(cNE, x3, x3, 1, 0); // reset FPU
VMSR_cond(cNE, x3);
}
}
if(q0!=v0) {
VMOVQ(v0, q0);
}

120
src/emu/x86compstrings.c Normal file
View File

@ -0,0 +1,120 @@
#include <stdint.h>
#include "box86stack.h"
#include "x86emu.h"
#include "x86run_private.h"
#include "x86emu_private.h"
#include "x86compstrings.h"
static int overrideIfDataInvalid(sse_regs_t* mem, int lmem, sse_regs_t* reg, int lreg, int j, int i, int imm8)
{
int valid1 = (i<lreg);
int valid2 = (j<lmem);
if(!valid1 && !valid2)
switch((imm8>>2)&3) {
case 0b00:
case 0b01: return 0;
case 0b10:
case 0b11: return 1;
}
if(!valid1 && valid2)
switch((imm8>>2)&3) {
case 0b00:
case 0b01:
case 0b10: return 0;
case 0b11: return 1;
}
if(valid1 && !valid2)
return 0;
switch((imm8>>2)&3) {
case 0b01: // range
switch (imm8&3) {
case 0b00: // ub
return (i&1)?((reg->ub[i]>=mem->ub[j])):((reg->ub[i]<=mem->ub[j]));
case 0b01: // uw
return (i&1)?((reg->uw[i]>=mem->uw[j])):((reg->uw[i]<=mem->uw[j]));
case 0b10: // sb
return (i&1)?((reg->sb[i]>=mem->sb[j])):((reg->sb[i]<=mem->sb[j]));
case 0b11: // sw
return (i&1)?((reg->sw[i]>=mem->sw[j])):((reg->sw[i]<=mem->sw[j]));
}
break;
default: // the others
switch (imm8&1) {
case 0: // byte
return (reg->ub[i] == mem->ub[j]);
case 1: // word
return (reg->uw[i] == mem->uw[j]);
}
}
}
uint32_t sse42_compare_string_explicit_len(x86emu_t* emu, sse_regs_t* mem, int lmem, sse_regs_t* reg, int lreg, uint8_t imm8)
{
// get number of packed byte/word
int n_packed = (imm8&1)?8:16;
if(lreg<0) lreg = -lreg;
if(lmem<0) lmem = -lmem;
if(lreg>n_packed) lreg = n_packed;
if(lmem>n_packed) lmem = n_packed;
// aggregate to intres1
uint32_t intres1 = 0;
switch((imm8>>2)&3) {
case 0b00: //Equal any
for(int j=0; j<n_packed; ++j)
for(int i=0; i<n_packed; ++i) {
intres1 |= overrideIfDataInvalid(mem, lmem, reg, lreg, j, i, imm8)<<j;
}
break;
case 0b01: // Range
for(int j=0; j<n_packed; ++j)
for(int i=0; i<n_packed; i+=2) {
intres1 |= (overrideIfDataInvalid(mem, lmem, reg, lreg, j, i, imm8) & overrideIfDataInvalid(mem, lmem, reg, lreg, j, i+1, imm8))<<j;
}
break;
case 0b10: // Equal each
for(int i=0; i<n_packed; ++i) {
intres1 |= overrideIfDataInvalid(mem, lmem, reg, lreg, i, i, imm8)<<i;
}
break;
case 0b11: // Equal ordered
intres1 = (1<<n_packed)-1;
for(int j=0; j<n_packed; ++j)
for(int i=0; i<n_packed-j; ++i) {
int k = i+j;
intres1 &= (((1<<n_packed)-1)^(1<<j)) | overrideIfDataInvalid(mem, lmem, reg, lreg, k, i, imm8)<<j;
}
break;
}
// build intres2
uint32_t intres2 = intres1;
switch((imm8>>4)&3) {
case 0b01: intres2 ^= ((1<<n_packed)-1); break;
case 0b11: intres2 ^= ((1<<lmem)-1); break;
}
// and now set the flags
RESET_FLAGS(emu);
CONDITIONAL_SET_FLAG(intres2, F_CF);
CONDITIONAL_SET_FLAG(lmem<n_packed, F_ZF);
CONDITIONAL_SET_FLAG(lreg<n_packed, F_SF);
CONDITIONAL_SET_FLAG(intres2&1, F_OF);
CLEAR_FLAG(F_AF);
CLEAR_FLAG(F_PF);
return intres2;
}
uint32_t sse42_compare_string_implicit_len(x86emu_t* emu, sse_regs_t* mem, sse_regs_t* reg, uint8_t imm8)
{
int lmem = 0;
int lreg = 0;
// get lmem and lreg
if(imm8&1) {
while(lmem<8 && mem->uw[lmem]) ++lmem;
while(lreg<8 && reg->uw[lreg]) ++lreg;
} else {
while(lmem<16 && mem->ub[lmem]) ++lmem;
while(lreg<16 && reg->ub[lreg]) ++lreg;
}
return sse42_compare_string_explicit_len(emu, mem, lmem, reg, lreg, imm8);
}

13
src/emu/x86compstrings.h Normal file
View File

@ -0,0 +1,13 @@
#ifndef __X86_CMPSTRINGS_H__
#define __X86_CMPSTRINGS_H__
#include <stdint.h>
#include "regs.h"
typedef struct x86emu_s x86emu_t;
uint32_t sse42_compare_string_explicit_len(x86emu_t* emu, sse_regs_t* a, int la, sse_regs_t* b, int lb, uint8_t imm8);
uint32_t sse42_compare_string_implicit_len(x86emu_t* emu, sse_regs_t* a, sse_regs_t* b, uint8_t imm8);
#endif //__X86_CMPSTRINGS_H__

View File

@ -99,11 +99,21 @@ uintptr_t Run66(x86emu_t *emu, int rep, uintptr_t addr)
break;
case 0x0F: /* 66 0f prefix */
#ifdef TEST_INTERPRETER
return Test660F(test, addr);
#else
return Run660F(emu, addr);
#endif
switch(rep) {
case 2: return 0;
case 1:
#ifdef TEST_INTERPRETER
return Test66F20F(test, addr);
#else
return Run66F20F(emu, addr);
#endif
default:
#ifdef TEST_INTERPRETER
return Test660F(test, addr);
#else
return Run660F(emu, addr);
#endif
}
case 0x1E: /* PUSH DS */
Push16(emu, emu->segs[_DS]);

View File

@ -15,6 +15,7 @@
#include "box86context.h"
#include "modrm.h"
#include "x86compstrings.h"
static uint8_t ff_mult(uint8_t a, uint8_t b)
{
@ -37,6 +38,8 @@ static uint8_t ff_mult(uint8_t a, uint8_t b)
return retval;
}
#define MODREG ((nextop&0xC0)==0xC0)
#ifdef TEST_INTERPRETER
uintptr_t Test660F(x86test_t *test, uintptr_t addr)
#else
@ -54,6 +57,8 @@ uintptr_t Run660F(x86emu_t *emu, uintptr_t addr)
int32_t tmp32s;
sse_regs_t *opex, eax1, *opx2;
mmx87_regs_t *opem;
float tmpf;
double tmpd;
#ifndef NOALIGN
int is_nan;
#endif
@ -467,6 +472,15 @@ uintptr_t Run660F(x86emu_t *emu, uintptr_t addr)
}
break;
case 0x10: /* PBLENDVB Gx, Ex */
nextop = F8;
GET_EX;
for (int i=0; i<16; ++i) {
if(emu->xmm[0].ub[i]&0x80)
GX.ub[i] = EX->ub[i];
}
break;
case 0x14: /* BLENDVPS Gx, Ex */
nextop = F8;
GET_EX;
@ -475,6 +489,14 @@ uintptr_t Run660F(x86emu_t *emu, uintptr_t addr)
GX.ud[i] = EX->ud[i];
}
break;
case 0x15: /* BLENDVPD Gx, Ex */
nextop = F8;
GET_EX;
for (int i=0; i<2; ++i) {
if(emu->xmm[0].q[i]&0x8000000000000000LL)
GX.q[i] = EX->q[i];
}
break;
case 0x17: // PTEST GX, EX
nextop = F8;
@ -482,6 +504,10 @@ uintptr_t Run660F(x86emu_t *emu, uintptr_t addr)
CHECK_FLAGS(emu);
CONDITIONAL_SET_FLAG(!((GX.q[0]&EX->q[0])|(GX.q[1]&EX->q[1])), F_ZF);
CONDITIONAL_SET_FLAG(!(((~GX.q[0])&EX->q[0])|((~GX.q[1])&EX->q[1])), F_CF);
CLEAR_FLAG(F_AF);
CLEAR_FLAG(F_OF);
CLEAR_FLAG(F_SF);
CLEAR_FLAG(F_PF);
break;
case 0x1C: /* PABSB Gx, Ex */
@ -543,6 +569,36 @@ uintptr_t Run660F(x86emu_t *emu, uintptr_t addr)
GX.sq[i] = EX->sd[i];
break;
case 0x28: /* PMULDQ Gx, Ex */
nextop = F8;
GET_EX;
GX.sq[1] = ((int64_t)GX.sd[2])*(int64_t)EX->sd[2];
GX.sq[0] = ((int64_t)GX.sd[0])*(int64_t)EX->sd[0];
break;
case 0x29: /* PCMPEQQ Gx, Ex */
nextop = F8;
GET_EX;
for(int i=1; i>=0; --i)
GX.sq[i] = (GX.sq[i]==EX->sq[i])?-1LL:0LL;
break;
case 0x2A: /* MOVNTDQA Gx, Ex */
nextop = F8;
GET_EX;
GX.q[0] = EX->q[0];
GX.q[1] = EX->q[1];
break;
case 0x2B: /* PACKUSDW Gx, Ex */
nextop = F8;
GET_EX;
for(int i=0; i<4; ++i)
GX.uw[i] = (GX.sd[i]<0)?0:((GX.sd[i]>65535)?65535:GX.sd[i]);
if(&GX==EX)
GX.q[1] = GX.q[0];
else
for(int i=0; i<4; ++i)
GX.uw[i+4] = (EX->sd[i]<0)?0:((EX->sd[i]>65535)?65535:EX->sd[i]);
break;
case 0x30: /* PMOVZXBW Gx, Ex */
nextop = F8;
GET_EX;
@ -580,6 +636,19 @@ uintptr_t Run660F(x86emu_t *emu, uintptr_t addr)
GX.q[i] = EX->ud[i];
break;
case 0x37: /* PCMPGTQ Gx, Ex */
nextop = F8;
GET_EX;
for(int i=1; i>=0; --i)
GX.sq[i] = (GX.sq[i]>EX->sq[i])?-1LL:0LL;
break;
case 0x38: /* PMINSB Gx, Ex */
nextop = F8;
GET_EX;
for(int i=0; i<16; ++i)
if(GX.sb[i]>EX->sb[i])
GX.sb[i] = EX->sb[i];
break;
case 0x39: /* PMINSD Gx, Ex */
nextop = F8;
GET_EX;
@ -587,7 +656,27 @@ uintptr_t Run660F(x86emu_t *emu, uintptr_t addr)
if(GX.sd[i]>EX->sd[i])
GX.sd[i] = EX->sd[i];
break;
case 0x3A: /* PMINUW Gx, Ex */
nextop = F8;
GET_EX;
for(int i=0; i<8; ++i)
if(GX.uw[i]>EX->uw[i])
GX.uw[i] = EX->uw[i];
break;
case 0x3B: /* PMINUD Gx, Ex */
nextop = F8;
GET_EX;
for(int i=0; i<4; ++i)
if(GX.ud[i]>EX->ud[i])
GX.ud[i] = EX->ud[i];
break;
case 0x3C: /* PMAXSB Gx, Ex */
nextop = F8;
GET_EX;
for(int i=0; i<16; ++i)
if(GX.sb[i]<EX->sb[i])
GX.sb[i] = EX->sb[i];
break;
case 0x3D: /* PMAXSD Gx, Ex */
nextop = F8;
GET_EX;
@ -595,7 +684,20 @@ uintptr_t Run660F(x86emu_t *emu, uintptr_t addr)
if(GX.sd[i]<EX->sd[i])
GX.sd[i] = EX->sd[i];
break;
case 0x3E: /* PMAXUW Gx, Ex */
nextop = F8;
GET_EX;
for(int i=0; i<8; ++i)
if(GX.uw[i]<EX->uw[i])
GX.uw[i] = EX->uw[i];
break;
case 0x3F: /* PMAXUD Gx, Ex */
nextop = F8;
GET_EX;
for(int i=0; i<4; ++i)
if(GX.ud[i]<EX->ud[i])
GX.ud[i] = EX->ud[i];
break;
case 0x40: /* PMULLD Gx, Ex */
nextop = F8;
GET_EX;
@ -603,6 +705,22 @@ uintptr_t Run660F(x86emu_t *emu, uintptr_t addr)
GX.sd[i] *= EX->sd[i];
}
break;
case 0x41: /* PHMINPOSUW Gx, Ex */
nextop = F8;
GET_EX;
tmp16u = EX->uw[0];
tmp16s = 0;
for(int i=1; i<8; ++i) {
if(EX->uw[i]<tmp16u) {
tmp16u = EX->uw[i];
tmp16s = i;
}
}
GX.q[1] = 0;
GX.uw[0] = tmp16u;
GX.uw[1] = tmp16s;
GX.ud[1] = 0;
break;
case 0xDB: /* AESIMC Gx, Ex */
nextop = F8;
@ -701,12 +819,22 @@ uintptr_t Run660F(x86emu_t *emu, uintptr_t addr)
GX.q[1] ^= EX->q[1];
break;
case 0xF0: /* MOVBE Gw, Ew */
nextop = F8;
GET_ED;
GD.word[0] = __builtin_bswap16(ED->word[0]);
break;
case 0xF1: /* MOVBE Ew, Gw */
nextop = F8;
GET_ED;
ED->word[0] = __builtin_bswap16(GD.word[0]);
break;
default:
return 0;
}
break;
case 0x3A: // these are some SSE3 opcodes
case 0x3A: // these are some SSE3 & SSE4.x opcodes
opcode = F8;
switch(opcode) {
case 0x08: // roundps GX, EX, u8
@ -805,7 +933,24 @@ uintptr_t Run660F(x86emu_t *emu, uintptr_t addr)
break;
}
break;
case 0x0C: /* PBLENDPS Gx, Ex, Ib */
nextop = F8;
GET_EX;
tmp8u = F8;
for (int i=0; i<4; ++i) {
if(tmp8u&(1<<i))
GX.ud[i] = EX->ud[i];
}
break;
case 0x0D: /* PBLENDPD Gx, Ex, Ib */
nextop = F8;
GET_EX;
tmp8u = F8;
for (int i=0; i<2; ++i) {
if(tmp8u&(1<<i))
GX.q[i] = EX->q[i];
}
break;
case 0x0E: /* PBLENDW Gx, Ex, Ib */
nextop = F8;
GET_EX;
@ -830,12 +975,36 @@ uintptr_t Run660F(x86emu_t *emu, uintptr_t addr)
}
break;
case 0x14: // PEXTRB ED, GX, u8
nextop = F8;
GET_ED;
tmp8u = F8;
if(MODREG)
ED->dword[0] = GX.ub[tmp8u&0x0f];
else
ED->byte[0] = GX.ub[tmp8u&0x0f];
break;
case 0x15: // PEXTRW Ew,Gx,Ib
nextop = F8;
GET_ED;
tmp8u = F8;
if(MODREG)
ED->dword[0] = GX.uw[tmp8u&7]; // 16bits extract, 0 extended
else
ED->word[0] = GX.uw[tmp8u&7];
break;
case 0x16: // PEXTRD ED, GX, u8
nextop = F8;
GET_ED;
tmp8u = F8;
ED->dword[0] = GX.ud[tmp8u&3];
break;
case 0x17: // EXTRACTPS ED, GX, u8
nextop = F8;
GET_ED;
tmp8u = F8;
ED->dword[0] = GX.ud[tmp8u&3];
break;
case 0x20: // PINSRB GX, Ed, Ib
nextop = F8;
@ -860,6 +1029,49 @@ uintptr_t Run660F(x86emu_t *emu, uintptr_t addr)
GX.ud[tmp8u&0x3] = ED->dword[0];
break;
case 0x40: /* DPPS Gx, Ex, Ib */
nextop = F8;
GET_EX;
tmp8u = F8;
tmpf = 0.0f;
for(int i=0; i<4; ++i)
if(tmp8u&(1<<(i+4)))
tmpf += GX.f[i]*EX->f[i];
for(int i=0; i<4; ++i)
GX.f[i] = (tmp8u&(1<<i))?tmpf:0.0f;
break;
case 0x41: /* DPPD Gx, Ex, Ib */
nextop = F8;
GET_EX;
tmp8u = F8;
tmpd = 0.0;
if(tmp8u&(1<<(4+0)))
tmpd += GX.d[0]*EX->d[0];
if(tmp8u&(1<<(4+1)))
tmpd += GX.d[1]*EX->d[1];
GX.d[0] = (tmp8u&(1<<(0)))?tmpd:0.0;
GX.d[1] = (tmp8u&(1<<(1)))?tmpd:0.0;
break;
case 0x42: /* MPSADBW Gx, Ex, Ib */
nextop = F8;
GET_EX;
tmp8u = F8;
{
int src = tmp8u&3;
int dst = (tmp8u>>2)&1;
int b[11];
for (int i=0; i<11; ++i)
b[i] = GX.ub[dst*4+i];
for(int i=0; i<8; ++i) {
int tmp = abs(b[i+0]-EX->ub[src*4+0]);
tmp += abs(b[i+1]-EX->ub[src*4+1]);
tmp += abs(b[i+2]-EX->ub[src*4+2]);
tmp += abs(b[i+3]-EX->ub[src*4+3]);
GX.uw[i] = tmp;
}
}
break;
case 0x44: /* PCLMULQDQ Gx, Ex, Ib */
nextop = F8;
GET_EX;
@ -880,6 +1092,61 @@ uintptr_t Run660F(x86emu_t *emu, uintptr_t addr)
}
break;
case 0x60: /* PCMPESTRM */
nextop = F8;
GET_EX;
tmp8u = F8;
tmp32u = sse42_compare_string_explicit_len(emu, EX, R_EDX, &GX, R_EAX, tmp8u);
if(tmp8u&0b1000000) {
switch(tmp8u&1) {
case 0: for(int i=0; i<16; ++i) GX.ub[i] = ((tmp32u>>i)&1)?0xff:0x00; break;
case 1: for(int i=0; i<8; ++i) GX.uw[i] = ((tmp32u>>i)&1)?0xffff:0x0000; break;
}
} else {
GX.q[1] = GX.q[0] = 0;
GX.uw[0] = tmp32u;
}
break;
case 0x61: /* PCMPESTRI */
nextop = F8;
GET_EX;
tmp8u = F8;
tmp32u = sse42_compare_string_explicit_len(emu, EX, R_EDX, &GX, R_EAX, tmp8u);
if(!tmp32u)
R_ECX = (tmp8u&1)?8:16;
else if(tmp8u&0b1000000)
R_ECX = 31-__builtin_clz(tmp32u);
else
R_ECX = __builtin_ffs(tmp32u) - 1;
break;
case 0x62: /* PCMPISTRM */
nextop = F8;
GET_EX;
tmp8u = F8;
tmp32u = sse42_compare_string_implicit_len(emu, EX, &GX, tmp8u);
if(tmp8u&0b1000000) {
switch(tmp8u&1) {
case 0: for(int i=0; i<16; ++i) GX.ub[i] = ((tmp32u>>i)&1)?0xff:0x00; break;
case 1: for(int i=0; i<8; ++i) GX.uw[i] = ((tmp32u>>i)&1)?0xffff:0x0000; break;
}
} else {
GX.q[1] = GX.q[0] = 0;
GX.uw[0] = tmp32u;
}
break;
case 0x63: /* PCMPISTRI */
nextop = F8;
GET_EX;
tmp8u = F8;
tmp32u = sse42_compare_string_implicit_len(emu, EX, &GX, tmp8u);
if(!tmp32u)
R_ECX = (tmp8u&1)?8:16;
else if(tmp8u&0b1000000)
R_ECX = 31-__builtin_clz(tmp32u);
else
R_ECX = __builtin_ffs(tmp32u) - 1;
break;
case 0xDF: // AESKEYGENASSIST Gx, Ex, u8
nextop = F8;
GET_EX;

76
src/emu/x86run66f20f.c Normal file
View File

@ -0,0 +1,76 @@
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include "debug.h"
#include "box86stack.h"
#include "x86emu.h"
#include "x86run.h"
#include "x86emu_private.h"
#include "x86run_private.h"
#include "x86primop.h"
#include "x86trace.h"
#include "box86context.h"
#include "modrm.h"
#include "x86compstrings.h"
#define MODREG ((nextop&0xC0)==0xC0)
#ifdef TEST_INTERPRETER
uintptr_t Test66F20F(x86test_t *test, uintptr_t addr)
#else
uintptr_t Run66F20F(x86emu_t *emu, uintptr_t addr)
#endif
{
uint8_t opcode;
uint8_t nextop;
reg32_t *oped;
uint8_t tmp8u;
int8_t tmp8s;
uint16_t tmp16u;
int16_t tmp16s;
uint32_t tmp32u;
int32_t tmp32s;
sse_regs_t *opex, eax1, *opx2;
mmx87_regs_t *opem;
float tmpf;
double tmpd;
#ifdef TEST_INTERPRETER
x86emu_t* emu = test->emu;
#endif
opcode = F8;
switch(opcode) {
case 0x38: // SSE 4.x
opcode = F8;
switch(opcode) {
case 0xF1: // CRC32 Gd, Ew
nextop = F8;
GET_EW;
for(int j=0; j<2; ++j) {
GD.dword[0] ^= EW->byte[j];
for (int i = 0; i < 8; i++) {
if (GD.dword[0] & 1)
GD.dword[0] = (GD.dword[0] >> 1) ^ 0x82f63b78;
else
GD.dword[0] = (GD.dword[0] >> 1);
}
}
break;
default:
return 0;
}
break;
default:
return 0;
}
return addr;
}

View File

@ -212,6 +212,7 @@ uintptr_t Run6466(x86emu_t *emu, uintptr_t tlsdata, uintptr_t addr);
uintptr_t Run6467(x86emu_t *emu, uintptr_t tlsdata, uintptr_t addr);
uintptr_t Run66(x86emu_t *emu, int rep, uintptr_t addr);
uintptr_t Run660F(x86emu_t *emu, uintptr_t addr);
uintptr_t Run66F20F(x86emu_t *emu, uintptr_t addr);
uintptr_t Run6664(x86emu_t *emu, int seg, uintptr_t addr);
uintptr_t Run66D9(x86emu_t *emu, uintptr_t addr);
uintptr_t Run66DD(x86emu_t *emu, uintptr_t addr);
@ -241,6 +242,7 @@ uintptr_t Test6466(x86test_t *test, uintptr_t tlsdata, uintptr_t addr);
uintptr_t Test6467(x86test_t *test, uintptr_t tlsdata, uintptr_t addr);
uintptr_t Test66(x86test_t *test, int rep, uintptr_t addr);
uintptr_t Test660F(x86test_t *test, uintptr_t addr);
uintptr_t Test66F20F(x86test_t *test, uintptr_t addr);
uintptr_t Test6664(x86test_t *test, int seg, uintptr_t addr);
uintptr_t Test66D9(x86test_t *test, uintptr_t addr);
uintptr_t Test66DD(x86test_t *test, uintptr_t addr);

View File

@ -103,6 +103,42 @@ uintptr_t RunF20F(x86emu_t *emu, uintptr_t addr, int *step)
break;
}
break;
case 0x38: // more opcodes
opcode = F8;
switch(opcode) {
case 0xF0: // CRC32 Gd, Eb
nextop = F8;
GET_EB;
GD.dword[0] ^= EB->byte[0];
for (int i = 0; i < 8; i++) {
if (GD.dword[0] & 1)
GD.dword[0] = (GD.dword[0] >> 1) ^ 0x82f63b78;
else
GD.dword[0] = (GD.dword[0] >> 1);
}
break;
case 0xF1: // CRC32 Gd, Ed
nextop = F8;
GET_ED;
for(int j=0; j<4; ++j) {
GD.dword[0] ^= ED->byte[j];
for (int i = 0; i < 8; i++) {
if (GD.dword[0] & 1)
GD.dword[0] = (GD.dword[0] >> 1) ^ 0x82f63b78;
else
GD.dword[0] = (GD.dword[0] >> 1);
}
}
break;
default:
return 0;
}
break;
case 0x51: /* SQRTSD Gx, Ex */
nextop = F8;
GET_EX;

View File

@ -210,7 +210,10 @@ void my_cpuid(x86emu_t* emu, uint32_t tmp32u)
| 1<<9 // SSSE3
| 1<<12 // fma
| 1<<13 // cx16 (cmpxchg16)
| 1<<19 // SSE4_1
| 1<<20 // SSE4_2
| 1<<22 // MOVBE
| 1<<23 // POPCOUNT
| 1<<25 // aesni
;
break;

View File

@ -1694,7 +1694,7 @@ void CreateCPUInfoFile(int fd)
P;
sprintf(buff, "bogomips\t: %g\n", getBogoMips());
P;
sprintf(buff, "flags\t\t: fpu cx8 sep ht cmov clflush mmx sse sse2 rdtscp ssse3 fma cpuid pclmulqdq cx16 aes movbe pni\n");
sprintf(buff, "flags\t\t: fpu cx8 sep ht cmov clflush mmx sse sse2 rdtscp ssse3 fma cpuid pclmulqdq cx16 aes movbe pni sse4_1 sse4_2 lzcnt popcnt\n");
P;
sprintf(buff, "\n");
P;

View File

@ -1,43 +1,25 @@
ucomiss 1.000000, 2.000000 => 0x203
ucomiss 2.000000, 1.000000 => 0x202
ucomiss -1.000000, 2.000000 => 0x203
ucomiss 2.000000, -1.000000 => 0x202
ucomiss -1.000000, 340282346638528859811704183484516925440.000000 => 0x203
ucomiss 340282346638528859811704183484516925440.000000, -1.000000 => 0x202
ucomiss -1.000000, -340282346638528859811704183484516925440.000000 => 0x202
ucomiss -340282346638528859811704183484516925440.000000, -1.000000 => 0x203
ucomiss 340282346638528859811704183484516925440.000000, -340282346638528859811704183484516925440.000000 => 0x202
ucomiss -340282346638528859811704183484516925440.000000, 340282346638528859811704183484516925440.000000 => 0x203
ucomiss -0.000000, 0.000000 => 0x242
ucomiss 0.000000, -0.000000 => 0x242
ucomiss -2.000000, -2.000000 => 0x242
ucomiss 5.000000, 5.000000 => 0x242
ucomiss 5.000000, inf => 0x203
ucomiss inf, 5.000000 => 0x202
ucomiss 5.000000, -inf => 0x202
ucomiss -inf, 5.000000 => 0x203
ucomiss 5.000000, nan => 0x203
ucomiss nan, 5.000000 => 0x203
ucomiss 5.000000, 5.000000 => 0x242
ucomiss 5.000000, 5.000000 => 0x242
ucomiss 1.000000, inf => 0x203
ucomiss inf, 1.000000 => 0x202
ucomiss 1.000000, -inf => 0x202
ucomiss -inf, 1.000000 => 0x203
ucomiss 1.000000, nan => 0x203
ucomiss nan, 1.000000 => 0x203
ucomiss 1.000000, 1.000000 => 0x242
ucomiss 1.000000, 1.000000 => 0x242
ucomiss inf, inf => 0x242
ucomiss -inf, inf => 0x203
ucomiss inf, -inf => 0x202
ucomiss nan, nan => 0x203
minss 1, 2 => 1
minss 2, 1 => 1
minss -1, 2 => -1
minss 2, -1 => -1
minss -0, 0 => 0
minss 0, -0 => -0
minss 5, -10 => -10
minss -10, 5 => -10
minss -inf, -10 => -inf
minss -10, -inf => -inf
minss inf, -10 => -10
minss -10, inf => -10
minss nan, -10 => -10
minss -10, nan => nan
minss -inf, 2 => -inf
minss 2, -inf => -inf
minss inf, 2 => 2
minss 2, inf => 2
minss nan, 2 => 2
minss 2, nan => nan
minss nan, 3.40282e+38 => 3.40282e+38
minss 3.40282e+38, nan => nan
minss -inf, 3.40282e+38 => -inf
@ -46,18 +28,12 @@ minss inf, 3.40282e+38 => 3.40282e+38
minss 3.40282e+38, inf => 3.40282e+38
maxss 1, 2 => 2
maxss 2, 1 => 2
maxss -1, 2 => 2
maxss 2, -1 => 2
maxss -0, 0 => 0
maxss 0, -0 => -0
maxss 5, -10 => 5
maxss -10, 5 => 5
maxss -inf, -10 => -10
maxss -10, -inf => -10
maxss inf, -10 => inf
maxss -10, inf => inf
maxss nan, -10 => -10
maxss -10, nan => nan
maxss -inf, 2 => 2
maxss 2, -inf => 2
maxss inf, 2 => inf
maxss 2, inf => inf
maxss nan, 2 => 2
maxss 2, nan => nan
maxss nan, 3.40282e+38 => 3.40282e+38
maxss 3.40282e+38, nan => nan
maxss -inf, 3.40282e+38 => 3.40282e+38
@ -66,208 +42,112 @@ maxss inf, 3.40282e+38 => inf
maxss 3.40282e+38, inf => inf
cmpss 0 1.000000, 2.000000 => 0x0
cmpss 0 2.000000, 1.000000 => 0x0
cmpss 0 -1.000000, 2.000000 => 0x0
cmpss 0 2.000000, -1.000000 => 0x0
cmpss 0 -1.000000, 340282346638528859811704183484516925440.000000 => 0x0
cmpss 0 340282346638528859811704183484516925440.000000, -1.000000 => 0x0
cmpss 0 -1.000000, -340282346638528859811704183484516925440.000000 => 0x0
cmpss 0 -340282346638528859811704183484516925440.000000, -1.000000 => 0x0
cmpss 0 340282346638528859811704183484516925440.000000, -340282346638528859811704183484516925440.000000 => 0x0
cmpss 0 -340282346638528859811704183484516925440.000000, 340282346638528859811704183484516925440.000000 => 0x0
cmpss 0 -0.000000, 0.000000 => 0xffffffff
cmpss 0 0.000000, -0.000000 => 0xffffffff
cmpss 0 -2.000000, -2.000000 => 0xffffffff
cmpss 0 5.000000, 5.000000 => 0xffffffff
cmpss 0 5.000000, inf => 0x0
cmpss 0 inf, 5.000000 => 0x0
cmpss 0 5.000000, -inf => 0x0
cmpss 0 -inf, 5.000000 => 0x0
cmpss 0 5.000000, nan => 0x0
cmpss 0 nan, 5.000000 => 0x0
cmpss 0 5.000000, 5.000000 => 0xffffffff
cmpss 0 5.000000, 5.000000 => 0xffffffff
cmpss 0 1.000000, inf => 0x0
cmpss 0 inf, 1.000000 => 0x0
cmpss 0 1.000000, -inf => 0x0
cmpss 0 -inf, 1.000000 => 0x0
cmpss 0 1.000000, nan => 0x0
cmpss 0 nan, 1.000000 => 0x0
cmpss 0 1.000000, 1.000000 => 0xffffffff
cmpss 0 1.000000, 1.000000 => 0xffffffff
cmpss 0 inf, inf => 0xffffffff
cmpss 0 -inf, inf => 0x0
cmpss 0 inf, -inf => 0x0
cmpss 0 nan, nan => 0x0
cmpss 1 1.000000, 2.000000 => 0xffffffff
cmpss 1 2.000000, 1.000000 => 0x0
cmpss 1 -1.000000, 2.000000 => 0xffffffff
cmpss 1 2.000000, -1.000000 => 0x0
cmpss 1 -1.000000, 340282346638528859811704183484516925440.000000 => 0xffffffff
cmpss 1 340282346638528859811704183484516925440.000000, -1.000000 => 0x0
cmpss 1 -1.000000, -340282346638528859811704183484516925440.000000 => 0x0
cmpss 1 -340282346638528859811704183484516925440.000000, -1.000000 => 0xffffffff
cmpss 1 340282346638528859811704183484516925440.000000, -340282346638528859811704183484516925440.000000 => 0x0
cmpss 1 -340282346638528859811704183484516925440.000000, 340282346638528859811704183484516925440.000000 => 0xffffffff
cmpss 1 -0.000000, 0.000000 => 0x0
cmpss 1 0.000000, -0.000000 => 0x0
cmpss 1 -2.000000, -2.000000 => 0x0
cmpss 1 5.000000, 5.000000 => 0x0
cmpss 1 5.000000, inf => 0xffffffff
cmpss 1 inf, 5.000000 => 0x0
cmpss 1 5.000000, -inf => 0x0
cmpss 1 -inf, 5.000000 => 0xffffffff
cmpss 1 5.000000, nan => 0x0
cmpss 1 nan, 5.000000 => 0x0
cmpss 1 5.000000, 5.000000 => 0x0
cmpss 1 5.000000, 5.000000 => 0x0
cmpss 1 1.000000, inf => 0xffffffff
cmpss 1 inf, 1.000000 => 0x0
cmpss 1 1.000000, -inf => 0x0
cmpss 1 -inf, 1.000000 => 0xffffffff
cmpss 1 1.000000, nan => 0x0
cmpss 1 nan, 1.000000 => 0x0
cmpss 1 1.000000, 1.000000 => 0x0
cmpss 1 1.000000, 1.000000 => 0x0
cmpss 1 inf, inf => 0x0
cmpss 1 -inf, inf => 0xffffffff
cmpss 1 inf, -inf => 0x0
cmpss 1 nan, nan => 0x0
cmpss 2 1.000000, 2.000000 => 0xffffffff
cmpss 2 2.000000, 1.000000 => 0x0
cmpss 2 -1.000000, 2.000000 => 0xffffffff
cmpss 2 2.000000, -1.000000 => 0x0
cmpss 2 -1.000000, 340282346638528859811704183484516925440.000000 => 0xffffffff
cmpss 2 340282346638528859811704183484516925440.000000, -1.000000 => 0x0
cmpss 2 -1.000000, -340282346638528859811704183484516925440.000000 => 0x0
cmpss 2 -340282346638528859811704183484516925440.000000, -1.000000 => 0xffffffff
cmpss 2 340282346638528859811704183484516925440.000000, -340282346638528859811704183484516925440.000000 => 0x0
cmpss 2 -340282346638528859811704183484516925440.000000, 340282346638528859811704183484516925440.000000 => 0xffffffff
cmpss 2 -0.000000, 0.000000 => 0xffffffff
cmpss 2 0.000000, -0.000000 => 0xffffffff
cmpss 2 -2.000000, -2.000000 => 0xffffffff
cmpss 2 5.000000, 5.000000 => 0xffffffff
cmpss 2 5.000000, inf => 0xffffffff
cmpss 2 inf, 5.000000 => 0x0
cmpss 2 5.000000, -inf => 0x0
cmpss 2 -inf, 5.000000 => 0xffffffff
cmpss 2 5.000000, nan => 0x0
cmpss 2 nan, 5.000000 => 0x0
cmpss 2 5.000000, 5.000000 => 0xffffffff
cmpss 2 5.000000, 5.000000 => 0xffffffff
cmpss 2 1.000000, inf => 0xffffffff
cmpss 2 inf, 1.000000 => 0x0
cmpss 2 1.000000, -inf => 0x0
cmpss 2 -inf, 1.000000 => 0xffffffff
cmpss 2 1.000000, nan => 0x0
cmpss 2 nan, 1.000000 => 0x0
cmpss 2 1.000000, 1.000000 => 0xffffffff
cmpss 2 1.000000, 1.000000 => 0xffffffff
cmpss 2 inf, inf => 0xffffffff
cmpss 2 -inf, inf => 0xffffffff
cmpss 2 inf, -inf => 0x0
cmpss 2 nan, nan => 0x0
cmpss 3 1.000000, 2.000000 => 0x0
cmpss 3 2.000000, 1.000000 => 0x0
cmpss 3 -1.000000, 2.000000 => 0x0
cmpss 3 2.000000, -1.000000 => 0x0
cmpss 3 -1.000000, 340282346638528859811704183484516925440.000000 => 0x0
cmpss 3 340282346638528859811704183484516925440.000000, -1.000000 => 0x0
cmpss 3 -1.000000, -340282346638528859811704183484516925440.000000 => 0x0
cmpss 3 -340282346638528859811704183484516925440.000000, -1.000000 => 0x0
cmpss 3 340282346638528859811704183484516925440.000000, -340282346638528859811704183484516925440.000000 => 0x0
cmpss 3 -340282346638528859811704183484516925440.000000, 340282346638528859811704183484516925440.000000 => 0x0
cmpss 3 -0.000000, 0.000000 => 0x0
cmpss 3 0.000000, -0.000000 => 0x0
cmpss 3 -2.000000, -2.000000 => 0x0
cmpss 3 5.000000, 5.000000 => 0x0
cmpss 3 5.000000, inf => 0x0
cmpss 3 inf, 5.000000 => 0x0
cmpss 3 5.000000, -inf => 0x0
cmpss 3 -inf, 5.000000 => 0x0
cmpss 3 5.000000, nan => 0xffffffff
cmpss 3 nan, 5.000000 => 0xffffffff
cmpss 3 5.000000, 5.000000 => 0x0
cmpss 3 5.000000, 5.000000 => 0x0
cmpss 3 1.000000, inf => 0x0
cmpss 3 inf, 1.000000 => 0x0
cmpss 3 1.000000, -inf => 0x0
cmpss 3 -inf, 1.000000 => 0x0
cmpss 3 1.000000, nan => 0xffffffff
cmpss 3 nan, 1.000000 => 0xffffffff
cmpss 3 1.000000, 1.000000 => 0x0
cmpss 3 1.000000, 1.000000 => 0x0
cmpss 3 inf, inf => 0x0
cmpss 3 -inf, inf => 0x0
cmpss 3 inf, -inf => 0x0
cmpss 3 nan, nan => 0xffffffff
cmpss 4 1.000000, 2.000000 => 0xffffffff
cmpss 4 2.000000, 1.000000 => 0xffffffff
cmpss 4 -1.000000, 2.000000 => 0xffffffff
cmpss 4 2.000000, -1.000000 => 0xffffffff
cmpss 4 -1.000000, 340282346638528859811704183484516925440.000000 => 0xffffffff
cmpss 4 340282346638528859811704183484516925440.000000, -1.000000 => 0xffffffff
cmpss 4 -1.000000, -340282346638528859811704183484516925440.000000 => 0xffffffff
cmpss 4 -340282346638528859811704183484516925440.000000, -1.000000 => 0xffffffff
cmpss 4 340282346638528859811704183484516925440.000000, -340282346638528859811704183484516925440.000000 => 0xffffffff
cmpss 4 -340282346638528859811704183484516925440.000000, 340282346638528859811704183484516925440.000000 => 0xffffffff
cmpss 4 -0.000000, 0.000000 => 0x0
cmpss 4 0.000000, -0.000000 => 0x0
cmpss 4 -2.000000, -2.000000 => 0x0
cmpss 4 5.000000, 5.000000 => 0x0
cmpss 4 5.000000, inf => 0xffffffff
cmpss 4 inf, 5.000000 => 0xffffffff
cmpss 4 5.000000, -inf => 0xffffffff
cmpss 4 -inf, 5.000000 => 0xffffffff
cmpss 4 5.000000, nan => 0xffffffff
cmpss 4 nan, 5.000000 => 0xffffffff
cmpss 4 5.000000, 5.000000 => 0x0
cmpss 4 5.000000, 5.000000 => 0x0
cmpss 4 1.000000, inf => 0xffffffff
cmpss 4 inf, 1.000000 => 0xffffffff
cmpss 4 1.000000, -inf => 0xffffffff
cmpss 4 -inf, 1.000000 => 0xffffffff
cmpss 4 1.000000, nan => 0xffffffff
cmpss 4 nan, 1.000000 => 0xffffffff
cmpss 4 1.000000, 1.000000 => 0x0
cmpss 4 1.000000, 1.000000 => 0x0
cmpss 4 inf, inf => 0x0
cmpss 4 -inf, inf => 0xffffffff
cmpss 4 inf, -inf => 0xffffffff
cmpss 4 nan, nan => 0xffffffff
cmpss 5 1.000000, 2.000000 => 0x0
cmpss 5 2.000000, 1.000000 => 0xffffffff
cmpss 5 -1.000000, 2.000000 => 0x0
cmpss 5 2.000000, -1.000000 => 0xffffffff
cmpss 5 -1.000000, 340282346638528859811704183484516925440.000000 => 0x0
cmpss 5 340282346638528859811704183484516925440.000000, -1.000000 => 0xffffffff
cmpss 5 -1.000000, -340282346638528859811704183484516925440.000000 => 0xffffffff
cmpss 5 -340282346638528859811704183484516925440.000000, -1.000000 => 0x0
cmpss 5 340282346638528859811704183484516925440.000000, -340282346638528859811704183484516925440.000000 => 0xffffffff
cmpss 5 -340282346638528859811704183484516925440.000000, 340282346638528859811704183484516925440.000000 => 0x0
cmpss 5 -0.000000, 0.000000 => 0xffffffff
cmpss 5 0.000000, -0.000000 => 0xffffffff
cmpss 5 -2.000000, -2.000000 => 0xffffffff
cmpss 5 5.000000, 5.000000 => 0xffffffff
cmpss 5 5.000000, inf => 0x0
cmpss 5 inf, 5.000000 => 0xffffffff
cmpss 5 5.000000, -inf => 0xffffffff
cmpss 5 -inf, 5.000000 => 0x0
cmpss 5 5.000000, nan => 0xffffffff
cmpss 5 nan, 5.000000 => 0xffffffff
cmpss 5 5.000000, 5.000000 => 0xffffffff
cmpss 5 5.000000, 5.000000 => 0xffffffff
cmpss 5 1.000000, inf => 0x0
cmpss 5 inf, 1.000000 => 0xffffffff
cmpss 5 1.000000, -inf => 0xffffffff
cmpss 5 -inf, 1.000000 => 0x0
cmpss 5 1.000000, nan => 0xffffffff
cmpss 5 nan, 1.000000 => 0xffffffff
cmpss 5 1.000000, 1.000000 => 0xffffffff
cmpss 5 1.000000, 1.000000 => 0xffffffff
cmpss 5 inf, inf => 0xffffffff
cmpss 5 -inf, inf => 0x0
cmpss 5 inf, -inf => 0xffffffff
cmpss 5 nan, nan => 0xffffffff
cmpss 6 1.000000, 2.000000 => 0x0
cmpss 6 2.000000, 1.000000 => 0xffffffff
cmpss 6 -1.000000, 2.000000 => 0x0
cmpss 6 2.000000, -1.000000 => 0xffffffff
cmpss 6 -1.000000, 340282346638528859811704183484516925440.000000 => 0x0
cmpss 6 340282346638528859811704183484516925440.000000, -1.000000 => 0xffffffff
cmpss 6 -1.000000, -340282346638528859811704183484516925440.000000 => 0xffffffff
cmpss 6 -340282346638528859811704183484516925440.000000, -1.000000 => 0x0
cmpss 6 340282346638528859811704183484516925440.000000, -340282346638528859811704183484516925440.000000 => 0xffffffff
cmpss 6 -340282346638528859811704183484516925440.000000, 340282346638528859811704183484516925440.000000 => 0x0
cmpss 6 -0.000000, 0.000000 => 0x0
cmpss 6 0.000000, -0.000000 => 0x0
cmpss 6 -2.000000, -2.000000 => 0x0
cmpss 6 5.000000, 5.000000 => 0x0
cmpss 6 5.000000, inf => 0x0
cmpss 6 inf, 5.000000 => 0xffffffff
cmpss 6 5.000000, -inf => 0xffffffff
cmpss 6 -inf, 5.000000 => 0x0
cmpss 6 5.000000, nan => 0xffffffff
cmpss 6 nan, 5.000000 => 0xffffffff
cmpss 6 5.000000, 5.000000 => 0x0
cmpss 6 5.000000, 5.000000 => 0x0
cmpss 6 1.000000, inf => 0x0
cmpss 6 inf, 1.000000 => 0xffffffff
cmpss 6 1.000000, -inf => 0xffffffff
cmpss 6 -inf, 1.000000 => 0x0
cmpss 6 1.000000, nan => 0xffffffff
cmpss 6 nan, 1.000000 => 0xffffffff
cmpss 6 1.000000, 1.000000 => 0x0
cmpss 6 1.000000, 1.000000 => 0x0
cmpss 6 inf, inf => 0x0
cmpss 6 -inf, inf => 0x0
cmpss 6 inf, -inf => 0xffffffff
cmpss 6 nan, nan => 0xffffffff
cmpss 7 1.000000, 2.000000 => 0xffffffff
cmpss 7 2.000000, 1.000000 => 0xffffffff
cmpss 7 -1.000000, 2.000000 => 0xffffffff
cmpss 7 2.000000, -1.000000 => 0xffffffff
cmpss 7 -1.000000, 340282346638528859811704183484516925440.000000 => 0xffffffff
cmpss 7 340282346638528859811704183484516925440.000000, -1.000000 => 0xffffffff
cmpss 7 -1.000000, -340282346638528859811704183484516925440.000000 => 0xffffffff
cmpss 7 -340282346638528859811704183484516925440.000000, -1.000000 => 0xffffffff
cmpss 7 340282346638528859811704183484516925440.000000, -340282346638528859811704183484516925440.000000 => 0xffffffff
cmpss 7 -340282346638528859811704183484516925440.000000, 340282346638528859811704183484516925440.000000 => 0xffffffff
cmpss 7 -0.000000, 0.000000 => 0xffffffff
cmpss 7 0.000000, -0.000000 => 0xffffffff
cmpss 7 -2.000000, -2.000000 => 0xffffffff
cmpss 7 5.000000, 5.000000 => 0xffffffff
cmpss 7 5.000000, inf => 0xffffffff
cmpss 7 inf, 5.000000 => 0xffffffff
cmpss 7 5.000000, -inf => 0xffffffff
cmpss 7 -inf, 5.000000 => 0xffffffff
cmpss 7 5.000000, nan => 0x0
cmpss 7 nan, 5.000000 => 0x0
cmpss 7 5.000000, 5.000000 => 0xffffffff
cmpss 7 5.000000, 5.000000 => 0xffffffff
cmpss 7 1.000000, inf => 0xffffffff
cmpss 7 inf, 1.000000 => 0xffffffff
cmpss 7 1.000000, -inf => 0xffffffff
cmpss 7 -inf, 1.000000 => 0xffffffff
cmpss 7 1.000000, nan => 0x0
cmpss 7 nan, 1.000000 => 0x0
cmpss 7 1.000000, 1.000000 => 0xffffffff
cmpss 7 1.000000, 1.000000 => 0xffffffff
cmpss 7 inf, inf => 0xffffffff
cmpss 7 -inf, inf => 0xffffffff
cmpss 7 inf, -inf => 0xffffffff
@ -485,13 +365,6 @@ psqrtps(1 2 3 -4 ) = 1 1.41421 1.73205 nan
psqrtps(0 -2 -10 0.5 ) = 0 nan nan 0.707107
psqrtps(inf -inf -inf 1 ) = inf nan nan 1
psqrtps(nan -0 nan inf ) = nan -0 nan inf
prsqrtps(1 2 3 -4 ) = 1 0.71 0.58 nan
prsqrtps(0 -2 -10 0.5 ) = inf nan nan 1.4
prsqrtps(inf -inf -inf 1 ) = 0 nan nan 1
prsqrtps(nan -0 nan inf ) = nan -inf nan 0
prcpps(1 2 3 -4 ) = 1 0.5 0.33 -0.25
prcpps(0 -2 -10 0.5 ) = inf -0.5 -0.1 2
prcpps(inf -inf -inf 1 ) = 0 -0 -0 1
prcpps(nan -0 nan inf ) = nan -inf nan 0
andps(1 2 3 -4 , 0 -2 -10 0.5 ) = 0 2 2 0
andps(0 -2 -10 0.5 , inf -inf -inf 1 ) = 0 -2 -8 0.5
@ -661,180 +534,31 @@ maxsd(1 2 , -inf inf ) = 1 2
maxsd(1 2 , -0 0x7ff8000000000000 ) = 1 2
maxsd(0 -2 , -0 0x7ff8000000000000 ) = -0 -2
maxsd(0 -2 , -0 0x7ff8000000000000 ) = -0 -2
cvttss2si(1) = 0x1
cvttss2si(1.49) = 0x1
cvttss2si(1.5) = 0x1
cvttss2si(1.9) = 0x1
cvttss2si(-1) = 0xffffffff
cvttss2si(-1.49) = 0xffffffff
cvttss2si(-1.5) = 0xffffffff
cvttss2si(-1.9) = 0xffffffff
cvttss2si(1e+30) = 0x80000000
cvttss2si(-1e+30) = 0x80000000
cvttss2si(inf) = 0x80000000
cvttss2si(-inf) = 0x80000000
cvttss2si(nan) = 0x80000000
cvttsd2si(1) = 0x1
cvttsd2si(1.49) = 0x1
cvttsd2si(1.5) = 0x1
cvttsd2si(1.9) = 0x1
cvttsd2si(-1) = 0xffffffff
cvttsd2si(-1.49) = 0xffffffff
cvttsd2si(-1.5) = 0xffffffff
cvttsd2si(-1.9) = 0xffffffff
cvttsd2si(1e+300) = 0x80000000
cvttsd2si(-1e+300) = 0x80000000
cvttsd2si(inf) = 0x80000000
cvttsd2si(-inf) = 0x80000000
cvttsd2si(nan) = 0x80000000
default rounding
cvtss2si(1) = 0x1
cvtss2si(1.49) = 0x1
cvtss2si(1.5) = 0x2
cvtss2si(1.9) = 0x2
cvtss2si(-1) = 0xffffffff
cvtss2si(-1.49) = 0xffffffff
cvtss2si(-1.5) = 0xfffffffe
cvtss2si(-1.9) = 0xfffffffe
cvtss2si(1e+30) = 0x80000000
cvtss2si(-1e+30) = 0x80000000
cvtss2si(inf) = 0x80000000
cvtss2si(-inf) = 0x80000000
cvtss2si(nan) = 0x80000000
cvtsd2si(1) = 0x1
cvtsd2si(1.49) = 0x1
cvtsd2si(1.5) = 0x2
cvtsd2si(1.9) = 0x2
cvtsd2si(-1) = 0xffffffff
cvtsd2si(-1.49) = 0xffffffff
cvtsd2si(-1.5) = 0xfffffffe
cvtsd2si(-1.9) = 0xfffffffe
cvtsd2si(1e+300) = 0x80000000
cvtsd2si(-1e+300) = 0x80000000
cvtsd2si(inf) = 0x80000000
cvtsd2si(-inf) = 0x80000000
cvtsd2si(nan) = 0x80000000
Round(0)
cvtss2si(1) = 0x1
cvtss2si(1.49) = 0x1
cvtss2si(1.5) = 0x2
cvtss2si(1.9) = 0x2
cvtss2si(-1) = 0xffffffff
cvtss2si(-1.49) = 0xffffffff
cvtss2si(-1.5) = 0xfffffffe
cvtss2si(-1.9) = 0xfffffffe
cvtss2si(1e+30) = 0x80000000
cvtss2si(-1e+30) = 0x80000000
cvtss2si(inf) = 0x80000000
cvtss2si(-inf) = 0x80000000
cvtss2si(nan) = 0x80000000
cvtsd2si(1) = 0x1
cvtsd2si(1.49) = 0x1
cvtsd2si(1.5) = 0x2
cvtsd2si(1.9) = 0x2
cvtsd2si(-1) = 0xffffffff
cvtsd2si(-1.49) = 0xffffffff
cvtsd2si(-1.5) = 0xfffffffe
cvtsd2si(-1.9) = 0xfffffffe
cvtsd2si(1e+300) = 0x80000000
cvtsd2si(-1e+300) = 0x80000000
cvtsd2si(inf) = 0x80000000
cvtsd2si(-inf) = 0x80000000
cvtsd2si(nan) = 0x80000000
cvtss2si(1.4) -> 1 cvtsi2ss -> 1
cvtss2si(-2.9) -> -3 cvtsi2ss -> -3
cvtss2si(1.6) -> 2 cvtsi2ss -> 2
cvtss2si(1e+38) -> -2147483648 cvtsi2ss -> -2.14748e+09
Round(1)
cvtss2si(1) = 0x1
cvtss2si(1.49) = 0x1
cvtss2si(1.5) = 0x1
cvtss2si(1.9) = 0x1
cvtss2si(-1) = 0xffffffff
cvtss2si(-1.49) = 0xfffffffe
cvtss2si(-1.5) = 0xfffffffe
cvtss2si(-1.9) = 0xfffffffe
cvtss2si(1e+30) = 0x80000000
cvtss2si(-1e+30) = 0x80000000
cvtss2si(inf) = 0x80000000
cvtss2si(-inf) = 0x80000000
cvtss2si(nan) = 0x80000000
cvtsd2si(1) = 0x1
cvtsd2si(1.49) = 0x1
cvtsd2si(1.5) = 0x1
cvtsd2si(1.9) = 0x1
cvtsd2si(-1) = 0xffffffff
cvtsd2si(-1.49) = 0xfffffffe
cvtsd2si(-1.5) = 0xfffffffe
cvtsd2si(-1.9) = 0xfffffffe
cvtsd2si(1e+300) = 0x80000000
cvtsd2si(-1e+300) = 0x80000000
cvtsd2si(inf) = 0x80000000
cvtsd2si(-inf) = 0x80000000
cvtsd2si(nan) = 0x80000000
cvtss2si(1.4) -> 1 cvtsi2ss -> 1
cvtss2si(-2.9) -> -3 cvtsi2ss -> -3
cvtss2si(1.6) -> 1 cvtsi2ss -> 1
cvtss2si(1e+38) -> -2147483648 cvtsi2ss -> -2.14748e+09
Round(2)
cvtss2si(1) = 0x1
cvtss2si(1.49) = 0x2
cvtss2si(1.5) = 0x2
cvtss2si(1.9) = 0x2
cvtss2si(-1) = 0xffffffff
cvtss2si(-1.49) = 0xffffffff
cvtss2si(-1.5) = 0xffffffff
cvtss2si(-1.9) = 0xffffffff
cvtss2si(1e+30) = 0x80000000
cvtss2si(-1e+30) = 0x80000000
cvtss2si(inf) = 0x80000000
cvtss2si(-inf) = 0x80000000
cvtss2si(nan) = 0x80000000
cvtsd2si(1) = 0x1
cvtsd2si(1.49) = 0x2
cvtsd2si(1.5) = 0x2
cvtsd2si(1.9) = 0x2
cvtsd2si(-1) = 0xffffffff
cvtsd2si(-1.49) = 0xffffffff
cvtsd2si(-1.5) = 0xffffffff
cvtsd2si(-1.9) = 0xffffffff
cvtsd2si(1e+300) = 0x80000000
cvtsd2si(-1e+300) = 0x80000000
cvtsd2si(inf) = 0x80000000
cvtsd2si(-inf) = 0x80000000
cvtsd2si(nan) = 0x80000000
cvtss2si(1.4) -> 2 cvtsi2ss -> 2
cvtss2si(-2.9) -> -2 cvtsi2ss -> -2
cvtss2si(1.6) -> 2 cvtsi2ss -> 2
cvtss2si(1e+38) -> -2147483648 cvtsi2ss -> -2.14748e+09
Round(3)
cvtss2si(1) = 0x1
cvtss2si(1.49) = 0x1
cvtss2si(1.5) = 0x1
cvtss2si(1.9) = 0x1
cvtss2si(-1) = 0xffffffff
cvtss2si(-1.49) = 0xffffffff
cvtss2si(-1.5) = 0xffffffff
cvtss2si(-1.9) = 0xffffffff
cvtss2si(1e+30) = 0x80000000
cvtss2si(-1e+30) = 0x80000000
cvtss2si(inf) = 0x80000000
cvtss2si(-inf) = 0x80000000
cvtss2si(nan) = 0x80000000
cvtsd2si(1) = 0x1
cvtsd2si(1.49) = 0x1
cvtsd2si(1.5) = 0x1
cvtsd2si(1.9) = 0x1
cvtsd2si(-1) = 0xffffffff
cvtsd2si(-1.49) = 0xffffffff
cvtsd2si(-1.5) = 0xffffffff
cvtsd2si(-1.9) = 0xffffffff
cvtsd2si(1e+300) = 0x80000000
cvtsd2si(-1e+300) = 0x80000000
cvtsd2si(inf) = 0x80000000
cvtsd2si(-inf) = 0x80000000
cvtsd2si(nan) = 0x80000000
cvtss2si(1.4) -> 1 cvtsi2ss -> 1
cvtss2si(-2.9) -> -2 cvtsi2ss -> -2
cvtss2si(1.6) -> 1 cvtsi2ss -> 1
cvtss2si(1e+38) -> -2147483648 cvtsi2ss -> -2.14748e+09
cvtps2dq(1 2 3 -4 ) = 0x1 0x2 0x3 0xfffffffc
cvtps2dq(0 -2 -10 0.5 ) = 0x0 0xfffffffe 0xfffffff6 0x0
cvtps2dq(inf -inf -inf 1 ) = 0x80000000 0x80000000 0x80000000 0x1
cvtps2dq(nan -0 nan inf ) = 0x80000000 0x0 0x80000000 0x80000000
dpps(1 2 3 -4 , 0 -2 -10 0.5 , 255) = -36 -36 -36 -36
dpps(0 -2 -10 0.5 , inf -inf -inf 1 , 255) = nan nan nan nan
dpps(1 2 3 -4 , nan -0 nan inf , 255) = nan nan nan nan
dpps(0 -2 -10 0.5 , nan -0 nan inf , 255) = nan nan nan nan
dpps(inf -inf -inf 1 , nan -0 nan inf , 255) = nan nan nan nan
dpps(nan -0 nan inf , nan -0 nan inf , 255) = nan nan nan nan
dpps(1 2 3 -4 , 0 -2 -10 0.5 , 63) = -4 -4 -4 -4
dpps(0 -2 -10 0.5 , inf -inf -inf 1 , 63) = nan nan nan nan
dpps(1 2 3 -4 , nan -0 nan inf , 63) = nan nan nan nan
dpps(0 -2 -10 0.5 , nan -0 nan inf , 63) = nan nan nan nan
dpps(inf -inf -inf 1 , nan -0 nan inf , 63) = nan nan nan nan
dpps(nan -0 nan inf , nan -0 nan inf , 63) = nan nan nan nan
dpps(1 2 3 -4 , 0 -2 -10 0.5 , 243) = -36 -36 0 0
dpps(0 -2 -10 0.5 , inf -inf -inf 1 , 243) = nan nan 0 0
dpps(1 2 3 -4 , nan -0 nan inf , 243) = nan nan 0 0
dpps(0 -2 -10 0.5 , nan -0 nan inf , 243) = nan nan 0 0
dpps(inf -inf -inf 1 , nan -0 nan inf , 243) = nan nan 0 0
dpps(nan -0 nan inf , nan -0 nan inf , 243) = nan nan 0 0
dpps(1 2 3 -4 , 0 -2 -10 0.5 , 83) = -30 -30 0 0
dpps(0 -2 -10 0.5 , inf -inf -inf 1 , 83) = nan nan 0 0
dpps(1 2 3 -4 , nan -0 nan inf , 83) = nan nan 0 0
dpps(0 -2 -10 0.5 , nan -0 nan inf , 83) = nan nan 0 0
dpps(inf -inf -inf 1 , nan -0 nan inf , 83) = nan nan 0 0
dpps(nan -0 nan inf , nan -0 nan inf , 83) = nan nan 0 0

5
tests/ref23.txt Normal file
View File

@ -0,0 +1,5 @@
ret = 0x78563412
ret = 0x3412
ret = 0x78563412
ret = 0x3412
ret = 0x12345678

640
tests/ref24.txt Normal file
View File

@ -0,0 +1,640 @@
Testing rint(1.000000)
FE_UPWARD: 1.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 1.0
Current rounding mode: 0x0
Testing rint(1.300000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 1.0
Current rounding mode: 0x0
Testing rint(1.500000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing rint(1.800000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing rint(2.000000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing rint(2.300000)
FE_UPWARD: 3.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing rint(2.500000)
FE_UPWARD: 3.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing rint(2.800000)
FE_UPWARD: 3.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 3.0
Current rounding mode: 0x0
Testing rintf(1.000000)
FE_UPWARD: 1.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 1.0
Current rounding mode: 0x0
Testing rintf(1.300000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 1.0
Current rounding mode: 0x0
Testing rintf(1.500000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing rintf(1.800000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing rintf(2.000000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing rintf(2.300000)
FE_UPWARD: 3.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing rintf(2.500000)
FE_UPWARD: 3.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing rintf(2.800000)
FE_UPWARD: 3.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 3.0
Current rounding mode: 0x0
Testing nearbyint(1.000000)
FE_UPWARD: 1.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 1.0
Current rounding mode: 0x0
Testing nearbyint(1.300000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 1.0
Current rounding mode: 0x0
Testing nearbyint(1.500000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing nearbyint(1.800000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing nearbyint(2.000000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing nearbyint(2.300000)
FE_UPWARD: 3.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing nearbyint(2.500000)
FE_UPWARD: 3.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing nearbyint(2.800000)
FE_UPWARD: 3.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 3.0
Current rounding mode: 0x0
Testing nearbyintf(1.000000)
FE_UPWARD: 1.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 1.0
Current rounding mode: 0x0
Testing nearbyintf(1.300000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 1.0
Current rounding mode: 0x0
Testing nearbyintf(1.500000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing nearbyintf(1.800000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing nearbyintf(2.000000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing nearbyintf(2.300000)
FE_UPWARD: 3.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing nearbyintf(2.500000)
FE_UPWARD: 3.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing nearbyintf(2.800000)
FE_UPWARD: 3.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 3.0
Current rounding mode: 0x0
Testing llrintf(1.000000)
FE_UPWARD: 1.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 1.0
Current rounding mode: 0x0
Testing llrintf(1.300000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 1.0
Current rounding mode: 0x0
Testing llrintf(1.500000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing llrintf(1.800000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing llrintf(2.000000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing llrintf(2.300000)
FE_UPWARD: 3.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing llrintf(2.500000)
FE_UPWARD: 3.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing llrintf(2.800000)
FE_UPWARD: 3.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 3.0
Current rounding mode: 0x0
Testing llrint(1.000000)
FE_UPWARD: 1.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 1.0
Current rounding mode: 0x0
Testing llrint(1.300000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 1.0
Current rounding mode: 0x0
Testing llrint(1.500000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing llrint(1.800000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing llrint(2.000000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing llrint(2.300000)
FE_UPWARD: 3.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing llrint(2.500000)
FE_UPWARD: 3.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing llrint(2.800000)
FE_UPWARD: 3.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 3.0
Current rounding mode: 0x0
Testing lrintf(1.000000)
FE_UPWARD: 1.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 1.0
Current rounding mode: 0x0
Testing lrintf(1.300000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 1.0
Current rounding mode: 0x0
Testing lrintf(1.500000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing lrintf(1.800000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing lrintf(2.000000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing lrintf(2.300000)
FE_UPWARD: 3.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing lrintf(2.500000)
FE_UPWARD: 3.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing lrintf(2.800000)
FE_UPWARD: 3.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 3.0
Current rounding mode: 0x0
Testing lrint(1.000000)
FE_UPWARD: 1.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 1.0
Current rounding mode: 0x0
Testing lrint(1.300000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 1.0
Current rounding mode: 0x0
Testing lrint(1.500000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing lrint(1.800000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 1.0
Current rounding mode: 0x400
FE_TOWARDZERO: 1.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing lrint(2.000000)
FE_UPWARD: 2.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing lrint(2.300000)
FE_UPWARD: 3.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing lrint(2.500000)
FE_UPWARD: 3.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 2.0
Current rounding mode: 0x0
Testing lrint(2.800000)
FE_UPWARD: 3.0
Current rounding mode: 0x800
FE_DOWNWARD: 2.0
Current rounding mode: 0x400
FE_TOWARDZERO: 2.0
Current rounding mode: 0xc00
FE_TONEAREST: 3.0
Current rounding mode: 0x0

1062
tests/ref25.txt Normal file

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -1,5 +1,6 @@
// build with gcc -march=corei7 -O2 -g -msse -msse2 test17.c -o test17
// build with gcc -O0 -g -msse -msse2 -mssse3 -msse4.1 test17.c -o test17
// and -m32 for 32bits version
#include <inttypes.h>
#include <string.h>
#include <stdio.h>
#include <stddef.h>
@ -141,7 +142,7 @@ void print_32(v128 v) {
}
void print_64(v128 v) {
for(int i=0; i<2; ++i)
printf("0x%llx ", v.u64[i]);
printf("0x%"PRIx64" ", v.u64[i]);
}
#define print_128 print_64
void print_ps(v128 v) {
@ -151,33 +152,19 @@ void print_ps(v128 v) {
else
printf("%g ", v.f32[i]);
}
void print_ps_approx(v128 v) {
for(int i=0; i<4; ++i)
if(isnanf(v.f32[i]))
printf("nan ");
else
printf("%.2g ", v.f32[i]);
}
void print_pd(v128 v) {
for(int i=0; i<2; ++i)
if(isnan(v.d64[i]))
printf("0x%llx ", v.u64[i]);
printf("0x%"PRIx64" ", v.u64[i]);
else
printf("%g ", v.d64[i]);
}
void print_pd_approx(v128 v) {
for(int i=0; i<2; ++i)
if(isnan(v.d64[i]))
printf("0x%llx ", v.u64[i]);
else
printf("%.4g ", v.d64[i]);
}
#define print_sd print_pd
int main(int argc, const char** argv)
{
float a, b;
uint64_t flags;
uint32_t flags;
uint32_t maxf = 0x7f7fffff;
uint32_t minf = 0xff7fffff;
uint32_t r;
@ -185,71 +172,40 @@ int main(int argc, const char** argv)
#define GO1(A, N) \
a = 1.0f; b = 2.0f; \
flags = A(a, b); \
printf(N " %f, %f => 0x%lx\n", a, b, flags); \
printf(N " %f, %f => 0x%"PRIx32"\n", a, b, flags); \
flags = A(b, a); \
printf(N " %f, %f => 0x%lx\n", b, a, flags); \
a = -1.0f; b = 2.0f; \
flags = A(a, b); \
printf(N " %f, %f => 0x%lx\n", a, b, flags); \
flags = A(b, a); \
printf(N " %f, %f => 0x%lx\n", b, a, flags); \
a = -1.0f; b = *(float*)&maxf; \
flags = A(a, b); \
printf(N " %f, %f => 0x%lx\n", a, b, flags); \
flags = A(b, a); \
printf(N " %f, %f => 0x%lx\n", b, a, flags); \
a = -1.0f; b = *(float*)&minf; \
flags = A(a, b); \
printf(N " %f, %f => 0x%lx\n", a, b, flags); \
flags = A(b, a); \
printf(N " %f, %f => 0x%lx\n", b, a, flags); \
a = *(float*)&maxf; b = *(float*)&minf; \
flags = A(a, b); \
printf(N " %f, %f => 0x%lx\n", a, b, flags); \
flags = A(b, a); \
printf(N " %f, %f => 0x%lx\n", b, a, flags); \
a = -0.0f; b = 0.0f; \
flags = A(a, b); \
printf(N " %f, %f => 0x%lx\n", a, b, flags); \
flags = A(b, a); \
printf(N " %f, %f => 0x%lx\n", b, a, flags); \
a = -2.0f; b = -2.0f; \
flags = A(a, b); \
printf(N " %f, %f => 0x%lx\n", a, b, flags); \
a = 5.0f; b = 5.0f; \
flags = A(a, b); \
printf(N " %f, %f => 0x%lx\n", a, b, flags); \
printf(N " %f, %f => 0x%"PRIx32"\n", b, a, flags); \
b = INFINITY; \
flags = A(a, b); \
printf(N " %f, %f => 0x%lx\n", a, b, flags); \
printf(N " %f, %f => 0x%"PRIx32"\n", a, b, flags); \
flags = A(b, a); \
printf(N " %f, %f => 0x%lx\n", b, a, flags); \
printf(N " %f, %f => 0x%"PRIx32"\n", b, a, flags); \
b = -INFINITY; \
flags = A(a, b); \
printf(N " %f, %f => 0x%lx\n", a, b, flags); \
printf(N " %f, %f => 0x%"PRIx32"\n", a, b, flags); \
flags = A(b, a); \
printf(N " %f, %f => 0x%lx\n", b, a, flags); \
printf(N " %f, %f => 0x%"PRIx32"\n", b, a, flags); \
b = NAN; \
flags = A(a, b); \
printf(N " %f, %f => 0x%lx\n", a, b, flags); \
printf(N " %f, %f => 0x%"PRIx32"\n", a, b, flags); \
flags = A(b, a); \
printf(N " %f, %f => 0x%lx\n", b, a, flags); \
printf(N " %f, %f => 0x%"PRIx32"\n", b, a, flags); \
b = a; \
flags = A(a, b); \
printf(N " %f, %f => 0x%lx\n", a, b, flags); \
printf(N " %f, %f => 0x%"PRIx32"\n", a, b, flags); \
flags = A(b, a); \
printf(N " %f, %f => 0x%lx\n", b, a, flags); \
printf(N " %f, %f => 0x%"PRIx32"\n", b, a, flags); \
a = b = INFINITY; \
flags = A(a, b); \
printf(N " %f, %f => 0x%lx\n", a, b, flags); \
printf(N " %f, %f => 0x%"PRIx32"\n", a, b, flags); \
a = -INFINITY; \
flags = A(a, b); \
printf(N " %f, %f => 0x%lx\n", a, b, flags); \
printf(N " %f, %f => 0x%"PRIx32"\n", a, b, flags); \
flags = A(b, a); \
printf(N " %f, %f => 0x%lx\n", b, a, flags); \
printf(N " %f, %f => 0x%"PRIx32"\n", b, a, flags); \
a = b = NAN; \
flags = A(a, b); \
printf(N " %f, %f => 0x%lx\n", a, b, flags);
printf(N " %f, %f => 0x%"PRIx32"\n", a, b, flags);
#define GO2(A, N) \
a = 1.0f; b = 2.0f; \
@ -257,21 +213,6 @@ r = A(a, b); \
printf(N " %g, %g => %g\n", a, b, *(float*)&r); \
r = A(b, a); \
printf(N " %g, %g => %g\n", b, a, *(float*)&r); \
a = -1.0f; b = 2.0f; \
r = A(a, b); \
printf(N " %g, %g => %g\n", a, b, *(float*)&r); \
r = A(b, a); \
printf(N " %g, %g => %g\n", b, a, *(float*)&r); \
a = -0.0f; b = 0.0f; \
r = A(a, b); \
printf(N " %g, %g => %g\n", a, b, *(float*)&r); \
r = A(b, a); \
printf(N " %g, %g => %g\n", b, a, *(float*)&r); \
a = 5.0f; b = -10.0f; \
r = A(a, b); \
printf(N " %g, %g => %g\n", a, b, *(float*)&r); \
r = A(b, a); \
printf(N " %g, %g => %g\n", b, a, *(float*)&r); \
a = -INFINITY; \
r = A(a, b); \
printf(N " %g, %g => %g\n", a, b, *(float*)&r); \
@ -317,7 +258,7 @@ printf(N " %g, %g => %g\n", b, a, *(float*)&r);
#undef GO1
#undef GO2
v128 a128, b128;
v128 a128;
int i;
#define GO1(A, N, C) \
@ -367,10 +308,6 @@ printf(N " %g, %g => %g\n", b, a, *(float*)&r);
a128.md = _mm_##A##_pd(A1.md); \
printf("%s(", #C); print_pd(A1); \
printf(") = "); print_pd(a128); printf("\n");
#define GO1pda(A, C, A1) \
a128.md = _mm_##A##_pd(A1.md); \
printf("%s(", #C); print_pd(A1); \
printf(") = "); print_pd_approx(a128); printf("\n");
#define GO2pd(A, C, A1, A2) \
a128.md = _mm_##A##_pd(A1.md, A2.md); \
printf("%s(", #C); print_pd(A1); \
@ -382,23 +319,13 @@ printf(N " %g, %g => %g\n", b, a, *(float*)&r);
printf(", "); print_pd(A2); \
printf(", %d) = ", I); print_pd(a128); printf("\n");
#define GO1isd(A, C, A1) \
a128.md[0] = A1; \
i = _mm_##A##_si32(a128.md); \
printf("%s(%g", #C, A1); \
printf(") = 0x%x\n", i);
#define GO1iss(A, C, A1) \
a128.mf[0] = A1; \
i = _mm_##A##_si32(a128.mf); \
printf("%s(%g", #C, A1); \
i = _mm_##A##_sd(A1.md); \
printf("%s(", #C); print_64(A1); \
printf(") = 0x%x\n", i);
#define GO1sd(A, C, A1) \
a128.md = _mm_##A##_sd(A1); \
printf("%s(", #C); print_sd(A1); \
printf(") = "); print_sd(a128); printf("\n");
#define GO1sda(A, C, A1) \
a128.md = _mm_##A##_sd(A1.md); \
printf("%s(", #C); print_sd(A1); \
printf(") = "); print_sd_approx(a128); printf("\n");
printf(") = "); print_sd(a128); printf("\n");
#define GO2sd(A, C, A1, A2) \
a128.md = _mm_##A##_sd(A1.md, A2.md); \
printf("%s(", #C); print_sd(A1); \
@ -417,10 +344,6 @@ printf(N " %g, %g => %g\n", b, a, *(float*)&r);
a128.mf = _mm_##A##_ps(A1.mf); \
printf("%s(", #C); print_ps(A1); \
printf(") = "); print_ps(a128); printf("\n");
#define GO1psa(A, C, A1) \
a128.mf = _mm_##A##_ps(A1.mf); \
printf("%s(", #C); print_ps(A1); \
printf(") = "); print_ps_approx(a128); printf("\n");
#define GO2ps(A, C, A1, A2) \
a128.mf = _mm_##A##_ps(A1.mf, A2.mf); \
printf("%s(", #C); print_ps(A1); \
@ -431,6 +354,10 @@ printf(N " %g, %g => %g\n", b, a, *(float*)&r);
printf("%s(", #C); print_ps(A1); \
printf(", "); print_ps(A2); \
printf(", %d) = ", I); print_ps(a128); printf("\n");
#define GO1ps2dq(A, C, A1) \
a128.mm = _mm_##A##_epi32(A1.mf); \
printf("%s(", #C); print_ps(A1); \
printf(") = "); print_32(a128); printf("\n");
#define MULITGO2pd(A, B) \
GO2pd(A, B, a128_pd, b128_pd) \
@ -456,6 +383,12 @@ printf(N " %g, %g => %g\n", b, a, *(float*)&r);
GO2ps(A, B, c128_ps, d128_ps) \
GO2ps(A, B, d128_ps, d128_ps)
#define MULTIGO1ps2dq(A, B) \
GO1ps2dq(A, B, a128_ps) \
GO1ps2dq(A, B, b128_ps) \
GO1ps2dq(A, B, c128_ps) \
GO1ps2dq(A, B, d128_ps)
#define MULITGO2Cps(A, B, I) \
GO2Cps(A, B, a128_ps, b128_ps, I) \
GO2Cps(A, B, b128_ps, c128_ps, I) \
@ -478,35 +411,6 @@ printf(N " %g, %g => %g\n", b, a, *(float*)&r);
GO2sd(A, B, b128_pd, reverse_pd(d128_pd)) \
GO2sd(A, B, b128_pd, reverse_pd(d128_pd))
#define MULTIGO1iss(A, B) \
GO1iss(A, B, 1.0f); \
GO1iss(A, B, 1.49f); \
GO1iss(A, B, 1.5f); \
GO1iss(A, B, 1.9f); \
GO1iss(A, B, -1.0f); \
GO1iss(A, B, -1.49f); \
GO1iss(A, B, -1.5f); \
GO1iss(A, B, -1.9f); \
GO1iss(A, B, 1e30f); \
GO1iss(A, B, -1e30f); \
GO1iss(A, B, INFINITY); \
GO1iss(A, B, -INFINITY); \
GO1iss(A, B, NAN);
#define MULTIGO1isd(A, B) \
GO1isd(A, B, 1.0f); \
GO1isd(A, B, 1.49f); \
GO1isd(A, B, 1.5f); \
GO1isd(A, B, 1.9f); \
GO1isd(A, B, -1.0f); \
GO1isd(A, B, -1.49f); \
GO1isd(A, B, -1.5f); \
GO1isd(A, B, -1.9f); \
GO1isd(A, B, 1e300); \
GO1isd(A, B, -1e300); \
GO1isd(A, B, INFINITY); \
GO1isd(A, B, -INFINITY); \
GO1isd(A, B, NAN);
GO2(shuffle, 8, pshufb, a128_8, b128_8)
GO2(hadd, 16, phaddw, a128_16, b128_16)
@ -554,14 +458,6 @@ printf(N " %g, %g => %g\n", b, a, *(float*)&r);
GO1pd(sqrt, psqrtpd, b128_pd)
GO1pd(sqrt, psqrtpd, c128_pd)
GO1pd(sqrt, psqrtpd, d128_pd)
//GO1pda(rsqrt, prsqrtps, a128_pd)
//GO1pda(rsqrt, prsqrtps, b128_pd)
//GO1pda(rsqrt, prsqrtps, c128_pd)
//GO1pda(rsqrt, prsqrtps, d128_pd)
//GO1pda(rcp, prcpps, a128_pd)
//GO1pda(rcp, prcpps, b128_pd)
//GO1pda(rcp, prcpps, c128_pd)
//GO1psa(rcp, prcpps, d128_pd)
MULITGO2pd(and, andpd)
MULITGO2pd(andnot, andnpd)
MULITGO2pd(or, orpd)
@ -674,14 +570,14 @@ printf(N " %g, %g => %g\n", b, a, *(float*)&r);
GO1ps(sqrt, psqrtps, b128_ps)
GO1ps(sqrt, psqrtps, c128_ps)
GO1ps(sqrt, psqrtps, d128_ps)
GO1psa(rsqrt, prsqrtps, a128_ps)
GO1psa(rsqrt, prsqrtps, b128_ps)
GO1psa(rsqrt, prsqrtps, c128_ps)
GO1psa(rsqrt, prsqrtps, d128_ps)
GO1psa(rcp, prcpps, a128_ps)
GO1psa(rcp, prcpps, b128_ps)
GO1psa(rcp, prcpps, c128_ps)
GO1psa(rcp, prcpps, d128_ps)
//GO1ps(rsqrt, prsqrtps, a128_ps) // difference in precision
//GO1ps(rsqrt, prsqrtps, b128_ps) // same
//GO1ps(rsqrt, prsqrtps, c128_ps) // same
//GO1ps(rsqrt, prsqrtps, d128_ps) // difference in the handling of NAN, (-)0, and INF in Dynarec
//GO1ps(rcp, prcpps, a128_ps) // deference in precision
//GO1ps(rcp, prcpps, b128_ps) // deference in precision
//GO1ps(rcp, prcpps, c128_ps) // deference in precision
GO1ps(rcp, prcpps, d128_ps)
MULITGO2ps(and, andps)
MULITGO2ps(andnot, andnps)
MULITGO2ps(or, orps)
@ -692,7 +588,7 @@ printf(N " %g, %g => %g\n", b, a, *(float*)&r);
MULITGO2ps(min, minps)
MULITGO2ps(div, divps)
MULITGO2ps(max, maxps)
// MULITGO2Cps(cmp, cmpps, 0) // use avx for some reason
//MULITGO2Cps(cmp, cmpps, 0) // use avx for some reason
MULITGO2Cps(shuffle, shufps, 0)
MULITGO2Cps(shuffle, shufps, 0x15)
MULITGO2Cps(shuffle, shufps, 0xff)
@ -704,35 +600,12 @@ printf(N " %g, %g => %g\n", b, a, *(float*)&r);
MULTIGO2sd(min, minsd)
MULTIGO2sd(div, divsd)
MULTIGO2sd(max, maxsd)
MULTIGO1iss(cvttss, cvttss2si)
MULTIGO1isd(cvttsd, cvttsd2si)
printf("default rounding\n");
MULTIGO1iss(cvtss, cvtss2si)
MULTIGO1isd(cvtsd, cvtsd2si)
unsigned int old_mxcsr = _mm_getcsr();
for(unsigned int rr = 0; rr<4; ++rr) {
printf("Round(%d)\n", rr);
_mm_setcsr((old_mxcsr&~0x6000)|(rr<<13));
MULTIGO1iss(cvtss, cvtss2si)
MULTIGO1isd(cvtsd, cvtsd2si)
a128.mf[0]=1.4f;
i = _mm_cvtss_si32(a128.mf);
b128.mf = _mm_cvtsi32_ss(a128.mf, i);
printf("cvtss2si(%g) -> %d cvtsi2ss -> %g\n", a128.mf[0], i, b128.mf[0]);
a128.mf[0]=-2.9f;
i = _mm_cvtss_si32(a128.mf);
b128.mf = _mm_cvtsi32_ss(a128.mf, i);
printf("cvtss2si(%g) -> %d cvtsi2ss -> %g\n", a128.mf[0], i, b128.mf[0]);
a128.mf[0]=1.6f;
i = _mm_cvtss_si32(a128.mf);
b128.mf = _mm_cvtsi32_ss(a128.mf, i);
printf("cvtss2si(%g) -> %d cvtsi2ss -> %g\n", a128.mf[0], i, b128.mf[0]);
a128.mf[0]=1e38f;
i = _mm_cvtss_si32(a128.mf);
b128.mf = _mm_cvtsi32_ss(a128.mf, i);
printf("cvtss2si(%g) -> %d cvtsi2ss -> %g\n", a128.mf[0], i, b128.mf[0]);
}
_mm_setcsr(old_mxcsr);
MULTIGO1ps2dq(cvtps, cvtps2dq)
MULITGO2Cps(dp, dpps, 0xff)
MULITGO2Cps(dp, dpps, 0x3f)
MULITGO2Cps(dp, dpps, 0xf3)
MULITGO2Cps(dp, dpps, 0x53)
return 0;
}

BIN
tests/test23 Executable file

Binary file not shown.

52
tests/test23.c Normal file
View File

@ -0,0 +1,52 @@
#include <stdio.h>
#include <string.h>
#include <stddef.h>
#include <stdint.h>
// Build with `gcc -march=core2 -O2 -m32 test23.c -o test23`
uint64_t a = 0x12345678abcdefed;
uint32_t b = 0x12345678;
uint16_t c = 0x1234;
int main()
{
uint32_t ret2;
uint16_t ret3;
asm volatile(
"movbe %1, %0\n"
: "=r"(ret2)
: "m"(b)
: "memory");
printf("ret = 0x%x\n", ret2);
asm volatile(
"movbe %1, %0\n"
: "=r"(ret3)
: "m"(c)
: "memory");
printf("ret = 0x%x\n", ret3);
asm volatile(
"movbe %1, %0\n"
: "=m"(ret2)
: "r"(b)
: "memory");
printf("ret = 0x%x\n", ret2);
asm volatile(
"movbe %1, %0\n"
: "=m"(ret3)
: "r"(c)
: "memory");
printf("ret = 0x%x\n", ret3);
asm volatile(
"bswap %0\n"
: "+r"(ret2)
:
:);
printf("ret = 0x%x\n", ret2);
return 0;
}

BIN
tests/test24 Executable file

Binary file not shown.

104
tests/test24.c Normal file
View File

@ -0,0 +1,104 @@
#include <stdio.h>
#include <fenv.h>
#include <math.h>
// Build with `gcc -march=core2 -O0 -m32 test24.c -o test24 -lm`
#define TEST(fn, val) \
printf("Testing %s(%f)\n", #fn, val); \
fesetround(FE_UPWARD); \
printf("FE_UPWARD: %.1f\n", (double)fn(val)); \
printf("Current rounding mode: 0x%x\n", fegetround()); \
fesetround(FE_DOWNWARD); \
printf("FE_DOWNWARD: %.1f\n", (double)fn(val)); \
printf("Current rounding mode: 0x%x\n", fegetround()); \
fesetround(FE_TOWARDZERO); \
printf("FE_TOWARDZERO: %.1f\n", (double)fn(val)); \
printf("Current rounding mode: 0x%x\n", fegetround()); \
fesetround(FE_TONEAREST); \
printf("FE_TONEAREST: %.1f\n", (double)fn(val)); \
printf("Current rounding mode: 0x%x\n\n", fegetround());
int main()
{
TEST(rint, 1.0f);
TEST(rint, 1.3f);
TEST(rint, 1.5f);
TEST(rint, 1.8f);
TEST(rint, 2.0f);
TEST(rint, 2.3f);
TEST(rint, 2.5f);
TEST(rint, 2.8f);
TEST(rintf, 1.0f);
TEST(rintf, 1.3f);
TEST(rintf, 1.5f);
TEST(rintf, 1.8f);
TEST(rintf, 2.0f);
TEST(rintf, 2.3f);
TEST(rintf, 2.5f);
TEST(rintf, 2.8f);
TEST(nearbyint, 1.0f);
TEST(nearbyint, 1.3f);
TEST(nearbyint, 1.5f);
TEST(nearbyint, 1.8f);
TEST(nearbyint, 2.0f);
TEST(nearbyint, 2.3f);
TEST(nearbyint, 2.5f);
TEST(nearbyint, 2.8f);
TEST(nearbyintf, 1.0f);
TEST(nearbyintf, 1.3f);
TEST(nearbyintf, 1.5f);
TEST(nearbyintf, 1.8f);
TEST(nearbyintf, 2.0f);
TEST(nearbyintf, 2.3f);
TEST(nearbyintf, 2.5f);
TEST(nearbyintf, 2.8f);
TEST(llrintf, 1.0f);
TEST(llrintf, 1.3f);
TEST(llrintf, 1.5f);
TEST(llrintf, 1.8f);
TEST(llrintf, 2.0f);
TEST(llrintf, 2.3f);
TEST(llrintf, 2.5f);
TEST(llrintf, 2.8f);
TEST(llrint, 1.0f);
TEST(llrint, 1.3f);
TEST(llrint, 1.5f);
TEST(llrint, 1.8f);
TEST(llrint, 2.0f);
TEST(llrint, 2.3f);
TEST(llrint, 2.5f);
TEST(llrint, 2.8f);
TEST(lrintf, 1.0f);
TEST(lrintf, 1.3f);
TEST(lrintf, 1.5f);
TEST(lrintf, 1.8f);
TEST(lrintf, 2.0f);
TEST(lrintf, 2.3f);
TEST(lrintf, 2.5f);
TEST(lrintf, 2.8f);
TEST(lrint, 1.0f);
TEST(lrint, 1.3f);
TEST(lrint, 1.5f);
TEST(lrint, 1.8f);
TEST(lrint, 2.0f);
TEST(lrint, 2.3f);
TEST(lrint, 2.5f);
TEST(lrint, 2.8f);
return 0;
}

BIN
tests/test25 Executable file

Binary file not shown.

130
tests/test25.c Normal file
View File

@ -0,0 +1,130 @@
// build with gcc -O0 -g -msse -msse2 -mssse3 -msse4.1 -msse4.2 -m32 test25.c -o test25
// and -m32 for 32bits version
#include <inttypes.h>
#include <string.h>
#include <stdio.h>
#include <stddef.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>
#include <pmmintrin.h>
#include <immintrin.h>
typedef unsigned char u8x16 __attribute__ ((vector_size (16)));
typedef unsigned short u16x8 __attribute__ ((vector_size (16)));
typedef unsigned int u32x4 __attribute__ ((vector_size (16)));
typedef unsigned long int u64x2 __attribute__ ((vector_size (16)));
typedef float f32x4 __attribute__ ((vector_size (16)));
typedef double d64x2 __attribute__ ((vector_size (16)));
const char* string1 = "This is a string";
const char* string2 = "This\0 string ";
const char* string3 = "is\0 ";
const char* string4 = "maestrum-foo-bar";
const char* string5 = "\0 ";
typedef union {
__m128i mm;
__m128 mf;
__m128d md;
u8x16 u8;
u16x8 u16;
u32x4 u32;
u64x2 u64;
f32x4 f32;
d64x2 d64;
} v128;
v128 load_string(const char* s)
{
v128 ret;
for(int i=0; i<16; ++i)
ret.u8[i] = s[i];
return ret;
}
v128 load_stringw(const char* s)
{
v128 ret;
for(int i=0; i<8; ++i)
ret.u16[i] = s[i];
return ret;
}
int main(int argc, const char** argv)
{
printf("test SSE 4.2\n");
v128 a, b, c;
int ret;
int fa, fc, fo, fs, fz;
#define GO1(A, B, C) \
ret = _mm_cmpestri(a.mm, strlen(A), b.mm, strlen(B), C); \
printf("_mm_cmpestri(\"%s\", %d, \"%s\", %d, 0x%x) => %d\n", A, strlen(A), B, strlen(B), C, ret); \
fa = _mm_cmpestra(a.mm, strlen(A), b.mm, strlen(B), C); \
fc = _mm_cmpestrc(a.mm, strlen(A), b.mm, strlen(B), C); \
fo = _mm_cmpestro(a.mm, strlen(A), b.mm, strlen(B), C); \
fs = _mm_cmpestrs(a.mm, strlen(A), b.mm, strlen(B), C); \
fz = _mm_cmpestrz(a.mm, strlen(A), b.mm, strlen(B), C); \
printf("_mm_cmpestri(\"%s\", %d, \"%s\", %d, 0x%x) flags: a:%d s:%d z:%d c:%d o:%d\n", A, strlen(A), B, strlen(B), C, fa, fs, fz, fc, fo); \
c.mm = _mm_cmpestrm(a.mm, strlen(A), b.mm, strlen(B), C); \
printf("mm_cmpestrm(\"%s\", %d, \"%s\", %d, 0x%x) = %016x-%016x\n", A, strlen(A), B, strlen(B), C, c.u64[1], c.u64[0]); \
ret = _mm_cmpistri(a.mm, b.mm, C); \
printf("_mm_cmpistri(\"%s\", \"%s\", 0x%x) => %d\n", A, B, C, ret); \
fa = _mm_cmpistra(a.mm, b.mm, C); \
fc = _mm_cmpistrc(a.mm, b.mm, C); \
fo = _mm_cmpistro(a.mm, b.mm, C); \
fs = _mm_cmpistrs(a.mm, b.mm, C); \
fz = _mm_cmpistrz(a.mm, b.mm, C); \
printf("_mm_cmpestri(\"%s\", \"%s\", 0x%x) flags: a:%d s:%d z:%d c:%d o:%d\n", A, B, C, fa, fs, fz, fc, fo); \
c.mm = _mm_cmpistrm(a.mm, b.mm, C); \
printf("mm_cmpestrm(\"%s\", \"%s\", 0x%x) = %016x-%016x\n", A, B, C, c.u64[1], c.u64[0])
#define GO(A, B, C) \
a = load_string(A); \
b = load_string(B); \
GO1(A, B, C); \
a = load_stringw(A);\
b = load_stringw(B);\
GO1(A, B, C+1) \
#define GO2(C) \
GO(string1, string2, C); \
GO(string2, string1, C); \
GO(string1, string3, C); \
GO(string3, string1, C); \
GO(string1, string4, C); \
GO(string4, string1, C); \
GO(string1, string5, C); \
GO(string5, string1, C);
GO2(0x00)
GO2(0x04)
GO2(0x08)
GO2(0x0c)
GO2(0x10)
GO2(0x30)
GO2(0b1001100)
GO2(0b0101100)
GO2(0b0110100)
GO2(0b0110110)
GO2(0b1110100)
unsigned int crc = 0;
printf("crc32(0x%x, byte:0x%x) => ", crc, 0);
crc = _mm_crc32_u8(crc, 0);
printf("0x%x\n", crc);
printf("crc32(0x%x, byte:0x%x) => ", crc, 10);
crc = _mm_crc32_u8(crc, 10);
printf("0x%x\n", crc);
printf("crc32(0x%x, dword:0x%x) => ", crc, 0);
crc = _mm_crc32_u32(crc, 0);
printf("0x%x\n", crc);
printf("crc32(0x%x, dword:0x%x) => ", crc, 0x123456);
crc = _mm_crc32_u32(crc, 0x123456);
printf("0x%x\n", crc);
printf("crc32(0x%x, word:0x%x) => ", crc, 0x8765);
crc = _mm_crc32_u16(crc, 0x8765);
printf("0x%x\n", crc);
return 0;
}