box64/tests/test17.c
Yang Liu 24f2720df4
[TESTS] Disable compiler optimizations for test17 and some related fixes (#677)
* [TESTS] Disable compiler optimizations for test17

* [INTERP] Fixed MAXSD/MINSD when both sources are 0

* [RV64_DYNAREC] Fixed F2 0F 5D MINSD opcode
2023-04-09 08:41:36 +02:00

607 lines
22 KiB
C

// build with gcc -O0 -g -msse -msse2 -mssse3 -msse4.1 test17.c -o test17
// and -m32 for 32bits version
#include <inttypes.h>
#include <string.h>
#include <stdio.h>
#include <stddef.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>
#include <pmmintrin.h>
#include <immintrin.h>
typedef unsigned char u8x16 __attribute__ ((vector_size (16)));
typedef unsigned short u16x8 __attribute__ ((vector_size (16)));
typedef unsigned int u32x4 __attribute__ ((vector_size (16)));
typedef unsigned long int u64x2 __attribute__ ((vector_size (16)));
typedef float f32x4 __attribute__ ((vector_size (16)));
typedef double d64x2 __attribute__ ((vector_size (16)));
typedef union {
__m128i mm;
__m128 mf;
__m128d md;
u8x16 u8;
u16x8 u16;
u32x4 u32;
u64x2 u64;
f32x4 f32;
d64x2 d64;
} v128;
uint64_t _ucomiss_(float a, float b)
{
uint64_t ret = 0x202;
v128 va, vb;
va.f32[0] = a;
vb.f32[0] = b;
if(_mm_ucomigt_ss(va.mf, vb.mf))
ret |= 0x000;
else if(_mm_ucomilt_ss(va.mf, vb.mf))
ret |= 0x001;
else if(_mm_ucomieq_ss(va.mf, vb.mf))
ret |= 0x040;
else
ret |= 0x045;
return ret;
}
uint64_t _minss_(float a, float b)
{
v128 va, vb, ret;
va.f32[0] = a;
vb.f32[0] = b;
ret.mf = _mm_min_ss(va.mf, vb.mf);
return ret.u64[0];
}
uint64_t _maxss_(float a, float b)
{
v128 va, vb, ret;
va.f32[0] = a;
vb.f32[0] = b;
ret.mf = _mm_max_ss(va.mf, vb.mf);
return ret.u64[0];
}
#define CMPSS(A, B) \
uint64_t _cmpss_##A(float a, float b) \
{ \
v128 va, vb, ret; \
va.f32[0] = a; \
vb.f32[0] = b; \
ret.mf = _mm_cmp##B##_ss(va.mf, vb.mf); \
return ret.u64[0]; \
}
CMPSS(0, eq)
CMPSS(1, lt)
CMPSS(2, le)
CMPSS(3, unord)
CMPSS(4, neq)
CMPSS(5, nlt)
CMPSS(6, nle)
CMPSS(7, ord)
#undef CMPSS
const v128 a128_8 = {.u8 = {
0xff, 0x80, 0x7f, 0x00, 0x01, 0x02, 0x03, 0x81,
0xfe, 0x84, 0x72, 0x52, 0xa5, 0x00, 0xc0, 0x32
}};
const v128 a128_16 = {.u16 = {
0xffff, 0x8000, 0x7fff, 0x0000, 0x0001, 0x0002, 0x0003, 0x8001
}};
const v128 a128_32 = {.u32 = {
0xffffffff, 0x80000000, 0x7fffffff, 0x00000000
}};
const v128 a128_64 = {.u64 = {
0xffffffffffffffffLL, 0x8000000000000000LL
}};
const v128 b128_8 = {.u8 = {
0x00, 0x01, 0x05, 0x15, 0x20, 0x80, 0xff, 0x00,
0x08, 0x07, 0x81, 0x06, 0x0a, 0x0f, 0x10, 0x01
}};
const v128 b128_16 = {.u16 = {
0x8000, 0x7fff, 0xffff, 0xffff, 0x0050, 0x9000, 0xfffe, 0x8001
}};
const v128 b128_32 = {.u32 = {
0x00000001, 0x80000000, 0x00000005, 0xfffffffe
}};
const v128 b128_64 = {.u64 = {
0x0000000000000001LL, 0x8000000000000000LL
}};
const v128 c128_32 = {.u32 = {
0x00000001, 0x80000000, 0x80000005, 0x0000fffe
}};
const v128 a128_pd = {.d64 = { 1.0, 2.0}};
const v128 b128_pd = {.d64 = { 0.0, -2.0}};
const v128 c128_pd = {.d64 = { INFINITY, -INFINITY}};
const v128 d128_pd = {.d64 = { NAN, -0.0}};
const v128 a128_ps = {.f32 = { 1.0, 2.0, 3.0, -4.0}};
const v128 b128_ps = {.f32 = { 0.0, -2.0, -10.0, 0.5}};
const v128 c128_ps = {.f32 = { INFINITY, -INFINITY, -INFINITY, 1.0}};
const v128 d128_ps = {.f32 = { NAN, -0.0, -NAN, INFINITY}};
v128 reverse_pd(v128 a) {
v128 ret;
ret.md = _mm_shuffle_pd(a.md, a.md, 1);
return ret;
}
void print_8(v128 v) {
for(int i=0; i<16; ++i)
printf("0x%x ", v.u8[i]);
}
void print_16(v128 v) {
for(int i=0; i<8; ++i)
printf("0x%x ", v.u16[i]);
}
void print_32(v128 v) {
for(int i=0; i<4; ++i)
printf("0x%x ", v.u32[i]);
}
void print_64(v128 v) {
for(int i=0; i<2; ++i)
printf("0x%"PRIx64" ", v.u64[i]);
}
#define print_128 print_64
void print_ps(v128 v) {
for(int i=0; i<4; ++i)
if(isnanf(v.f32[i]))
printf("nan ");
else
printf("%g ", v.f32[i]);
}
void print_pd(v128 v) {
for(int i=0; i<2; ++i)
if(isnan(v.d64[i]))
printf("0x%"PRIx64" ", v.u64[i]);
else
printf("%g ", v.d64[i]);
}
#define print_sd print_pd
int main(int argc, const char** argv)
{
float a, b;
uint32_t flags;
uint32_t maxf = 0x7f7fffff;
uint32_t minf = 0xff7fffff;
uint32_t r;
#define GO1(A, N) \
a = 1.0f; b = 2.0f; \
flags = A(a, b); \
printf(N " %f, %f => 0x%"PRIx32"\n", a, b, flags); \
flags = A(b, a); \
printf(N " %f, %f => 0x%"PRIx32"\n", b, a, flags); \
b = INFINITY; \
flags = A(a, b); \
printf(N " %f, %f => 0x%"PRIx32"\n", a, b, flags); \
flags = A(b, a); \
printf(N " %f, %f => 0x%"PRIx32"\n", b, a, flags); \
b = -INFINITY; \
flags = A(a, b); \
printf(N " %f, %f => 0x%"PRIx32"\n", a, b, flags); \
flags = A(b, a); \
printf(N " %f, %f => 0x%"PRIx32"\n", b, a, flags); \
b = NAN; \
flags = A(a, b); \
printf(N " %f, %f => 0x%"PRIx32"\n", a, b, flags); \
flags = A(b, a); \
printf(N " %f, %f => 0x%"PRIx32"\n", b, a, flags); \
b = a; \
flags = A(a, b); \
printf(N " %f, %f => 0x%"PRIx32"\n", a, b, flags); \
flags = A(b, a); \
printf(N " %f, %f => 0x%"PRIx32"\n", b, a, flags); \
a = b = INFINITY; \
flags = A(a, b); \
printf(N " %f, %f => 0x%"PRIx32"\n", a, b, flags); \
a = -INFINITY; \
flags = A(a, b); \
printf(N " %f, %f => 0x%"PRIx32"\n", a, b, flags); \
flags = A(b, a); \
printf(N " %f, %f => 0x%"PRIx32"\n", b, a, flags); \
a = b = NAN; \
flags = A(a, b); \
printf(N " %f, %f => 0x%"PRIx32"\n", a, b, flags);
#define GO2(A, N) \
a = 1.0f; b = 2.0f; \
r = A(a, b); \
printf(N " %g, %g => %g\n", a, b, *(float*)&r); \
r = A(b, a); \
printf(N " %g, %g => %g\n", b, a, *(float*)&r); \
a = -INFINITY; \
r = A(a, b); \
printf(N " %g, %g => %g\n", a, b, *(float*)&r); \
r = A(b, a); \
printf(N " %g, %g => %g\n", b, a, *(float*)&r); \
a = +INFINITY; \
r = A(a, b); \
printf(N " %g, %g => %g\n", a, b, *(float*)&r); \
r = A(b, a); \
printf(N " %g, %g => %g\n", b, a, *(float*)&r); \
a = NAN; \
r = A(a, b); \
printf(N " %g, %g => %g\n", a, b, *(float*)&r); \
r = A(b, a); \
printf(N " %g, %g => %g\n", b, a, *(float*)&r); \
b = *(float*)&maxf; \
r = A(a, b); \
printf(N " %g, %g => %g\n", a, b, *(float*)&r); \
r = A(b, a); \
printf(N " %g, %g => %g\n", b, a, *(float*)&r); \
a = -INFINITY; \
r = A(a, b); \
printf(N " %g, %g => %g\n", a, b, *(float*)&r); \
r = A(b, a); \
printf(N " %g, %g => %g\n", b, a, *(float*)&r); \
a = +INFINITY; \
r = A(a, b); \
printf(N " %g, %g => %g\n", a, b, *(float*)&r); \
r = A(b, a); \
printf(N " %g, %g => %g\n", b, a, *(float*)&r);
GO1(_ucomiss_, "ucomiss")
GO2(_minss_, "minss")
GO2(_maxss_, "maxss")
GO1(_cmpss_0, "cmpss 0")
GO1(_cmpss_1, "cmpss 1")
GO1(_cmpss_2, "cmpss 2")
GO1(_cmpss_3, "cmpss 3")
GO1(_cmpss_4, "cmpss 4")
GO1(_cmpss_5, "cmpss 5")
GO1(_cmpss_6, "cmpss 6")
GO1(_cmpss_7, "cmpss 7")
#undef GO1
#undef GO2
v128 a128;
int i;
#define GO1(A, N, C) \
a128.mm = _mm_##A##_epi##N(a128_##N.mm); \
printf("%s(", #C); print_##N(a128_##N); \
printf(") = "); print_##N(a128); printf("\n");
#define GO1C(A, N, C, A1, I) \
a128.mm = _mm_##A##_epi##N(A1.mm, I); \
printf("%s(", #C); print_##N(A1); \
printf("%d) = ", I); print_##N(a128); printf("\n");
#define GO2(A, N, C, A1, A2) \
a128.mm = _mm_##A##_epi##N(A1.mm, A2.mm); \
printf("%s(", #C); print_##N(A1); \
printf(", "); print_##N(A2); \
printf(") = "); print_##N(a128); printf("\n");
#define GO2u(A, N, C, A1, A2) \
a128.mm = _mm_##A##_epu##N(A1.mm, A2.mm); \
printf("%s(", #C); print_##N(A1); \
printf(", "); print_##N(A2); \
printf(") = "); print_##N(a128); printf("\n");
#define GO2f(A, C, A1, A2) \
a128.mm = _mm_##A##_si128(A1.mm, A2.mm); \
printf("%s(", #C); print_128(A1); \
printf(", "); print_128(A2); \
printf(") = "); print_128(a128); printf("\n");
#define GO2C(A, N, C, A1, A2, I) \
a128.mm = _mm_##A##_epi##N(A1.mm, A2.mm, I); \
printf("%s(", #C); print_##N(A1); \
printf(", "); print_##N(A2); \
printf("%d) = ", I); print_##N(a128); printf("\n");
#define GO2i(A, A1, A2) \
i = _mm_##A##_si128(A1.mm, A2.mm); \
printf("p%s(", #A); print_64(A1); \
printf(", "); print_64(A2); \
printf(") = %d\n", i);
#define GO3PS(A, N, A1, A2, A3) \
a128.mf = _mm_##A##_ps(A1.mf, A2.mf, A3.mf); \
printf("p%s%s(", #A, "ps"); print_##N(A1); \
printf(", "); print_##N(A2); \
printf(", "); print_##N(A3); \
printf(") = "); print_##N(a128); printf("\n");
#define GO1ipd(A, C, A1) \
i = _mm_##A##_pd(A1.md); \
printf("%s(", #C); print_64(A1); \
printf(") = 0x%x\n", i);
#define GO1pd(A, C, A1) \
a128.md = _mm_##A##_pd(A1.md); \
printf("%s(", #C); print_pd(A1); \
printf(") = "); print_pd(a128); printf("\n");
#define GO2pd(A, C, A1, A2) \
a128.md = _mm_##A##_pd(A1.md, A2.md); \
printf("%s(", #C); print_pd(A1); \
printf(", "); print_pd(A2); \
printf(") = "); print_pd(a128); printf("\n");
#define GO2Cpd(A, C, A1, A2, I) \
a128.md = _mm_##A##_pd(A1.md, A2.md, I); \
printf("%s(", #C); print_pd(A1); \
printf(", "); print_pd(A2); \
printf(", %d) = ", I); print_pd(a128); printf("\n");
#define GO1isd(A, C, A1) \
i = _mm_##A##_sd(A1.md); \
printf("%s(", #C); print_64(A1); \
printf(") = 0x%x\n", i);
#define GO1sd(A, C, A1) \
a128.md = _mm_##A##_sd(A1.md); \
printf("%s(", #C); print_sd(A1); \
printf(") = "); print_sd(a128); printf("\n");
#define GO2sd(A, C, A1, A2) \
a128.md = _mm_##A##_sd(A1.md, A2.md); \
printf("%s(", #C); print_sd(A1); \
printf(", "); print_sd(A2); \
printf(") = "); print_sd(a128); printf("\n");
#define GO2Csd(A, C, A1, A2, I) \
a128.md = _mm_##A##_sd(A1.md, A2.md, I); \
printf("%s(", #C); print_sd(A1); \
printf(", "); print_sd(A2); \
printf(", %d) = ", I); print_sd(a128); printf("\n");
#define GO1ips(A, C, A1) \
i = _mm_##A##_ps(A1.mf); \
printf("%s(", #C); print_32(A1); \
printf(") = 0x%x\n", i);
#define GO1ps(A, C, A1) \
a128.mf = _mm_##A##_ps(A1.mf); \
printf("%s(", #C); print_ps(A1); \
printf(") = "); print_ps(a128); printf("\n");
#define GO2ps(A, C, A1, A2) \
a128.mf = _mm_##A##_ps(A1.mf, A2.mf); \
printf("%s(", #C); print_ps(A1); \
printf(", "); print_ps(A2); \
printf(") = "); print_ps(a128); printf("\n");
#define GO2Cps(A, C, A1, A2, I) \
a128.mf = _mm_##A##_ps(A1.mf, A2.mf, I); \
printf("%s(", #C); print_ps(A1); \
printf(", "); print_ps(A2); \
printf(", %d) = ", I); print_ps(a128); printf("\n");
#define GO1ps2dq(A, C, A1) \
a128.mm = _mm_##A##_epi32(A1.mf); \
printf("%s(", #C); print_ps(A1); \
printf(") = "); print_32(a128); printf("\n");
#define MULITGO2pd(A, B) \
GO2pd(A, B, a128_pd, b128_pd) \
GO2pd(A, B, b128_pd, c128_pd) \
GO2pd(A, B, a128_pd, d128_pd) \
GO2pd(A, B, b128_pd, d128_pd) \
GO2pd(A, B, c128_pd, d128_pd) \
GO2pd(A, B, d128_pd, d128_pd)
#define MULITGO2Cpd(A, B, I) \
GO2Cpd(A, B, a128_pd, b128_pd, I) \
GO2Cpd(A, B, b128_pd, c128_pd, I) \
GO2Cpd(A, B, a128_pd, d128_pd, I) \
GO2Cpd(A, B, b128_pd, d128_pd, I) \
GO2Cpd(A, B, c128_pd, d128_pd, I) \
GO2Cpd(A, B, d128_pd, d128_pd, I)
#define MULITGO2ps(A, B) \
GO2ps(A, B, a128_ps, b128_ps) \
GO2ps(A, B, b128_ps, c128_ps) \
GO2ps(A, B, a128_ps, d128_ps) \
GO2ps(A, B, b128_ps, d128_ps) \
GO2ps(A, B, c128_ps, d128_ps) \
GO2ps(A, B, d128_ps, d128_ps)
#define MULTIGO1ps2dq(A, B) \
GO1ps2dq(A, B, a128_ps) \
GO1ps2dq(A, B, b128_ps) \
GO1ps2dq(A, B, c128_ps) \
GO1ps2dq(A, B, d128_ps)
#define MULITGO2Cps(A, B, I) \
GO2Cps(A, B, a128_ps, b128_ps, I) \
GO2Cps(A, B, b128_ps, c128_ps, I) \
GO2Cps(A, B, a128_ps, d128_ps, I) \
GO2Cps(A, B, b128_ps, d128_ps, I) \
GO2Cps(A, B, c128_ps, d128_ps, I) \
GO2Cps(A, B, d128_ps, d128_ps, I)
#define MULTIGO2sd(A, B) \
GO2sd(A, B, a128_pd, a128_pd) \
GO2sd(A, B, a128_pd, b128_pd) \
GO2sd(A, B, a128_pd, c128_pd) \
GO2sd(A, B, a128_pd, d128_pd) \
GO2sd(A, B, b128_pd, d128_pd) \
GO2sd(A, B, c128_pd, d128_pd) \
GO2sd(A, B, a128_pd, reverse_pd(a128_pd)) \
GO2sd(A, B, a128_pd, reverse_pd(b128_pd)) \
GO2sd(A, B, a128_pd, reverse_pd(c128_pd)) \
GO2sd(A, B, a128_pd, reverse_pd(d128_pd)) \
GO2sd(A, B, b128_pd, reverse_pd(d128_pd)) \
GO2sd(A, B, b128_pd, reverse_pd(d128_pd))
GO2(shuffle, 8, pshufb, a128_8, b128_8)
GO2(hadd, 16, phaddw, a128_16, b128_16)
GO2(hadd, 32, phaddd, a128_32, b128_32)
GO2(hadds, 16, phaddsw, a128_16, b128_16)
GO2(maddubs, 16, pmaddubsw, a128_8, b128_8)
GO2(hsub, 16, phsubw, a128_16, b128_16)
GO2(sign, 8, psignb, a128_8, b128_8)
GO2(sign, 16, psignw, a128_16, b128_16)
GO2(sign, 32, psignd, a128_32, b128_32)
GO2(mulhrs, 16, pmulhrsw, a128_16, b128_16)
GO3PS(blendv, 32, a128_32, b128_32, c128_32)
GO2i(testz, a128_32, b128_32)
GO2i(testc, a128_32, b128_32)
GO2i(testnzc, a128_32, b128_32)
GO1(abs, 8, pabsb)
GO1(abs, 16, pabsw)
GO1(abs, 32, pabsd)
GO1(cvtepi8, 16, pmovsxbw);
GO1(cvtepi8, 32, pmovsxbd);
GO1(cvtepi8, 64, pmovsxbq);
GO1(cvtepi16, 32, pmovsxwd);
GO1(cvtepi16, 64, pmovsxwq);
GO1(cvtepi32, 64, pmovsxdq);
GO1(cvtepu8, 16, pmovzxbw);
GO1(cvtepu8, 32, pmovzxbd);
GO1(cvtepu8, 64, pmovzxbq);
GO1(cvtepu16, 32, pmovzxwd);
GO1(cvtepu16, 64, pmovzxwq);
GO1(cvtepu32, 64, pmovzxdq);
GO2(min, 32, pminsd, a128_32, b128_32)
GO2(max, 32, pmaxsd, a128_32, b128_32)
GO2C(blend, 16, pblendw, a128_16, b128_16, 0)
GO2C(blend, 16, pblendw, a128_16, b128_16, 0xff)
GO2C(blend, 16, pblendw, a128_16, b128_16, 0xaa)
GO2C(blend, 16, pblendw, a128_16, b128_16, 2)
GO2C(alignr, 8, palignr, a128_8, b128_8, 0)
GO2C(alignr, 8, palignr, a128_8, b128_8, 2)
GO2C(alignr, 8, palignr, a128_8, b128_8, 7)
GO2C(alignr, 8, palignr, a128_8, b128_8, 15)
GO2C(alignr, 8, palignr, a128_8, b128_8, 16)
GO2C(alignr, 8, palignr, a128_8, b128_8, 0xff)
GO1ipd(movemask, movmskpd, a128_64)
GO1pd(sqrt, psqrtpd, a128_pd)
GO1pd(sqrt, psqrtpd, b128_pd)
GO1pd(sqrt, psqrtpd, c128_pd)
GO1pd(sqrt, psqrtpd, d128_pd)
MULITGO2pd(and, andpd)
MULITGO2pd(andnot, andnpd)
MULITGO2pd(or, orpd)
MULITGO2pd(xor, xorpd)
MULITGO2pd(add, addpd)
MULITGO2pd(mul, mulpd)
MULITGO2pd(sub, subpd)
MULITGO2pd(min, minpd)
MULITGO2pd(div, divpd)
MULITGO2pd(max, maxpd)
GO2(unpacklo, 8, punpcklbw, a128_8, b128_8)
GO2(unpacklo, 16, punpcklwd, a128_16, b128_16)
GO2(unpacklo, 32, punpckldq, a128_32, b128_32)
GO2(packs, 16, ppacksswb, a128_16, b128_16)
GO2(cmpgt, 8, pcmpgtb, a128_8, b128_8)
GO2(cmpgt, 16, pcmpgtw, a128_16, b128_16)
GO2(cmpgt, 32, pcmpgtd, a128_32, b128_32)
GO2(packus, 16, packuswb, a128_16, b128_16)
GO2(unpackhi, 8, punpckhbw, a128_8, b128_8)
GO2(unpackhi, 16, punpckhwd, a128_16, b128_16)
GO2(unpackhi, 32, punpckhdq, a128_32, b128_32)
GO2(packs, 32, ppackssdw, a128_32, b128_32)
GO2(unpacklo, 64, punpcklqdq, a128_64, b128_64)
GO2(unpackhi, 64, punpckhqdq, a128_64, b128_64)
GO1C(shuffle, 32, pshufd, a128_32, 0)
GO1C(shuffle, 32, pshufd, a128_32, 0xff)
GO1C(shuffle, 32, pshufd, a128_32, 0xaa)
GO1C(shuffle, 32, pshufd, a128_32, 2)
GO1C(srli, 16, psrlw, a128_16, 0)
GO1C(srli, 16, psrlw, a128_16, 0xff)
GO1C(srli, 16, psrlw, a128_16, 0xaa)
GO1C(srli, 16, psrlw, a128_16, 2)
GO1C(srli, 32, psrld, a128_32, 0)
GO1C(srli, 32, psrld, a128_32, 0xff)
GO1C(srli, 32, psrld, a128_32, 0xaa)
GO1C(srli, 32, psrld, a128_32, 2)
GO1C(srli, 64, psrlq, a128_64, 0)
GO1C(srli, 64, psrlq, a128_64, 0xff)
GO1C(srli, 64, psrlq, a128_64, 0xaa)
GO1C(srli, 64, psrlq, a128_64, 2)
GO1C(srai, 16, psraw, a128_16, 0)
GO1C(srai, 16, psraw, a128_16, 0xff)
GO1C(srai, 16, psraw, a128_16, 0xaa)
GO1C(srai, 16, psraw, a128_16, 2)
GO1C(srai, 32, psrad, a128_32, 0)
GO1C(srai, 32, psrad, a128_32, 0xff)
GO1C(srai, 32, psrad, a128_32, 0xaa)
GO1C(srai, 32, psrad, a128_32, 2)
GO1C(slli, 16, psllw, a128_16, 0)
GO1C(slli, 16, psllw, a128_16, 0xff)
GO1C(slli, 16, psllw, a128_16, 0xaa)
GO1C(slli, 16, psllw, a128_16, 2)
GO1C(slli, 32, pslld, a128_32, 0)
GO1C(slli, 32, pslld, a128_32, 0xff)
GO1C(slli, 32, pslld, a128_32, 0xaa)
GO1C(slli, 32, pslld, a128_32, 2)
GO1C(slli, 64, psllq, a128_64, 0)
GO1C(slli, 64, psllq, a128_64, 0xff)
GO1C(slli, 64, psllq, a128_64, 0xaa)
GO1C(slli, 64, psllq, a128_64, 2)
GO2(cmpeq, 8, pcmpeqb, a128_8, b128_8)
GO2(cmpeq, 16, pcmpeqw, a128_16, b128_16)
GO2(cmpeq, 32, pcmpeqd, a128_32, b128_32)
MULITGO2pd(hadd, haddpd)
GO2(srl, 16, psrlw, a128_16, b128_16)
GO2(srl, 32, psrld, a128_32, b128_32)
GO2(srl, 64, psrlq, a128_64, b128_64)
GO2(add, 64, paddq, a128_64, b128_64)
GO2(mullo, 16, pmullw, a128_16, b128_16)
GO2u(subs, 8, psubusb, a128_8, b128_8)
GO2u(subs, 16, psubusw, a128_16, b128_16)
GO2u(min, 8, pminub, a128_8, b128_8)
GO2f(and, pand, a128_8, b128_8)
GO2u(adds, 8, paddusb, a128_8, b128_8)
GO2u(adds, 16, paddusw, a128_16, b128_16)
GO2u(max, 8, pmaxub, a128_8, b128_8)
GO2f(andnot, pandn, a128_8, b128_8)
GO2u(avg, 8, pavgb, a128_8, b128_8)
GO2(sra, 16, psraw, a128_16, b128_16)
GO2(sra, 32, psrad, a128_32, b128_32)
GO2u(avg, 16, pavgb, a128_16, b128_16)
GO2u(mulhi, 16, pmulhuw, a128_16, b128_16)
GO2(mulhi, 16, pmulhw, a128_16, b128_16)
GO2(subs, 8, psubsb, a128_8, b128_8)
GO2(subs, 16, psubsw, a128_16, b128_16)
GO2(min, 16, pminsw, a128_16, b128_16)
GO2f(or, por, a128_8, b128_8)
GO2(adds, 8, paddusb, a128_8, b128_8)
GO2(adds, 16, paddusw, a128_16, b128_16)
GO2(max, 16, pmaxsw, a128_16, b128_16)
GO2f(xor, pxor, a128_8, b128_8)
GO2(sll, 16, psllw, a128_16, b128_16)
GO2(sll, 32, pslld, a128_32, b128_32)
GO2(sll, 64, psllq, a128_64, b128_64)
GO2u(mul, 32, pmuludq, a128_32, b128_32)
GO2(madd, 16, pmaddwd, a128_16, b128_16)
GO2u(sad, 8, psadbw, a128_8, b128_8)
GO2(sub, 8, psubb, a128_8, b128_8)
GO2(sub, 16, psubw, a128_16, b128_16)
GO2(sub, 32, psubd, a128_32, b128_32)
GO2(sub, 64, psubq, a128_64, b128_64)
GO2(add, 8, paddb, a128_8, b128_8)
GO2(add, 16, paddw, a128_16, b128_16)
GO2(add, 32, paddd, a128_32, b128_32)
GO2ps(movehl, pmovhlps, a128_ps, b128_ps)
GO2ps(unpacklo, unpcklps, a128_ps, b128_ps)
GO2ps(unpackhi, unpckhps, a128_ps, b128_ps)
GO2ps(movelh, pmovhps, a128_ps, b128_ps)
GO1ps(sqrt, psqrtps, a128_ps)
GO1ps(sqrt, psqrtps, b128_ps)
GO1ps(sqrt, psqrtps, c128_ps)
GO1ps(sqrt, psqrtps, d128_ps)
//GO1ps(rsqrt, prsqrtps, a128_ps) // difference in precision
//GO1ps(rsqrt, prsqrtps, b128_ps) // same
//GO1ps(rsqrt, prsqrtps, c128_ps) // same
//GO1ps(rsqrt, prsqrtps, d128_ps) // difference in the handling of NAN, (-)0, and INF in Dynarec
//GO1ps(rcp, prcpps, a128_ps) // deference in precision
//GO1ps(rcp, prcpps, b128_ps) // deference in precision
//GO1ps(rcp, prcpps, c128_ps) // deference in precision
GO1ps(rcp, prcpps, d128_ps)
MULITGO2ps(and, andps)
MULITGO2ps(andnot, andnps)
MULITGO2ps(or, orps)
MULITGO2ps(xor, xorps)
MULITGO2ps(add, addps)
MULITGO2ps(mul, mulps)
MULITGO2ps(sub, subps)
MULITGO2ps(min, minps)
MULITGO2ps(div, divps)
MULITGO2ps(max, maxps)
//MULITGO2Cps(cmp, cmpps, 0) // use avx for some reason
MULITGO2Cps(shuffle, shufps, 0)
MULITGO2Cps(shuffle, shufps, 0x15)
MULITGO2Cps(shuffle, shufps, 0xff)
MULITGO2Cps(shuffle, shufps, 0x02)
MULTIGO2sd(sqrt, sqrtsd)
MULTIGO2sd(add, addsd)
MULTIGO2sd(mul, mulsd)
MULTIGO2sd(sub, subsd)
MULTIGO2sd(min, minsd)
MULTIGO2sd(div, divsd)
MULTIGO2sd(max, maxsd)
MULTIGO1ps2dq(cvtps, cvtps2pd)
return 0;
}