Rewrite CMake & First Work for Windows Support

2026-01-31 01:15:21 +01:00 · 2025-06-18 20:50:02 +02:00
parent 66aa17ee9e
commit 453d49ed44
7 changed files with 825 additions and 580 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,32 +1,23 @@
 cmake_minimum_required(VERSION 3.14)

-set(PROJECT_SHARED rem)
-
-set(CMAKE_C_COMPILER gcc)
-set(CMAKE_CXX_COMPILER g++)
-
-set(SOURCE_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/src/)
-set(LIBRARY_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/libraries/)
+project(rem)

 if(NOT CMAKE_BUILD_TYPE)
  set(CMAKE_BUILD_TYPE Release)
 endif()

-set(CMAKE_CXX_FLAGS "-g")
-
-project(${PROJECT_SHARED})
-
-add_definitions(-Ofast)
-add_definitions(-Wno-c++11-narrowing)
-add_definitions(-std=c++17)
 add_compile_definitions(XBYAK64)
+add_definitions(-Wno-c++11-narrowing)

-set(CMAKE_BUILD_TYPE Debug)
+set(SOURCE_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/src)
+set(LIBRARY_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/libraries)

-file(GLOB_RECURSE SOURCES "${SOURCE_DIRECTORY}*.cpp")
-file(GLOB_RECURSE HEADERS "${SOURCE_DIRECTORY}*.h")
+file(GLOB_RECURSE SOURCES "${SOURCE_DIRECTORY}/*.cpp")
+file(GLOB_RECURSE HEADERS "${SOURCE_DIRECTORY}/*.h")

-include_directories(${PROJECT_SHARED} ${LIBRARY_DIRECTORY})
-include_directories(${PROJECT_SHARED} ${SOURCE_DIRECTORY})
+include_directories(rem ${LIBRARY_DIRECTORY})
+include_directories(rem ${SOURCE_DIRECTORY})

-add_library(${PROJECT_SHARED} SHARED ${SOURCES} ${HEADERS})
+add_library(rem STATIC ${SOURCES} ${HEADERS})
+
+target_include_directories(rem PUBLIC ${SOURCE_DIRECTORY})
--- a/libraries/xbyak/xbyak.h
+++ b/libraries/xbyak/xbyak.h
@@ -1,4 +1,4 @@
-
+#pragma once
 #ifndef XBYAK_XBYAK_H_
 #define XBYAK_XBYAK_H_
 /*!
@@ -151,11 +151,17 @@
 	#pragma GCC diagnostic ignored "-Warray-bounds"
 #endif

+// Define this macro as 0 to disable strict checking of memory operand and register size matching.
+// This macro may be removed in future versions.
+#ifndef XBYAK_STRICT_CHECK_MEM_REG_SIZE
+	#define XBYAK_STRICT_CHECK_MEM_REG_SIZE 1
+#endif
+
 namespace Xbyak {

 enum {
 	DEFAULT_MAX_CODE_SIZE = 4096,
-	VERSION = 0x7070 /* 0xABCD = A.BC(.D) */
+	VERSION = 0x7260 /* 0xABCD = A.BC(.D) */
 };

 #ifndef MIE_INTEGER_TYPE_DEFINED
@@ -232,6 +238,8 @@ enum {
 	ERR_CANT_USE_REX2,
 	ERR_INVALID_DFV,
 	ERR_INVALID_REG_IDX,
+	ERR_BAD_ENCODING_MODE,
+	ERR_CANT_USE_ABCDH,
 	ERR_INTERNAL // Put it at last.
 };

@@ -290,6 +298,8 @@ inline const char *ConvertErrorToString(int err)
 		"can't use rex2",
 		"invalid dfv",
 		"invalid reg index",
+		"bad encoding mode",
+		"can't use [abcd]h with rex",
 		"internal error"
 	};
 	assert(ERR_INTERNAL + 1 == sizeof(errTbl) / sizeof(*errTbl));
@@ -634,12 +644,12 @@ public:
 	void setBit(int bit);
 	void setOpmaskIdx(int idx, bool /*ignore_idx0*/ = true)
 	{
-		if (mask_) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET)
+		if (mask_ && (mask_ != idx)) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET)
 		mask_ = idx;
 	}
 	void setRounding(int idx)
 	{
-		if (rounding_) XBYAK_THROW(ERR_ROUNDING_IS_ALREADY_SET)
+		if (rounding_ && (rounding_ != idx)) XBYAK_THROW(ERR_ROUNDING_IS_ALREADY_SET)
 		rounding_ = idx;
 	}
 	void setZero() { zero_ = true; }
@@ -784,6 +794,9 @@ class Label;
 struct Reg8;
 struct Reg16;
 struct Reg32;
+struct Xmm;
+struct Ymm;
+struct Zmm;
 #ifdef XBYAK64
 struct Reg64;
 #endif
@@ -799,6 +812,9 @@ public:
 #ifdef XBYAK64
 	Reg64 cvt64() const;
 #endif
+	Xmm cvt128() const;
+	Ymm cvt256() const;
+	Zmm cvt512() const;
 	Reg operator|(const ApxFlagNF&) const { Reg r(*this); r.setNF(); return r; }
 	Reg operator|(const ApxFlagZU&) const { Reg r(*this); r.setZU(); return r; }
 };
@@ -936,6 +952,21 @@ inline Reg64 Reg::cvt64() const
 }
 #endif

+inline Xmm Reg::cvt128() const
+{
+	return Xmm(changeBit(128).getIdx());
+}
+
+inline Ymm Reg::cvt256() const
+{
+	return Ymm(changeBit(256).getIdx());
+}
+
+inline Zmm Reg::cvt512() const
+{
+	return Zmm(changeBit(512).getIdx());
+}
+
 #ifndef XBYAK_DISABLE_SEGMENT
 // not derived from Reg
 class Segment {
@@ -1673,7 +1704,9 @@ inline const uint8_t* Label::getAddress() const
 typedef enum {
 	DefaultEncoding,
 	VexEncoding,
-	EvexEncoding
+	EvexEncoding,
+	PreAVX10v2Encoding,
+	AVX10v2Encoding
 } PreferredEncoding;

 class CodeGenerator : public CodeArray {
@@ -1730,10 +1763,10 @@ private:
 	{
 		return op1.isREG(i32e) && ((op2.isREG(i32e) && op1.getBit() == op2.getBit()) || op2.isMEM());
 	}
-	static inline bool isValidSSE(const Operand& op1)
+	static inline bool isValidSSE(const Operand& op)
 	{
 		// SSE instructions do not support XMM16 - XMM31
-		return !(op1.isXMM() && op1.getIdx() >= 16);
+		return !(op.isXMM() && op.getIdx() >= 16);
 	}
 	static inline uint8_t rexRXB(int bit, int bit3, const Reg& r, const Reg& b, const Reg& x = Reg())
 	{
@@ -1815,12 +1848,11 @@ private:
 	static const uint64_t T_0F = 1ull << 8;
 	static const uint64_t T_0F38 = 1ull << 9;
 	static const uint64_t T_0F3A = 1ull << 10;
-	static const uint64_t T_L0 = 1ull << 11;
+	static const uint64_t T_MAP5 = 1ull << 11;
 	static const uint64_t T_L1 = 1ull << 12;
-	static const uint64_t T_W0 = 1ull << 13;
-	static const uint64_t T_W1 = 1ull << 14;
-	static const uint64_t T_EW0 = 1ull << 15;
-	static const uint64_t T_EW1 = 1ull << 16;
+	static const uint64_t T_W0 = 1ull << 13; // T_EW0 = T_W0
+	static const uint64_t T_W1 = 1ull << 14; // for VEX
+	static const uint64_t T_EW1 = 1ull << 16; // for EVEX
 	static const uint64_t T_YMM = 1ull << 17; // support YMM, ZMM
 	static const uint64_t T_EVEX = 1ull << 18;
 	static const uint64_t T_ER_X = 1ull << 19; // xmm{er}
@@ -1836,23 +1868,29 @@ private:
 	static const uint64_t T_M_K = 1ull << 28; // mem{k}
 	static const uint64_t T_VSIB = 1ull << 29;
 	static const uint64_t T_MEM_EVEX = 1ull << 30; // use evex if mem
-	static const uint64_t T_FP16 = 1ull << 31; // avx512-fp16
-	static const uint64_t T_MAP5 = T_FP16 | T_0F;
-	static const uint64_t T_MAP6 = T_FP16 | T_0F38;
+	static const uint64_t T_MAP6 = 1ull << 31;
 	static const uint64_t T_NF = 1ull << 32; // T_nf
 	static const uint64_t T_CODE1_IF1 = 1ull << 33; // code|=1 if !r.isBit(8)

 	static const uint64_t T_ND1 = 1ull << 35; // ND=1
 	static const uint64_t T_ZU = 1ull << 36; // ND=ZU
 	static const uint64_t T_F2 = 1ull << 37; // pp = 3
+	static const uint64_t T_SENTRY = (1ull << 38)-1; // attribute(>=T_SENTRY) is for error check
+	static const uint64_t T_ALLOW_DIFF_SIZE = 1ull << 38; // allow difference reg size
+	static const uint64_t T_ALLOW_ABCDH = 1ull << 39; // allow [abcd]h reg
 	// T_66 = 1, T_F3 = 2, T_F2 = 3
 	static inline uint32_t getPP(uint64_t type) { return (type & T_66) ? 1 : (type & T_F3) ? 2 : (type & T_F2) ? 3 : 0; }
 	// @@@end of avx_type_def.h
-	static inline uint32_t getMap(uint64_t type) { return (type & T_0F) ? 1 : (type & T_0F38) ? 2 : (type & T_0F3A) ? 3 : 0; }
+	static inline uint32_t getMap(uint64_t type)
+	{
+		if (type & T_MAP6) return 6;
+		if (type & T_MAP5) return 5;
+		return (type & T_0F) ? 1 : (type & T_0F38) ? 2 : (type & T_0F3A) ? 3 : 0;
+	}
 	void vex(const Reg& reg, const Reg& base, const Operand *v, uint64_t type, int code, bool x = false)
 	{
 		int w = (type & T_W1) ? 1 : 0;
-		bool is256 = (type & T_L1) ? true : (type & T_L0) ? false : reg.isYMM();
+		bool is256 = (type & T_L1) ? true : reg.isYMM();
 		bool r = reg.isExtIdx();
 		bool b = base.isExtIdx();
 		int idx = v ? v->getIdx() : 0;
@@ -1867,16 +1905,19 @@ private:
 		}
 		db(code);
 	}
-	void verifySAE(const Reg& r, uint64_t type) const
+	// Allow YMM embedded rounding for AVX10.2 to minimize flag modifications
+	bool verifySAE(const Reg& r, const Reg& b, uint64_t type) const
 	{
-		if (((type & T_SAE_X) && r.isXMM()) || ((type & T_SAE_Y) && r.isYMM()) || ((type & T_SAE_Z) && r.isZMM())) return;
-		XBYAK_THROW(ERR_SAE_IS_INVALID)
+		if (((type & T_SAE_X) && (r.isYMM() && b.isXMM())) || ((type & T_SAE_Y) && b.isXMM()) || ((type & T_SAE_Z) && b.isYMM())) return true;
+		if (((type & T_SAE_X) && b.isXMM()) || ((type & T_SAE_Y) && b.isYMM()) || ((type & T_SAE_Z) && b.isZMM())) return false;
+		XBYAK_THROW_RET(ERR_SAE_IS_INVALID, false)
 	}
-	void verifyER(const Reg& r, uint64_t type) const
+	bool verifyER(const Reg& r, const Reg& b, uint64_t type) const
 	{
-		if ((type & T_ER_R) && r.isREG(32|64)) return;
-		if (((type & T_ER_X) && r.isXMM()) || ((type & T_ER_Y) && r.isYMM()) || ((type & T_ER_Z) && r.isZMM())) return;
-		XBYAK_THROW(ERR_ER_IS_INVALID)
+		if ((type & T_ER_R) && b.isREG(32|64)) return false;
+		if (((type & T_ER_X) && (r.isYMM() && b.isXMM())) || ((type & T_ER_Y) && b.isXMM()) || ((type & T_ER_Z) && b.isYMM())) return true;
+		if (((type & T_ER_X) && b.isXMM()) || ((type & T_ER_Y) && b.isYMM()) || ((type & T_ER_Z) && b.isZMM())) return false;
+		XBYAK_THROW_RET(ERR_SAE_IS_INVALID, false)
 	}
 	// (a, b, c) contains non zero two or three values then err
 	int verifyDuplicate(int a, int b, int c, int err)
@@ -1890,31 +1931,32 @@ private:
 		if (!(type & (T_EVEX | T_MUST_EVEX))) XBYAK_THROW_RET(ERR_EVEX_IS_INVALID, 0)
 		int w = (type & T_EW1) ? 1 : 0;
 		uint32_t mmm = getMap(type);
-		if (type & T_FP16) mmm |= 4;
 		uint32_t pp = getPP(type);
 		int idx = v ? v->getIdx() : 0;
 		uint32_t vvvv = ~idx;

 		bool R = reg.isExtIdx();
 		bool X3 = (x && x->isExtIdx()) || (base.isSIMD() && base.isExtIdx2());
-		bool B4 = base.isREG() && base.isExtIdx2();
-		bool X4 = x && (x->isREG() && x->isExtIdx2());
+		uint8_t B4 = (base.isREG() && base.isExtIdx2()) ? 8 : 0;
+		uint8_t U = (x && (x->isREG() && x->isExtIdx2())) ? 0 : 4;
 		bool B = base.isExtIdx();
 		bool Rp = reg.isExtIdx2();
 		int LL;
 		int rounding = verifyDuplicate(reg.getRounding(), base.getRounding(), v ? v->getRounding() : 0, ERR_ROUNDING_IS_ALREADY_SET);
 		int disp8N = 1;
 		if (rounding) {
+			bool isUzero = false;
 			if (rounding == EvexModifierRounding::T_SAE) {
-				verifySAE(base, type); LL = 0;
+				isUzero = verifySAE(reg, base, type); LL = 0;
 			} else {
-				verifyER(base, type); LL = rounding - 1;
+				isUzero = verifyER(reg, base, type); LL = rounding - 1;
 			}
+			if (isUzero) U = 0; // avx10.2 Evex.U
 			b = true;
 		} else {
 			if (v) VL = (std::max)(VL, v->getBit());
 			VL = (std::max)((std::max)(reg.getBit(), base.getBit()), VL);
-			LL = (VL == 512) ? 2 : (VL == 256) ? 1 : 0;
+			LL = (VL >= 512 /* tmm */) ? 2 : (VL == 256) ? 1 : 0;
 			if (b) {
 				disp8N = ((type & T_B16) == T_B16) ? 2 : (type & T_B32) ? 4 : 8;
 			} else if ((type & T_NX_MASK) == T_DUP) {
@@ -1935,8 +1977,8 @@ private:
 		if (aaa == 0) aaa = verifyDuplicate(base.getOpmaskIdx(), reg.getOpmaskIdx(), (v ? v->getOpmaskIdx() : 0), ERR_OPMASK_IS_ALREADY_SET);
 		if (aaa == 0) z = 0; // clear T_z if mask is not set
 		db(0x62);
-		db((R ? 0 : 0x80) | (X3 ? 0 : 0x40) | (B ? 0 : 0x20) | (Rp ? 0 : 0x10) | (B4 ? 8 : 0) | mmm);
-		db((w == 1 ? 0x80 : 0) | ((vvvv & 15) << 3) | (X4 ? 0 : 4) | (pp & 3));
+		db((R ? 0 : 0x80) | (X3 ? 0 : 0x40) | (B ? 0 : 0x20) | (Rp ? 0 : 0x10) | B4 | mmm);
+		db((w == 1 ? 0x80 : 0) | ((vvvv & 15) << 3) | U | (pp & 3));
 		db((z ? 0x80 : 0) | ((LL & 3) << 5) | (b ? 0x10 : 0) | (V4 ? 0 : 8) | (aaa & 7));
 		db(code);
 		return disp8N;
@@ -2036,7 +2078,6 @@ private:
 		}
 	}
 	LabelManager labelMgr_;
-	bool isInDisp16(uint32_t x) const { return 0xFFFF8000 <= x || x <= 0x7FFF; }
 	void writeCode(uint64_t type, const Reg& r, int code, bool rex2 = false)
 	{
 		if (!(type&T_APX || rex2)) {
@@ -2048,19 +2089,24 @@ private:
 				db(0x0F); db(0x3A);
 			}
 		}
-		db(code | ((type == 0 || (type & T_CODE1_IF1)) && !r.isBit(8)));
+		db(code | (((type & T_SENTRY) == 0 || (type & T_CODE1_IF1)) && !r.isBit(8)));
 	}
-	void opRR(const Reg& reg1, const Reg& reg2, uint64_t type, int code)
+	void opRR(const Reg& r1, const Reg& r2, uint64_t type, int code)
 	{
-		bool rex2 = rex(reg2, reg1, type);
-		writeCode(type, reg1, code, rex2);
-		setModRM(3, reg1.getIdx(), reg2.getIdx());
+		if (!(type & T_ALLOW_DIFF_SIZE) && r1.isREG() && r2.isREG() && r1.getBit() != r2.getBit()) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
+		if (!(type & T_ALLOW_ABCDH) && (isBadCombination(r1, r2) || isBadCombination(r2, r1))) XBYAK_THROW(ERR_CANT_USE_ABCDH)
+		bool rex2 = rex(r2, r1, type);
+		writeCode(type, r1, code, rex2);
+		setModRM(3, r1.getIdx(), r2.getIdx());
 	}
 	void opMR(const Address& addr, const Reg& r, uint64_t type, int code, uint64_t type2 = 0, int code2 = NONE)
 	{
 		if (code2 == NONE) code2 = code;
 		if (type2 && opROO(Reg(), addr, r, type2, code2)) return;
 		if (addr.is64bitDisp()) XBYAK_THROW(ERR_CANT_USE_64BIT_DISP)
+#if XBYAK_STRICT_CHECK_MEM_REG_SIZE == 1
+		if (!(type & T_ALLOW_DIFF_SIZE) && r.getBit() <= BIT && addr.getBit() > 0 && addr.getBit() != r.getBit()) XBYAK_THROW(ERR_BAD_MEM_SIZE)
+#endif
 		bool rex2 = rex(addr, r, type);
 		writeCode(type, r, code, rex2);
 		opAddr(addr, r.getIdx());
@@ -2163,7 +2209,7 @@ private:
 			}
 		}
 	}
-	void opSSE(const Reg& r, const Operand& op, uint64_t type, int code, bool isValid(const Operand&, const Operand&), int imm8 = NONE)
+	void opSSE(const Reg& r, const Operand& op, uint64_t type, int code, bool isValid(const Operand&, const Operand&) = 0, int imm8 = NONE)
 	{
 		if (isValid && !isValid(r, op)) XBYAK_THROW(ERR_BAD_COMBINATION)
 		if (!isValidSSE(r) || !isValidSSE(op)) XBYAK_THROW(ERR_NOT_SUPPORTED)
@@ -2205,6 +2251,13 @@ private:
 			opSSE(mmx, op, T_66 | T_0F3A, code, isXMM_REG32orMEM, imm);
 		}
 	}
+	// r1 is [abcd]h and r2 is reg with rex
+	bool isBadCombination(const Reg& r1, const Reg& r2) const
+	{
+		if (!r1.isHigh8bit()) return false;
+		if (r2.isExt8bit() || r2.getIdx() >= 8) return true;
+		return false;
+	}
 	// (r, r, m) or (r, m, r)
 	bool opROO(const Reg& d, const Operand& op1, const Operand& op2, uint64_t type, int code, int immSize = 0, int sc = NONE)
 	{
@@ -2232,11 +2285,11 @@ private:
 		int opBit = op.getBit();
 		if (disableRex && opBit == 64) opBit = 32;
 		const Reg r(ext, Operand::REG, opBit);
-		if ((type & T_APX) && op.hasRex2NFZU() && opROO(d ? *d : Reg(0, Operand::REG, opBit), op, r, type, code)) return;
+		if ((type & T_APX) && (d != 0 || op.hasRex2NFZU()) && opROO(d ? *d : Reg(0, Operand::REG, opBit), op, r, type, code)) return;
 		if (op.isMEM()) {
 			opMR(op.getAddress(immSize), r, type, code);
 		} else if (op.isREG(bit)) {
-			opRR(r, op.getReg().changeBit(opBit), type, code);
+			opRR(r, op.getReg().changeBit(opBit), type | T_ALLOW_ABCDH, code);
 		} else {
 			XBYAK_THROW(ERR_BAD_COMBINATION)
 		}
@@ -2295,10 +2348,13 @@ private:
 			opRO(static_cast<const Reg&>(op2), op1, 0, code, op1.getKind() == op2.getKind());
 		}
 	}
+	bool isInDisp16(uint32_t x) const { return 0xFFFF8000 <= x || x <= 0x7FFF; }
+	// allow add(ax, 0x8000);
+	bool isInDisp16relaxed(uint32_t x) const { uint32_t v = x & 0xffff0000; return v == 0 || v == 0xffff0000; }
 	uint32_t getImmBit(const Operand& op, uint32_t imm)
 	{
 		verifyMemHasSize(op);
-		uint32_t immBit = inner::IsInDisp8(imm) ? 8 : isInDisp16(imm) ? 16 : 32;
+		uint32_t immBit = inner::IsInDisp8(imm) ? 8 : isInDisp16relaxed(imm) ? 16 : 32;
 		if (op.isBit(8)) immBit = 8;
 		if (op.getBit() < immBit) XBYAK_THROW_RET(ERR_IMM_IS_TOO_BIG, 0)
 		if (op.isBit(32|64) && immBit == 16) immBit = 32; /* don't use MEM16 if 32/64bit mode */
@@ -2352,7 +2408,7 @@ private:
 		if (op.isREG() && op.hasRex2()) {
 			const Reg& r = static_cast<const Reg&>(op);
 			rex2(0, rexRXB(3, 0, Reg(), r), Reg(), r);
-			db(alt);
+			db(alt | (r.getIdx() & 7));
 			return;
 		}
 		int bit = op.getBit();
@@ -2364,7 +2420,7 @@ private:
 				return;
 			}
 			if (op.isMEM()) {
-				opMR(op.getAddress(), Reg(ext, Operand::REG, 32), 0, code);
+				opMR(op.getAddress(), Reg(ext, Operand::REG, 32), T_ALLOW_DIFF_SIZE, code);
 				return;
 			}
 		}
@@ -2422,7 +2478,7 @@ private:
 		if (op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION)
 		int w = op.isBit(16);
 		if (!(reg.isREG() && (reg.getBit() > op.getBit()))) XBYAK_THROW(ERR_BAD_COMBINATION)
-		opRO(reg, op, T_0F, code | w);
+		opRO(reg, op, T_0F | T_ALLOW_DIFF_SIZE, code | w);
 	}
 	void opFpuMem(const Address& addr, uint8_t m16, uint8_t m32, uint8_t m64, uint8_t ext, uint8_t m64ext)
 	{
@@ -2554,6 +2610,18 @@ private:
 		Operand::Kind kind = op.isBit(128) ? Operand::XMM : op.isBit(256) ? Operand::YMM : Operand::ZMM;
 		opVex(x.copyAndSetKind(kind), &xm0, op, type, code);
 	}
+	// (x, x, x/m), (x, y, y/m), (y, z, z/m)
+	void opCvt6(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code)
+	{
+		int b1 = x1.getBit();
+		int b2 = x2.getBit();
+		int b3 = op.getBit();
+		if ((b1 == 128 && (b2 == 128 || b2 == 256) && (b2 == b3 || op.isMEM())) || (b1 == 256 && b2 == 512 && (b3 == b2 || op.isMEM()))) {
+			opVex(x1, &x2, op, type, code);
+			return;
+		}
+		XBYAK_THROW(ERR_BAD_COMBINATION);
+	}
 	const Xmm& cvtIdx0(const Operand& x) const
 	{
 		return x.isZMM() ? zm0 : x.isYMM() ? ym0 : xm0;
@@ -2568,8 +2636,7 @@ private:
 		if (reg.isBit(8)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
 		bool is16bit = reg.isREG(16) && (op.isREG(16) || op.isMEM());
 		if (!is16bit && !(reg.isREG(i32e) && (op.isREG(reg.getBit()) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION)
-		if (is16bit) db(0x66);
-		opRO(reg.changeBit(i32e == 32 ? 32 : reg.getBit()), op, T_F3 | T_0F, code);
+		opRO(reg, op, T_F3 | T_0F, code);
 	}
 	void opGather(const Xmm& x1, const Address& addr, const Xmm& x2, uint64_t type, uint8_t code, int mode)
 	{
@@ -2644,21 +2711,24 @@ private:
 		if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
 		opVex(x, 0, addr, type, code);
 	}
-	void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code, PreferredEncoding encoding)
+	void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code, PreferredEncoding enc, int imm = NONE, uint64_t typeVex = 0, uint64_t typeEvex = 0, int sel = 0)
 	{
-		opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding), code);
+		opAVX_X_X_XM(x1, x2, op, type | orEvexIf(enc, typeVex, typeEvex, sel), code, imm);
 	}
-	int orEvexIf(PreferredEncoding encoding) {
-		if (encoding == DefaultEncoding) {
-			encoding = defaultEncoding_;
+	PreferredEncoding getEncoding(PreferredEncoding enc, int sel) const
+	{
+		if (enc == DefaultEncoding) {
+			enc = defaultEncoding_[sel];
 		}
-		if (encoding == EvexEncoding) {
+		if ((sel == 0 && enc != VexEncoding && enc != EvexEncoding) || (sel == 1 && enc != PreAVX10v2Encoding && enc != AVX10v2Encoding)) XBYAK_THROW_RET(ERR_BAD_ENCODING_MODE, VexEncoding)
 #ifdef XBYAK_DISABLE_AVX512
-			XBYAK_THROW(ERR_EVEX_IS_INVALID)
+		if (enc == EvexEncoding || enc == AVX10v2Encoding) XBYAK_THROW_RET(ERR_EVEX_IS_INVALID, VexEncoding)
 #endif
-			return T_MUST_EVEX;
-		}
-		return 0;
+		return enc;
+	}
+	uint64_t orEvexIf(PreferredEncoding enc, uint64_t typeVex, uint64_t typeEvex, int sel) {
+		enc = getEncoding(enc, sel);
+		return ((sel == 0 && enc == VexEncoding) || (sel == 1 && enc != AVX10v2Encoding)) ? typeVex : (T_MUST_EVEX | typeEvex);
 	}
 	void opInOut(const Reg& a, const Reg& d, uint8_t code)
 	{
@@ -2752,7 +2822,7 @@ private:
 		const Operand *p1 = &k, *p2 = &op;
 		if (code == 0x93) { std::swap(p1, p2); }
 		if (opROO(Reg(), *p2, *p1, T_APX|type, code)) return;
-		opVex(static_cast<const Reg&>(*p1), 0, *p2, T_L0|type, code);
+		opVex(static_cast<const Reg&>(*p1), 0, *p2, type, code);
 	}
 	void opEncodeKey(const Reg32& r1, const Reg32& r2, uint8_t code1, uint8_t code2)
 	{
@@ -2770,6 +2840,31 @@ private:
 		}
 		opSSE(x, op, type1, code1, isXMM_XMMorMEM, imm);
 	}
+	// AVX10 zero-extending for vmovd, vmovw
+	void opAVX10ZeroExt(const Operand& op1, const Operand& op2, const uint64_t typeTbl[4], const int codeTbl[4], PreferredEncoding enc, int bit)
+	{
+		const Operand *p1 = &op1;
+		const Operand *p2 = &op2;
+		bool rev = false;
+		if (p1->isMEM()) {
+			std::swap(p1, p2);
+			rev = true;
+		}
+		if (p1->isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION)
+		if (p1->isXMM()) {
+			std::swap(p1, p2);
+			rev = !rev;
+		}
+		enc = getEncoding(enc, 1);
+		int sel = -1;
+		if (p1->isXMM() || (p1->isMEM() && enc == AVX10v2Encoding)) {
+			sel = 2 + int(rev);
+		} else if (p1->isREG(bit) || p1->isMEM()) {
+			sel = int(rev);
+		}
+		if (sel == -1) XBYAK_THROW(ERR_BAD_COMBINATION)
+		opAVX_X_X_XM(*static_cast<const Xmm*>(p2), xm0, *p1, typeTbl[sel], codeTbl[sel]);
+	}
 public:
 	unsigned int getVersion() const { return VERSION; }
 	using CodeArray::db;
@@ -2828,7 +2923,7 @@ public:
 #endif
 private:
 	bool isDefaultJmpNEAR_;
-	PreferredEncoding defaultEncoding_;
+	PreferredEncoding defaultEncoding_[2]; // 0:vnni, 1:vmpsadbw
 public:
 	void L(const std::string& label) { labelMgr_.defineSlabel(label); }
 	void L(Label& label) { labelMgr_.defineClabel(label); }
@@ -2999,6 +3094,7 @@ public:
 			rex(*p2, *p1); db(0x90 | (p2->getIdx() & 7));
 			return;
 		}
+		if (p1->isREG() && p2->isREG()) std::swap(p1, p2); // adapt to NASM 2.16.03 behavior to pass tests
 		opRO(static_cast<const Reg&>(*p1), *p2, 0, 0x86 | (p1->isBit(8) ? 0 : 1), (p1->isREG() && (p1->getBit() == p2->getBit())));
 	}

@@ -3044,11 +3140,11 @@ public:
 	}
 	void mov(const Operand& op, const Segment& seg)
 	{
-		opRO(Reg8(seg.getIdx()), op, 0, 0x8C, op.isREG(16|i32e));
+		opRO(Reg8(seg.getIdx()), op, T_ALLOW_DIFF_SIZE | T_ALLOW_ABCDH, 0x8C, op.isREG(16|i32e));
 	}
 	void mov(const Segment& seg, const Operand& op)
 	{
-		opRO(Reg8(seg.getIdx()), op.isREG(16|i32e) ? static_cast<const Operand&>(op.getReg().cvt32()) : op, 0, 0x8E, op.isREG(16|i32e));
+		opRO(Reg8(seg.getIdx()), op.isREG(16|i32e) ? static_cast<const Operand&>(op.getReg().cvt32()) : op, T_ALLOW_DIFF_SIZE | T_ALLOW_ABCDH, 0x8E, op.isREG(16|i32e));
 	}
 #endif

@@ -3113,8 +3209,9 @@ public:
 		, es(Segment::es), cs(Segment::cs), ss(Segment::ss), ds(Segment::ds), fs(Segment::fs), gs(Segment::gs)
 #endif
 		, isDefaultJmpNEAR_(false)
-		, defaultEncoding_(EvexEncoding)
 	{
+		setDefaultEncoding();
+		setDefaultEncodingAVX10();
 		labelMgr_.set(this);
 	}
 	void reset()
@@ -3151,13 +3248,20 @@ public:
 	#undef jnl
 #endif

-	// set default encoding to select Vex or Evex
-	void setDefaultEncoding(PreferredEncoding encoding) { defaultEncoding_ = encoding; }
-
-	void sha1msg12(const Xmm& x, const Operand& op)
+	// set default encoding of VNNI
+	// EvexEncoding : AVX512_VNNI, VexEncoding : AVX-VNNI
+	void setDefaultEncoding(PreferredEncoding enc = EvexEncoding)
 	{
-		opROO(Reg(), op, x, T_MUST_EVEX, 0xD9);
+		if (enc != VexEncoding && enc != EvexEncoding) XBYAK_THROW(ERR_BAD_ENCODING_MODE)
+		defaultEncoding_[0] = enc;
 	}
+	// default : PreferredEncoding : AVX-VNNI-INT8/AVX512-FP16
+	void setDefaultEncodingAVX10(PreferredEncoding enc = PreAVX10v2Encoding)
+	{
+		if (enc != PreAVX10v2Encoding && enc != AVX10v2Encoding) XBYAK_THROW(ERR_BAD_ENCODING_MODE)
+		defaultEncoding_[1] = enc;
+	}
+
 	void bswap(const Reg32e& r)
 	{
 		int idx = r.getIdx();
@@ -3170,6 +3274,24 @@ public:
 		}
 		db(0xC8 + (idx & 7));
 	}
+	void vmovd(const Operand& op1, const Operand& op2, PreferredEncoding enc = DefaultEncoding)
+	{
+		const uint64_t typeTbl[] = {
+			T_EVEX|T_66|T_0F|T_W0|T_N4, T_EVEX|T_66|T_0F|T_W0|T_N4, // legacy, avx, avx512
+			T_MUST_EVEX|T_66|T_0F|T_N4, T_MUST_EVEX|T_F3|T_0F|T_N4, // avx10.2
+		};
+		const int codeTbl[] = { 0x7E, 0x6E, 0xD6, 0x7E };
+		opAVX10ZeroExt(op1, op2, typeTbl, codeTbl, enc, 32);
+	}
+	void vmovw(const Operand& op1, const Operand& op2, PreferredEncoding enc = DefaultEncoding)
+	{
+		const uint64_t typeTbl[] = {
+			T_MUST_EVEX|T_66|T_MAP5|T_N2, T_MUST_EVEX|T_66|T_MAP5|T_N2, // avx512-fp16
+			T_MUST_EVEX|T_F3|T_MAP5|T_N2, T_MUST_EVEX|T_F3|T_MAP5|T_N2, // avx10.2
+		};
+		const int codeTbl[] = { 0x7E, 0x6E, 0x7E, 0x6E };
+		opAVX10ZeroExt(op1, op2, typeTbl, codeTbl, enc, 16|32|64);
+	}
 	/*
 		use single byte nop if useMultiByteNop = false
 	*/
--- a/libraries/xbyak/xbyak_mnemonic.h
+++ b/libraries/xbyak/xbyak_mnemonic.h
--- a/libraries/xbyak/xbyak_util.h
+++ b/libraries/xbyak/xbyak_util.h
@@ -548,6 +548,12 @@ public:
 	XBYAK_DEFINE_TYPE(88, tSSE4a);
 	XBYAK_DEFINE_TYPE(89, tCLWB);
 	XBYAK_DEFINE_TYPE(90, tTSXLDTRK);
+	XBYAK_DEFINE_TYPE(91, tAMX_TRANSPOSE);
+	XBYAK_DEFINE_TYPE(92, tAMX_TF32);
+	XBYAK_DEFINE_TYPE(93, tAMX_AVX512);
+	XBYAK_DEFINE_TYPE(94, tAMX_MOVRS);
+	XBYAK_DEFINE_TYPE(95, tAMX_FP8);
+	XBYAK_DEFINE_TYPE(96, tMOVRS);

 #undef XBYAK_SPLIT_ID
 #undef XBYAK_DEFINE_TYPE
@@ -702,12 +708,20 @@ public:
 				if (EAX & (1U << 7)) type_ |= tCMPCCXADD;
 				if (EAX & (1U << 21)) type_ |= tAMX_FP16;
 				if (EAX & (1U << 23)) type_ |= tAVX_IFMA;
+				if (EAX & (1U << 31)) type_ |= tMOVRS;
 				if (EDX & (1U << 4)) type_ |= tAVX_VNNI_INT8;
 				if (EDX & (1U << 5)) type_ |= tAVX_NE_CONVERT;
 				if (EDX & (1U << 10)) type_ |= tAVX_VNNI_INT16;
 				if (EDX & (1U << 14)) type_ |= tPREFETCHITI;
 				if (EDX & (1U << 19)) type_ |= tAVX10;
 				if (EDX & (1U << 21)) type_ |= tAPX_F;
+
+				getCpuidEx(0x1e, 1, data);
+				if (EAX & (1U << 4)) type_ |= tAMX_FP8;
+				if (EAX & (1U << 5)) type_ |= tAMX_TRANSPOSE;
+				if (EAX & (1U << 6)) type_ |= tAMX_TF32;
+				if (EAX & (1U << 7)) type_ |= tAMX_AVX512;
+				if (EAX & (1U << 8)) type_ |= tAMX_MOVRS;
 			}
 		}
 		if (maxNum >= 0x19) {
@@ -892,17 +906,17 @@ class StackFrame {
 #endif
 	static const int maxRegNum = 14; // maxRegNum = 16 - rsp - rax
 	Xbyak::CodeGenerator *code_;
-	int pNum_;
-	int tNum_;
-	bool useRcx_;
-	bool useRdx_;
-	int saveNum_;
-	int P_;
-	bool makeEpilog_;
 	Xbyak::Reg64 pTbl_[4];
 	Xbyak::Reg64 tTbl_[maxRegNum];
 	Pack p_;
 	Pack t_;
+	int pNum_;
+	int tNum_;
+	int saveNum_;
+	int P_;
+	bool useRcx_;
+	bool useRdx_;
+	bool makeEpilog_;
 	StackFrame(const StackFrame&);
 	void operator=(const StackFrame&);
 public:
@@ -928,10 +942,10 @@ public:
 		: code_(code)
 		, pNum_(pNum)
 		, tNum_(tNum & ~(UseRCX | UseRDX))
-		, useRcx_((tNum & UseRCX) != 0)
-		, useRdx_((tNum & UseRDX) != 0)
 		, saveNum_(0)
 		, P_(0)
+		, useRcx_((tNum & UseRCX) != 0)
+		, useRdx_((tNum & UseRDX) != 0)
 		, makeEpilog_(makeEpilog)
 		, p(p_)
 		, t(t_)
--- a/src/assembly/aarch64/aarch64_assembler.cpp
+++ b/src/assembly/aarch64/aarch64_assembler.cpp
@@ -1,11 +1,12 @@
 #include "aarch64_assembler.h"
 #include "jit/jit_context.h"
 #include <string.h>
-
-#include <sys/mman.h>
 #include <stdio.h>
-#include <string.h>
+
+#ifdef __linux__
+#include <sys/mman.h>
 #include <unistd.h>
+#endif

 #define ONE_MB 1 * 1024 * 1024

--- a/src/emulator/aarch64/aarch64_emit_context.cpp
+++ b/src/emulator/aarch64/aarch64_emit_context.cpp
@@ -1,3 +1,4 @@
+#include "aarch64_context_offsets.h"
 #include "aarch64_emit_context.h"
 #include "emulator/ssa_emit_context.h"
 #include "emulator/guest_process.h"
--- a/src/export.cpp
+++ b/src/export.cpp
@@ -1,3 +1,4 @@
+#include "emulator/aarch64/aarch64_context_offsets.h"
 #include "emulator/guest_process.h"
 #include "emulator/ssa_emit_context.h"