diff --git a/Source/COP_FPU.cpp b/Source/COP_FPU.cpp
index 73bb6aa2..eeeab5ba 100644
--- a/Source/COP_FPU.cpp
+++ b/Source/COP_FPU.cpp
@@ -212,28 +212,28 @@ void CCOP_FPU::BC1TL()
 //00
 void CCOP_FPU::ADD_S()
 {
-    m_codeGen->FPU_PushSingle(offsetof(CMIPS, m_State.nCOP10[m_nFS * 2]));
-    m_codeGen->FPU_PushSingle(offsetof(CMIPS, m_State.nCOP10[m_nFT * 2]));
-    m_codeGen->FPU_Add();
-    m_codeGen->FPU_PullSingle(offsetof(CMIPS, m_State.nCOP10[m_nFD * 2]));
+    m_codeGen->FP_PushSingle(offsetof(CMIPS, m_State.nCOP10[m_nFS * 2]));
+    m_codeGen->FP_PushSingle(offsetof(CMIPS, m_State.nCOP10[m_nFT * 2]));
+    m_codeGen->FP_Add();
+    m_codeGen->FP_PullSingle(offsetof(CMIPS, m_State.nCOP10[m_nFD * 2]));
 }
 
 //01
 void CCOP_FPU::SUB_S()
 {
-    m_codeGen->FPU_PushSingle(offsetof(CMIPS, m_State.nCOP10[m_nFS * 2]));
-    m_codeGen->FPU_PushSingle(offsetof(CMIPS, m_State.nCOP10[m_nFT * 2]));
-    m_codeGen->FPU_Sub();
-    m_codeGen->FPU_PullSingle(offsetof(CMIPS, m_State.nCOP10[m_nFD * 2]));
+    m_codeGen->FP_PushSingle(offsetof(CMIPS, m_State.nCOP10[m_nFS * 2]));
+    m_codeGen->FP_PushSingle(offsetof(CMIPS, m_State.nCOP10[m_nFT * 2]));
+    m_codeGen->FP_Sub();
+    m_codeGen->FP_PullSingle(offsetof(CMIPS, m_State.nCOP10[m_nFD * 2]));
 }
 
 //02
 void CCOP_FPU::MUL_S()
 {
-    m_codeGen->FPU_PushSingle(offsetof(CMIPS, m_State.nCOP10[m_nFS * 2]));
-    m_codeGen->FPU_PushSingle(offsetof(CMIPS, m_State.nCOP10[m_nFT * 2]));
-    m_codeGen->FPU_Mul();
-    m_codeGen->FPU_PullSingle(offsetof(CMIPS, m_State.nCOP10[m_nFD * 2]));
+    m_codeGen->FP_PushSingle(offsetof(CMIPS, m_State.nCOP10[m_nFS * 2]));
+    m_codeGen->FP_PushSingle(offsetof(CMIPS, m_State.nCOP10[m_nFT * 2]));
+    m_codeGen->FP_Mul();
+    m_codeGen->FP_PullSingle(offsetof(CMIPS, m_State.nCOP10[m_nFD * 2]));
 }
 
 //03
@@ -251,10 +251,10 @@ void CCOP_FPU::DIV_S()
 	}
 	m_codeGen->BeginIfElseAlt();
 	{
-        m_codeGen->FPU_PushSingle(offsetof(CMIPS, m_State.nCOP10[m_nFS * 2]));
-        m_codeGen->FPU_PushSingle(offsetof(CMIPS, m_State.nCOP10[m_nFT * 2]));
-        m_codeGen->FPU_Div();
-        m_codeGen->FPU_PullSingle(offsetof(CMIPS, m_State.nCOP10[m_nFD * 2]));
+        m_codeGen->FP_PushSingle(offsetof(CMIPS, m_State.nCOP10[m_nFS * 2]));
+        m_codeGen->FP_PushSingle(offsetof(CMIPS, m_State.nCOP10[m_nFT * 2]));
+        m_codeGen->FP_Div();
+        m_codeGen->FP_PullSingle(offsetof(CMIPS, m_State.nCOP10[m_nFD * 2]));
 	}
 	m_codeGen->EndIf();
 }
@@ -367,8 +367,8 @@ void CCOP_FPU::CVT_W_S()
 {
 	//Load the rounding mode from FCSR?
     //PS2 only supports truncate rounding mode
-    m_codeGen->FPU_PushSingle(offsetof(CMIPS, m_State.nCOP10[m_nFS * 2]));
-    m_codeGen->FPU_PullWordTruncate(offsetof(CMIPS, m_State.nCOP10[m_nFD * 2]));
+    m_codeGen->FP_PushSingle(offsetof(CMIPS, m_State.nCOP10[m_nFS * 2]));
+    m_codeGen->FP_PullWordTruncate(offsetof(CMIPS, m_State.nCOP10[m_nFD * 2]));
 }
 
 //32
@@ -420,8 +420,8 @@ void CCOP_FPU::C_LE_S()
 //20
 void CCOP_FPU::CVT_S_W()
 {
-	m_codeGen->FPU_PushWord(offsetof(CMIPS, m_State.nCOP10[m_nFS * 2]));
-	m_codeGen->FPU_PullSingle(offsetof(CMIPS, m_State.nCOP10[m_nFD * 2]));
+	m_codeGen->FP_PushWord(offsetof(CMIPS, m_State.nCOP10[m_nFS * 2]));
+	m_codeGen->FP_PullSingle(offsetof(CMIPS, m_State.nCOP10[m_nFD * 2]));
 }
 
 //////////////////////////////////////////////////
diff --git a/Source/CodeGen.cpp b/Source/CodeGen.cpp
index 96557129..fb14aad8 100644
--- a/Source/CodeGen.cpp
+++ b/Source/CodeGen.cpp
@@ -9,6 +9,7 @@
 
 using namespace boost;
 using namespace Framework;
+using namespace std;
 
 bool					CCodeGen::m_nBlockStarted = false;
 CCacheBlock*			CCodeGen::m_pBlock = NULL;
@@ -25,6 +26,7 @@ CX86Assembler           CCodeGen::m_Assembler
                                             );
 
 bool                    CCodeGen::m_nRegisterAllocated[MAX_REGISTER];
+bool                    CCodeGen::m_xmmRegisterAllocated[MAX_XMM_REGISTER];
 CStream*                CCodeGen::m_stream = NULL;
 
 CX86Assembler::REGISTER CCodeGen::g_nBaseRegister = CX86Assembler::rBP;
@@ -91,6 +93,16 @@ CX86Assembler::REGISTER CCodeGen::m_nRegisterLookupEx[MAX_REGISTER] =
 
 #endif
 
+CCodeGen::CCodeGen()
+{
+
+}
+
+CCodeGen::~CCodeGen()
+{
+    
+}
+
 void CCodeGen::SetStream(CStream* stream)
 {
     m_stream = stream;
@@ -112,6 +124,11 @@ void CCodeGen::Begin(CCacheBlock* pBlock)
 	{
 		m_nRegisterAllocated[i] = false;		
 	}
+
+    for(unsigned int i = 0; i < MAX_XMM_REGISTER; i++)
+    {
+        m_xmmRegisterAllocated[i] = false;
+    }
 }
 
 void CCodeGen::End()
@@ -332,6 +349,25 @@ void CCodeGen::FreeRegister(unsigned int nRegister)
 	m_nRegisterAllocated[nRegister] = false;
 }
 
+CCodeGen::XMMREGISTER CCodeGen::AllocateXmmRegister()
+{
+    for(unsigned int i = 0; i < MAX_XMM_REGISTER; i++)
+    {
+        if(!m_xmmRegisterAllocated[i])
+        {
+            m_xmmRegisterAllocated[i] = true;
+            return static_cast<XMMREGISTER>(i);
+        }
+    }
+
+    throw runtime_error("All registers exhausted.");
+}
+
+void CCodeGen::FreeXmmRegister(XMMREGISTER registerId)
+{
+    m_xmmRegisterAllocated[registerId] = false;
+}
+
 void CCodeGen::LoadVariableInRegister(unsigned int nRegister, uint32 nVariable)
 {
 	//mov reg, dword ptr[Variable]
diff --git a/Source/CodeGen.h b/Source/CodeGen.h
index c9022d81..4e0c765c 100644
--- a/Source/CodeGen.h
+++ b/Source/CodeGen.h
@@ -19,7 +19,9 @@ namespace CodeGen
 class CCodeGen
 {
 public:
-	enum CONDITION
+    typedef CX86Assembler::XMMREGISTER XMMREGISTER;
+
+    enum CONDITION
 	{
 		CONDITION_EQ,
 		CONDITION_NE,
@@ -41,6 +43,8 @@ public:
 #ifdef AMD64
 		REGISTER64,
 #endif
+        FP_SINGLE_RELATIVE,
+        FP_SINGLE_REGISTER,
 	};
 
 	enum ROUNDMODE
@@ -53,6 +57,9 @@ public:
 
     friend class					CodeGen::CFPU;
 
+                                    CCodeGen();
+    virtual                         ~CCodeGen();
+
 	static void						Begin(CCacheBlock*);
 	static void						End();
 
@@ -100,20 +107,17 @@ public:
 	static void						Xor();
 
     //FPU
-    void                            FPU_PushWord(size_t);
-    void                            FPU_PushSingle(size_t);
-    void                            FPU_PullWord(size_t);
-    void                            FPU_PullWordTruncate(size_t);
-    void                            FPU_PullSingle(size_t);
+    void                            FP_PushWord(size_t);
+    void                            FP_PushSingle(size_t);
+    void                            FP_PullWordTruncate(size_t);
+    void                            FP_PullSingle(size_t);
+    void                            FP_PushSingleReg(XMMREGISTER);
+    void                            FP_LoadSingleRelativeInRegister(XMMREGISTER, uint32);
 
-    void                            FPU_Add();
-    void                            FPU_Sub();
-    void                            FPU_Mul();
-    void                            FPU_Div();
-
-    void                            FPU_PushRoundingMode();
-    void                            FPU_PullRoundingMode();
-    void                            FPU_SetRoundingMode(ROUNDMODE);
+    void                            FP_Add();
+    void                            FP_Sub();
+    void                            FP_Mul();
+    void                            FP_Div();
 
     void                            SetStream(Framework::CStream*);
     static CX86Assembler            m_Assembler;
@@ -139,6 +143,15 @@ private:
 #endif
 	};
 
+    enum MAX_XMM_REGISTER
+    {
+#ifdef AMD64
+        MAX_XMM_REGISTER = 16,
+#else
+        MAX_XMM_REGISTER = 8,
+#endif
+    };
+
 	enum REL_REGISTER
 	{
 		REL_REGISTER = 5,
@@ -178,6 +191,9 @@ private:
 	static void						LoadConstantInRegister64(unsigned int, uint64);
 #endif
 
+    XMMREGISTER                     AllocateXmmRegister();
+    void                            FreeXmmRegister(XMMREGISTER);
+
 	static void						LoadConditionInRegister(unsigned int, CONDITION);
 
 	static void						ReduceToRegister();
@@ -247,6 +263,7 @@ private:
 	static unsigned int				m_nRegisterLookup[MAX_REGISTER];
     static CX86Assembler::REGISTER  m_nRegisterLookupEx[MAX_REGISTER];
 	static CCacheBlock*				m_pBlock;
+    static bool                     m_xmmRegisterAllocated[MAX_XMM_REGISTER];
 
     static Framework::CStream*      m_stream;
     static CX86Assembler::REGISTER  g_nBaseRegister;
diff --git a/Source/CodeGen_FPU.cpp b/Source/CodeGen_FPU.cpp
index 3e54a9bb..d2f56b15 100644
--- a/Source/CodeGen_FPU.cpp
+++ b/Source/CodeGen_FPU.cpp
@@ -1,5 +1,6 @@
 #include <assert.h>
 #include "CodeGen_FPU.h"
+#include "CodeGen_StackPatterns.h"
 #include "PtrMacro.h"
 
 using namespace CodeGen;
@@ -226,76 +227,129 @@ void CFPU::Round()
 
 //New stuff
 
-void CCodeGen::FPU_PushSingle(size_t offset)
+void CCodeGen::FP_PushSingleReg(XMMREGISTER registerId)
 {
-    m_Assembler.FldEd(CX86Assembler::MakeIndRegOffAddress(g_nBaseRegister, static_cast<uint32>(offset)));
+    m_Shadow.Push(registerId);
+    m_Shadow.Push(FP_SINGLE_REGISTER);
 }
 
-void CCodeGen::FPU_PushWord(size_t offset)
+void CCodeGen::FP_PushSingle(size_t offset)
 {
-    m_Assembler.FildEd(CX86Assembler::MakeIndRegOffAddress(g_nBaseRegister, static_cast<uint32>(offset)));
+    m_Shadow.Push(static_cast<uint32>(offset));
+    m_Shadow.Push(FP_SINGLE_RELATIVE);
 }
 
-void CCodeGen::FPU_PullSingle(size_t offset)
+void CCodeGen::FP_LoadSingleRelativeInRegister(XMMREGISTER destination, uint32 source)
 {
-    m_Assembler.FstpEd(CX86Assembler::MakeIndRegOffAddress(g_nBaseRegister, static_cast<uint32>(offset)));
+    m_Assembler.MovssEd(destination,
+        CX86Assembler::MakeIndRegOffAddress(g_nBaseRegister, source));
 }
 
-void CCodeGen::FPU_PullWord(size_t offset)
+void CCodeGen::FP_PushWord(size_t offset)
 {
-    m_Assembler.FistpEd(CX86Assembler::MakeIndRegOffAddress(g_nBaseRegister, static_cast<uint32>(offset)));
+    XMMREGISTER resultRegister = AllocateXmmRegister();
+    m_Assembler.Cvtsi2ssEd(resultRegister,
+        CX86Assembler::MakeIndRegOffAddress(g_nBaseRegister, static_cast<uint32>(offset)));
+    FP_PushSingleReg(resultRegister);
 }
 
-void CCodeGen::FPU_PullWordTruncate(size_t offset)
+void CCodeGen::FP_PullSingle(size_t offset)
 {
-    m_Assembler.FisttpEd(CX86Assembler::MakeIndRegOffAddress(g_nBaseRegister, static_cast<uint32>(offset)));
+    if(FitsPattern<SingleFpSingleRegister>())
+    {
+        XMMREGISTER valueRegister = static_cast<XMMREGISTER>(GetPattern<SingleFpSingleRegister>());
+        m_Assembler.MovssEd(CX86Assembler::MakeIndRegOffAddress(g_nBaseRegister, static_cast<uint32>(offset)),
+            valueRegister);
+        FreeXmmRegister(valueRegister);
+    }
+    else
+    {
+        assert(0);
+    }
 }
 
-void CCodeGen::FPU_PushRoundingMode()
+void CCodeGen::FP_PullWordTruncate(size_t offset)
 {
-    m_Assembler.SubId(CX86Assembler::MakeRegisterAddress(CX86Assembler::rSP), 4);
-    m_Assembler.Fwait();
-    m_Assembler.FnstcwEw(CX86Assembler::MakeIndRegAddress(CX86Assembler::rSP));
+    if(FitsPattern<SingleFpSingleRelative>())
+    {
+        SingleFpSingleRelative::PatternValue op = GetPattern<SingleFpSingleRelative>();
+        unsigned int valueRegister = AllocateRegister();
+        m_Assembler.Cvttss2siEd(m_nRegisterLookupEx[valueRegister],
+            CX86Assembler::MakeIndRegOffAddress(g_nBaseRegister, op));
+        m_Assembler.MovGd(CX86Assembler::MakeIndRegOffAddress(g_nBaseRegister, static_cast<uint32>(offset)),
+            m_nRegisterLookupEx[valueRegister]);
+        FreeRegister(valueRegister);
+    }
+    else
+    {
+        assert(0);
+    }
 }
 
-void CCodeGen::FPU_PullRoundingMode()
+void CCodeGen::FP_Add()
 {
-    m_Assembler.FldcwEw(CX86Assembler::MakeIndRegAddress(CX86Assembler::rSP));
-    m_Assembler.AddId(CX86Assembler::MakeRegisterAddress(CX86Assembler::rSP), 4);
+    if(FitsPattern<DualFpSingleRelative>())
+    {
+        DualFpSingleRelative::PatternValue ops = GetPattern<DualFpSingleRelative>();
+        XMMREGISTER resultRegister = AllocateXmmRegister();
+        FP_LoadSingleRelativeInRegister(resultRegister, ops.first);
+        m_Assembler.AddssEd(resultRegister,
+            CX86Assembler::MakeIndRegOffAddress(g_nBaseRegister, ops.second));
+        FP_PushSingleReg(resultRegister);
+    }
+    else
+    {
+        assert(0);
+    }
 }
 
-void CCodeGen::FPU_SetRoundingMode(ROUNDMODE roundingMode)
+void CCodeGen::FP_Sub()
 {
-    //Load current control word
-    m_Assembler.SubId(CX86Assembler::MakeRegisterAddress(CX86Assembler::rSP), 4);
-    m_Assembler.Fwait();
-    m_Assembler.FnstcwEw(CX86Assembler::MakeIndRegAddress(CX86Assembler::rSP));
-    //Set new rounding mode
-    m_Assembler.AndId(CX86Assembler::MakeIndRegAddress(CX86Assembler::rSP),
-        0xFFFFF3FF);
-    m_Assembler.OrId(CX86Assembler::MakeIndRegAddress(CX86Assembler::rSP),
-        roundingMode << 10);
-    //Save control word
-    m_Assembler.FldcwEw(CX86Assembler::MakeIndRegAddress(CX86Assembler::rSP));
-    m_Assembler.AddId(CX86Assembler::MakeRegisterAddress(CX86Assembler::rSP), 4);
+    if(FitsPattern<DualFpSingleRelative>())
+    {
+        DualFpSingleRelative::PatternValue ops = GetPattern<DualFpSingleRelative>();
+        XMMREGISTER resultRegister = AllocateXmmRegister();
+        FP_LoadSingleRelativeInRegister(resultRegister, ops.first);
+        m_Assembler.SubssEd(resultRegister,
+            CX86Assembler::MakeIndRegOffAddress(g_nBaseRegister, ops.second));
+        FP_PushSingleReg(resultRegister);
+    }
+    else
+    {
+        assert(0);
+    }
 }
 
-void CCodeGen::FPU_Add()
+void CCodeGen::FP_Mul()
 {
-    m_Assembler.FaddpSt(1);
+    if(FitsPattern<DualFpSingleRelative>())
+    {
+        DualFpSingleRelative::PatternValue ops = GetPattern<DualFpSingleRelative>();
+        XMMREGISTER resultRegister = AllocateXmmRegister();
+        FP_LoadSingleRelativeInRegister(resultRegister, ops.first);
+        m_Assembler.MulssEd(resultRegister,
+            CX86Assembler::MakeIndRegOffAddress(g_nBaseRegister, ops.second));
+        FP_PushSingleReg(resultRegister);
+    }
+    else
+    {
+        assert(0);
+    }
 }
 
-void CCodeGen::FPU_Sub()
+void CCodeGen::FP_Div()
 {
-    m_Assembler.FsubpSt(1);
-}
-
-void CCodeGen::FPU_Mul()
-{
-    m_Assembler.FmulpSt(1);
-}
-
-void CCodeGen::FPU_Div()
-{
-    m_Assembler.FdivpSt(1);
+    if(FitsPattern<DualFpSingleRelative>())
+    {
+        DualFpSingleRelative::PatternValue ops = GetPattern<DualFpSingleRelative>();
+        XMMREGISTER resultRegister = AllocateXmmRegister();
+        FP_LoadSingleRelativeInRegister(resultRegister, ops.first);
+        m_Assembler.DivssEd(resultRegister,
+            CX86Assembler::MakeIndRegOffAddress(g_nBaseRegister, ops.second));
+        FP_PushSingleReg(resultRegister);
+    }
+    else
+    {
+        assert(0);
+    }
 }
diff --git a/Source/CodeGen_StackPatterns.h b/Source/CodeGen_StackPatterns.h
index a7ea1726..8ee0c209 100644
--- a/Source/CodeGen_StackPatterns.h
+++ b/Source/CodeGen_StackPatterns.h
@@ -294,11 +294,15 @@ struct ZeroWithSomethingCommutative64
 typedef GenericOneArgument<CCodeGen::RELATIVE> SingleRelative;
 typedef GenericOneArgument<CCodeGen::REGISTER> SingleRegister;
 typedef GenericOneArgument<CCodeGen::CONSTANT> SingleConstant;
+typedef GenericOneArgument<CCodeGen::FP_SINGLE_REGISTER> SingleFpSingleRegister;
+typedef GenericOneArgument<CCodeGen::FP_SINGLE_RELATIVE> SingleFpSingleRelative;
 typedef GenericTwoArguments<CCodeGen::RELATIVE, CCodeGen::CONSTANT> RelativeConstant;
 typedef GenericTwoArguments<CCodeGen::REGISTER, CCodeGen::CONSTANT> RegisterConstant;
 typedef GenericTwoArguments<CCodeGen::CONSTANT, CCodeGen::RELATIVE> ConstantRelative;
 typedef GenericTwoArguments<CCodeGen::CONSTANT, CCodeGen::CONSTANT> ConstantConstant;
 typedef GenericTwoArguments<CCodeGen::RELATIVE, CCodeGen::RELATIVE> RelativeRelative;
+typedef GenericTwoArguments<CCodeGen::FP_SINGLE_REGISTER, CCodeGen::FP_SINGLE_REGISTER> DualFpSingleRegister;
+typedef GenericTwoArguments<CCodeGen::FP_SINGLE_RELATIVE, CCodeGen::FP_SINGLE_RELATIVE> DualFpSingleRelative;
 typedef GenericCommutative<CCodeGen::REGISTER, CCodeGen::CONSTANT> CommutativeRegisterConstant;
 typedef GenericCommutative<CCodeGen::RELATIVE, CCodeGen::CONSTANT> CommutativeRelativeConstant;
 typedef GenericOneArgument64<CCodeGen::CONSTANT> SingleConstant64;
diff --git a/Source/DMAC.cpp b/Source/DMAC.cpp
index 087ac2c2..2cb96cde 100644
--- a/Source/DMAC.cpp
+++ b/Source/DMAC.cpp
@@ -624,7 +624,7 @@ void CDMAC::SetRegister(uint32 nAddress, uint32 nData)
 	}
 
 #ifdef _DEBUG
-	DisassembleSet(nAddress, nData);
+//	DisassembleSet(nAddress, nData);
 #endif
 
 #ifdef PROFILE
diff --git a/Source/GSHandler.cpp b/Source/GSHandler.cpp
index 7228a506..cb37d65e 100644
--- a/Source/GSHandler.cpp
+++ b/Source/GSHandler.cpp
@@ -427,7 +427,7 @@ void CGSHandler::WriteRegisterImpl(uint8 nRegister, uint64 nData)
 	}
 
 #ifdef _DEBUG
-	DisassembleWrite(nRegister, nData);
+//	DisassembleWrite(nRegister, nData);
 #endif
 }
 
diff --git a/Source/X86Assembler.h b/Source/X86Assembler.h
index e3a5a8f3..58eda944 100644
--- a/Source/X86Assembler.h
+++ b/Source/X86Assembler.h
@@ -28,6 +28,26 @@ public:
         r15,
     };
 
+    enum XMMREGISTER
+    {
+        xMM0 = 0,
+        xMM1,
+        xMM2,
+        xMM3,
+        xMM4,
+        xMM5,
+        xMM6,
+        xMM7,
+        xMM8,
+        xMM9,
+        xMM10,
+        xMM11,
+        xMM12,
+        xMM13,
+        xMM14,
+        xMM15,
+    };
+
     typedef std::tr1::function<void (uint8)>                WriteFunctionType;
     typedef std::tr1::function<void (unsigned int, uint8)>  WriteAtFunctionType;
     typedef std::tr1::function<size_t ()>                   TellFunctionType;
@@ -143,6 +163,16 @@ public:
     void                                    FnstcwEw(const CAddress&);
     void                                    FldcwEw(const CAddress&);
 
+    //SSE
+    void                                    MovssEd(const CAddress&, XMMREGISTER);
+    void                                    MovssEd(XMMREGISTER, const CAddress&);
+    void                                    AddssEd(XMMREGISTER, const CAddress&);
+    void                                    SubssEd(XMMREGISTER, const CAddress&);
+    void                                    MulssEd(XMMREGISTER, const CAddress&);
+    void                                    DivssEd(XMMREGISTER, const CAddress&);
+    void                                    Cvtsi2ssEd(XMMREGISTER, const CAddress&);
+    void                                    Cvttss2siEd(REGISTER, const CAddress&);
+
 private:
     struct LABELREF
     {
@@ -159,6 +189,7 @@ private:
     void                                    WriteEvGvOp(uint8, bool, const CAddress&, REGISTER);
     void                                    WriteEvId(uint8, const CAddress&, uint32);
     void                                    WriteEvIq(uint8, const CAddress&, uint64);
+    void                                    WriteEdVdOp(uint8, const CAddress&, XMMREGISTER); 
     void                                    WriteStOp(uint8, uint8, uint8);
 
     void                                    CreateLabelReference(LABEL, unsigned int);
diff --git a/Source/X86Assembler_Fpu.cpp b/Source/X86Assembler_Fpu.cpp
index cd819cd5..59726d86 100644
--- a/Source/X86Assembler_Fpu.cpp
+++ b/Source/X86Assembler_Fpu.cpp
@@ -69,6 +69,62 @@ void CX86Assembler::FldcwEw(const CAddress& address)
     WriteEvOp(0xD9, 0x05, false, address);
 }
 
+void CX86Assembler::MovssEd(const CAddress& address, XMMREGISTER registerId)
+{
+    WriteByte(0xF3);
+    WriteByte(0x0F);
+    WriteEdVdOp(0x11, address, registerId);
+}
+
+void CX86Assembler::MovssEd(XMMREGISTER registerId, const CAddress& address)
+{
+    WriteByte(0xF3);
+    WriteByte(0x0F);
+    WriteEdVdOp(0x10, address, registerId);
+}
+
+void CX86Assembler::AddssEd(XMMREGISTER registerId, const CAddress& address)
+{
+    WriteByte(0xF3);
+    WriteByte(0x0F);
+    WriteEdVdOp(0x58, address, registerId);
+}
+
+void CX86Assembler::SubssEd(XMMREGISTER registerId, const CAddress& address)
+{
+    WriteByte(0xF3);
+    WriteByte(0x0F);
+    WriteEdVdOp(0x5C, address, registerId);
+}
+
+void CX86Assembler::MulssEd(XMMREGISTER registerId, const CAddress& address)
+{
+    WriteByte(0xF3);
+    WriteByte(0x0F);
+    WriteEdVdOp(0x59, address, registerId);
+}
+
+void CX86Assembler::DivssEd(XMMREGISTER registerId, const CAddress& address)
+{
+    WriteByte(0xF3);
+    WriteByte(0x0F);
+    WriteEdVdOp(0x5E, address, registerId);
+}
+
+void CX86Assembler::Cvtsi2ssEd(XMMREGISTER registerId, const CAddress& address)
+{
+    WriteByte(0xF3);
+    WriteByte(0x0F);
+    WriteEdVdOp(0x2A, address, registerId);
+}
+
+void CX86Assembler::Cvttss2siEd(REGISTER registerId, const CAddress& address)
+{
+    WriteByte(0xF3);
+    WriteByte(0x0F);
+    WriteEvGvOp(0x2C, false, address, registerId);
+}
+
 void CX86Assembler::WriteStOp(uint8 opcode, uint8 subOpcode, uint8 stackId)
 {
     CAddress address;
@@ -78,3 +134,13 @@ void CX86Assembler::WriteStOp(uint8 opcode, uint8 subOpcode, uint8 stackId)
     WriteByte(opcode);
     WriteByte(address.ModRm.nByte);
 }
+
+void CX86Assembler::WriteEdVdOp(uint8 opcode, const CAddress& address, XMMREGISTER xmmRegisterId)
+{
+    REGISTER registerId = static_cast<REGISTER>(xmmRegisterId);
+    WriteRexByte(false, address, registerId);
+    CAddress NewAddress(address);
+    NewAddress.ModRm.nFnReg = registerId;
+    WriteByte(opcode);
+    NewAddress.Write(m_WriteFunction);
+}