From ca4692179622d407d5bf35c7e9341051574b2535 Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Sat, 2 Jul 2016 18:56:21 +0200
Subject: [PATCH] MTVU: use acquire/release semantics for atomic operation

* Avoid the generation of memory barrier (mfence)
* Based on the fact that it used to work on previous code without any
  barrier

v2:
* keep basic code in reset path
* use relaxed access for isBusy. The variable doesn't carry load/store
  dependency but is instead an hint to optimize semaphore post
---
 common/include/Utilities/Threading.h |  8 ++++----
 pcsx2/MTVU.cpp                       | 10 +++++-----
 pcsx2/MTVU.h                         |  4 ++--
 3 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/common/include/Utilities/Threading.h b/common/include/Utilities/Threading.h
index 26967e5a8..f25610c00 100644
--- a/common/include/Utilities/Threading.h
+++ b/common/include/Utilities/Threading.h
@@ -399,17 +399,17 @@ namespace Threading
 		ScopedLockBool(Mutex& mutexToLock, std::atomic<bool>& isLockedBool)
 			: m_lock(mutexToLock),
 			  m_bool(isLockedBool) {
-			m_bool = m_lock.IsLocked();
+			m_bool.store(m_lock.IsLocked(), std::memory_order_relaxed);
 		}
 		virtual ~ScopedLockBool() throw() {
-			m_bool = false;
+			m_bool.store(false, std::memory_order_relaxed);
 		}
 		void Acquire() {
 			m_lock.Acquire();
-			m_bool = m_lock.IsLocked();
+			m_bool.store(m_lock.IsLocked(), std::memory_order_relaxed);
 		}
 		void Release() {
-			m_bool = false;
+			m_bool.store(false, std::memory_order_relaxed);
 			m_lock.Release();
 		}
 	};
diff --git a/pcsx2/MTVU.cpp b/pcsx2/MTVU.cpp
index e049ba3df..7a2fd4a7b 100644
--- a/pcsx2/MTVU.cpp
+++ b/pcsx2/MTVU.cpp
@@ -75,11 +75,11 @@ void VU_Thread::Reset()
 {
 	ScopedLock lock(mtxBusy);
 
-	read_pos     = 0;
 	write_pos    = 0;
 	write_offset = 0;
 	vuCycleIdx   = 0;
-	isBusy = false;
+	read_pos     = 0;
+	isBusy       = false;
 	memzero(vif);
 	memzero(vifRegs);
 	memzero(vuCycles);
@@ -202,7 +202,7 @@ __fi u32* VU_Thread::GetWritePtr()
 
 __fi void VU_Thread::incReadPos(s32 offset)
 { // Offset in u32 sizes
-	read_pos = (read_pos + offset) & buffer_mask;
+	read_pos.store((read_pos.load(std::memory_order_relaxed) + offset) & buffer_mask, std::memory_order_release);
 }
 __fi void VU_Thread::incWritePos()
 { // Adds write_offset
@@ -272,12 +272,12 @@ u32 VU_Thread::Get_vuCycles()
 void VU_Thread::KickStart(bool forceKick)
 {
 	if ((forceKick && !semaEvent.Count())
-	|| (!isBusy && GetReadPos() != write_pos)) semaEvent.Post();
+	|| (!isBusy.load(std::memory_order_relaxed) && GetReadPos() != write_pos)) semaEvent.Post();
 }
 
 bool VU_Thread::IsDone()
 {
-	return !isBusy && GetReadPos() == GetWritePos();
+	return !isBusy.load(std::memory_order_relaxed) && GetReadPos() == GetWritePos();
 }
 
 void VU_Thread::WaitVU()
diff --git a/pcsx2/MTVU.h b/pcsx2/MTVU.h
index 29cc1fcf3..8db6cde72 100644
--- a/pcsx2/MTVU.h
+++ b/pcsx2/MTVU.h
@@ -30,8 +30,8 @@ class VU_Thread : public pxThread {
 	static const s32 buffer_size = (_1mb * 16) / sizeof(s32);
 	static const u32 buffer_mask = buffer_size - 1;
 	__aligned(4) u32 buffer[buffer_size];
-	__aligned(4) std::atomic<int> read_pos; // Only modified by VU thread
-	__aligned(4) std::atomic<bool> isBusy;   // Is thread processing data?
+	std::atomic<int> read_pos; // Only modified by VU thread
+	std::atomic<bool> isBusy;   // Is thread processing data?
 	__aligned(4) s32  write_pos;    // Only modified by EE thread
 	__aligned(4) s32  write_offset; // Only modified by EE thread
 	__aligned(4) Mutex     mtxBusy;