Merge pull request #4716 from degasus/jitcache

JitCache: Use a container for overlapping blocks.
This commit is contained in:
Matthew Parlane 2017-01-24 09:18:42 +13:00 committed by GitHub
commit 8d0ce8ea47
10 changed files with 98 additions and 65 deletions

View File

@ -211,7 +211,7 @@ void CachedInterpreter::Jit(u32 address)
b->codeSize = (u32)(GetCodePtr() - b->checkedEntry);
b->originalSize = code_block.m_num_instructions;
m_block_cache.FinalizeBlock(*b, jo.enableBlocklink, b->checkedEntry);
m_block_cache.FinalizeBlock(*b, jo.enableBlocklink, code_block.m_physical_addresses);
}
void CachedInterpreter::ClearCache()

View File

@ -590,7 +590,8 @@ void Jit64::Jit(u32 em_address)
}
JitBlock* b = blocks.AllocateBlock(em_address);
blocks.FinalizeBlock(*b, jo.enableBlocklink, DoJit(em_address, &code_buffer, b, nextPC));
DoJit(em_address, &code_buffer, b, nextPC);
blocks.FinalizeBlock(*b, jo.enableBlocklink, code_block.m_physical_addresses);
}
const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC)

View File

@ -508,7 +508,8 @@ void JitIL::Jit(u32 em_address)
}
JitBlock* b = blocks.AllocateBlock(em_address);
blocks.FinalizeBlock(*b, jo.enableBlocklink, DoJit(em_address, &code_buffer, b, nextPC));
DoJit(em_address, &code_buffer, b, nextPC);
blocks.FinalizeBlock(*b, jo.enableBlocklink, code_block.m_physical_addresses);
}
const u8* JitIL::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC)

View File

@ -399,7 +399,7 @@ void JitArm64::Jit(u32)
JitBlock* b = blocks.AllocateBlock(em_address);
const u8* BlockPtr = DoJit(em_address, &code_buffer, b, nextPC);
blocks.FinalizeBlock(*b, jo.enableBlocklink, BlockPtr);
blocks.FinalizeBlock(*b, jo.enableBlocklink, code_block.m_physical_addresses);
}
const u8* JitArm64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer* code_buf, JitBlock* b, u32 nextPC)

View File

@ -36,6 +36,12 @@ static void ClearCacheThreadSafe(u64 userdata, s64 cyclesdata)
JitInterface::ClearCache();
}
bool JitBlock::OverlapsPhysicalRange(u32 address, u32 length) const
{
return physical_addresses.lower_bound(address) !=
physical_addresses.lower_bound(address + length);
}
JitBaseBlockCache::JitBaseBlockCache(JitBase& jit) : m_jit{jit}
{
}
@ -64,13 +70,13 @@ void JitBaseBlockCache::Clear()
#endif
m_jit.js.fifoWriteAddresses.clear();
m_jit.js.pairedQuantizeAddresses.clear();
for (auto& e : start_block_map)
for (auto& e : block_map)
{
DestroyBlock(e.second);
}
start_block_map.clear();
links_to.clear();
block_map.clear();
links_to.clear();
block_range_map.clear();
valid_block.ClearAll();
@ -95,14 +101,14 @@ JitBlock** JitBaseBlockCache::GetFastBlockMap()
void JitBaseBlockCache::RunOnBlocks(std::function<void(const JitBlock&)> f)
{
for (const auto& e : start_block_map)
for (const auto& e : block_map)
f(e.second);
}
JitBlock* JitBaseBlockCache::AllocateBlock(u32 em_address)
{
u32 physicalAddress = PowerPC::JitCache_TranslateAddress(em_address).address;
JitBlock& b = start_block_map.emplace(physicalAddress, JitBlock())->second;
JitBlock& b = block_map.emplace(physicalAddress, JitBlock())->second;
b.effectiveAddress = em_address;
b.physicalAddress = physicalAddress;
b.msrBits = MSR & JIT_CACHE_MSR_MASK;
@ -111,30 +117,21 @@ JitBlock* JitBaseBlockCache::AllocateBlock(u32 em_address)
return &b;
}
void JitBaseBlockCache::FreeBlock(JitBlock* block)
{
auto iter = start_block_map.equal_range(block->physicalAddress);
while (iter.first != iter.second)
{
if (&iter.first->second == block)
iter.first = start_block_map.erase(iter.first);
else
iter.first++;
}
}
void JitBaseBlockCache::FinalizeBlock(JitBlock& block, bool block_link, const u8* code_ptr)
void JitBaseBlockCache::FinalizeBlock(JitBlock& block, bool block_link,
const std::set<u32>& physical_addresses)
{
size_t index = FastLookupIndexForAddress(block.effectiveAddress);
fast_block_map[index] = &block;
block.fast_block_map_index = index;
u32 pAddr = block.physicalAddress;
block.physical_addresses = physical_addresses;
for (u32 addr = pAddr / 32; addr <= (pAddr + (block.originalSize - 1) * 4) / 32; ++addr)
valid_block.Set(addr);
block_map.emplace(std::make_pair(pAddr + 4 * block.originalSize - 1, pAddr), &block);
u32 range_mask = ~(BLOCK_RANGE_MAP_ELEMENTS - 1);
for (u32 addr : physical_addresses)
{
valid_block.Set(addr / 32);
block_range_map[addr & range_mask].insert(&block);
}
if (block_link)
{
@ -162,7 +159,7 @@ JitBlock* JitBaseBlockCache::GetBlockFromStartAddress(u32 addr, u32 msr)
translated_addr = translated.address;
}
auto iter = start_block_map.equal_range(translated_addr);
auto iter = block_map.equal_range(translated_addr);
for (; iter.first != iter.second; iter.first++)
{
JitBlock& b = iter.first->second;
@ -186,7 +183,7 @@ const u8* JitBaseBlockCache::Dispatch()
return block->normalEntry;
}
void JitBaseBlockCache::InvalidateICache(u32 address, const u32 length, bool forced)
void JitBaseBlockCache::InvalidateICache(u32 address, u32 length, bool forced)
{
auto translated = PowerPC::JitCache_TranslateAddress(address);
if (!translated.valid)
@ -203,19 +200,10 @@ void JitBaseBlockCache::InvalidateICache(u32 address, const u32 length, bool for
valid_block.Clear(pAddr / 32);
}
// destroy JIT blocks
// !! this works correctly under assumption that any two overlapping blocks end at the same
// address
if (destroy_block)
{
auto it = block_map.lower_bound(std::make_pair(pAddr, 0));
while (it != block_map.end() && it->first.second < pAddr + length)
{
JitBlock* block = it->second;
DestroyBlock(*block);
FreeBlock(block);
it = block_map.erase(it);
}
// destroy JIT blocks
ErasePhysicalRange(pAddr, length);
// If the code was actually modified, we need to clear the relevant entries from the
// FIFO write address cache, so we don't end up with FIFO checks in places they shouldn't
@ -232,6 +220,46 @@ void JitBaseBlockCache::InvalidateICache(u32 address, const u32 length, bool for
}
}
void JitBaseBlockCache::ErasePhysicalRange(u32 address, u32 length)
{
// Iterate over all macro blocks which overlap the given range.
u32 range_mask = ~(BLOCK_RANGE_MAP_ELEMENTS - 1);
auto start = block_range_map.lower_bound(address & range_mask);
auto end = block_range_map.lower_bound(address + length);
while (start != end)
{
// Iterate over all blocks in the macro block.
auto iter = start->second.begin();
while (iter != start->second.end())
{
JitBlock* block = *iter;
if (block->OverlapsPhysicalRange(address, length))
{
// If the block overlaps, also remove all other occupied slots in the other macro blocks.
// This will leak empty macro blocks, but they may be reused or cleared later on.
for (u32 addr : block->physical_addresses)
if ((addr & range_mask) != start->first)
block_range_map[addr & range_mask].erase(block);
// And remove the block.
DestroyBlock(*block);
block_map.erase(block->physicalAddress);
iter = start->second.erase(iter);
}
else
{
iter++;
}
}
// If the macro block is empty, drop it.
if (start->second.empty())
start = block_range_map.erase(start);
else
start++;
}
}
u32* JitBaseBlockCache::GetBlockBitSet() const
{
return valid_block.m_valid_block.get();

View File

@ -9,6 +9,7 @@
#include <functional>
#include <map>
#include <memory>
#include <set>
#include <vector>
#include "Common/CommonTypes.h"
@ -24,6 +25,8 @@ class JitBase;
// address.
struct JitBlock
{
bool OverlapsPhysicalRange(u32 address, u32 length) const;
// A special entry point for block linking; usually used to check the
// downcount.
const u8* checkedEntry;
@ -35,8 +38,8 @@ struct JitBlock
// The MSR bits expected for this block to be valid; see JIT_CACHE_MSR_MASK.
u32 msrBits;
// The physical address of the code represented by this block.
// Various maps in the cache are indexed by this (start_block_map,
// block_map, and valid_block in particular). This is useful because of
// Various maps in the cache are indexed by this (block_map
// and valid_block in particular). This is useful because of
// of the way the instruction cache works on PowerPC.
u32 physicalAddress;
// The number of bytes of JIT'ed code contained in this block. Mostly
@ -57,6 +60,9 @@ struct JitBlock
};
std::vector<LinkData> linkData;
// This set stores all physical addresses of all occupied instructions.
std::set<u32> physical_addresses;
// we don't really need to save start and stop
// TODO (mb2): ticStart and ticStop -> "local var" mean "in block" ... low priority ;)
u64 ticStart; // for profiling - time.
@ -124,8 +130,7 @@ public:
void RunOnBlocks(std::function<void(const JitBlock&)> f);
JitBlock* AllocateBlock(u32 em_address);
void FreeBlock(JitBlock* block);
void FinalizeBlock(JitBlock& block, bool block_link, const u8* code_ptr);
void FinalizeBlock(JitBlock& block, bool block_link, const std::set<u32>& physical_addresses);
// Look for the block in the slow but accurate way.
// This function shall be used if FastLookupIndexForAddress() failed.
@ -138,7 +143,8 @@ public:
// assembly version.)
const u8* Dispatch();
void InvalidateICache(u32 address, const u32 length, bool forced);
void InvalidateICache(u32 address, u32 length, bool forced);
void ErasePhysicalRange(u32 address, u32 length);
u32* GetBlockBitSet() const;
@ -163,20 +169,21 @@ private:
// It is used to query all blocks which links to an address.
std::multimap<u32, JitBlock*> links_to; // destination_PC -> number
// Map indexed by the physical memory location.
// It is used to invalidate blocks based on memory location.
std::multimap<std::pair<u32, u32>, JitBlock*> block_map; // (end_addr, start_addr) -> block
// Map indexed by the physical address of the entry point.
// This is used to query the block based on the current PC in a slow way.
// TODO: This is redundant with block_map.
std::multimap<u32, JitBlock> start_block_map; // start_addr -> block
std::multimap<u32, JitBlock> block_map; // start_addr -> block
// Range of overlapping code indexed by a masked physical address.
// This is used for invalidation of memory regions. The range is grouped
// in macro blocks of each 0x100 bytes.
static constexpr u32 BLOCK_RANGE_MAP_ELEMENTS = 0x100;
std::map<u32, std::set<JitBlock*>> block_range_map;
// This bitsets shows which cachelines overlap with any blocks.
// It is used to provide a fast way to query if no icache invalidation is needed.
ValidBlockBitSet valid_block;
// This array is indexed with the masked PC and likely holds the correct block id.
// This is used as a fast cache of start_block_map used in the assembly dispatcher.
// This is used as a fast cache of block_map used in the assembly dispatcher.
std::array<JitBlock*, FAST_BLOCK_MAP_ELEMENTS> fast_block_map; // start_addr & mask -> number
};

View File

@ -384,7 +384,7 @@ TryReadInstResult TryReadInstruction(u32 address)
auto tlb_addr = TranslateAddress<FLAG_OPCODE>(address);
if (!tlb_addr.Success())
{
return TryReadInstResult{false, false, 0};
return TryReadInstResult{false, false, 0, 0};
}
else
{
@ -403,7 +403,7 @@ TryReadInstResult TryReadInstruction(u32 address)
{
hex = PowerPC::ppcState.iCache.ReadInstruction(address);
}
return TryReadInstResult{true, from_bat, hex};
return TryReadInstResult{true, from_bat, hex, address};
}
u32 HostRead_Instruction(const u32 address)

View File

@ -646,6 +646,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32
block->m_memory_exception = false;
block->m_num_instructions = 0;
block->m_gqr_used = BitSet8(0);
block->m_physical_addresses.clear();
CodeOp* code = buffer->codebuffer;
@ -653,7 +654,6 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32
u32 return_address = 0;
u32 numFollows = 0;
u32 num_inst = 0;
bool prev_inst_from_bat = true;
for (u32 i = 0; i < blockSize; ++i)
{
@ -666,16 +666,6 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32
}
UGeckoInstruction inst = result.hex;
// Slight hack: the JIT block cache currently assumes all blocks end at the same place,
// but broken blocks due to page faults break this assumption. Avoid this by just ending
// all virtual memory instruction blocks at page boundaries.
// FIXME: improve the JIT block cache so we don't need to do this.
if ((!result.from_bat || !prev_inst_from_bat) && i > 0 && (address & 0xfff) == 0)
{
break;
}
prev_inst_from_bat = result.from_bat;
num_inst++;
memset(&code[i], 0, sizeof(CodeOp));
GekkoOPInfo* opinfo = GetOpInfo(inst);
@ -687,6 +677,7 @@ u32 PPCAnalyzer::Analyze(u32 address, CodeBlock* block, CodeBuffer* buffer, u32
code[i].branchToIndex = -1;
code[i].skip = false;
block->m_stats->numCycles += opinfo->numCycles;
block->m_physical_addresses.insert(result.physical_address);
SetInstructionStats(block, &code[i], opinfo, i);

View File

@ -7,6 +7,7 @@
#include <algorithm>
#include <cstdlib>
#include <map>
#include <set>
#include <string>
#include <vector>
@ -157,6 +158,9 @@ struct CodeBlock
// Which GPRs this block reads from before defining, if any.
BitSet32 m_gpr_inputs;
// Which memory locations are occupied by this block.
std::set<u32> m_physical_addresses;
};
class PPCAnalyzer

View File

@ -232,6 +232,7 @@ struct TryReadInstResult
bool valid;
bool from_bat;
u32 hex;
u32 physical_address;
};
TryReadInstResult TryReadInstruction(const u32 address);