Skip to content

Commit

Permalink
Latte: Implement better index caching (#1443)
Browse files Browse the repository at this point in the history
  • Loading branch information
Exzap authored Jan 12, 2025
1 parent 1923b7a commit 8dd809d
Show file tree
Hide file tree
Showing 16 changed files with 527 additions and 192 deletions.
15 changes: 15 additions & 0 deletions src/Cafe/HW/Latte/Core/LatteCommandProcessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,14 @@ class DrawPassContext

void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx);

// called whenever the GPU runs out of commands or hits a wait condition (semaphores, HLE waits)
void LatteCP_signalEnterWait()
{
// based on the assumption that games won't do a rugpull and swap out buffer data in the middle of an uninterrupted sequence of drawcalls,
// we only flush caches when the GPU goes idle or has to wait for any operation
LatteIndices_invalidateAll();
}

/*
* Read a U32 from the command buffer
* If no data is available then wait in a busy loop
Expand Down Expand Up @@ -466,6 +474,8 @@ LatteCMDPtr LatteCP_itWaitRegMem(LatteCMDPtr cmd, uint32 nWords)
const uint32 GPU7_WAIT_MEM_OP_GREATER = 6;
const uint32 GPU7_WAIT_MEM_OP_NEVER = 7;

LatteCP_signalEnterWait();

bool stalls = false;
if ((word0 & 0x10) != 0)
{
Expand Down Expand Up @@ -594,6 +604,7 @@ LatteCMDPtr LatteCP_itMemSemaphore(LatteCMDPtr cmd, uint32 nWords)
else if(SEM_SIGNAL == 7)
{
// wait
LatteCP_signalEnterWait();
size_t loopCount = 0;
while (true)
{
Expand Down Expand Up @@ -1305,11 +1316,13 @@ void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx)
}
case IT_HLE_TRIGGER_SCANBUFFER_SWAP:
{
LatteCP_signalEnterWait();
LatteCP_itHLESwapScanBuffer(cmdData, nWords);
break;
}
case IT_HLE_WAIT_FOR_FLIP:
{
LatteCP_signalEnterWait();
LatteCP_itHLEWaitForFlip(cmdData, nWords);
break;
}
Expand Down Expand Up @@ -1594,12 +1607,14 @@ void LatteCP_ProcessRingbuffer()
}
case IT_HLE_TRIGGER_SCANBUFFER_SWAP:
{
LatteCP_signalEnterWait();
LatteCP_itHLESwapScanBuffer(cmd, nWords);
timerRecheck += CP_TIMER_RECHECK / 64;
break;
}
case IT_HLE_WAIT_FOR_FLIP:
{
LatteCP_signalEnterWait();
LatteCP_itHLEWaitForFlip(cmd, nWords);
timerRecheck += CP_TIMER_RECHECK / 1;
break;
Expand Down
114 changes: 74 additions & 40 deletions src/Cafe/HW/Latte/Core/LatteIndices.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "Cafe/HW/Latte/Core/LatteConst.h"
#include "Cafe/HW/Latte/Renderer/Renderer.h"
#include "Cafe/HW/Latte/ISA/RegDefines.h"
#include "Cafe/HW/Latte/Core/LattePerformanceMonitor.h"
#include "Common/cpu_features.h"

#if defined(ARCH_X86_64) && defined(__GNUC__)
Expand All @@ -9,32 +10,53 @@

struct
{
const void* lastPtr;
uint32 lastCount;
LattePrimitiveMode lastPrimitiveMode;
LatteIndexType lastIndexType;
// output
uint32 indexMin;
uint32 indexMax;
Renderer::INDEX_TYPE renderIndexType;
uint32 outputCount;
uint32 indexBufferOffset;
uint32 indexBufferIndex;
struct CacheEntry
{
// input data
const void* lastPtr;
uint32 lastCount;
LattePrimitiveMode lastPrimitiveMode;
LatteIndexType lastIndexType;
uint64 lastUsed;
// output
uint32 indexMin;
uint32 indexMax;
Renderer::INDEX_TYPE renderIndexType;
uint32 outputCount;
Renderer::IndexAllocation indexAllocation;
};
std::array<CacheEntry, 8> entry;
uint64 currentUsageCounter{0};
}LatteIndexCache{};

void LatteIndices_invalidate(const void* memPtr, uint32 size)
{
if (LatteIndexCache.lastPtr >= memPtr && (LatteIndexCache.lastPtr < ((uint8*)memPtr + size)) )
for(auto& entry : LatteIndexCache.entry)
{
LatteIndexCache.lastPtr = nullptr;
LatteIndexCache.lastCount = 0;
if (entry.lastPtr >= memPtr && (entry.lastPtr < ((uint8*)memPtr + size)) )
{
if(entry.lastPtr != nullptr)
g_renderer->indexData_releaseIndexMemory(entry.indexAllocation);
entry.lastPtr = nullptr;
entry.lastCount = 0;
}
}
}

void LatteIndices_invalidateAll()
{
LatteIndexCache.lastPtr = nullptr;
LatteIndexCache.lastCount = 0;
for(auto& entry : LatteIndexCache.entry)
{
if (entry.lastPtr != nullptr)
g_renderer->indexData_releaseIndexMemory(entry.indexAllocation);
entry.lastPtr = nullptr;
entry.lastCount = 0;
}
}

uint64 LatteIndices_GetNextUsageIndex()
{
return LatteIndexCache.currentUsageCounter++;
}

uint32 LatteIndices_calculateIndexOutputSize(LattePrimitiveMode primitiveMode, LatteIndexType indexType, uint32 count)
Expand Down Expand Up @@ -532,7 +554,7 @@ void LatteIndices_alternativeCalculateIndexMinMax(const void* indexData, LatteIn
}
}

void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, uint32& indexBufferOffset, uint32& indexBufferIndex)
void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, Renderer::IndexAllocation& indexAllocation)
{
// what this should do:
// [x] use fast SIMD-based index decoding
Expand All @@ -542,17 +564,18 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
// [ ] better cache implementation, allow to cache across frames

// reuse from cache if data didn't change
if (LatteIndexCache.lastPtr == indexData &&
LatteIndexCache.lastCount == count &&
LatteIndexCache.lastPrimitiveMode == primitiveMode &&
LatteIndexCache.lastIndexType == indexType)
{
indexMin = LatteIndexCache.indexMin;
indexMax = LatteIndexCache.indexMax;
renderIndexType = LatteIndexCache.renderIndexType;
outputCount = LatteIndexCache.outputCount;
indexBufferOffset = LatteIndexCache.indexBufferOffset;
indexBufferIndex = LatteIndexCache.indexBufferIndex;
auto cacheEntry = std::find_if(LatteIndexCache.entry.begin(), LatteIndexCache.entry.end(), [indexData, count, primitiveMode, indexType](const auto& entry)
{
return entry.lastPtr == indexData && entry.lastCount == count && entry.lastPrimitiveMode == primitiveMode && entry.lastIndexType == indexType;
});
if (cacheEntry != LatteIndexCache.entry.end())
{
indexMin = cacheEntry->indexMin;
indexMax = cacheEntry->indexMax;
renderIndexType = cacheEntry->renderIndexType;
outputCount = cacheEntry->outputCount;
indexAllocation = cacheEntry->indexAllocation;
cacheEntry->lastUsed = LatteIndices_GetNextUsageIndex();
return;
}

Expand All @@ -576,10 +599,12 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
indexMin = 0;
indexMax = std::max(count, 1u)-1;
renderIndexType = Renderer::INDEX_TYPE::NONE;
indexAllocation = {};
return; // no indices
}
// query index buffer from renderer
void* indexOutputPtr = g_renderer->indexData_reserveIndexMemory(indexOutputSize, indexBufferOffset, indexBufferIndex);
indexAllocation = g_renderer->indexData_reserveIndexMemory(indexOutputSize);
void* indexOutputPtr = indexAllocation.mem;

// decode indices
indexMin = std::numeric_limits<uint32>::max();
Expand Down Expand Up @@ -704,16 +729,25 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
// recalculate index range but filter out primitive restart index
LatteIndices_alternativeCalculateIndexMinMax(indexData, indexType, count, indexMin, indexMax);
}
g_renderer->indexData_uploadIndexMemory(indexBufferOffset, indexOutputSize);
g_renderer->indexData_uploadIndexMemory(indexAllocation);
performanceMonitor.cycle[performanceMonitor.cycleIndex].indexDataUploaded += indexOutputSize;
// get least recently used cache entry
auto lruEntry = std::min_element(LatteIndexCache.entry.begin(), LatteIndexCache.entry.end(), [](const auto& a, const auto& b)
{
return a.lastUsed < b.lastUsed;
});
// invalidate previous allocation
if(lruEntry->lastPtr != nullptr)
g_renderer->indexData_releaseIndexMemory(lruEntry->indexAllocation);
// update cache
LatteIndexCache.lastPtr = indexData;
LatteIndexCache.lastCount = count;
LatteIndexCache.lastPrimitiveMode = primitiveMode;
LatteIndexCache.lastIndexType = indexType;
LatteIndexCache.indexMin = indexMin;
LatteIndexCache.indexMax = indexMax;
LatteIndexCache.renderIndexType = renderIndexType;
LatteIndexCache.outputCount = outputCount;
LatteIndexCache.indexBufferOffset = indexBufferOffset;
LatteIndexCache.indexBufferIndex = indexBufferIndex;
lruEntry->lastPtr = indexData;
lruEntry->lastCount = count;
lruEntry->lastPrimitiveMode = primitiveMode;
lruEntry->lastIndexType = indexType;
lruEntry->indexMin = indexMin;
lruEntry->indexMax = indexMax;
lruEntry->renderIndexType = renderIndexType;
lruEntry->outputCount = outputCount;
lruEntry->indexAllocation = indexAllocation;
lruEntry->lastUsed = LatteIndices_GetNextUsageIndex();
}
2 changes: 1 addition & 1 deletion src/Cafe/HW/Latte/Core/LatteIndices.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@

void LatteIndices_invalidate(const void* memPtr, uint32 size);
void LatteIndices_invalidateAll();
void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, uint32& indexBufferOffset, uint32& indexBufferIndex);
void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, Renderer::IndexAllocation& indexAllocation);
6 changes: 6 additions & 0 deletions src/Cafe/HW/Latte/Core/LatteOverlay.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,13 @@ void LatteOverlay_renderOverlay(ImVec2& position, ImVec2& pivot, sint32 directio
ImGui::Text("VRAM: %dMB / %dMB", g_state.vramUsage, g_state.vramTotal);

if (config.overlay.debug)
{
// general debug info
ImGui::Text("--- Debug info ---");
ImGui::Text("IndexUploadPerFrame: %dKB", (performanceMonitor.stats.indexDataUploadPerFrame+1023)/1024);
// backend specific info
g_renderer->AppendOverlayDebugInfo();
}

position.y += (ImGui::GetWindowSize().y + 10.0f) * direction;
}
Expand Down
3 changes: 1 addition & 2 deletions src/Cafe/HW/Latte/Core/LattePerformanceMonitor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,15 +74,14 @@ void LattePerformanceMonitor_frameEnd()
uniformBankDataUploadedPerFrame /= 1024ULL;
uint32 uniformBankCountUploadedPerFrame = (uint32)(uniformBankUploadedCount / (uint64)elapsedFrames);
uint64 indexDataUploadPerFrame = (indexDataUploaded / (uint64)elapsedFrames);
indexDataUploadPerFrame /= 1024ULL;

double fps = (double)elapsedFrames2S * 1000.0 / (double)totalElapsedTimeFPS;
uint32 shaderBindsPerFrame = shaderBindCounter / elapsedFrames;
passedCycles = passedCycles * 1000ULL / totalElapsedTime;
uint32 rlps = (uint32)((uint64)recompilerLeaveCount * 1000ULL / (uint64)totalElapsedTime);
uint32 tlps = (uint32)((uint64)threadLeaveCount * 1000ULL / (uint64)totalElapsedTime);
// set stats

performanceMonitor.stats.indexDataUploadPerFrame = indexDataUploadPerFrame;
// next counter cycle
sint32 nextCycleIndex = (performanceMonitor.cycleIndex + 1) % PERFORMANCE_MONITOR_TRACK_CYCLES;
performanceMonitor.cycle[nextCycleIndex].drawCallCounter = 0;
Expand Down
6 changes: 6 additions & 0 deletions src/Cafe/HW/Latte/Core/LattePerformanceMonitor.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,12 @@ typedef struct
LattePerfStatCounter numDrawBarriersPerFrame;
LattePerfStatCounter numBeginRenderpassPerFrame;
}vk;

// calculated stats (per frame)
struct
{
uint32 indexDataUploadPerFrame;
}stats;
}performanceMonitor_t;

extern performanceMonitor_t performanceMonitor;
Expand Down
1 change: 0 additions & 1 deletion src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
#include "Cafe/HW/Latte/Core/LattePerformanceMonitor.h"
#include "Cafe/GraphicPack/GraphicPack2.h"
#include "config/ActiveSettings.h"
#include "Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.h"
#include "gui/guiWrapper.h"
#include "Cafe/OS/libs/erreula/erreula.h"
#include "input/InputManager.h"
Expand Down
17 changes: 11 additions & 6 deletions src/Cafe/HW/Latte/Renderer/OpenGL/OpenGLRenderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,16 +102,21 @@ class OpenGLRenderer : public Renderer
static void SetAttributeArrayState(uint32 index, bool isEnabled, sint32 aluDivisor);
static void SetArrayElementBuffer(GLuint arrayElementBuffer);

// index
void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) override
// index (not used by OpenGL renderer yet)
IndexAllocation indexData_reserveIndexMemory(uint32 size) override
{
assert_dbg();
return nullptr;
cemu_assert_unimplemented();
return {};
}

void indexData_uploadIndexMemory(uint32 offset, uint32 size) override
void indexData_releaseIndexMemory(IndexAllocation& allocation) override
{
assert_dbg();
cemu_assert_unimplemented();
}

void indexData_uploadIndexMemory(IndexAllocation& allocation) override
{
cemu_assert_unimplemented();
}

// uniform
Expand Down
11 changes: 9 additions & 2 deletions src/Cafe/HW/Latte/Renderer/Renderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,15 @@ class Renderer
virtual void draw_endSequence() = 0;

// index
virtual void* indexData_reserveIndexMemory(uint32 size, uint32& offset, uint32& bufferIndex) = 0;
virtual void indexData_uploadIndexMemory(uint32 offset, uint32 size) = 0;
struct IndexAllocation
{
void* mem; // pointer to index data inside buffer
void* rendererInternal; // for renderer use
};

virtual IndexAllocation indexData_reserveIndexMemory(uint32 size) = 0;
virtual void indexData_releaseIndexMemory(IndexAllocation& allocation) = 0;
virtual void indexData_uploadIndexMemory(IndexAllocation& allocation) = 0;

// occlusion queries
virtual LatteQueryObject* occlusionQuery_create() = 0;
Expand Down
Loading

0 comments on commit 8dd809d

Please sign in to comment.