From 2772ba09ab5160e67604213efb976031b69f088c Mon Sep 17 00:00:00 2001 From: Finn Wilkinson Date: Thu, 13 Jul 2023 20:54:52 +0100 Subject: [PATCH 1/2] Updated all Arch64 single structure vector/sve loads to extract their values from a single block of fetched memory in Instruction_execute. --- src/lib/arch/aarch64/Instruction_execute.cc | 73 ++++++++++----------- 1 file changed, 36 insertions(+), 37 deletions(-) diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc index 6e4752adae..506f8ef131 100644 --- a/src/lib/arch/aarch64/Instruction_execute.cc +++ b/src/lib/arch/aarch64/Instruction_execute.cc @@ -2442,13 +2442,13 @@ void Instruction::execute() { const uint32_t sliceNum = (ws + metadata.operands[0].sme_index.disp) % partition_num; - uint16_t index = 0; + const uint32_t* data = memoryData[0].getAsVector(); + uint32_t out[64] = {0}; for (int i = 0; i < partition_num; i++) { uint64_t shifted_active = 1ull << ((i % 16) * 4); if (pg[i / 16] & shifted_active) { - out[i] = memoryData[index].get(); - index++; + out[i] = data[i]; } else { out[i] = 0; } @@ -2479,14 +2479,14 @@ void Instruction::execute() { const uint32_t sliceNum = (ws + metadata.operands[0].sme_index.disp) % partition_num; - uint16_t index = 0; + const uint32_t* data = memoryData[0].getAsVector(); + for (int i = 0; i < partition_num; i++) { uint32_t* row = const_cast(operands[i].getAsVector()); uint64_t shifted_active = 1ull << ((i % 16) * 4); if (pg[i / 16] & shifted_active) { - row[sliceNum] = memoryData[index].get(); - index++; + row[sliceNum] = data[i]; } else { row[sliceNum] = 0; } @@ -2499,13 +2499,13 @@ void Instruction::execute() { const uint64_t* p = operands[0].getAsVector(); const uint16_t partition_num = VL_bits / 8; - uint16_t index = 0; + const uint8_t* data = memoryData[0].getAsVector(); + uint8_t out[256] = {0}; for (int i = 0; i < partition_num; i++) { uint64_t shifted_active = 1ull << (i % 64); if (p[i / 64] & shifted_active) { - out[i] = memoryData[index].get(); - index++; + out[i] = data[i]; } else { out[i] = 0; } @@ -2518,13 +2518,13 @@ void Instruction::execute() { const uint64_t* p = operands[0].getAsVector(); const uint16_t partition_num = VL_bits / 64; - uint16_t index = 0; + const uint64_t* data = memoryData[0].getAsVector(); + uint64_t out[32] = {0}; for (int i = 0; i < partition_num; i++) { uint64_t shifted_active = 1ull << ((i % 8) * 8); if (p[i / 8] & shifted_active) { - out[i] = memoryData[index].get(); - index++; + out[i] = data[i]; } else { out[i] = 0; } @@ -2538,13 +2538,13 @@ void Instruction::execute() { const uint64_t* p = operands[0].getAsVector(); const uint16_t partition_num = VL_bits / 64; - uint16_t index = 0; + const uint64_t* data = memoryData[0].getAsVector(); + uint64_t out[32] = {0}; for (int i = 0; i < partition_num; i++) { uint64_t shifted_active = 1ull << ((i % 8) * 8); if (p[i / 8] & shifted_active) { - out[i] = memoryData[index].get(); - index++; + out[i] = data[i]; } else { out[i] = 0; } @@ -2557,13 +2557,13 @@ void Instruction::execute() { const uint64_t* p = operands[0].getAsVector(); const uint16_t partition_num = VL_bits / 16; - uint16_t index = 0; + const uint16_t* data = memoryData[0].getAsVector(); + uint16_t out[128] = {0}; for (int i = 0; i < partition_num; i++) { uint64_t shifted_active = 1ull << ((i % 32) * 2); if (p[i / 32] & shifted_active) { - out[i] = memoryData[index].get(); - index++; + out[i] = data[i]; } else { out[i] = 0; } @@ -2612,15 +2612,14 @@ void Instruction::execute() { const uint64_t* p = operands[0].getAsVector(); const uint16_t partition_num = VL_bits / 64; uint64_t out[32] = {0}; - uint16_t index = 0; + const uint64_t* data = memoryData[0].getAsVector(); // Get mini-vector (quadword) uint64_t mini[2] = {0}; for (int i = 0; i < 2; i++) { uint64_t shifted_active = 1ull << ((i % 8) * 8); if (p[i / 8] & shifted_active) { - mini[i] = memoryData[index].get(); - index++; + mini[i] = data[i]; } } @@ -2637,15 +2636,15 @@ void Instruction::execute() { const uint64_t* p = operands[0].getAsVector(); const uint16_t partition_num = VL_bits / 32; uint32_t out[64] = {0}; - uint16_t index = 0; + const uint32_t* data = memoryData[0].getAsVector(); // Get mini-vector (quadword) uint32_t mini[4] = {0}; for (int i = 0; i < 4; i++) { uint64_t shifted_active = 1ull << ((i % 16) * 4); - if (p[i / 16] & shifted_active) - mini[i] = memoryData[index].get(); - index++; + if (p[i / 16] & shifted_active) { + mini[i] = data[i]; + } } // Duplicate mini-vector into output vector @@ -2662,7 +2661,7 @@ void Instruction::execute() { // LOAD const uint16_t partition_num = VL_bits / 32; uint32_t out[64] = {0}; - uint16_t index = 0; + // Check if any lanes are active, otherwise set all to 0 and break early bool active = false; const uint64_t* p = operands[0].getAsVector(); @@ -2675,10 +2674,8 @@ void Instruction::execute() { if (active) { uint32_t data = memoryData[0].get(); for (int i = 0; i < partition_num; i++) { - uint64_t shifted_active = p[index / 16] & 1ull - << ((index % 16) * 4); + uint64_t shifted_active = p[i / 16] & 1ull << ((i % 16) * 4); out[i] = shifted_active ? data : 0; - index++; } } results[0] = {out, 256}; @@ -2827,13 +2824,13 @@ void Instruction::execute() { const uint64_t* p = operands[0].getAsVector(); const uint16_t partition_num = VL_bits / 32; - uint16_t index = 0; + const uint32_t* data = memoryData[0].getAsVector(); + uint32_t out[64] = {0}; for (int i = 0; i < partition_num; i++) { uint64_t shifted_active = 1ull << ((i % 16) * 4); if (p[i / 16] & shifted_active) { - out[i] = memoryData[index].get(); - index++; + out[i] = data[i]; } else { out[i] = 0; } @@ -2847,13 +2844,13 @@ void Instruction::execute() { const uint64_t* p = operands[0].getAsVector(); const uint16_t partition_num = VL_bits / 32; - uint16_t index = 0; + const uint32_t* data = memoryData[0].getAsVector(); + uint32_t out[64] = {0}; for (int i = 0; i < partition_num; i++) { uint64_t shifted_active = 1ull << ((i % 16) * 4); if (p[i / 16] & shifted_active) { - out[i] = memoryData[index].get(); - index++; + out[i] = data[i]; } else { out[i] = 0; } @@ -3331,10 +3328,11 @@ void Instruction::execute() { // LOAD const uint64_t PL_bits = VL_bits / 8; const uint16_t partition_num = PL_bits / 8; + const uint8_t* memData = memoryData[0].getAsVector(); uint64_t out[4] = {0}; for (int i = 0; i < partition_num; i++) { - uint8_t data = memoryData[i].get(); + uint8_t data = memData[i]; for (int j = 0; j < 8; j++) { out[i / 8] |= (data & (1 << j)) ? 1ull << ((j + (i * 8)) % 64) : 0; } @@ -3346,9 +3344,10 @@ void Instruction::execute() { // LOAD const uint16_t partition_num = VL_bits / 8; uint8_t out[256] = {0}; + const uint8_t* data = memoryData[0].getAsVector(); for (int i = 0; i < partition_num; i++) { - out[i] = memoryData[i].get(); + out[i] = data[i]; } results[0] = {out, 256}; break; From bb4e117a7b05fbb7992c07b2d8d8f403eabe2c42 Mon Sep 17 00:00:00 2001 From: Jack Jones Date: Fri, 14 Jul 2023 08:37:04 +0100 Subject: [PATCH 2/2] Updated Instruction_Address entries for those loads altered in prior commit --- src/include/simeng/MemoryInterface.hh | 2 +- .../simeng/arch/aarch64/Instruction.hh | 2 + src/lib/arch/aarch64/Instruction.cc | 6 + src/lib/arch/aarch64/Instruction_address.cc | 165 ++---------------- 4 files changed, 28 insertions(+), 147 deletions(-) diff --git a/src/include/simeng/MemoryInterface.hh b/src/include/simeng/MemoryInterface.hh index aeddc57d0c..49eb23a68a 100644 --- a/src/include/simeng/MemoryInterface.hh +++ b/src/include/simeng/MemoryInterface.hh @@ -18,7 +18,7 @@ struct MemoryAccessTarget { /** The address to access. */ uint64_t address; /** The number of bytes to access at `address`. */ - uint8_t size; + uint16_t size; /** Check for equality of two access targets. */ bool operator==(const MemoryAccessTarget& other) const { diff --git a/src/include/simeng/arch/aarch64/Instruction.hh b/src/include/simeng/arch/aarch64/Instruction.hh index d4ae80aa16..5d747860cd 100644 --- a/src/include/simeng/arch/aarch64/Instruction.hh +++ b/src/include/simeng/arch/aarch64/Instruction.hh @@ -453,6 +453,8 @@ class Instruction : public simeng::Instruction { void setMemoryAddresses(std::vector&& addresses); + void setMemoryAddresses(MemoryAccessTarget address); + /** The memory addresses this instruction accesses, as a vector of {offset, * width} pairs. */ std::vector memoryAddresses; diff --git a/src/lib/arch/aarch64/Instruction.cc b/src/lib/arch/aarch64/Instruction.cc index 676f705b40..4b4a41db2a 100644 --- a/src/lib/arch/aarch64/Instruction.cc +++ b/src/lib/arch/aarch64/Instruction.cc @@ -109,6 +109,12 @@ void Instruction::setMemoryAddresses( memoryAddresses = std::move(addresses); } +void Instruction::setMemoryAddresses(MemoryAccessTarget address) { + dataPending_ = 1; + memoryData.resize(1); + memoryAddresses.push_back(address); +} + span Instruction::getGeneratedAddresses() const { return {memoryAddresses.data(), memoryAddresses.size()}; } diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc index d1068fde2c..8552580948 100644 --- a/src/lib/arch/aarch64/Instruction_address.cc +++ b/src/lib/arch/aarch64/Instruction_address.cc @@ -66,23 +66,11 @@ span Instruction::generateAddresses() { // [{, xm, LSL #2}] // SME const uint16_t partition_num = VL_bits / 32; - const uint64_t* pg = - operands[partition_num + 1].getAsVector(); const uint64_t n = operands[partition_num + 2].get(); uint64_t m = 0; if (metadata.operands[2].mem.index) m = operands[partition_num + 3].get() << 2; - - std::vector addresses; - addresses.reserve(partition_num); - - for (int i = 0; i < partition_num; i++) { - uint64_t shifted_active = 1ull << ((i % 16) * 4); - if (pg[i / 16] & shifted_active) { - addresses.push_back({(n + m) + (i * 4), 4}); - } - } - setMemoryAddresses(std::move(addresses)); + setMemoryAddresses({(n + m), static_cast(VL_bits / 8)}); break; } case Opcode::AArch64_LD1i32: { // ld1 {vt.s}[index], [xn] @@ -110,41 +98,15 @@ span Instruction::generateAddresses() { break; } case Opcode::AArch64_LD1RQ_D_IMM: { // ld1rqd {zd.d}, pg/z, [xn{, #imm}] - const uint64_t* p = operands[0].getAsVector(); - uint64_t addr = operands[1].get() + metadata.operands[2].mem.disp; - - std::vector addresses; - addresses.reserve(2); - - for (int i = 0; i < 2; i++) { - uint64_t shifted_active = 1ull << ((i % 8) * 8); - if (p[i / 8] & shifted_active) { - addresses.push_back({addr, 8}); - } - addr += 8; - } - setMemoryAddresses(std::move(addresses)); + setMemoryAddresses({addr, static_cast(16)}); break; } case Opcode::AArch64_LD1RQ_W_IMM: { // ld1rqw {zd.s}, pg/z, [xn{, #imm}] - const uint64_t* p = operands[0].getAsVector(); - uint64_t addr = operands[1].get() + metadata.operands[2].mem.disp; - - std::vector addresses; - addresses.reserve(4); - - for (int i = 0; i < 4; i++) { - uint64_t shifted_active = 1ull << ((i % 16) * 4); - if (p[i / 16] & shifted_active) { - addresses.push_back({addr, 4}); - } - addr += 4; - } - setMemoryAddresses(std::move(addresses)); + setMemoryAddresses({addr, static_cast(16)}); break; } case Opcode::AArch64_LD1RW_IMM: { // ld1rw {zt.s}, pg/z, [xn, #imm] @@ -243,133 +205,58 @@ span Instruction::generateAddresses() { break; } case Opcode::AArch64_LD1B: { // ld1b {zt.b}, pg/z, [xn, xm] - const uint64_t* p = operands[0].getAsVector(); - const uint16_t partition_num = VL_bits / 8; - const uint64_t base = operands[1].get(); const uint64_t offset = operands[2].get(); - std::vector addresses; - addresses.reserve(partition_num); - - for (int i = 0; i < partition_num; i++) { - uint64_t shifted_active = 1ull << (i % 64); - if (p[i / 64] & shifted_active) { - addresses.push_back({base + (offset + i), 1}); - } - } - - setMemoryAddresses(std::move(addresses)); + setMemoryAddresses({base + offset, static_cast(VL_bits / 8)}); break; } case Opcode::AArch64_LD1D: { // ld1d {zt.d}, pg/z, [xn, xm, lsl #3] - const uint64_t* p = operands[0].getAsVector(); - const uint16_t partition_num = VL_bits / 64; - const uint64_t base = operands[1].get(); const uint64_t offset = operands[2].get(); + const uint64_t addr = base + (offset * 8); - std::vector addresses; - addresses.reserve(partition_num); - - for (int i = 0; i < partition_num; i++) { - uint64_t shifted_active = 1ull << ((i % 8) * 8); - if (p[i / 8] & shifted_active) { - addresses.push_back({base + ((offset + i) * 8), 8}); - } - } - - setMemoryAddresses(std::move(addresses)); + setMemoryAddresses({addr, static_cast(VL_bits / 8)}); break; } case Opcode::AArch64_LD1D_IMM_REAL: { // ld1d {zt.d}, pg/z, [xn{, #imm, // mul vl}] - const uint64_t* p = operands[0].getAsVector(); const uint16_t partition_num = VL_bits / 64; const uint64_t base = operands[1].get(); const uint64_t offset = static_cast(metadata.operands[2].mem.disp); + const uint64_t addr = base + (offset * partition_num * 8); - std::vector addresses; - addresses.reserve(partition_num); - - uint64_t addr = base + (offset * partition_num * 8); - - for (int i = 0; i < partition_num; i++) { - uint64_t shifted_active = 1ull << ((i % 8) * 8); - if (p[i / 8] & shifted_active) { - addresses.push_back({addr, 8}); - } - addr += 8; - } - - setMemoryAddresses(std::move(addresses)); + setMemoryAddresses({addr, static_cast(VL_bits / 8)}); break; } case Opcode::AArch64_LD1H: { // ld1h {zt.h}, pg/z, [xn, xm, lsl #1] - const uint64_t* p = operands[0].getAsVector(); - const uint16_t partition_num = VL_bits / 16; - const uint64_t base = operands[1].get(); const uint64_t offset = operands[2].get(); + const uint64_t addr = base + (offset * 2); - std::vector addresses; - addresses.reserve(partition_num); - - for (int i = 0; i < partition_num; i++) { - uint64_t shifted_active = 1ull << ((i % 32) * 2); - if (p[i / 32] & shifted_active) { - addresses.push_back({base + ((offset + i) * 2), 2}); - } - } - - setMemoryAddresses(addresses); + setMemoryAddresses({addr, static_cast(VL_bits / 8)}); break; } case Opcode::AArch64_LD1W: { // ld1w {zt.s}, pg/z, [xn, xm, lsl #2] - const uint64_t* p = operands[0].getAsVector(); - const uint16_t partition_num = VL_bits / 32; - const uint64_t base = operands[1].get(); const uint64_t offset = operands[2].get(); + const uint64_t addr = base + (offset * 4); - std::vector addresses; - addresses.reserve(partition_num); - - for (int i = 0; i < partition_num; i++) { - uint64_t shifted_active = 1ull << ((i % 16) * 4); - if (p[i / 16] & shifted_active) { - addresses.push_back({base + ((offset + i) * 4), 4}); - } - } - - setMemoryAddresses(std::move(addresses)); + setMemoryAddresses({addr, static_cast(VL_bits / 8)}); break; } case Opcode::AArch64_LD1W_IMM_REAL: { // ld1w {zt.s}, pg/z, [xn{, #imm, // mul vl}] - const uint64_t* p = operands[0].getAsVector(); const uint16_t partition_num = VL_bits / 32; const uint64_t base = operands[1].get(); const uint64_t offset = static_cast(metadata.operands[2].mem.disp); + const uint64_t addr = base + (offset * partition_num * 4); - std::vector addresses; - addresses.reserve(partition_num); - - uint64_t addr = base + (offset * partition_num * 4); - - for (int i = 0; i < partition_num; i++) { - uint64_t shifted_active = 1ull << ((i % 16) * 4); - if (p[i / 16] & shifted_active) { - addresses.push_back({addr, 4}); - } - addr += 4; - } - - setMemoryAddresses(std::move(addresses)); + setMemoryAddresses({addr, static_cast(VL_bits / 8)}); break; } case Opcode::AArch64_LD2D: { // ld2d {zt1.d, zt2.d}, pg/z, [xn|sp, xm, @@ -668,34 +555,20 @@ span Instruction::generateAddresses() { const uint64_t offset = static_cast(metadata.operands[1].mem.disp); - std::vector addresses(partition_num); - uint64_t addr = base + (offset * partition_num); - for (int i = 0; i < partition_num; i++) { - addresses[i] = {addr, 1}; - addr += 1; - } - - setMemoryAddresses(std::move(addresses)); + setMemoryAddresses({addr, partition_num}); break; } case Opcode::AArch64_LDR_ZXI: { // ldr zt, [xn{, #imm, mul vl}] const uint16_t partition_num = VL_bits / 8; const uint64_t base = operands[0].get(); - const uint64_t offset = - static_cast(metadata.operands[1].mem.disp); - - std::vector addresses(partition_num); - - uint64_t addr = base + (offset * partition_num); - for (int i = 0; i < partition_num; i++) { - addresses[i] = {addr, 1}; - addr += 1; - } + const int64_t offset = + static_cast(metadata.operands[1].mem.disp); + const uint64_t addr = base + (offset * partition_num); - setMemoryAddresses(std::move(addresses)); + setMemoryAddresses({addr, partition_num}); break; } case Opcode::AArch64_LDNPSi: { // ldnp st1, st2, [xn, #imm]