From c1bae6747276088f21555842b6c2fe64ec76d346 Mon Sep 17 00:00:00 2001 From: Liangfu Chen Date: Thu, 21 Nov 2019 11:39:51 +0800 Subject: [PATCH 1/4] disable pipelined adder and enable streamlined gemm execution --- .../src/main/scala/core/TensorGemm.scala | 49 +++++++++++++------ 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/vta/hardware/chisel/src/main/scala/core/TensorGemm.scala b/vta/hardware/chisel/src/main/scala/core/TensorGemm.scala index 3f5f38766738..058568ce15ec 100644 --- a/vta/hardware/chisel/src/main/scala/core/TensorGemm.scala +++ b/vta/hardware/chisel/src/main/scala/core/TensorGemm.scala @@ -61,6 +61,23 @@ class PipeAdder(aBits: Int = 8, bBits: Int = 8) extends Module { io.y := add } +/** Adder */ +class Adder(aBits: Int = 8, bBits: Int = 8) extends Module { + val outBits = Math.max(aBits, bBits) + 1 + val io = IO(new Bundle { + val a = Input(SInt(aBits.W)) + val b = Input(SInt(bBits.W)) + val y = Output(SInt(outBits.W)) + }) + val add = Wire(SInt(outBits.W)) + val rA = Wire(SInt(aBits.W)) + val rB = Wire(SInt(bBits.W)) + rA := io.a + rB := io.b + add := rA +& rB + io.y := add +} + /** Pipelined DotProduct based on MAC and PipeAdder */ class DotProduct(aBits: Int = 8, bBits: Int = 8, size: Int = 16) extends Module { @@ -80,7 +97,7 @@ class DotProduct(aBits: Int = 8, bBits: Int = 8, size: Int = 16) val m = Seq.fill(s(0))(Module(new MAC(aBits, bBits, cBits = 1))) // # of total vector pairs val a = Seq.tabulate(p)( i => - Seq.fill(s(i + 1))(Module(new PipeAdder( + Seq.fill(s(i + 1))(Module(new Adder( aBits = (b + i + 1), bBits = (b + i + 1))))) // # adders within each layer @@ -127,7 +144,7 @@ class MatrixVectorMultiplication(implicit p: Parameters) extends Module { val dot = Seq.fill(size)( Module(new DotProduct(aBits = inpBits, bBits = wgtBits, size))) val acc = Seq.fill(size)( - Module(new Pipe(UInt(accBits.W), latency = log2Ceil(size) + 1))) + Module(new Pipe(UInt(accBits.W), latency = 2))) val add = Seq.fill(size)(Wire(SInt(accBits.W))) val vld = Wire(Vec(size, Bool())) @@ -188,7 +205,7 @@ class TensorGemm(debug: Boolean = false)(implicit p: Parameters) val wgt_i = Reg(chiselTypeOf(dec.uop_end)) val pBits = log2Ceil(p(CoreKey).blockOut) + 1 val inflight = Reg(UInt(pBits.W)) - val wrpipe = Module(new Pipe(chiselTypeOf(dec.uop_end), latency = pBits)) + val wrpipe = Module(new Pipe(chiselTypeOf(dec.uop_end), latency = 2)) val done = inflight === 0.U & ((state === sExe & cnt_o === dec.lp_0 - 1.U & @@ -236,7 +253,9 @@ class TensorGemm(debug: Boolean = false)(implicit p: Parameters) when(state === sIdle) { inflight := 0.U }.elsewhen(!dec.reset) { - when(state === sReadTensor) { // issue a tensor + when((state === sReadTensor) && mvc.io.acc_o.data.valid) { // issue & commit + inflight := inflight + }.elsewhen(state === sReadTensor) { // issue a tensor inflight := inflight + 1.U }.elsewhen(mvc.io.acc_o.data.valid) { // commit a tensor inflight := inflight - 1.U @@ -273,18 +292,16 @@ class TensorGemm(debug: Boolean = false)(implicit p: Parameters) inp_i := 0.U wgt_i := 0.U }.elsewhen(state === sReadUop && cnt_i === dec.lp_1) { - cnt_i := 0.U - acc_i := acc_o - inp_i := inp_o - wgt_i := wgt_o - } - .elsewhen(state === sExe && - uop_idx === uop_end - 1.U) { - cnt_i := cnt_i + 1.U - acc_i := acc_i + dec.acc_1 - inp_i := inp_i + dec.inp_1 - wgt_i := wgt_i + dec.wgt_1 - } + cnt_i := 0.U + acc_i := acc_o + inp_i := inp_o + wgt_i := wgt_o + }.elsewhen(state === sExe && uop_idx === uop_end - 1.U) { + cnt_i := cnt_i + 1.U + acc_i := acc_i + dec.acc_1 + inp_i := inp_i + dec.inp_1 + wgt_i := wgt_i + dec.wgt_1 + } when(state === sComputeIdx && io.uop.data.valid) { uop_acc := io.uop.data.bits.u0 + acc_i From f02e979e2d0062e63e1916d31f04e363e34b7e2e Mon Sep 17 00:00:00 2001 From: Liangfu Chen Date: Tue, 26 Nov 2019 11:17:01 +0800 Subject: [PATCH 2/4] pipeline first layer of adder --- .../src/main/scala/core/TensorGemm.scala | 41 ++++++++++--------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/vta/hardware/chisel/src/main/scala/core/TensorGemm.scala b/vta/hardware/chisel/src/main/scala/core/TensorGemm.scala index 058568ce15ec..880eddd11cfe 100644 --- a/vta/hardware/chisel/src/main/scala/core/TensorGemm.scala +++ b/vta/hardware/chisel/src/main/scala/core/TensorGemm.scala @@ -97,9 +97,11 @@ class DotProduct(aBits: Int = 8, bBits: Int = 8, size: Int = 16) val m = Seq.fill(s(0))(Module(new MAC(aBits, bBits, cBits = 1))) // # of total vector pairs val a = Seq.tabulate(p)( i => - Seq.fill(s(i + 1))(Module(new Adder( - aBits = (b + i + 1), - bBits = (b + i + 1))))) // # adders within each layer + Seq.fill(s(i + 1))( + if (i == 0) + Module(new PipeAdder(aBits = (b + i + 1), bBits = (b + i + 1))) + else + Module(new Adder(aBits = (b + i + 1), bBits = (b + i + 1))))) // # adders within each layer // Vector MACs for (i <- 0 until s(0)) { @@ -143,8 +145,7 @@ class MatrixVectorMultiplication(implicit p: Parameters) extends Module { }) val dot = Seq.fill(size)( Module(new DotProduct(aBits = inpBits, bBits = wgtBits, size))) - val acc = Seq.fill(size)( - Module(new Pipe(UInt(accBits.W), latency = 2))) + val acc = Seq.fill(size)(Module(new Pipe(UInt(accBits.W), latency = 2))) val add = Seq.fill(size)(Wire(SInt(accBits.W))) val vld = Wire(Vec(size, Bool())) @@ -256,10 +257,11 @@ class TensorGemm(debug: Boolean = false)(implicit p: Parameters) when((state === sReadTensor) && mvc.io.acc_o.data.valid) { // issue & commit inflight := inflight }.elsewhen(state === sReadTensor) { // issue a tensor - inflight := inflight + 1.U - }.elsewhen(mvc.io.acc_o.data.valid) { // commit a tensor - inflight := inflight - 1.U - } + inflight := inflight + 1.U + } + .elsewhen(mvc.io.acc_o.data.valid) { // commit a tensor + inflight := inflight - 1.U + } } when( @@ -292,16 +294,17 @@ class TensorGemm(debug: Boolean = false)(implicit p: Parameters) inp_i := 0.U wgt_i := 0.U }.elsewhen(state === sReadUop && cnt_i === dec.lp_1) { - cnt_i := 0.U - acc_i := acc_o - inp_i := inp_o - wgt_i := wgt_o - }.elsewhen(state === sExe && uop_idx === uop_end - 1.U) { - cnt_i := cnt_i + 1.U - acc_i := acc_i + dec.acc_1 - inp_i := inp_i + dec.inp_1 - wgt_i := wgt_i + dec.wgt_1 - } + cnt_i := 0.U + acc_i := acc_o + inp_i := inp_o + wgt_i := wgt_o + } + .elsewhen(state === sExe && uop_idx === uop_end - 1.U) { + cnt_i := cnt_i + 1.U + acc_i := acc_i + dec.acc_1 + inp_i := inp_i + dec.inp_1 + wgt_i := wgt_i + dec.wgt_1 + } when(state === sComputeIdx && io.uop.data.valid) { uop_acc := io.uop.data.bits.u0 + acc_i From a7985cfc006ead5d933b06c6e15c6c8857064752 Mon Sep 17 00:00:00 2001 From: Liangfu Chen Date: Tue, 26 Nov 2019 19:37:14 +0800 Subject: [PATCH 3/4] explain difference between pipeadder and adder --- .../chisel/src/main/scala/core/TensorGemm.scala | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/vta/hardware/chisel/src/main/scala/core/TensorGemm.scala b/vta/hardware/chisel/src/main/scala/core/TensorGemm.scala index 880eddd11cfe..5332b9b46178 100644 --- a/vta/hardware/chisel/src/main/scala/core/TensorGemm.scala +++ b/vta/hardware/chisel/src/main/scala/core/TensorGemm.scala @@ -46,7 +46,10 @@ class MAC(aBits: Int = 8, bBits: Int = 8, cBits: Int = 16) extends Module { io.y := add } -/** Pipelined adder */ +/** PipeAdder + * + * This unit loads input bits into register and performs addition in the next cycle + */ class PipeAdder(aBits: Int = 8, bBits: Int = 8) extends Module { val outBits = Math.max(aBits, bBits) + 1 val io = IO(new Bundle { @@ -61,7 +64,11 @@ class PipeAdder(aBits: Int = 8, bBits: Int = 8) extends Module { io.y := add } -/** Adder */ +/** Adder + * + * This unit wires input bits to an adder directly. + * The output comes out of combinational logic without waiting for another cycle. + */ class Adder(aBits: Int = 8, bBits: Int = 8) extends Module { val outBits = Math.max(aBits, bBits) + 1 val io = IO(new Bundle { From ab23556146a2c88a70565fc6a5b86d677db92ab7 Mon Sep 17 00:00:00 2001 From: Liangfu Chen Date: Wed, 27 Nov 2019 14:58:40 +0800 Subject: [PATCH 4/4] add comment for explaining the hard-coded latency --- vta/hardware/chisel/src/main/scala/core/TensorGemm.scala | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vta/hardware/chisel/src/main/scala/core/TensorGemm.scala b/vta/hardware/chisel/src/main/scala/core/TensorGemm.scala index 5332b9b46178..7328c426978f 100644 --- a/vta/hardware/chisel/src/main/scala/core/TensorGemm.scala +++ b/vta/hardware/chisel/src/main/scala/core/TensorGemm.scala @@ -152,6 +152,8 @@ class MatrixVectorMultiplication(implicit p: Parameters) extends Module { }) val dot = Seq.fill(size)( Module(new DotProduct(aBits = inpBits, bBits = wgtBits, size))) + // Latency is defined as two in the following, because there is one cycle in the MAC module, + // and another cycle in the pipelined adders as the first layer of the accumulator val acc = Seq.fill(size)(Module(new Pipe(UInt(accBits.W), latency = 2))) val add = Seq.fill(size)(Wire(SInt(accBits.W))) val vld = Wire(Vec(size, Bool())) @@ -213,6 +215,8 @@ class TensorGemm(debug: Boolean = false)(implicit p: Parameters) val wgt_i = Reg(chiselTypeOf(dec.uop_end)) val pBits = log2Ceil(p(CoreKey).blockOut) + 1 val inflight = Reg(UInt(pBits.W)) + // Latency is defined as two in the following, because there is one cycle in the MAC module, + // and another cycle in the pipelined adders as the first layer of the accumulator val wrpipe = Module(new Pipe(chiselTypeOf(dec.uop_end), latency = 2)) val done = inflight === 0.U & ((state === sExe &