todo: gemm-gemm flow is to finish

CodingPlatelets · CodingPlatelets · commit de763341a348 · 2024-12-17T21:27:44.000+08:00
diff --git a/src/main/scala/kernel/alu/Softmax.scala b/src/main/scala/kernel/alu/Softmax.scala
@@ -45,8 +45,9 @@ class FixedPointExp extends Module with SoftmaxAccuracy with DebugLog {
 
 class Softmax(val arraySize: Int = 4) extends Module with SoftmaxAccuracy with DebugLog {
   val io = IO(new Bundle {
-    val x = Input(Valid(Vec(arraySize, UInt((I + F).W))))
-    val soft_x = Valid(Vec(arraySize, UInt((I + F).W)))
+    // val x = Input(Valid(Vec(arraySize, UInt((I + F).W))))
+    val x = Flipped(Decoupled(Vec(arraySize, UInt((I + F).W))))
+    val soft_x = Decoupled(Vec(arraySize, UInt((I + F).W)))
   })
 
   // first find the max value of x
diff --git a/src/main/scala/models/llama3/common/llamaConfig.scala b/src/main/scala/models/llama3/common/llamaConfig.scala
@@ -24,4 +24,7 @@ trait llamaConfig {
   // DAC for zb, stream for heads
   val stream_size = 8
 
+  // buffer size for gemm-gemm pipeline
+  val bufferSizeGemm = 32
+
 }
diff --git a/src/main/scala/models/llama3/metrixController.scala b/src/main/scala/models/llama3/metrixController.scala
@@ -303,3 +303,113 @@ class GenerationMatrixMul(
     p"stateReg: $stateReg,\t currentValid: ${io.current.valid},\t rowIdx: ${rowIdx.value},\t colIdx: ${colIdx.value},\t gemmValid: ${gemmGroup.io.out.valid}\n"
   )
 }
+
+/*
+ * using two GenerationMatrixMul Modules(as QKGEN) to do q,k generation simultaneously.
+ * using another GenerationMatrixMul Module(as QKMUL) to do q,k mul.
+ * the output of QKMUL is the final result.
+ * using the output of QKGEN to Stitch the final result.
+ * the k1,n1 are for q,k generation, the k2,n2 are for q,k mul.
+ */
+class QKMul(
+  val k1:       Int,
+  val n1:       Int,
+  val k2:       Int,
+  val n2:       Int,
+  val m:        Int,
+  val p:        Int,
+  val q:        Int,
+  val gemmType: GEMMDataType.Type
+)(
+  implicit config: DataWidthConfig)
+    extends Module
+    with llamaConfig
+    with DebugLog {
+
+  val nk1: Int = k1 * n1
+  val nk2: Int = k2 * n2
+  require(m % nk1 == 0)
+  require(p % nk1 == 0)
+  require(q % nk1 == 0)
+  require(m % nk2 == 0)
+  require(q % nk2 == 0)
+
+  class QKGenerationMatrixMulWarper(
+    val k:          Int,
+    val n:          Int,
+    val m:          Int,
+    val p:          Int,
+    val q:          Int,
+    val gemmType:   GEMMDataType.Type,
+    val bufferSize: Int
+  )(
+    implicit config: DataWidthConfig)
+      extends Module
+      with llamaConfig
+      with DebugLog {
+    val io = IO(new Bundle {
+      val in_a = Flipped(Decoupled(Vec(m * p, UInt(config.inputWidth.W))))
+      val in_b = Flipped(Decoupled(Vec(p * q, UInt(config.inputWidth.W))))
+      val flush = Input(Bool())
+      val outMatrix = Decoupled(new currentSystolicGroupIdx(nk1, m, p, q))
+    })
+
+    val qkGenMul = Module(new GenerationMatrixMul(k1, n1, m, p, q, gemmType))
+    io.in_a <> qkGenMul.io.in_a
+    io.in_b <> qkGenMul.io.in_b
+
+    val currentBuffer = Module(
+      new Queue(
+        new currentSystolicGroupIdx(nk1, m, p, q),
+        entries = bufferSize,
+        pipe = true,
+        flow = false,
+        useSyncReadMem = false,
+        hasFlush = true
+      )
+    )
+
+    // hasFlush must be true
+    currentBuffer.io.flush.get := io.flush
+
+    // ATTENTION: we assert the size of the buffer is huge enough to hold the current systolic group output
+    // we ignore the ready signal of the enq
+    currentBuffer.io.enq.bits := qkGenMul.io.current.bits
+    currentBuffer.io.enq.valid := qkGenMul.io.current.valid
+
+    io.outMatrix <> currentBuffer.io.deq
+  }
+
+  val io = IO(new Bundle {
+    val inputToken = Flipped(Decoupled(Vec(m * p, UInt(config.inputWidth.W))))
+    val weightQ = Flipped(Decoupled(Vec(p * q, UInt(config.inputWidth.W))))
+    val weightK = Flipped(Decoupled(Vec(p * q, UInt(config.inputWidth.W))))
+    val score = Decoupled(Vec(m * q, UInt(config.inputWidth.W)))
+    val resetBuffer = Input(Bool())
+  })
+
+  val qGen = new QKGenerationMatrixMulWarper(k1, n1, m, p, q, gemmType, bufferSizeGemm)
+  val kGen = new QKGenerationMatrixMulWarper(k2, n2, m, p, q, gemmType, bufferSizeGemm)
+
+  qGen.io.in_a <> io.inputToken
+  qGen.io.in_b <> io.weightQ
+  kGen.io.in_a <> io.inputToken
+  kGen.io.in_b <> io.weightQ
+
+  qGen.io.flush := io.resetBuffer
+  kGen.io.flush := io.resetBuffer
+
+  // final result idx
+  val rowIdx = RegInit(0.U(log2Ceil(m / nk2).W))
+  val colIdx = RegInit(0.U(log2Ceil(m / nk2).W))
+  val resValid = RegInit(false.B)
+  io.score.valid := resValid
+
+  val scoreValue = RegInit(VecInit.fill(m * q)(0.U(config.outputWidth.W)))
+  io.score.bits := scoreValue
+
+  when(resValid && io.score.ready) {
+    resValid := false.B
+  }
+
+}
diff --git a/src/test/scala/models/llama3/metrixControllerTest.scala b/src/test/scala/models/llama3/metrixControllerTest.scala
@@ -42,7 +42,7 @@ class metrixControllerTest extends AnyFlatSpec with ChiselScalatestTester with P
         Array.fill(rows, cols)(
           numeric.fromInt(
             // r.nextInt(math.pow(2, config.inputWidth).toInt) - math.pow(2, config.inputWidth - 1).toInt
-            r.nextInt(16) - 8
+            r.nextInt(4) - 2
           )
         )
       case c if c == classOf[Float] =>
@@ -260,6 +260,8 @@ class metrixControllerTest extends AnyFlatSpec with ChiselScalatestTester with P
       val in_a = Flipped(Decoupled(Vec(m * p, UInt(config.inputWidth.W))))
       val in_b = Flipped(Decoupled(Vec(p * q, UInt(config.inputWidth.W))))
       val outMatrix = Valid(Vec(nk * nk, UInt(config.inputWidth.W)))
+      val rowIdx = Output(UInt(config.inputWidth.W))
+      val colIdx = Output(UInt(config.inputWidth.W))
     })
 
     val metrixController = Module(new GenerationMatrixMul(k, n, m, p, q, gemmType))
@@ -270,6 +272,8 @@ class metrixControllerTest extends AnyFlatSpec with ChiselScalatestTester with P
     matrixRestore.io.inBlocks := metrixController.io.current.bits.value
     io.outMatrix.bits := matrixRestore.io.outMatrix
     io.outMatrix.valid := metrixController.io.current.valid
+    io.rowIdx := metrixController.io.current.bits.row
+    io.colIdx := metrixController.io.current.bits.col
   }
 
   private def testMetrixController[T: Numeric: ClassTag](
@@ -328,6 +332,8 @@ class metrixControllerTest extends AnyFlatSpec with ChiselScalatestTester with P
         }
         // println(s"emptyRes: ${emptyRes.mkString(", ")}")
         // assert(emptyRes.sameElements(finalMatrix))
+        println(s"rowIdx: ${dut.io.rowIdx.peekInt()}")
+        println(s"colIdx: ${dut.io.colIdx.peekInt()}")
         printmat(emptyRes, nk, nk)
       }
       dut.clock.step()
@@ -350,7 +356,7 @@ class metrixControllerTest extends AnyFlatSpec with ChiselScalatestTester with P
 
   "GenerationMatrixMul" should "correctly multiply matrices" in {
     implicit val config: DataWidthConfig = FxpConfig
-    test(new MetrixControllerWarper(k = 1, n = 2, m = 4, p = 4, q = 4, GEMMDataType.Fxp))
+    test(new MetrixControllerWarper(k = 1, n = 2, m = 4, p = 6, q = 8, GEMMDataType.Fxp))
       .withAnnotations(Seq(VerilatorBackendAnnotation))(testMetrixController[Int])
   }
 }

Original file line number	Diff line number	Diff line change
`@@ -24,4 +24,7 @@ trait llamaConfig {`
`24`	`24`	`// DAC for zb, stream for heads`
`25`	`25`	`val stream_size = 8`
`26`	`26`
	`27`	`+ // buffer size for gemm-gemm pipeline`
	`28`	`+ val bufferSizeGemm = 32`
	`29`	`+`
`27`	`30`	`}`
Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@ class metrixControllerTest extends AnyFlatSpec with ChiselScalatestTester with P`
`42`	`42`	`Array.fill(rows, cols)(`
`43`	`43`	`numeric.fromInt(`
`44`	`44`	`// r.nextInt(math.pow(2, config.inputWidth).toInt) - math.pow(2, config.inputWidth - 1).toInt`
`45`		`- r.nextInt(16) - 8`
	`45`	`+ r.nextInt(4) - 2`
`46`	`46`	`)`
`47`	`47`	`)`
`48`	`48`	`case c if c == classOf[Float] =>`
`@@ -260,6 +260,8 @@ class metrixControllerTest extends AnyFlatSpec with ChiselScalatestTester with P`
`260`	`260`	`val in_a = Flipped(Decoupled(Vec(m * p, UInt(config.inputWidth.W))))`
`261`	`261`	`val in_b = Flipped(Decoupled(Vec(p * q, UInt(config.inputWidth.W))))`
`262`	`262`	`val outMatrix = Valid(Vec(nk * nk, UInt(config.inputWidth.W)))`
	`263`	`+ val rowIdx = Output(UInt(config.inputWidth.W))`
	`264`	`+ val colIdx = Output(UInt(config.inputWidth.W))`
`263`	`265`	`})`
`264`	`266`
`265`	`267`	`val metrixController = Module(new GenerationMatrixMul(k, n, m, p, q, gemmType))`
`@@ -270,6 +272,8 @@ class metrixControllerTest extends AnyFlatSpec with ChiselScalatestTester with P`
`270`	`272`	`matrixRestore.io.inBlocks := metrixController.io.current.bits.value`
`271`	`273`	`io.outMatrix.bits := matrixRestore.io.outMatrix`
`272`	`274`	`io.outMatrix.valid := metrixController.io.current.valid`
	`275`	`+ io.rowIdx := metrixController.io.current.bits.row`
	`276`	`+ io.colIdx := metrixController.io.current.bits.col`
`273`	`277`	`}`
`274`	`278`
`275`	`279`	`private def testMetrixController[T: Numeric: ClassTag](`
`@@ -328,6 +332,8 @@ class metrixControllerTest extends AnyFlatSpec with ChiselScalatestTester with P`
`328`	`332`	`}`
`329`	`333`	`// println(s"emptyRes: ${emptyRes.mkString(", ")}")`
`330`	`334`	`// assert(emptyRes.sameElements(finalMatrix))`
	`335`	`+ println(s"rowIdx: ${dut.io.rowIdx.peekInt()}")`
	`336`	`+ println(s"colIdx: ${dut.io.colIdx.peekInt()}")`
`331`	`337`	`printmat(emptyRes, nk, nk)`
`332`	`338`	`}`
`333`	`339`	`dut.clock.step()`
`@@ -350,7 +356,7 @@ class metrixControllerTest extends AnyFlatSpec with ChiselScalatestTester with P`
`350`	`356`
`351`	`357`	`"GenerationMatrixMul" should "correctly multiply matrices" in {`
`352`	`358`	`implicit val config: DataWidthConfig = FxpConfig`
`353`		`- test(new MetrixControllerWarper(k = 1, n = 2, m = 4, p = 4, q = 4, GEMMDataType.Fxp))`
	`359`	`+ test(new MetrixControllerWarper(k = 1, n = 2, m = 4, p = 6, q = 8, GEMMDataType.Fxp))`
`354`	`360`	`.withAnnotations(Seq(VerilatorBackendAnnotation))(testMetrixController[Int])`
`355`	`361`	`}`
`356`	`362`	`}`