diff --git a/scripts/gemmini/chisel_wrappers/DMACommandTracker.scala b/scripts/gemmini/chisel_wrappers/DMACommandTracker.scala index e1f39da..0fbad20 100644 --- a/scripts/gemmini/chisel_wrappers/DMACommandTracker.scala +++ b/scripts/gemmini/chisel_wrappers/DMACommandTracker.scala @@ -101,97 +101,3 @@ class DMACommandTracker[T <: Data]( io.cmd_completed.bits.tag.rob_id := custom_tracker.io.io_cmd_completed_bits_tag_rob_id io.busy := custom_tracker.io.io_alloc_ready } - -//// This module is meant to go inside the Load controller, where it can track which commands are currently -//// in flight and which are completed -//class DMACommandTracker[T <: Data](val nCmds: Int, val maxBytes: Int, tag_t: => T) extends Module { -// def cmd_id_t = UInt((log2Ceil(nCmds) max 1).W) -// -// val io = IO(new Bundle { -// // TODO is there an existing decoupled interface in the standard library which matches this use-case? -// val alloc = new Bundle { -// val valid = Input(Bool()) -// val ready = Output(Bool()) -// -// class BitsT(tag_t: => T, cmd_id_t: UInt) extends Bundle { -// // This was only spun off as its own class to resolve CloneType errors -// val tag = Input(tag_t.cloneType) -// val bytes_to_read = Input(UInt(log2Up(maxBytes+1).W)) -// val cmd_id = Output(cmd_id_t.cloneType) -// } -// -// val bits = new BitsT(tag_t.cloneType, cmd_id_t.cloneType) -// -// def fire(dummy: Int = 0) = valid && ready -// } -// -// class RequestReturnedT(cmd_id_t: UInt) extends Bundle { -// // This was only spun off as its own class to resolve CloneType errors -// val bytes_read = UInt(log2Up(maxBytes+1).W) -// val cmd_id = cmd_id_t.cloneType -// -// } -// -// val request_returned = Flipped(Valid(new RequestReturnedT(cmd_id_t.cloneType))) -// -// class CmdCompletedT(cmd_id_t: UInt, tag_t: T) extends Bundle { -// val cmd_id = cmd_id_t.cloneType -// val tag = tag_t.cloneType -// -// } -// -// val cmd_completed = Decoupled(new CmdCompletedT(cmd_id_t.cloneType, tag_t.cloneType)) -// -// val busy = Output(Bool()) -// }) -// -// class Entry extends Bundle { -// val valid = Bool() -// val tag = tag_t.cloneType -// val bytes_left = UInt(log2Up(maxBytes+1).W) -// -// def init(dummy: Int = 0): Unit = { -// valid := false.B -// } -// } -// -// // val cmds = RegInit(VecInit(Seq.fill(nCmds)(entry_init))) -// val cmds = Reg(Vec(nCmds, new Entry)) -// val cmd_valids = cmds.map(_.valid) -// -// val next_empty_alloc = MuxCase(0.U, cmd_valids.zipWithIndex.map { case (v, i) => (!v) -> i.U }) -// -// io.alloc.ready := !cmd_valids.reduce(_ && _) -// io.alloc.bits.cmd_id := next_empty_alloc -// -// io.busy := cmd_valids.reduce(_ || _) -// -// val cmd_completed_id = MuxCase(0.U, cmds.zipWithIndex.map { case (cmd, i) => -// (cmd.valid && cmd.bytes_left === 0.U) -> i.U -// }) -// io.cmd_completed.valid := cmds.map(cmd => cmd.valid && cmd.bytes_left === 0.U).reduce(_ || _) -// io.cmd_completed.bits.cmd_id := cmd_completed_id -// io.cmd_completed.bits.tag := cmds(cmd_completed_id).tag -// -// when (io.alloc.fire()) { -// cmds(next_empty_alloc).valid := true.B -// cmds(next_empty_alloc).tag := io.alloc.bits.tag -// cmds(next_empty_alloc).bytes_left := io.alloc.bits.bytes_to_read -// } -// -// when (io.request_returned.fire) { -// val cmd_id = io.request_returned.bits.cmd_id -// cmds(cmd_id).bytes_left := cmds(cmd_id).bytes_left - io.request_returned.bits.bytes_read -// -// assert(cmds(cmd_id).valid) -// assert(cmds(cmd_id).bytes_left >= io.request_returned.bits.bytes_read) -// } -// -// when (io.cmd_completed.fire) { -// cmds(io.cmd_completed.bits.cmd_id).valid := false.B -// } -// -// when (reset.asBool) { -// cmds.foreach(_.init()) -// } -//} diff --git a/scripts/gemmini/chisel_wrappers/LoadController.scala b/scripts/gemmini/chisel_wrappers/LoadController.scala index 309ae7d..2667cd2 100644 --- a/scripts/gemmini/chisel_wrappers/LoadController.scala +++ b/scripts/gemmini/chisel_wrappers/LoadController.scala @@ -278,185 +278,3 @@ class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig io.completed.valid := custom_load_controller.io.io_completed_valid io.completed.bits := custom_load_controller.io.io_completed_bits } - -// // TODO we need to check for WAW errors here -// // TODO deal with errors when reading scratchpad responses -// class LoadController[T <: Data, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V], coreMaxAddrBits: Int, -// local_addr_t: LocalAddr) -// (implicit p: Parameters) extends Module { -// import config._ - -// val io = IO(new Bundle { -// val cmd = Flipped(Decoupled(new GemminiCmd(reservation_station_entries))) - -// val dma = new ScratchpadReadMemIO(local_addr_t, mvin_scale_t_bits) - -// val completed = Decoupled(UInt(log2Up(reservation_station_entries).W)) - -// val busy = Output(Bool()) - -// val counter = new CounterEventIO() -// }) - -// val waiting_for_command :: waiting_for_dma_req_ready :: sending_rows :: Nil = Enum(3) -// val control_state = RegInit(waiting_for_command) - -// val strides = Reg(Vec(load_states, UInt(coreMaxAddrBits.W))) -// val scales = Reg(Vec(load_states, UInt(mvin_scale_t_bits.W))) -// val shrinks = Reg(Vec(load_states, Bool())) // Shrink inputs to accumulator -// val block_strides = Reg(Vec(load_states, UInt(block_stride_bits.W))) // Spad stride during block move-ins -// val pixel_repeats = Reg(Vec(load_states, UInt(pixel_repeats_bits.W))) -// val block_rows = meshRows * tileRows -// val block_cols = meshColumns * tileColumns -// val row_counter = RegInit(0.U(log2Ceil(block_rows).W)) - -// val cmd = Queue(io.cmd, ld_queue_length) - -// val vaddr = cmd.bits.cmd.rs1 -// val mvin_rs2 = cmd.bits.cmd.rs2.asTypeOf(new MvinRs2(mvin_rows_bits, mvin_cols_bits, local_addr_t)) -// val localaddr = mvin_rs2.local_addr -// val cols = mvin_rs2.num_cols -// val rows = mvin_rs2.num_rows - -// val config_stride = cmd.bits.cmd.rs2 - -// val config_mvin_rs1 = cmd.bits.cmd.rs1.asTypeOf(new ConfigMvinRs1(mvin_scale_t_bits, block_stride_bits, pixel_repeats_bits)) - -// val config_scale = config_mvin_rs1.scale -// val config_shrink = config_mvin_rs1.shrink -// val config_block_stride = config_mvin_rs1.stride -// val config_pixel_repeats = config_mvin_rs1.pixel_repeats - -// val mstatus = cmd.bits.cmd.status - -// val load_state_id = MuxCase(0.U, Seq((cmd.bits.cmd.inst.funct === LOAD2_CMD) -> 1.U, -// (cmd.bits.cmd.inst.funct === LOAD3_CMD) -> 2.U)) -// val config_state_id = config_mvin_rs1.state_id -// val state_id = Mux(cmd.bits.cmd.inst.funct === CONFIG_CMD, config_state_id, load_state_id) - -// val stride = strides(state_id) -// val scale = scales(state_id) -// val shrink = shrinks(state_id) -// val block_stride = block_strides(state_id) -// val pixel_repeat = pixel_repeats(state_id) - -// val all_zeros = vaddr === 0.U - -// val localaddr_plus_row_counter = localaddr + row_counter - -// val actual_rows_read = Mux(stride === 0.U && !all_zeros, 1.U, rows) - -// val DoConfig = cmd.bits.cmd.inst.funct === CONFIG_CMD -// val DoLoad = !DoConfig // TODO change this if more commands are added - -// cmd.ready := false.B - -// // Command tracker instantiation -// val nCmds = (max_in_flight_mem_reqs / block_rows) + 1 - -// val deps_t = new Bundle { -// val rob_id = UInt(log2Up(reservation_station_entries).W) -// } - -// val maxBytesInRowRequest = config.dma_maxbytes max (block_cols * config.inputType.getWidth / 8) max -// (block_cols * config.accType.getWidth / 8) -// val maxBytesInMatRequest = block_rows * maxBytesInRowRequest - -// val cmd_tracker = Module(new DMACommandTracker(nCmds, maxBytesInMatRequest, deps_t)) - -// io.busy := cmd.valid || cmd_tracker.io.busy - -// // DMA IO wiring -// io.dma.req.valid := (control_state === waiting_for_command && cmd.valid && DoLoad && cmd_tracker.io.alloc.ready) || -// control_state === waiting_for_dma_req_ready || -// (control_state === sending_rows && row_counter =/= 0.U) -// io.dma.req.bits.vaddr := vaddr + row_counter * stride -// io.dma.req.bits.laddr := localaddr_plus_row_counter -// io.dma.req.bits.cols := cols -// io.dma.req.bits.repeats := Mux(stride === 0.U && !all_zeros, rows - 1.U, 0.U) -// io.dma.req.bits.block_stride := block_stride -// io.dma.req.bits.scale := scale -// io.dma.req.bits.has_acc_bitwidth := localaddr_plus_row_counter.is_acc_addr && !shrink -// io.dma.req.bits.all_zeros := all_zeros -// io.dma.req.bits.status := mstatus -// io.dma.req.bits.pixel_repeats := pixel_repeat - -// // Command tracker IO -// cmd_tracker.io.alloc.valid := control_state === waiting_for_command && cmd.valid && DoLoad -// cmd_tracker.io.alloc.bits.bytes_to_read := -// Mux(io.dma.req.bits.has_acc_bitwidth, cols * actual_rows_read * config.accType.getWidth.U, -// cols * actual_rows_read * config.inputType.getWidth.U) >> 3 // We replaced a very clear "/ 8.U" operation here with a ">> 3" operation, solely to satisfy Verilator's linter -// cmd_tracker.io.alloc.bits.tag.rob_id := cmd.bits.rob_id.bits -// cmd_tracker.io.request_returned.valid := io.dma.resp.fire // TODO use a bundle connect -// cmd_tracker.io.request_returned.bits.cmd_id := io.dma.resp.bits.cmd_id // TODO use a bundle connect -// cmd_tracker.io.request_returned.bits.bytes_read := io.dma.resp.bits.bytesRead -// cmd_tracker.io.cmd_completed.ready := io.completed.ready - -// val cmd_id = RegEnableThru(cmd_tracker.io.alloc.bits.cmd_id, cmd_tracker.io.alloc.fire()) // TODO is this really better than a simple RegEnable? -// io.dma.req.bits.cmd_id := cmd_id - -// io.completed.valid := cmd_tracker.io.cmd_completed.valid -// io.completed.bits := cmd_tracker.io.cmd_completed.bits.tag.rob_id - -// io.busy := cmd.valid || cmd_tracker.io.busy - -// // Row counter -// when (io.dma.req.fire) { -// row_counter := wrappingAdd(row_counter, 1.U, actual_rows_read) - -// assert(block_stride >= rows) -// } - -// // Control logic -// switch (control_state) { -// is (waiting_for_command) { -// when (cmd.valid) { -// when(DoConfig) { -// stride := config_stride -// scale := config_scale -// shrink := config_shrink -// block_stride := config_block_stride -// pixel_repeat := Mux(config_pixel_repeats === 0.U, 1.U, config_pixel_repeats) // TODO this default value was just added to maintain backwards compatibility. we should deprecate and remove it later -// cmd.ready := true.B -// } - -// .elsewhen(DoLoad && cmd_tracker.io.alloc.fire()) { -// control_state := Mux(io.dma.req.fire, sending_rows, waiting_for_dma_req_ready) -// } -// } -// } - -// is (waiting_for_dma_req_ready) { -// when (io.dma.req.fire) { -// control_state := sending_rows -// } -// } - -// is (sending_rows) { -// val last_row = row_counter === 0.U || (row_counter === actual_rows_read-1.U && io.dma.req.fire) - -// when (last_row) { -// control_state := waiting_for_command -// cmd.ready := true.B -// } -// } -// } - -// // Optimizations based on config parameters -// if (!has_first_layer_optimizations) -// pixel_repeats.foreach(_ := 1.U) - -// // Performance counter -// CounterEventIO.init(io.counter) -// io.counter.connectEventSignal(CounterEvent.LOAD_ACTIVE_CYCLE, control_state === sending_rows) -// io.counter.connectEventSignal(CounterEvent.LOAD_DMA_WAIT_CYCLE, control_state === waiting_for_dma_req_ready) -// io.counter.connectEventSignal(CounterEvent.LOAD_SCRATCHPAD_WAIT_CYCLE, io.dma.req.valid && !io.dma.req.ready) - -// if (use_firesim_simulation_counters) { -// PerfCounter(io.dma.req.valid && !io.dma.req.ready, "load_dma_wait_cycle", "cycles during which load controller is waiting for DMA to be available") -// } - -// // Assertions -// assert(!(cmd_tracker.io.alloc.fire() && cmd_tracker.io.alloc.bits.bytes_to_read === 0.U), "A single mvin instruction must load more than 0 bytes") -// assert(has_first_layer_optimizations.B || !(cmd.valid && DoConfig && config_pixel_repeats > 1.U), "If first-layer optimizations are not enabled, then pixel-repeats cannot be greater than 1") -// } diff --git a/scripts/gemmini/chisel_wrappers/PE.scala b/scripts/gemmini/chisel_wrappers/PE.scala index a09fab8..db03ec1 100644 --- a/scripts/gemmini/chisel_wrappers/PE.scala +++ b/scripts/gemmini/chisel_wrappers/PE.scala @@ -107,129 +107,3 @@ class PE[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value, io.out_valid := custom_pe.io.io_out_valid io.bad_dataflow := custom_pe.io.io_bad_dataflow } - -/* -// TODO update documentation -/** - * A PE implementing a MAC operation. Configured as fully combinational when integrated into a Mesh. - * @param width Data width of operands - */ -class PE[T <: Data](inputType: T, outputType: T, accType: T, df: Dataflow.Value, max_simultaneous_matmuls: Int) - (implicit ev: Arithmetic[T]) extends Module { // Debugging variables - import ev._ - - val io = IO(new Bundle { - val in_a = Input(inputType) - val in_b = Input(outputType) - val in_d = Input(outputType) - val out_a = Output(inputType) - val out_b = Output(outputType) - val out_c = Output(outputType) - - val in_control = Input(new PEControl(accType)) - val out_control = Output(new PEControl(accType)) - - val in_id = Input(UInt(log2Up(max_simultaneous_matmuls).W)) - val out_id = Output(UInt(log2Up(max_simultaneous_matmuls).W)) - - val in_last = Input(Bool()) - val out_last = Output(Bool()) - - val in_valid = Input(Bool()) - val out_valid = Output(Bool()) - - val bad_dataflow = Output(Bool()) - }) - - chisel3.dontTouch(io) - - val cType = if (df == Dataflow.WS) inputType else accType - - // When creating PEs that support multiple dataflows, the - // elaboration/synthesis tools often fail to consolidate and de-duplicate - // MAC units. To force mac circuitry to be re-used, we create a "mac_unit" - // module here which just performs a single MAC operation - val mac_unit = Module(new MacUnit(inputType, cType, outputType)) - - val a = io.in_a - val b = io.in_b - val d = io.in_d - val c1 = Reg(cType) - val c2 = Reg(cType) - val dataflow = io.in_control.dataflow - val prop = io.in_control.propagate - val shift = io.in_control.shift - val id = io.in_id - val last = io.in_last - val valid = io.in_valid - - io.out_a := a - io.out_control.dataflow := dataflow - io.out_control.propagate := prop - io.out_control.shift := shift - io.out_id := id - io.out_last := last - io.out_valid := valid - - mac_unit.io.in_a := a - - val last_s = RegEnable(prop, valid) - val flip = last_s =/= prop - val shift_offset = Mux(flip, shift, 0.U) - - // Which dataflow are we using? - val OUTPUT_STATIONARY = Dataflow.OS.id.U(1.W) - val WEIGHT_STATIONARY = Dataflow.WS.id.U(1.W) - - // Is c1 being computed on, or propagated forward (in the output-stationary dataflow)? - val COMPUTE = 0.U(1.W) - val PROPAGATE = 1.U(1.W) - - io.bad_dataflow := false.B - when ((df == Dataflow.OS).B || ((df == Dataflow.BOTH).B && dataflow === OUTPUT_STATIONARY)) { - when(prop === PROPAGATE) { - io.out_c := (c1 >> shift_offset).clippedToWidthOf(outputType) - io.out_b := b - mac_unit.io.in_b := b.asTypeOf(inputType) - mac_unit.io.in_c := c2 - c2 := mac_unit.io.out_d - c1 := d.withWidthOf(cType) - }.otherwise { - io.out_c := (c2 >> shift_offset).clippedToWidthOf(outputType) - io.out_b := b - mac_unit.io.in_b := b.asTypeOf(inputType) - mac_unit.io.in_c := c1 - c1 := mac_unit.io.out_d - c2 := d.withWidthOf(cType) - } - }.elsewhen ((df == Dataflow.WS).B || ((df == Dataflow.BOTH).B && dataflow === WEIGHT_STATIONARY)) { - when(prop === PROPAGATE) { - io.out_c := c1 - mac_unit.io.in_b := c2.asTypeOf(inputType) - mac_unit.io.in_c := b - io.out_b := mac_unit.io.out_d - c1 := d - }.otherwise { - io.out_c := c2 - mac_unit.io.in_b := c1.asTypeOf(inputType) - mac_unit.io.in_c := b - io.out_b := mac_unit.io.out_d - c2 := d - } - }.otherwise { - io.bad_dataflow := true.B - //assert(false.B, "unknown dataflow") - io.out_c := DontCare - io.out_b := DontCare - mac_unit.io.in_b := b.asTypeOf(inputType) - mac_unit.io.in_c := c2 - } - - when (!valid) { - c1 := c1 - c2 := c2 - mac_unit.io.in_b := DontCare - mac_unit.io.in_c := DontCare - } -} -*/ diff --git a/scripts/gemmini/chisel_wrappers/StoreController.scala b/scripts/gemmini/chisel_wrappers/StoreController.scala index 906214a..6d1a5a3 100644 --- a/scripts/gemmini/chisel_wrappers/StoreController.scala +++ b/scripts/gemmini/chisel_wrappers/StoreController.scala @@ -286,318 +286,3 @@ class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: Gemm io.completed.valid := custom_store_controller.io.io_completed_valid io.completed.bits := custom_store_controller.io.io_completed_bits } - -// // TODO this is almost a complete copy of LoadController. We should combine them into one class -// // TODO deal with errors when reading scratchpad responses -// class StoreController[T <: Data : Arithmetic, U <: Data, V <: Data](config: GemminiArrayConfig[T, U, V], -// coreMaxAddrBits: Int, local_addr_t: LocalAddr)(implicit p: Parameters) extends Module { -// import config._ - -// val io = IO(new Bundle { -// val cmd = Flipped(Decoupled(new GemminiCmd(reservation_station_entries))) - -// val dma = new ScratchpadWriteMemIO(local_addr_t, accType.getWidth, acc_scale_t_bits) - -// val completed = Decoupled(UInt(log2Up(reservation_station_entries).W)) - -// val busy = Output(Bool()) - -// val counter = new CounterEventIO() -// }) - -// // val waiting_for_command :: waiting_for_dma_req_ready :: sending_rows :: Nil = Enum(3) - -// object State extends ChiselEnum { -// val waiting_for_command, waiting_for_dma_req_ready, sending_rows, pooling = Value -// } -// import State._ - -// val control_state = RegInit(waiting_for_command) - -// val stride = Reg(UInt(coreMaxAddrBits.W)) -// val block_rows = meshRows * tileRows -// val block_stride = block_rows.U -// val block_cols = meshColumns * tileColumns -// val max_blocks = (dma_maxbytes / (block_cols * inputType.getWidth / 8)) max 1 - -// val activation = Reg(UInt(Activation.bitwidth.W)) // TODO magic number -// val igelu_qb = Reg(accType) -// val igelu_qc = Reg(accType) -// val iexp_qln2 = Reg(accType) -// val iexp_qln2_inv = Reg(accType) -// val norm_stats_id = Reg(UInt(8.W)) // TODO magic number -// val acc_scale = Reg(acc_scale_t) - -// //val row_counter = RegInit(0.U(log2Ceil(block_rows).W)) -// val row_counter = RegInit(0.U(12.W)) // TODO magic number -// val block_counter = RegInit(0.U(8.W)) // TODO magic number - -// // Pooling variables -// val pool_stride = Reg(UInt(CONFIG_MVOUT_RS1_MAX_POOLING_STRIDE_WIDTH.W)) // When this is 0, pooling is disabled -// val pool_size = Reg(UInt(CONFIG_MVOUT_RS1_MAX_POOLING_WINDOW_SIZE_WIDTH.W)) -// val pool_out_dim = Reg(UInt(CONFIG_MVOUT_RS1_POOL_OUT_DIM_WIDTH.W)) -// val pool_porows = Reg(UInt(CONFIG_MVOUT_RS1_POOL_OUT_ROWS_WIDTH.W)) -// val pool_pocols = Reg(UInt(CONFIG_MVOUT_RS1_POOL_OUT_COLS_WIDTH.W)) -// val pool_orows = Reg(UInt(CONFIG_MVOUT_RS1_OUT_ROWS_WIDTH.W)) -// val pool_ocols = Reg(UInt(CONFIG_MVOUT_RS1_OUT_COLS_WIDTH.W)) -// val pool_upad = Reg(UInt(CONFIG_MVOUT_RS1_UPPER_ZERO_PADDING_WIDTH.W)) -// val pool_lpad = Reg(UInt(CONFIG_MVOUT_RS1_LEFT_ZERO_PADDING_WIDTH.W)) - -// val porow_counter = RegInit(0.U(pool_porows.getWidth.W)) -// val pocol_counter = RegInit(0.U(pool_pocols.getWidth.W)) -// val wrow_counter = RegInit(0.U(pool_size.getWidth.W)) -// val wcol_counter = RegInit(0.U(pool_size.getWidth.W)) - -// val pooling_is_enabled = has_max_pool.B && pool_stride =/= 0.U -// val mvout_1d_enabled = pool_size =/= 0.U && !pooling_is_enabled //1-D move out enabled (no pooling) - -// val orow = porow_counter * pool_stride +& wrow_counter - pool_upad // TODO get rid of this multiplication -// val orow_is_negative = porow_counter * pool_stride +& wrow_counter < pool_upad // TODO get rid of this multiplication - -// val ocol = pocol_counter * pool_stride +& wcol_counter - pool_lpad // TODO get rid of this multiplication -// val ocol_is_negative = pocol_counter * pool_stride +& wcol_counter < pool_lpad // TODO get rid of this multiplication - -// val pool_total_rows = pool_porows * pool_pocols * pool_size * pool_size // TODO get this value from software - -// // Commands -// val cmd = Queue(io.cmd, st_queue_length) -// val vaddr = cmd.bits.cmd.rs1 -// val mvout_rs2 = cmd.bits.cmd.rs2.asTypeOf(new MvoutRs2(mvout_rows_bits, mvout_cols_bits, local_addr_t)) -// val localaddr = mvout_rs2.local_addr -// val cols = mvout_rs2.num_cols -// val rows = mvout_rs2.num_rows -// val blocks = (cols / block_cols.U(cols.getWidth.W)) + (cols % block_cols.U =/= 0.U) - -// val config_mvout_rs1 = cmd.bits.cmd.rs1.asTypeOf(new ConfigMvoutRs1) -// val config_mvout_rs2 = cmd.bits.cmd.rs2.asTypeOf(new ConfigMvoutRs2(acc_scale_t_bits, 32)) -// val config_cmd_type = config_mvout_rs1.cmd_type -// val config_stride = config_mvout_rs2.stride -// val config_activation = config_mvout_rs1.activation -// val config_acc_scale = config_mvout_rs2.acc_scale -// val config_pool_stride = config_mvout_rs1.pool_stride -// val config_pool_size = config_mvout_rs1.pool_size -// val config_pool_out_dim = config_mvout_rs1.pool_out_dim -// val config_porows = config_mvout_rs1.porows -// val config_pocols = config_mvout_rs1.pocols -// val config_orows = config_mvout_rs1.orows -// val config_ocols = config_mvout_rs1.ocols -// val config_upad = config_mvout_rs1.upad -// val config_lpad = config_mvout_rs1.lpad - -// val config_norm_rs1 = cmd.bits.cmd.rs1.asTypeOf(new ConfigNormRs1(accType.getWidth)) -// val config_norm_rs2 = cmd.bits.cmd.rs2.asTypeOf(new ConfigNormRs2(accType.getWidth)) -// val config_stats_id = config_norm_rs1.norm_stats_id -// val config_activation_msb = config_norm_rs1.act_msb -// val config_set_stats_id_only = config_norm_rs1.set_stats_id_only -// val config_iexp_q_const_type = config_norm_rs1.q_const_type -// val config_iexp_q_const = config_norm_rs1.q_const -// val config_igelu_qb = config_norm_rs2.qb -// val config_igelu_qc = config_norm_rs2.qc - -// assert(config_norm_rs1.cmd_type === config_mvout_rs1.cmd_type) - -// val mstatus = cmd.bits.cmd.status - -// val current_vaddr = vaddr + row_counter * stride -// val current_localaddr = WireInit(localaddr + (block_counter * block_stride + row_counter)) - -// val pool_row_addr = localaddr + (orow * pool_ocols +& ocol) -// when (orow_is_negative || ocol_is_negative || orow >= pool_orows || ocol >= pool_ocols) { -// pool_row_addr.make_this_garbage() -// } - -// val pool_vaddr = vaddr + (porow_counter * pool_out_dim + pocol_counter) * stride // TODO get rid of these multiplications - -// val DoConfig = cmd.bits.cmd.inst.funct === CONFIG_CMD && config_cmd_type === CONFIG_STORE -// val DoConfigNorm = config.has_normalizations.B && cmd.bits.cmd.inst.funct === CONFIG_CMD && config_cmd_type === CONFIG_NORM -// val DoStore = !DoConfig && !DoConfigNorm - -// cmd.ready := false.B - -// val mvout_1d_rows = pool_orows * pool_ocols //for 1D mvout -// // Command tracker instantiation -// val nCmds = (max_in_flight_mem_reqs / block_rows) + 1 - -// val deps_t = new Bundle { -// val rob_id = UInt(log2Up(reservation_station_entries).W) -// } - -// val cmd_tracker_max_rows = ((block_rows * max_blocks) max -// (((1 << pool_orows.getWidth)-1) * ((1 << pool_ocols.getWidth)-1) + 2*((1 << pool_lpad.getWidth)-1) + 2*((1 << pool_upad.getWidth)-1))) min -// ((config.sp_banks * config.sp_bank_entries) max -// (config.acc_banks * config.acc_bank_entries)) - -// val cmd_tracker = Module(new DMACommandTracker(nCmds, cmd_tracker_max_rows, deps_t)) - -// // DMA IO wiring -// io.dma.req.valid := (control_state === waiting_for_command && cmd.valid && DoStore && cmd_tracker.io.alloc.ready) || -// control_state === waiting_for_dma_req_ready || -// (control_state === sending_rows && (block_counter =/= 0.U || row_counter =/= 0.U)) || -// (control_state === pooling && (wcol_counter =/= 0.U || wrow_counter =/= 0.U || pocol_counter =/= 0.U || porow_counter =/= 0.U)) - -// io.dma.req.bits.vaddr := Mux(pooling_is_enabled || mvout_1d_enabled, pool_vaddr, current_vaddr) -// io.dma.req.bits.laddr := Mux(pooling_is_enabled, pool_row_addr, current_localaddr) //Todo: laddr for 1D? -// io.dma.req.bits.laddr.norm_cmd := Mux(block_counter === blocks - 1.U, current_localaddr.norm_cmd, -// NormCmd.non_reset_version(current_localaddr.norm_cmd)) - -// io.dma.req.bits.acc_act := activation -// io.dma.req.bits.acc_igelu_qb := igelu_qb.asTypeOf(io.dma.req.bits.acc_igelu_qb) -// io.dma.req.bits.acc_igelu_qc := igelu_qc.asTypeOf(io.dma.req.bits.acc_igelu_qc) -// io.dma.req.bits.acc_iexp_qln2 := iexp_qln2.asTypeOf(io.dma.req.bits.acc_iexp_qln2) -// io.dma.req.bits.acc_iexp_qln2_inv := iexp_qln2_inv.asTypeOf(io.dma.req.bits.acc_iexp_qln2_inv) -// io.dma.req.bits.acc_norm_stats_id := norm_stats_id -// io.dma.req.bits.acc_scale := acc_scale.asTypeOf(io.dma.req.bits.acc_scale) - -// io.dma.req.bits.len := Mux(block_counter === blocks - 1.U, ((cols - 1.U) % block_cols.U) + 1.U, block_cols.U) -// io.dma.req.bits.block := block_counter -// io.dma.req.bits.status := mstatus -// io.dma.req.bits.pool_en := pooling_is_enabled && (wrow_counter =/= 0.U || wcol_counter =/= 0.U) -// io.dma.req.bits.store_en := Mux(pooling_is_enabled, wrow_counter === pool_size - 1.U && wcol_counter === pool_size - 1.U, -// block_counter === blocks - 1.U) - -// // Command tracker IO -// cmd_tracker.io.alloc.valid := control_state === waiting_for_command && cmd.valid && DoStore -// cmd_tracker.io.alloc.bits.bytes_to_read := Mux(!pooling_is_enabled, Mux(mvout_1d_enabled, mvout_1d_rows, rows*blocks), pool_total_rows) // TODO do we have to add upad and lpad to this? -// cmd_tracker.io.alloc.bits.tag.rob_id := cmd.bits.rob_id.bits - -// cmd_tracker.io.request_returned.valid := io.dma.resp.fire // TODO use a bundle connect -// cmd_tracker.io.request_returned.bits.cmd_id := io.dma.resp.bits.cmd_id // TODO use a bundle connect -// cmd_tracker.io.request_returned.bits.bytes_read := 1.U -// cmd_tracker.io.cmd_completed.ready := io.completed.ready - -// val cmd_id = RegEnableThru(cmd_tracker.io.alloc.bits.cmd_id, cmd_tracker.io.alloc.fire()) // TODO is this really better than a simple RegEnable? -// io.dma.req.bits.cmd_id := cmd_id - -// io.completed.valid := cmd_tracker.io.cmd_completed.valid -// io.completed.bits := cmd_tracker.io.cmd_completed.bits.tag.rob_id - -// io.busy := cmd.valid || cmd_tracker.io.busy - -// // Row counter -// when (io.dma.req.fire) { -// when (!pooling_is_enabled) { -// //where does rows come from? -// //row_counter := wrappingAdd(row_counter, 1.U, rows) -// when(mvout_1d_enabled){ -// pocol_counter := wrappingAdd(pocol_counter, 1.U, pool_ocols) -// porow_counter := wrappingAdd(porow_counter, 1.U, pool_orows, pocol_counter === pool_ocols - 1.U) -// } - -// block_counter := wrappingAdd(block_counter, 1.U, blocks) -// row_counter := Mux(mvout_1d_enabled, wrappingAdd(row_counter, 1.U, mvout_1d_rows), wrappingAdd(row_counter, 1.U, rows, block_counter === blocks - 1.U)) -// }.otherwise { -// wcol_counter := wrappingAdd(wcol_counter, 1.U, pool_size) -// wrow_counter := wrappingAdd(wrow_counter, 1.U, pool_size, wcol_counter === pool_size - 1.U) -// pocol_counter := wrappingAdd(pocol_counter, 1.U, pool_pocols, wrow_counter === pool_size - 1.U && wcol_counter === pool_size - 1.U) -// porow_counter := wrappingAdd(porow_counter, 1.U, pool_porows, pocol_counter === pool_pocols - 1.U && wrow_counter === pool_size - 1.U && wcol_counter === pool_size - 1.U) -// } - -// assert(!(io.dma.req.bits.laddr.read_full_acc_row && blocks > 1.U), "Block-mvouts are not permitted when moving out full accumulator data") -// assert(!((pooling_is_enabled || mvout_1d_enabled) && blocks > 1.U), "Block-mvouts are not permitted when pooling") -// } - -// // Control logic -// switch (control_state) { -// is (waiting_for_command) { -// when (cmd.valid) { -// when(DoConfig) { -// stride := config_stride - -// activation := config_activation -// when (!config_acc_scale.asUInt.andR) { -// acc_scale := config_acc_scale.asTypeOf(acc_scale_t) -// } - -// pool_size := config_pool_size -// pool_stride := config_pool_stride -// when (config_pool_stride =/= 0.U) { -// pool_out_dim := config_pool_out_dim -// pool_porows := config_porows -// pool_pocols := config_pocols -// pool_orows := config_orows -// pool_ocols := config_ocols -// pool_upad := config_upad -// pool_lpad := config_lpad -// }.elsewhen(config_pool_size =/= 0.U){ -// pool_orows := config_orows -// pool_ocols := config_ocols -// pool_out_dim := config_pool_out_dim -// } -// cmd.ready := true.B -// } -// .elsewhen(config.has_normalizations.B && DoConfigNorm) { -// when (!config_set_stats_id_only.asBool) { -// igelu_qb := config_igelu_qb.asTypeOf(igelu_qb) -// igelu_qc := config_igelu_qc.asTypeOf(igelu_qc) -// when(config_iexp_q_const_type === 0.U) { -// iexp_qln2 := config_iexp_q_const.asTypeOf(iexp_qln2) -// }.elsewhen(config_iexp_q_const_type === 1.U) { -// iexp_qln2_inv := config_iexp_q_const.asTypeOf(iexp_qln2_inv) -// } -// activation := Cat(config_activation_msb, activation(1, 0)) // TODO: magic number -// } -// norm_stats_id := config_stats_id -// cmd.ready := true.B -// } -// .elsewhen(DoStore && cmd_tracker.io.alloc.fire()) { -// val next_state = Mux(pooling_is_enabled, pooling, sending_rows) -// control_state := Mux(io.dma.req.fire, next_state, waiting_for_dma_req_ready) -// } -// } -// } - -// is (waiting_for_dma_req_ready) { -// when (io.dma.req.fire) { -// control_state := Mux(pooling_is_enabled, pooling, sending_rows) -// } -// } - -// is (sending_rows) { -// val last_block = block_counter === blocks - 1.U && io.dma.req.fire -// val last_row = Mux(mvout_1d_enabled, row_counter === mvout_1d_rows - 1.U, row_counter === rows - 1.U) && io.dma.req.fire -// //normal mvout: row, 1D mvout: orows*ocols - -// val only_one_dma_req = block_counter === 0.U && row_counter === 0.U // This is a special case when only one DMA request is made - -// when ((last_block && last_row) || only_one_dma_req) { -// control_state := waiting_for_command -// cmd.ready := true.B -// } -// } - -// is (pooling) { -// // TODO Is it really possible for all the counters to be 0 here? -// val last_row = (porow_counter === 0.U && pocol_counter === 0.U && wrow_counter === 0.U && wcol_counter === 0.U) || -// (porow_counter === pool_porows - 1.U && pocol_counter === pool_pocols - 1.U && -// wrow_counter === pool_size - 1.U && wcol_counter === pool_size - 1.U && io.dma.req.fire) - -// when (last_row) { -// control_state := waiting_for_command -// cmd.ready := true.B -// } -// } -// } - -// // Optimizations when features are disabled -// if (!config.has_normalizations) { -// current_localaddr.norm_cmd := NormCmd.RESET - -// igelu_qb := DontCare -// igelu_qc := DontCare -// iexp_qln2 := DontCare -// iexp_qln2_inv := DontCare -// norm_stats_id := 0.U -// } - -// // Performance counter -// CounterEventIO.init(io.counter) -// io.counter.connectEventSignal(CounterEvent.STORE_ACTIVE_CYCLE, control_state === sending_rows || control_state === pooling) -// io.counter.connectEventSignal(CounterEvent.STORE_POOLING_CYCLE, pooling_is_enabled) -// io.counter.connectEventSignal(CounterEvent.STORE_DMA_WAIT_CYCLE, control_state === waiting_for_dma_req_ready) -// io.counter.connectEventSignal(CounterEvent.STORE_SCRATCHPAD_WAIT_CYCLE, io.dma.req.valid && !io.dma.req.ready) - -// if (use_firesim_simulation_counters) { -// PerfCounter(pooling_is_enabled, "pooling_cycles", "cycles during which store controller is max-pooling") -// PerfCounter(io.dma.req.valid && !io.dma.req.ready, "st_dma_wait_cycle", "cycles during which store controller is stalling for the DMA to be ready") -// } -// }